# Fine Tuning Transformer for MultiClass Text Classification

In [1]:
import pandas as pd
import torch
import transformers
from torch.utils.data import Dataset, DataLoader
from transformers import AutoModel, AutoTokenizer
from sklearn.metrics import f1_score


In [2]:
# Setting up the device for GPU usage

from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
# Import the csv into pandas dataframe and add the headers
train = pd.read_csv('/content/drive/MyDrive/ds_data/train_prepro.csv')
test = pd.read_csv('/content/drive/MyDrive/ds_data/dev_prepro.csv')

train = train[['0','1']]
test = test[['0','1']]
encode_dict = {}
def encode_cat(x):
    if x not in encode_dict.keys():
        encode_dict[x]=len(encode_dict)
    return encode_dict[x]

sets = [train, test]
for df in sets:
  df.rename(columns={'1':'CATEGORY', '0':'TEXT'}, inplace=True)
  df.dropna(inplace=True)
  df['ENCODE_CAT'] = df['CATEGORY'].apply(lambda x: encode_cat(x))

train.tail()


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.rename(columns={'1':'CATEGORY', '0':'TEXT'}, inplace=True)


Unnamed: 0,TEXT,CATEGORY,ENCODE_CAT
162185,projektant pekovi izvijestio je gradona elnika...,hr,4
162186,as declara es de jupp foram posteriormente div...,pt-PT,6
162187,la puesta en marcha de lanbide es consecuencia...,es-ES,2
162188,la asociaci n brasile a de la industria produc...,es-AR,1
162189,premijer i ministar unutra njih poslova ivica ...,bs,0


In [5]:
test.ENCODE_CAT.value_counts()

2     2000
7     1984
5     1981
0     1970
1     1965
4     1962
6     1942
3     1782
8     1528
11    1053
9     1017
10     932
Name: ENCODE_CAT, dtype: int64

In [6]:
# Defining some key variables that will be used later on in the training
MAX_LEN = 512
TRAIN_BATCH_SIZE = 6
VALID_BATCH_SIZE = 2
EPOCHS = 2
LEARNING_RATE = 1e-05
model_name = "bert-base-multilingual-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

tokenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

In [7]:
class BERTMulti(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.len = len(dataframe)
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __getitem__(self, index):
        text = str(self.data.TEXT[index])
        text = " ".join(text.split())
        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True,
            truncation=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']

        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'targets': torch.tensor(self.data.ENCODE_CAT[index], dtype=torch.long)
        }

    def __len__(self):
        return self.len

#### Creating the dataset and dataloader for the neural network

In [8]:
test_dataset= test.reset_index(drop=True)
train_dataset = train.reset_index(drop=True)

print("TRAIN Dataset: {}".format(train_dataset.shape))
print("TEST Dataset: {}".format(test_dataset.shape))

training_set = BERTMulti(train_dataset, tokenizer, MAX_LEN)
testing_set = BERTMulti(test_dataset, tokenizer, MAX_LEN)

TRAIN Dataset: (162190, 3)
TEST Dataset: (20116, 3)


In [9]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

In [10]:
class BERTMultiClass(torch.nn.Module):
    def __init__(self):
        super(BERTMultiClass, self).__init__()
        self.l1 = AutoModel.from_pretrained(model_name)
        self.pre_classifier = torch.nn.Linear(768, 768)
        self.dropout = torch.nn.Dropout(0.3)
        self.classifier = torch.nn.Linear(768, 12)

    def forward(self, input_ids, attention_mask):
        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask)
        hidden_state = output_1[0]
        pooler = hidden_state[:, 0]
        pooler = self.pre_classifier(pooler)
        pooler = torch.nn.ReLU()(pooler)
        pooler = self.dropout(pooler)
        output = self.classifier(pooler)
        return output

In [11]:
model = BERTMultiClass()
model.to(device)

model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

BERTMultiClass(
  (l1): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_aff

In [12]:
loss_function = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

In [13]:
def calcuate_accu(big_idx, targets):
    n_correct = (big_idx==targets).sum().item()
    return n_correct

In [14]:
def train(epoch):
    tr_loss = 0
    n_correct = 0
    nb_tr_steps = 0
    nb_tr_examples = 0
    all_preds = []
    all_targets = []
    model.train()
    for _,data in enumerate(training_loader, 0):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.long)

        outputs = model(ids, mask)
        loss = loss_function(outputs, targets)
        tr_loss += loss.item()
        big_val, big_idx = torch.max(outputs.data, dim=1)
        n_correct += calcuate_accu(big_idx, targets)

        all_preds.extend(big_idx.cpu().numpy())
        all_targets.extend(targets.cpu().numpy())

        nb_tr_steps += 1
        nb_tr_examples+=targets.size(0)

        if _%5000==0:
            loss_step = tr_loss/nb_tr_steps
            accu_step = (n_correct*100)/nb_tr_examples
            print(f"Training Loss per 5000 steps: {loss_step}")
            print(f"Training Accuracy per 5000 steps: {accu_step}")

        optimizer.zero_grad()
        loss.backward()
        # # When using GPU
        optimizer.step()

    f1 = f1_score(all_targets, all_preds, average='weighted')

    print(f'The Total Accuracy for Epoch {epoch}: {(n_correct*100)/nb_tr_examples}')
    epoch_loss = tr_loss/nb_tr_steps
    epoch_accu = (n_correct*100)/nb_tr_examples
    print(f"Training Loss Epoch: {epoch_loss}")
    print(f"Training Accuracy Epoch: {epoch_accu}")
    print(f"Training F1 Score: {f1}")

    return

In [15]:
for epoch in range(EPOCHS):
    train(epoch)



Training Loss per 5000 steps: 2.4481234550476074
Training Accuracy per 5000 steps: 16.666666666666668
Training Loss per 5000 steps: 0.8639397441929662
Training Accuracy per 5000 steps: 59.89802039592082
Training Loss per 5000 steps: 0.732766091772919
Training Accuracy per 5000 steps: 66.81498516814985
Training Loss per 5000 steps: 0.6684858259157566
Training Accuracy per 5000 steps: 70.09088283003355
Training Loss per 5000 steps: 0.6241595958113729
Training Accuracy per 5000 steps: 72.3872139726347
Training Loss per 5000 steps: 0.5915529882254984
Training Accuracy per 5000 steps: 74.03903843846246
The Total Accuracy for Epoch 0: 74.6254392995869
Training Loss Epoch: 0.5800709958867917
Training Accuracy Epoch: 74.6254392995869
Training F1 Score: 0.744745695834282
Training Loss per 5000 steps: 0.5788543820381165
Training Accuracy per 5000 steps: 66.66666666666667




Training Loss per 5000 steps: 0.3739815716925205
Training Accuracy per 5000 steps: 84.7997067253216
Training Loss per 5000 steps: 0.3724369057609451
Training Accuracy per 5000 steps: 84.79818684798187
Training Loss per 5000 steps: 0.3678821416150465
Training Accuracy per 5000 steps: 84.93211563673533
Training Loss per 5000 steps: 0.3657464526810812
Training Accuracy per 5000 steps: 85.07574621268937
Training Loss per 5000 steps: 0.363180726508285
Training Accuracy per 5000 steps: 85.19125901630602
The Total Accuracy for Epoch 1: 85.27097848202725
Training Loss Epoch: 0.36163898174933545
Training Accuracy Epoch: 85.27097848202725
Training F1 Score: 0.8521772693835411


### Validating the Model




In [16]:
def valid(model, testing_loader):
    model.eval()
    tr_loss = 0.0
    nb_tr_steps = 0.0
    nb_tr_examples = 0.0
    all_preds = []
    all_targets = []
    n_correct = 0; n_wrong = 0; total = 0
    with torch.no_grad():
        for _, data in enumerate(testing_loader, 0):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.long)
            outputs = model(ids, mask).squeeze()
            loss = loss_function(outputs, targets)
            tr_loss += loss.item()
            big_val, big_idx = torch.max(outputs.data, dim=1)
            n_correct += calcuate_accu(big_idx, targets)

            all_preds.extend(big_idx.cpu().numpy())
            all_targets.extend(targets.cpu().numpy())

            nb_tr_steps += 1
            nb_tr_examples+=targets.size(0)

            if _%5000==0:
                loss_step = tr_loss/nb_tr_steps
                accu_step = (n_correct*100)/nb_tr_examples
                print(f"Validation Loss per 100 steps: {loss_step}")
                print(f"Validation Accuracy per 100 steps: {accu_step}")

    epoch_f1 = f1_score(all_targets, all_preds, average='weighted')
    epoch_loss = tr_loss/nb_tr_steps
    epoch_accu = (n_correct*100)/nb_tr_examples
    print(f"Validation Loss Epoch: {epoch_loss}")
    print(f"Validation Accuracy Epoch: {epoch_accu}")
    print(f"Validation F1 Score: {epoch_f1}")


    return epoch_f1, epoch_accu


In [17]:
print('This is the validation section to print the accuracy and see how it performs')
print('Here we are leveraging on the dataloader crearted for the validation dataset, the approcah is using more of pytorch')
f1, acc = valid(model, testing_loader)
print("Accuracy on test data = %0.2f%%" % acc)
print("F1 on test data = %0.2f%%" % f1)

This is the validation section to print the accuracy and see how it performs
Here we are leveraging on the dataloader crearted for the validation dataset, the approcah is using more of pytorch
Validation Loss per 100 steps: 0.0029999681282788515
Validation Accuracy per 100 steps: 100.0




Validation Loss per 100 steps: 0.5003047602223029
Validation Accuracy per 100 steps: 79.61407718456309
Validation Loss per 100 steps: 0.4928196513259778
Validation Accuracy per 100 steps: 80.23197680231976
Validation Loss Epoch: 0.49318093158466086
Validation Accuracy Epoch: 80.22469675879897
Validation F1 Score: 0.7987893412302686
Accuracy on test data = 80.22%
F1 on test data = 0.80%



### Saving the Trained Model + Vocab for inference


In [18]:
output_model_file = '/content/drive/MyDrive/ds_data/models/pytorch_bertmulti.bin'
output_vocab_file = '/content/drive/MyDrive/ds_data/models/vocab_bertmulti.bin'

model_to_save = model
torch.save(model_to_save, output_model_file)
tokenizer.save_vocabulary(output_vocab_file)

print('All files saved')
print('This tutorial is completed')

Exception: ignored