# Fine Tuning Transformer for Similar South-Slavic Languages (bert-multilingual)

In [None]:
import pandas as pd
from sklearn.metrics import f1_score
import torch
import transformers
from torch.utils.data import Dataset, DataLoader
from transformers import AutoModel, AutoTokenizer, AutoModelForPreTraining, ElectraForSequenceClassification, ElectraTokenizer
import torch.nn as nn



In [None]:
# Setting up the device for GPU usage

from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

<a id='section01'></a>
### Train and test set (DataLoader)
In this subtask, we consider only the group "bks" (in german: bosnisch-kroatisch-serbisch) which include the similar languages: bosnian, croatian, and serbian. For further reading into the challenges and the similarities between those languages, I refer to this [Wiki article](https://en.wikipedia.org/wiki/Comparison_of_standard_Bosnian,_Croatian,_Montenegrin_and_Serbian)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Import the csv into pandas dataframe and add the headers
train = pd.read_csv('/content/drive/MyDrive/ds_data/train_prepro.csv')
test = pd.read_csv('/content/drive/MyDrive/ds_data/test_prepro.csv')
#train = pd.read_csv('dialect-identification/data/all/train_prepro.csv')
#test = pd.read_csv('dialect-identification/data/all/test_prepro.csv')

train = train[train['GROUP'] == 'bks']
test = test[test['GROUP'] == 'bks']


train = train[['TEXT','VARIETY']]
test = test[['TEXT','VARIETY']]
encode_dict = {}
def encode_cat(x):
    if x not in encode_dict.keys():
        encode_dict[x]=len(encode_dict)
    return encode_dict[x]

sets = [train, test]
for df in sets:
  df['ENCODE_CAT'] = df['VARIETY'].apply(lambda x: encode_cat(x))

train.tail()

Unnamed: 0,TEXT,VARIETY,ENCODE_CAT
162174,Na kraju ćete se osjećati mnogo bolje i svježi...,bs,0
162175,Prema Barišićevom iskazu dok je na čelu strank...,hr,1
162180,Guverner Rohatinski ima potrebu pobjeći od pro...,hr,1
162185,Projektant Peković izvijestio je gradonačelnik...,hr,1
162189,Premijer i ministar unutrašnjih poslova Ivica ...,bs,0


In [None]:
# Defining some key variables that will be used later on in the training
MAX_LEN = 512
TRAIN_BATCH_SIZE = 6
VALID_BATCH_SIZE = 2
EPOCHS = 3
LEARNING_RATE = 2e-05
model_name = "bert-base-multilingual-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

tokenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

#### *DataPrep* Dataset Class
- This class is defined to accept the Dataframe as input and generate tokenized output that is used by the bert-base-multilingual-cased model for training.
- We are using the bert-base-multilingual-cased tokenizer to tokenize the data in the `TEXT` column of the dataframe.
- The tokenizer uses the `encode_plus` method to perform tokenization and generate the necessary outputs, namely: `ids`, `attention_mask`

In [None]:
class DataPrep(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.len = len(dataframe)
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __getitem__(self, index):
        text = str(self.data.TEXT[index])
        text = " ".join(text.split())
        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True,
            truncation=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']

        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'targets': torch.tensor(self.data.ENCODE_CAT[index], dtype=torch.long)
        }

    def __len__(self):
        return self.len

#### Dataloader
- Dataloader is used to for creating training and testing dataloader that load data to the neural network in batches, since we may run out of system memory when trying to feed the whole training set into the network.
- This  is achieved using the parameters such as `batch_size` and `max_len`.
- Training and Testing dataloaders are used in the training and validation part of the flow respectively

In [None]:
test_dataset= test.reset_index(drop=True)
train_dataset = train.reset_index(drop=True)

print("TRAIN Dataset: {}".format(train_dataset.shape))
print("TEST Dataset: {}".format(test_dataset.shape))

training_set = DataPrep(train_dataset, tokenizer, MAX_LEN)
testing_set = DataPrep(test_dataset, tokenizer, MAX_LEN)

TRAIN Dataset: (55989, 3)
TEST Dataset: (5916, 3)


In [None]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

<a id='section04'></a>
### Classifier (Neural Network) for Fine Tuning

#### Neural Network
 - We will be creating a neural network with the `BERTMultiClass`.
 - This network will have the pre-trained "bert-base-multilingual-cased" model followed by a `dropout` and finally a `Linear` layer to obtain the final outputs.
 - The data will be then fed to the pretrained "bert-base-multilingual-cased".

In [None]:
# Creating the customized model, by adding a drop out and a dense layer on top of distil bert to get the final output for the model.

class BERTMultiClass(torch.nn.Module):
    def __init__(self):
        super(BERTMultiClass, self).__init__()
        self.l1 = AutoModel.from_pretrained(model_name)
        self.pre_classifier = torch.nn.Linear(768, 768)
        self.dropout = torch.nn.Dropout(0.3)
        self.classifier = torch.nn.Linear(768, 12)

    def forward(self, input_ids, attention_mask):
        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask)
        hidden_state = output_1[0]
        pooler = hidden_state[:, 0]
        pooler = self.pre_classifier(pooler)
        pooler = torch.nn.ReLU()(pooler)
        pooler = self.dropout(pooler)
        output = self.classifier(pooler)
        return output


In [None]:
model = BERTMultiClass()
model.to(device)

model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

BERTMultiClass(
  (l1): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_aff

We will use some common loss and optimizer functions from the torch library.
 - The `Loss Function` is used the calculate the difference in the output created by the model and the actual output.
 - `Optimizer` is used to update the weights of the neural network to improve its performance.

In [None]:
# Creating the loss function and optimizer
loss_function = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

<a id='section05'></a>
### Fine Tuning our Model

First we define the functions for training the model and measuring its accuracy. The model will take our "key variables" defined before as input parameters.  

To get another insight about its performance, we measure the weighted F1 score (even though the target classes are balanced)

In [None]:
# Function to calcuate the accuracy of the model

def calcuate_accu(big_idx, targets):
    n_correct = (big_idx==targets).sum().item()
    return n_correct

In [None]:
def train(epoch):
    tr_loss = 0
    n_correct = 0
    nb_tr_steps = 0
    nb_tr_examples = 0
    all_preds = []
    all_targets = []
    model.train()
    for _,data in enumerate(training_loader, 0):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.long)

        outputs = model(ids, mask)
        loss = loss_function(outputs, targets)
        tr_loss += loss.item()
        big_val, big_idx = torch.max(outputs.data, dim=1)
        n_correct += calcuate_accu(big_idx, targets)

        all_preds.extend(big_idx.cpu().numpy())
        all_targets.extend(targets.cpu().numpy())

        nb_tr_steps += 1
        nb_tr_examples+=targets.size(0)

        if _%5000==0:
            loss_step = tr_loss/nb_tr_steps
            accu_step = (n_correct*100)/nb_tr_examples
            print(f"Training Loss per 5000 steps: {loss_step}")
            print(f"Training Accuracy per 5000 steps: {accu_step}")

        optimizer.zero_grad()
        loss.backward()
        # # When using GPU
        optimizer.step()

    f1 = f1_score(all_targets, all_preds, average='weighted')

    print(f'The Total Accuracy for Epoch {epoch}: {(n_correct*100)/nb_tr_examples}')
    epoch_loss = tr_loss/nb_tr_steps
    epoch_accu = (n_correct*100)/nb_tr_examples
    print(f"Training Loss Epoch: {epoch_loss}")
    print(f"Training Accuracy Epoch: {epoch_accu}")
    print(f"Training F1 Score: {f1}")

    return

In [17]:
for epoch in range(EPOCHS):
    train(epoch)



Training Loss per 5000 steps: 2.5304934978485107
Training Accuracy per 5000 steps: 0.0
Training Loss per 5000 steps: 0.6108342436650852
Training Accuracy per 5000 steps: 72.65213623941878
The Total Accuracy for Epoch 0: 76.32570683527122
Training Loss Epoch: 0.5348726041602834
Training Accuracy Epoch: 76.32570683527122
Training F1 Score: 0.7622594656582746




Training Loss per 5000 steps: 0.06851912289857864
Training Accuracy per 5000 steps: 100.0
Training Loss per 5000 steps: 0.36412003814980054
Training Accuracy per 5000 steps: 85.08298340331933
The Total Accuracy for Epoch 1: 85.32211684438015
Training Loss Epoch: 0.35847797532716785
Training Accuracy Epoch: 85.32211684438015
Training F1 Score: 0.8527197454698576




Training Loss per 5000 steps: 0.22089707851409912
Training Accuracy per 5000 steps: 100.0
Training Loss per 5000 steps: 0.2778156732619286
Training Accuracy per 5000 steps: 88.96554022528828
The Total Accuracy for Epoch 2: 88.76743646073336
Training Loss Epoch: 0.27937847187596787
Training Accuracy Epoch: 88.76743646073336
Training F1 Score: 0.8874160877025972


<a id='section06'></a>
### Validating our model

During the validation stage we pass the unseen data(Testing Dataset) to the model. This step determines how good the model performs on the unseen data. The results will help us understand the generalization degree of the model, for example if we over- or underfitted it.


In [18]:
def valid(model, testing_loader):
    model.eval()
    tr_loss = 0.0
    nb_tr_steps = 0.0
    nb_tr_examples = 0.0
    n_correct = 0; n_wrong = 0; total = 0
    all_preds = []
    all_targets = []
    with torch.no_grad():
        for _, data in enumerate(testing_loader, 0):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.long)
            outputs = model(ids, mask).squeeze()
            loss = loss_function(outputs, targets)
            tr_loss += loss.item()
            big_val, big_idx = torch.max(outputs.data, dim=1)
            n_correct += calcuate_accu(big_idx, targets)

            all_preds.extend(big_idx.cpu().numpy())
            all_targets.extend(targets.cpu().numpy())

            nb_tr_steps += 1
            nb_tr_examples+=targets.size(0)

            if _%5000==0:
                loss_step = tr_loss/nb_tr_steps
                accu_step = (n_correct*100)/nb_tr_examples
                print(f"Validation Loss per 100 steps: {loss_step}")
                print(f"Validation Accuracy per 100 steps: {accu_step}")
    epoch_loss = tr_loss/nb_tr_steps
    epoch_accu = (n_correct*100)/nb_tr_examples
    epoch_f1 = f1_score(all_targets, all_preds, average='weighted')
    print(f"Validation Loss Epoch: {epoch_loss}")
    print(f"Validation Accuracy Epoch: {epoch_accu}")
    print(f"Validation F1 Score (weighted): {epoch_f1}")

    return epoch_f1, epoch_accu


In [19]:
f1, acc = valid(model, testing_loader)
print("Accuracy on test data = %0.2f%%" % acc)
print(f"Validation F1 Score (weighted) on test data: {f1}")

This is the validation section to print the accuracy and see how it performs
Here we are leveraging on the dataloader crearted for the validation dataset, the approcah is using more of pytorch
Validation Loss per 100 steps: 0.002268342301249504
Validation Accuracy per 100 steps: 100.0




Validation Loss Epoch: 0.36708161642447024
Validation Accuracy Epoch: 85.39553752535497
Validation F1 Score (weighted): 0.8536611742923818
Accuracy on test data = 85.40%
Validation F1 Score (weighted) on test data: 0.8536611742923818


<a id='section07'></a>
### Saving the Trained Model Artifacts for inference

In [21]:
# Saving the files for re-use

output_model_file = '/content/drive/MyDrive/ds_data/models/pytorch_bertic_3.bin'
output_vocab_file = '/content/drive/MyDrive/ds_data/models/vocab_bertic_3.bin'

model_to_save = model
torch.save(model_to_save, output_model_file)
tokenizer.save_pretrained(output_vocab_file)

('/content/drive/MyDrive/ds_data/models/vocab_bertic_3.bin/tokenizer_config.json',
 '/content/drive/MyDrive/ds_data/models/vocab_bertic_3.bin/special_tokens_map.json',
 '/content/drive/MyDrive/ds_data/models/vocab_bertic_3.bin/vocab.txt',
 '/content/drive/MyDrive/ds_data/models/vocab_bertic_3.bin/added_tokens.json',
 '/content/drive/MyDrive/ds_data/models/vocab_bertic_3.bin/tokenizer.json')