# Fine Tuning Transformer for Similar South-Slavic Languages using BERTic

In this subtask, we consider only the group "bks" (in german: bosnisch-kroatisch-serbisch) which include the similar languages: bosnian, croatian, and serbian. For further reading into the challenges and the similarities between those languages, I refer to this [Wiki article](https://en.wikipedia.org/wiki/Comparison_of_standard_Bosnian,_Croatian,_Montenegrin_and_Serbian)

In [44]:
import pandas as pd
from sklearn.metrics import f1_score
import torch
import transformers
from torch.utils.data import Dataset, DataLoader
from transformers import AutoModel, AutoTokenizer, AutoModelForPreTraining, ElectraForSequenceClassification, ElectraTokenizer
import torch.nn as nn

In [45]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

In [46]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [47]:
# Import the csv into pandas dataframe and add the headers
train = pd.read_csv('/content/drive/MyDrive/ds_data/train_prepro.csv')
test = pd.read_csv('/content/drive/MyDrive/ds_data/test_prepro.csv')
#train = pd.read_csv('dialect-identification/data/all/train_prepro.csv')
#test = pd.read_csv('dialect-identification/data/all/test_prepro.csv')

train = train[train['GROUP'] == 'bks']
test = test[test['GROUP'] == 'bks']


train = train[['TEXT','VARIETY']]
test = test[['TEXT','VARIETY']]
encode_dict = {}
def encode_cat(x):
    if x not in encode_dict.keys():
        encode_dict[x]=len(encode_dict)
    return encode_dict[x]

sets = [train, test]
for df in sets:
  df['ENCODE_CAT'] = df['VARIETY'].apply(lambda x: encode_cat(x))

train.tail()

Unnamed: 0,TEXT,VARIETY,ENCODE_CAT
162174,na kraju ete se osje ati mnogo bolje i svje ij...,bs,0
162175,prema bari i evom iskazu dok je na elu stranke...,hr,1
162180,guverner rohatinski ima potrebu pobje i od pro...,hr,1
162185,projektant pekovi izvijestio je gradona elnika...,hr,1
162189,premijer i ministar unutra njih poslova ivica ...,bs,0


In [48]:
# Defining some key variables that will be used later on in the training
MAX_LEN = 512
TRAIN_BATCH_SIZE = 6
VALID_BATCH_SIZE = 2
EPOCHS = 5
LEARNING_RATE = 2e-05
tokenizer = AutoTokenizer.from_pretrained("classla/bcms-bertic")
bertic_model = AutoModelForPreTraining.from_pretrained("classla/bcms-bertic")


In [49]:
class DataPrep(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.len = len(dataframe)
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __getitem__(self, index):
        text = str(self.data.TEXT[index])
        text = " ".join(text.split())
        inputs = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_len,
            return_tensors='pt'
        )

        return {
            'ids': inputs['input_ids'].squeeze(),
            'mask': inputs['attention_mask'].squeeze(),
            'targets': torch.tensor(self.data.ENCODE_CAT[index], dtype=torch.long)
        }

    def __len__(self):
        return self.len

In [50]:
# Creating the dataset and dataloader for the neural network
test_dataset= test.reset_index(drop=True)
train_dataset = train.reset_index(drop=True)

print("TRAIN Dataset: {}".format(train_dataset.shape))
print("TEST Dataset: {}".format(test_dataset.shape))

training_set = DataPrep(train_dataset, tokenizer, MAX_LEN)
testing_set = DataPrep(test_dataset, tokenizer, MAX_LEN)

TRAIN Dataset: (55989, 3)
TEST Dataset: (5916, 3)


In [51]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

### Creating the Neural Network for Fine Tuning


In [52]:
class BERTicClass(torch.nn.Module):
    def __init__(self, model_name, num_labels):
        super(BERTicClass, self).__init__()
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModelForPreTraining.from_pretrained(model_name, num_labels=num_labels)
        self.loss_function = torch.nn.CrossEntropyLoss()

    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.model(input_ids, attention_mask=attention_mask, labels=labels)
        logits = outputs.logits
        if labels is not None:
            loss = self.loss_function(logits, labels)
            return loss
        else:
            return logits


In [53]:
model = BERTicClass(model_name="classla/bcms-bertic", num_labels=len(encode_dict))
model.to(device)

BERTicClass(
  (model): ElectraForPreTraining(
    (electra): ElectraModel(
      (embeddings): ElectraEmbeddings(
        (word_embeddings): Embedding(32000, 768, padding_idx=0)
        (position_embeddings): Embedding(512, 768)
        (token_type_embeddings): Embedding(2, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): ElectraEncoder(
        (layer): ModuleList(
          (0-11): 12 x ElectraLayer(
            (attention): ElectraAttention(
              (self): ElectraSelfAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bias=True)
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (output): ElectraSelfOutput(
                (dense): Linear(in_features=768, out_

In [54]:
loss_function = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

In [55]:
def calcuate_accu(big_idx, targets):
    n_correct = (big_idx==targets).sum().item()
    return n_correct

In [56]:
def train(epoch):
    tr_loss = 0
    n_correct = 0
    nb_tr_steps = 0
    nb_tr_examples = 0
    all_preds = []
    all_targets = []
    model.train()
    for _,data in enumerate(training_loader, 0):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.long)

        outputs = model(ids, mask)
        loss = loss_function(outputs, targets)
        tr_loss += loss.item()
        big_val, big_idx = torch.max(outputs.data, dim=1)
        n_correct += calcuate_accu(big_idx, targets)

        all_preds.extend(big_idx.cpu().numpy())
        all_targets.extend(targets.cpu().numpy())

        nb_tr_steps += 1
        nb_tr_examples+=targets.size(0)

        if _%5000==0:
            loss_step = tr_loss/nb_tr_steps
            accu_step = (n_correct*100)/nb_tr_examples
            print(f"Training Loss per 5000 steps: {loss_step}")
            print(f"Training Accuracy per 5000 steps: {accu_step}")

        optimizer.zero_grad()
        loss.backward()
        #When using GPU
        optimizer.step()

    f1 = f1_score(all_targets, all_preds, average='weighted')

    print(f'The Total Accuracy for Epoch {epoch}: {(n_correct*100)/nb_tr_examples}')
    epoch_loss = tr_loss/nb_tr_steps
    epoch_accu = (n_correct*100)/nb_tr_examples
    print(f"Training Loss Epoch: {epoch_loss}")
    print(f"Training Accuracy Epoch: {epoch_accu}")
    print(f"Training F1 Score: {f1}")

    return

In [57]:
for epoch in range(EPOCHS):
    train(epoch)

Training Loss per 5000 steps: 13.040238380432129
Training Accuracy per 5000 steps: 0.0
Training Loss per 5000 steps: 0.8201020442378063
Training Accuracy per 5000 steps: 67.52649470105979
The Total Accuracy for Epoch 0: 69.61188804943828
Training Loss Epoch: 0.7694555768192068
Training Accuracy Epoch: 69.61188804943828
Training F1 Score: 0.71073095869026
Training Loss per 5000 steps: 0.4038846790790558
Training Accuracy per 5000 steps: 66.66666666666667
Training Loss per 5000 steps: 0.3729338141210274
Training Accuracy per 5000 steps: 85.49290141971606
The Total Accuracy for Epoch 1: 86.34910428834235
Training Loss Epoch: 0.3473982559818169
Training Accuracy Epoch: 86.34910428834235
Training F1 Score: 0.8686533483125927
Training Loss per 5000 steps: 0.18926799297332764
Training Accuracy per 5000 steps: 83.33333333333333
Training Loss per 5000 steps: 0.23474647588960723
Training Accuracy per 5000 steps: 91.05178964207158
The Total Accuracy for Epoch 2: 90.89821214881495
Training Loss Ep

### Validating the Model

During the validation stage we pass the unseen data(Testing Dataset) to the model. This step determines how good the model performs on the unseen data.

In [58]:
def valid(model, testing_loader):
    model.eval()
    tr_loss = 0.0
    nb_tr_steps = 0.0
    nb_tr_examples = 0.0
    n_correct = 0; n_wrong = 0; total = 0
    all_preds = []
    all_targets = []
    with torch.no_grad():
        for _, data in enumerate(testing_loader, 0):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.long)
            outputs = model(ids, mask).squeeze()
            loss = loss_function(outputs, targets)
            tr_loss += loss.item()
            big_val, big_idx = torch.max(outputs.data, dim=1)
            n_correct += calcuate_accu(big_idx, targets)

            all_preds.extend(big_idx.cpu().numpy())
            all_targets.extend(targets.cpu().numpy())

            nb_tr_steps += 1
            nb_tr_examples+=targets.size(0)

            if _%5000==0:
                loss_step = tr_loss/nb_tr_steps
                accu_step = (n_correct*100)/nb_tr_examples
                print(f"Validation Loss per 100 steps: {loss_step}")
                print(f"Validation Accuracy per 100 steps: {accu_step}")
    epoch_loss = tr_loss/nb_tr_steps
    epoch_accu = (n_correct*100)/nb_tr_examples
    epoch_f1 = f1_score(all_targets, all_preds, average='weighted')
    print(f"Validation Loss Epoch: {epoch_loss}")
    print(f"Validation Accuracy Epoch: {epoch_accu}")
    print(f"Validation F1 Score (weighted): {epoch_f1}")

    return epoch_f1, epoch_accu


In [59]:
print('This is the validation section to print the accuracy and see how it performs')
print('Here we are leveraging on the dataloader crearted for the validation dataset, the approcah is using more of pytorch')
f1, acc = valid(model, testing_loader)
print("Accuracy on test data = %0.2f%%" % acc)
print(f"Validation F1 Score (weighted) on test data: {f1}")

This is the validation section to print the accuracy and see how it performs
Here we are leveraging on the dataloader crearted for the validation dataset, the approcah is using more of pytorch
Validation Loss per 100 steps: 0.7864134311676025
Validation Accuracy per 100 steps: 100.0
Validation Loss Epoch: 0.6835617088105231
Validation Accuracy Epoch: 84.80392156862744
Validation F1 Score (weighted): 0.8618189593675848
Accuracy on test data = 84.80%
Validation F1 Score (weighted) on test data: 0.8618189593675848


<a id='section07'></a>
### Saving the Trained Model Artifacts for inference

In [60]:
# Saving the files for re-use

#output_model_file = '/content/drive/MyDrive/ds_data/models/pytorch_bertmulti.bin'
#output_vocab_file = '/content/drive/MyDrive/ds_data/models/vocab_bertmulti.bin'

model_to_save = model
torch.save(model_to_save, output_model_file)
tokenizer.save_vocabulary(output_vocab_file)

Exception: ignored