# Importing and loading the libraries 

In [None]:
!pip install Sentencepiece
!pip install transformers

Collecting Sentencepiece
[?25l  Downloading https://files.pythonhosted.org/packages/ac/aa/1437691b0c7c83086ebb79ce2da16e00bef024f24fec2a5161c35476f499/sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2MB)
[K     |████████████████████████████████| 1.2MB 8.5MB/s 
[?25hInstalling collected packages: Sentencepiece
Successfully installed Sentencepiece-0.1.96
Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/fd/1a/41c644c963249fd7f3836d926afa1e3f1cc234a1c40d80c5f03ad8f6f1b2/transformers-4.8.2-py3-none-any.whl (2.5MB)
[K     |████████████████████████████████| 2.5MB 8.8MB/s 
[?25hCollecting huggingface-hub==0.0.12
  Downloading https://files.pythonhosted.org/packages/2f/ee/97e253668fda9b17e968b3f97b2f8e53aa0127e8807d24a547687423fe0b/huggingface_hub-0.0.12-py3-none-any.whl
Collecting tokenizers<0.11,>=0.10.1
[?25l  Downloading https://files.pythonhosted.org/packages/d4/e2/df3543e8ffdab68f5acc73f613de9c2b155ac47f162e725dcac

In [None]:
!pip install -U spacy[cuda92]
!python -m spacy download en_core_web_sm
import spacy
import en_core_web_sm
spacy.prefer_gpu()
spacy_nlp = en_core_web_sm.load()

Collecting spacy[cuda92]
[?25l  Downloading https://files.pythonhosted.org/packages/c1/da/61f934c6ae177a291c77246ef91a78cab44a2d76f79e6892ca7b17571adf/spacy-3.1.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (6.4MB)
[K     |████████████████████████████████| 6.4MB 6.8MB/s 
Collecting spacy-legacy<3.1.0,>=3.0.7
  Downloading https://files.pythonhosted.org/packages/d3/e8/1bc00eeff3faf1c50bde941f88a491a5c1128debb75dd8c913401e71585c/spacy_legacy-3.0.8-py2.py3-none-any.whl
Collecting srsly<3.0.0,>=2.4.1
[?25l  Downloading https://files.pythonhosted.org/packages/c3/84/dfdfc9f6f04f6b88207d96d9520b911e5fec0c67ff47a0dea31ab5429a1e/srsly-2.4.1-cp37-cp37m-manylinux2014_x86_64.whl (456kB)
[K     |████████████████████████████████| 460kB 48.9MB/s 
Collecting typer<0.4.0,>=0.3.0
  Downloading https://files.pythonhosted.org/packages/90/34/d138832f6945432c638f32137e6c79a3b682f06a63c488dcfaca6b166c64/typer-0.3.2-py3-none-any.whl
Collecting pathy>=0.3.5
[?25l  Downloading https://files.p

In [None]:
import os
import sys
import random
import numpy as np
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader

In [None]:
import sentencepiece
from transformers import BertTokenizer
from transformers import BertForSequenceClassification
from transformers import BertConfig

## Checking the state of hardware accelerator

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)

Using device: cuda


## Mount the Google drive

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


# BERT tranformer model traning and fine-tuning the hyper-parameters

## Setting the model name and additional model train parameter

In [None]:
pretrainedModel = 'bert-base-cased'
batchSize = 16
seqLength = 512
DIR = os.path.expanduser('/content/gdrive/MyDrive/QG_dataset/')
tokenizer = BertTokenizer.from_pretrained(pretrainedModel)

## Defining the functions of evalSet class

In [None]:
class evalSet(Dataset):
    def __init__(self, csv): 
        self.df = pd.read_csv(csv, engine = 'python')
        self.transforms = [self.shuffle, self.irrelevant]

    def __len__(self):
         return len(self.df)

    def __getitem__(self, idx): 
        _, question, answer = self.df.iloc[idx]
        label = random.choice([0, 1])

        if label == 0:
            question, answer = random.choice(self.transforms)(question, answer)
   
        encodedData = tokenizer(
            text = question,
            text_pair = answer,
            pad_to_max_length = True, 
            max_length = seqLength,
            truncation = True,
            return_tensors = "pt"
        )

        encodedData['input_ids'] = torch.squeeze(encodedData['input_ids'])
        encodedData['token_type_ids'] = torch.squeeze(encodedData['token_type_ids'])
        encodedData['attention_mask'] = torch.squeeze(encodedData['attention_mask'])
        return (encodedData.to(device), torch.tensor(label).to(device))
    
    def irrelevant(self, question, answer):
        doc = spacy_nlp(question)
        if len(doc.ents) > 1:
            replaceEntity = str(random.choice(doc.ents))
            for ent in doc.ents:
                question = question.replace(str(ent), replaceEntity)
        elif len(doc.ents) == 1:
            answer = str(doc.ents[0])
        return question, answer

trainSet = evalSet(os.path.join(DIR, 'qa_eval_train.csv')) 
trainLoader = DataLoader(trainSet, batchSize = batchSize, shuffle = True)
validSet = evalSet(os.path.join(DIR, 'qa_eval_valid.csv')) 
validLoader = DataLoader(validSet, batchSize = batchSize, shuffle = False)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=213450.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=29.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=435797.0, style=ProgressStyle(descripti…




## Training the model with bert-base pretrain model along with tokenizer

In [None]:
epochs = 10
logInterval = 500
learningRate = 0.001

model = BertForSequenceClassification.from_pretrained(pretrainedModel)
model = model.to(device)
optimizer = torch.optim.SGD(model.parameters(), lr = learningRate)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=762.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1338740706.0, style=ProgressStyle(descr…




Some weights of the model checkpoint at bert-large-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at 

Saving the tokenizer files to the drive

In [None]:
os.path.join(DIR, 'Tokenizer_Bert/')

'/content/gdrive/MyDrive/QG_dataset/Tokenizer_Bert/'

In [None]:
# Saving the tokenizer files to the drive
tokenizer.save_pretrained(os.path.join(DIR, 'Tokenizer_Bert/'))

('/content/gdrive/MyDrive/QG_dataset/Tokenizer_Bert/tokenizer_config.json',
 '/content/gdrive/MyDrive/QG_dataset/Tokenizer_Bert/special_tokens_map.json',
 '/content/gdrive/MyDrive/QG_dataset/Tokenizer_Bert/vocab.txt',
 '/content/gdrive/MyDrive/QG_dataset/Tokenizer_Bert/added_tokens.json')

## Defining the training, evaluation funtion along with saving and loading the best model epoch

In [None]:
modelName = 'qa_eval_model_trained_bert_large.pt'
modelPath = os.path.join(DIR, modelName)
modelPath

def train():
    model.train()
    totalLoss = 0.
    for batchIndex, batch in enumerate(trainLoader):
        data, labels = batch
        optimizer.zero_grad()
        output = model(**data, labels = labels)
        loss = output[0]
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
        optimizer.step()
        totalLoss += loss.item()
        
        if batchIndex % logInterval == 0 and batchIndex > 0:
            curLoss = totalLoss / logInterval
            print('| Epoch {:3d} | {:2d}/{:2d} Batches | loss {:3.2f}'.format(epoch, batchIndex, len(trainLoader), curLoss))
            totalLoss = 0

In [None]:
def evaluate(model, dataLoader):
    model.eval()
    totalScore = 0.
    with torch.no_grad():
        for batchIndex, batch in enumerate(dataLoader):
            data, labels = batch
            output = model(**data, labels=labels)
            preds = np.argmax(output[1].cpu(), axis=1)
            totalScore += (preds == labels.cpu()).sum()
    return totalScore / (len(dataLoader) * batchSize)

In [None]:
def load():
    return torch.load(modelPath)

In [None]:
def save(epoch, modelStateDict, optimizerStateDict, loss):
    torch.save({
            'epoch': epoch,
            'model_state_dict': modelStateDict,
            'optimizer_state_dict': optimizerStateDict,
            'best_loss': loss,
            }, modelPath)

    print("| Model saved")

## Saving the best model based on accuracy and iterating over several epochs

In [None]:
highestAccuracy = 0

accuracy = evaluate(model, validLoader)
print('| Before Training | Accuracy on Validation Set: {:3.7f}%'.format(accuracy))

for epoch in range(1, epochs + 1):
    train()
    accuracy = evaluate(model, validLoader)
    print('| End of Epoch {:4d} | Accuracy on Validation Set: {:3.7f}%'.format(epoch, accuracy))
    
    if accuracy > highestAccuracy:
        highestAccuracy = accuracy
        save(
             epoch, 
             model.state_dict(), 
             optimizer.state_dict(), 
             highestAccuracy
        )



------------------------------------------------------------
| Before training | accuracy on valid set:  0.62%
------------------------------------------------------------
------------------------------------------------------------
| end of epoch   1 | accuracy on valid set:  0.67%
------------------------------------------------------------
| Model saved.
------------------------------------------------------------
------------------------------------------------------------
| end of epoch   2 | accuracy on valid set:  0.71%
------------------------------------------------------------
| Model saved.
------------------------------------------------------------
------------------------------------------------------------
| end of epoch   3 | accuracy on valid set:  0.67%
------------------------------------------------------------
------------------------------------------------------------
| end of epoch   4 | accuracy on valid set:  0.67%
---------------------------------------------