# Importing the libraries and setting google drive path

In [None]:
!pip install Sentencepiece
!pip install transformers

Collecting Sentencepiece
[?25l  Downloading https://files.pythonhosted.org/packages/ac/aa/1437691b0c7c83086ebb79ce2da16e00bef024f24fec2a5161c35476f499/sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2MB)
[K     |████████████████████████████████| 1.2MB 4.3MB/s 
[?25hInstalling collected packages: Sentencepiece
Successfully installed Sentencepiece-0.1.96
Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/fd/1a/41c644c963249fd7f3836d926afa1e3f1cc234a1c40d80c5f03ad8f6f1b2/transformers-4.8.2-py3-none-any.whl (2.5MB)
[K     |████████████████████████████████| 2.5MB 4.3MB/s 
Collecting tokenizers<0.11,>=0.10.1
[?25l  Downloading https://files.pythonhosted.org/packages/d4/e2/df3543e8ffdab68f5acc73f613de9c2b155ac47f162e725dcac87c521c11/tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3MB)
[K     |████████████████████████████████| 3.3MB 22.9MB/s 
Collecting

In [None]:
import os
import sys
import torch
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import math
import numpy as np
import pandas as pd
import spacy

In [None]:
import sentencepiece
from transformers import T5Tokenizer, T5ForConditionalGeneration, T5Config

## Checking the state of hardware accelerator

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)

Using device: cuda


## Mount the google drive

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
# listing all the files in the data directory
downloadPath = os.path.expanduser('/content/gdrive/MyDrive/QG_dataset')
os.listdir(downloadPath)

['Dataset.csv',
 'qg_valid.csv',
 'qg_train.csv',
 'qa_eval_train.csv',
 'qa_eval_valid.csv']

# T5 tranformer model traning and fine-tuning the hyper-parameters

## Setting the model name and additional model train parameter

In [None]:
batchSize = 4
seqLength = 512
DIR = downloadPath
pretrainedModel = 't5-small'

tokenizer = T5Tokenizer.from_pretrained(pretrainedModel)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=791656.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1389353.0, style=ProgressStyle(descript…




## Adding additonal tokens to tokenizer and defining the function of QGDataset class

In [None]:
tokenizer.add_special_tokens(
    {'additional_special_tokens': ['<answer>', '<context>']}
)

class QGDataset(Dataset):
    def __init__(self, csv):
        self.df = pd.read_csv(csv, engine='python')

    def __len__(self):
         return len(self.df)

    def __getitem__(self, idx):   
        if torch.is_tensor(idx):
            idx = idx.tolist()
        row = self.df.iloc[idx, 1:]       

        encodedText = tokenizer(
            row['text'], 
            pad_to_max_length = True, 
            max_length = seqLength,
            truncation = True,
            return_tensors = "pt"
        )
        encodedText['input_ids'] = torch.squeeze(encodedText['input_ids'])
        encodedText['attention_mask'] = torch.squeeze(encodedText['attention_mask'])

        encodedQuestion = tokenizer(
            row['question'],
            pad_to_max_length = True,
            max_length = seqLength,
            truncation = True,
            return_tensors = 'pt'
        )
        encodedQuestion['input_ids'] = torch.squeeze(encodedQuestion['input_ids'])

        return (encodedText.to(device), encodedQuestion.to(device))

trainSet = QGDataset(os.path.join(DIR, 'qg_train.csv'))
trainLoader = DataLoader(trainSet, batchSize = batchSize, shuffle = True)
validSet = QGDataset(os.path.join(DIR, 'qg_valid.csv')) 
validLoader = DataLoader(validSet, batchSize = batchSize, shuffle = False)

## Training the model with T5-small pretrain model along with tokenizer

In [None]:
# Setting the cofiguration of the T-5 transformer model
config = T5Config(decoder_start_token_id=tokenizer.pad_token_id)
model = T5ForConditionalGeneration(config).from_pretrained(pretrainedModel)
model.resize_token_embeddings(len(tokenizer)) # to account for new special tokens
model = model.to(device)

# Setting the learning rate, epoch and log interval parameters
logInterval = 5000
epochs = 10
learningRate = 0.001
optimizer = torch.optim.SGD(model.parameters(), lr = learningRate)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1197.0, style=ProgressStyle(description…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=242065649.0, style=ProgressStyle(descri…




Saving the tokenizer files to the drive

In [None]:
tokenizer.save_pretrained(os.path.join(DIR, 'Tokenizer_T5'))

('/content/gdrive/MyDrive/QG_dataset/Tokenizer/tokenizer_config.json',
 '/content/gdrive/MyDrive/QG_dataset/Tokenizer/special_tokens_map.json',
 '/content/gdrive/MyDrive/QG_dataset/Tokenizer/spiece.model',
 '/content/gdrive/MyDrive/QG_dataset/Tokenizer/added_tokens.json')

## Defining the training, mask label padding and evaluation funtion along with saving and loading the best model epoch

In [None]:
savedModelName = "qg_pretrained_t5_model_trained.pth"
savedModelPath = os.path.join(DIR, savedModelName)

def train(epoch, bestValLoss):
    model.train()
    totalLoss = 0.
    for batchIndex, batch in enumerate(trainLoader):
        data, target = batch
        optimizer.zero_grad()
        masked_labels = mask_label_padding(target['input_ids'])
        output = model(
            input_ids=data['input_ids'],
            attention_mask=data['attention_mask'],
            labels=masked_labels
        )
        loss = output[0]
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
        optimizer.step()

        totalLoss += loss.item()
        if batchIndex % logInterval == 0 and batchIndex > 0:
            curLoss = totalLoss / logInterval
            print('| Epoch {:3d} | {:5d}/{:5d} Batches | loss {:7.2f}'.format(epoch, batchIndex, len(trainLoader), curLoss))
            save(
                savedModelPath,
                epoch, 
                model.state_dict(), 
                optimizer.state_dict(), 
                bestValLoss
            )
            totalLoss = 0

In [None]:
def evaluate(model, data_loader):
    totalLoss = 0
    model.eval()
    with torch.no_grad():
        for batchIndex, batch in enumerate(data_loader):
            data, target = batch
            masked_labels = mask_label_padding(target['input_ids'])
            output = model(
                input_ids = data['input_ids'],
                attention_mask = data['attention_mask'],
                labels = masked_labels
            )
            totalLoss += output[0].item()
    return totalLoss / len(data_loader)

In [None]:
def load(path):
    return torch.load(path)

In [None]:
def mask_label_padding(labels):
    MaskID = -1000
    labels[labels==tokenizer.pad_token_id] = MaskID
    return labels

In [None]:
def save(path, epoch, modelStateDict, optimizerStateDict, loss):
    torch.save({
            'epoch': epoch,
            'model_state_dict': modelStateDict,
            'optimizer_state_dict': optimizerStateDict,
            'best_loss': loss,
            }, path)

## Saving the best model based on validation loss and iterating over several epochs

In [None]:
valLoss = evaluate(model, validLoader)
print('| Before training | Valididation Loss {:3f}'.format(valLoss))

bestValLoss = float("inf")
bestModel = None

for epoch in range(1, epochs):
    train(epoch, valLoss)
    valLoss = evaluate(model, validLoader)
    print('| End of the Epoch {:4d} | Valididaton Loss {:2f}'.format(epoch, valLoss))
    if valLoss <= bestValLoss:
        bestValLoss = valLoss
        bestModel = model
        save(
             savedModelPath,
             epoch, 
             model.state_dict(), 
             optimizer.state_dict(), 
             bestValLoss
        )
        print("| Model saved")
    



------------------------------------------------------------
| Before training | valid loss  4.65
------------------------------------------------------------
------------------------------------------------------------
| end of epoch   1 | valid loss  3.81
------------------------------------------------------------
| Model saved.
------------------------------------------------------------
------------------------------------------------------------
| end of epoch   2 | valid loss  3.66
------------------------------------------------------------
| Model saved.
------------------------------------------------------------
------------------------------------------------------------
| end of epoch   3 | valid loss  3.56
------------------------------------------------------------
| Model saved.
------------------------------------------------------------
------------------------------------------------------------
| end of epoch   4 | valid loss  3.51
----------------------------------