<a href="https://colab.research.google.com/github/laskari/END-Program/blob/main/Week-9/Model_1_QA_Dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive/')


Mounted at /content/drive/


In [2]:
import pandas as pd
path1 = '/content/drive/MyDrive/END School of AI/Datasets/END Datasets/QA Dataset/question_answer_pairs (1).txt'
path2 = '/content/drive/MyDrive/END School of AI/Datasets/END Datasets/QA Dataset/question_answer_pairs (2).txt'
path = '/content/drive/MyDrive/END School of AI/Datasets/END Datasets/QA Dataset/question_answer_pairs.txt'

df1 = pd.read_csv(path1, sep = '\t', encoding= 'ISO-8859-1')
df2 = pd.read_csv(path2, sep = '\t', encoding= 'ISO-8859-1')
df = pd.read_csv(path, sep = '\t', encoding= 'ISO-8859-1')

In [3]:
df.shape, df1.shape, df2.shape

((1715, 6), (1458, 6), (1458, 6))

In [4]:
data1 = df.append([df1, df2])
data1.shape

(4631, 6)

In [5]:
data1.head(3)

Unnamed: 0,ArticleTitle,Question,Answer,DifficultyFromQuestioner,DifficultyFromAnswerer,ArticleFile
0,Abraham_Lincoln,Was Abraham Lincoln the sixteenth President of...,yes,easy,easy,data/set3/a4
1,Abraham_Lincoln,Was Abraham Lincoln the sixteenth President of...,Yes.,easy,easy,data/set3/a4
2,Abraham_Lincoln,Did Lincoln sign the National Banking Act of 1...,yes,easy,medium,data/set3/a4


In [6]:
df_data = data1[['Question', 'Answer']]
df_data = df_data.drop_duplicates(subset = ['Question'])
ques = [str(sent) for sent in df_data['Question']]
ans = [str(sent) for sent in df_data['Answer']]
len(ques), len(ans)

(1863, 1863)

In [7]:
df_data['Question'] = ques
df_data['Answer'] = ans
df_data.reset_index(inplace=True)
df_data.head()


Unnamed: 0,index,Question,Answer
0,0,Was Abraham Lincoln the sixteenth President of...,yes
1,2,Did Lincoln sign the National Banking Act of 1...,yes
2,4,Did his mother die of pneumonia?,no
3,6,How many long was Lincoln's formal education?,18 months
4,8,When did Lincoln begin his political career?,1832


In [8]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torchtext import data


from torchtext.data import Field, BucketIterator

import spacy
import numpy as np

import random
import math
import time

In [9]:
SEED = 1234

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [10]:
SRC = Field(tokenize = 'spacy', 
            init_token = '<sos>', 
            eos_token = '<eos>', 
            lower = True)

TRG = Field(tokenize = 'spacy', 
            init_token = '<sos>', 
            eos_token = '<eos>', 
            lower = True)

In [11]:
fields = [('Question', SRC),('Answer',TRG)]

In [12]:
example = [data.Example.fromlist([df_data.Question[i],df_data.Answer[i]], fields) for i in range(df_data.shape[0])] 

In [13]:
QADataset = data.Dataset(example, fields)

In [14]:
vars(QADataset.examples[0])

{'Answer': ['yes'],
 'Question': ['was',
  'abraham',
  'lincoln',
  'the',
  'sixteenth',
  'president',
  'of',
  'the',
  'united',
  'states',
  '?']}

In [15]:
(train, valid) = QADataset.split(split_ratio=[0.90, 0.10], random_state=random.seed(SEED))

In [16]:
(len(train), len(valid))

(1677, 186)

In [17]:
SRC.build_vocab(train, min_freq = 2)
TRG.build_vocab(train, min_freq = 2)

In [18]:
print("size of SRC vocab: ", len(SRC.vocab))
print('size of TRG vocab: ', len(TRG.vocab))

size of SRC vocab:  1429
size of TRG vocab:  760


In [19]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [20]:
BATCH_SIZE = 128
train_iterator, valid_iterator = BucketIterator.splits((train, valid),sort = False, batch_size = BATCH_SIZE, device = device)

In [21]:
# Encoder
class Encoder(nn.Module):
  def __init__(self, input_dim, emb_dim, hid_dim, dropout):
    super().__init__()
    self.hid_dim = hid_dim
    self.embedding = nn.Embedding(input_dim, emb_dim)
    self.rnn = nn.GRU(emb_dim, hid_dim)
    self.dropout = nn.Dropout(dropout)
  def forward(self, src):
    embedded = self.dropout(self.embedding(src))

    outputs, hidden = self.rnn(embedded)
      
    #outputs = [src len, batch size, hid dim * n directions]
    #hidden = [n layers * n directions, batch size, hid dim]
    return hidden

In [22]:
# Decoder 
class Decoder(nn.Module):
  def __init__(self, output_dim, emb_dim, hid_dim, dropout):
    super().__init__()
    self.hid_dim = hid_dim,
    self.output_dim = output_dim
    self.embedding = nn.Embedding(output_dim, emb_dim)
    self.rnn = nn.GRU(emb_dim + hid_dim, hid_dim)
    self.fc = nn.Linear(emb_dim + hid_dim * 2, output_dim)
    self.dropout = nn.Dropout(dropout)

  def forward(self, input, hidden, context):
    #input = [batch size]
    #hidden = [n layers * n directions, batch size, hid dim]
    #context = [n layers * n directions, batch size, hid dim]
        
    #n layers and n directions in the decoder will both always be 1, therefore:
    #hidden = [1, batch size, hid dim]
    #context = [1, batch size, hid dim]
    input = input.unsqueeze(0)
    embedded = self.dropout(self.embedding(input))

    emb_con = torch.cat((embedded, context), dim = 2)
    output, hidden = self.rnn(emb_con, hidden)
        #output = [seq len, batch size, hid dim * n directions]
        #hidden = [n layers * n directions, batch size, hid dim]
        
        #seq len, n layers and n directions will always be 1 in the decoder, therefore:
        #output = [1, batch size, hid dim]
        #hidden = [1, batch size, hid dim]
    output = torch.cat((embedded.squeeze(0), hidden.squeeze(0), context.squeeze(0)), dim = 1)
    prediction = self.fc(output)
    
    return prediction, hidden
    


In [23]:
class Seq2Seq(nn.Module):
  def __init__(self, encoder, decoder, device):
    super().__init__()
    self.encoder = encoder
    self.decoder = decoder
    self.device = device

    #assert encoder.hid_dim == decoder.hid_dim, \
    #  "Hidden dimensions of encoder and decoder must be equal"
  def forward(self, src, trg, teacher_forcing_ratio = 0.5):

    #src = [src len, batch size]
    #trg = [trg len, batch size]
    #teacher_forcing_ratio is probability to use teacher forcing
    #e.g. if teacher_forcing_ratio is 0.75 we use ground-truth inputs 75% of the time

    batch_size = src.shape[1]
    trg_len = trg.shape[0]
    trg_vocab_size = self.decoder.output_dim
    outputs = torch.zeros(trg_len, batch_size, trg_vocab_size).to(self.device)
    context = self.encoder(src)
    hidden = context
    input = trg[0, :]
    for t in range(1, trg_len):
      output, hidden = self.decoder(input, hidden, context)
      outputs[t] = output
      teacher_force = random.random() < teacher_forcing_ratio
      top1 = output.argmax(1)
      input = trg[t] if teacher_force else top1
    
    return outputs


In [24]:
INPUT_DIM = len(SRC.vocab)
OUTPUT_DIM = len(TRG.vocab)
ENC_EMB_DIM = 256
DEC_EMB_DIM = 256
HID_DIM = 512
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5

enc = Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM, ENC_DROPOUT)
dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM, DEC_DROPOUT)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = Seq2Seq(enc, dec, device).to(device)

In [25]:
def init_weights(m):
  for name, param in m.named_parameters():
    nn.init.normal_(param.data, mean = 0.0, std = 0.01)
model.apply(init_weights)

Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(1429, 256)
    (rnn): GRU(256, 512)
    (dropout): Dropout(p=0.5, inplace=False)
  )
  (decoder): Decoder(
    (embedding): Embedding(760, 256)
    (rnn): GRU(768, 512)
    (fc): Linear(in_features=1280, out_features=760, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
)

In [26]:
def count_parameters(model):
  return sum(p.numel() for p in model.parameters() if p.requires_grad)


print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 4,685,816 trainable parameters


In [27]:
# optimizer
optimizer = optim.Adam(model.parameters())

In [28]:
TRG_PAD_IDX = TRG.vocab.stoi[TRG.pad_token]
criterion = nn.CrossEntropyLoss(ignore_index = TRG_PAD_IDX)

In [29]:
def train(model, iterator, optimizer, criterion, clip):
  model.train()
  epoch_loss = 0
  for i, batch in enumerate(iterator):
    src = batch.Question
    trg = batch.Answer
    optimizer.zero_grad()
    output = model(src, trg)
    output_dim = output.shape[-1]
    output = output[1:].view(-1, output_dim)
    trg = trg[1:].view(-1)
    loss = criterion(output, trg)
    loss.backward()
    torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
    optimizer.step()
    epoch_loss += loss.item()
  return epoch_loss / len(iterator)

In [30]:
def evaluate(model, iterator, criterion):
    model.eval()
    
    epoch_loss = 0
    
    with torch.no_grad():
    
        for i, batch in enumerate(iterator):

            src = batch.Question
            trg = batch.Answer

            output = model(src, trg, 0) #turn off teacher forcing

            #trg = [trg len, batch size]
            #output = [trg len, batch size, output dim]

            output_dim = output.shape[-1]
            
            output = output[1:].view(-1, output_dim)
            trg = trg[1:].view(-1)

            #trg = [(trg len - 1) * batch size]
            #output = [(trg len - 1) * batch size, output dim]

            loss = criterion(output, trg)

            epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

In [31]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [32]:
N_EPOCHS = 10
CLIP = 1

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    
    start_time = time.time()
    
    train_loss = train(model, train_iterator, optimizer, criterion, CLIP)
    valid_loss = evaluate(model, valid_iterator, criterion)
    
    end_time = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'tut2-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')

Epoch: 01 | Time: 0m 1s
	Train Loss: 5.640 | Train PPL: 281.603
	 Val. Loss: 3.605 |  Val. PPL:  36.772
Epoch: 02 | Time: 0m 1s
	Train Loss: 4.205 | Train PPL:  67.047
	 Val. Loss: 3.386 |  Val. PPL:  29.561
Epoch: 03 | Time: 0m 1s
	Train Loss: 3.980 | Train PPL:  53.497
	 Val. Loss: 3.319 |  Val. PPL:  27.633
Epoch: 04 | Time: 0m 1s
	Train Loss: 3.913 | Train PPL:  50.036
	 Val. Loss: 3.281 |  Val. PPL:  26.589
Epoch: 05 | Time: 0m 1s
	Train Loss: 3.811 | Train PPL:  45.201
	 Val. Loss: 3.259 |  Val. PPL:  26.025
Epoch: 06 | Time: 0m 1s
	Train Loss: 3.751 | Train PPL:  42.574
	 Val. Loss: 3.269 |  Val. PPL:  26.277
Epoch: 07 | Time: 0m 1s
	Train Loss: 3.757 | Train PPL:  42.821
	 Val. Loss: 3.295 |  Val. PPL:  26.983
Epoch: 08 | Time: 0m 1s
	Train Loss: 3.701 | Train PPL:  40.476
	 Val. Loss: 3.317 |  Val. PPL:  27.575
Epoch: 09 | Time: 0m 1s
	Train Loss: 3.702 | Train PPL:  40.523
	 Val. Loss: 3.296 |  Val. PPL:  27.016
Epoch: 10 | Time: 0m 1s
	Train Loss: 3.634 | Train PPL:  37.857
