# Dependencies and helper functions

In [1]:
%%capture
!pip install transformers
!pip install git+https://github.com/salaniz/pycocoevalcap
!pip install sentencepiece

In [2]:
import pandas as pd
import os
import time
import datetime
from string import punctuation
import pickle

def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

def clean(text):
  '''
  Takes a string, removes leading and trailing whitespace,
  makes it lower case, and removes leading and trailing punctuation.
  '''
  text = text.strip() # remove leading and trailing whitespace
  text = text.lower() # lower case
  text = text.strip(punctuation)

  return text

# Data Pre-processing
In this section, we load the data required for training the model and perform any appropriate filtering/pre-processing

In [3]:
from google.colab import drive
drive.mount('/content/gdrive')

ROOT        = 'gdrive/Shared drives/CDT Mini-Project Team 1/Colab Notebooks/'
DATA_DIR    = ROOT + 'data/' 
MODELS_DIR  = ROOT + 'models/'

Mounted at /content/gdrive


In [4]:
#Load all data
qaps = pd.read_csv(DATA_DIR + 'narrativeqa_qas.csv')
summaries = pd.read_csv(DATA_DIR + 'summaries.csv')

In [5]:
summaries = summaries.set_index('document_id')
summaries = summaries.drop(labels=['set','summary_tokenized'],axis='columns')

In [6]:
qaps = qaps.set_index('document_id')
qaps = qaps.drop(labels=['question_tokenized','answer1_tokenized','answer2_tokenized'], axis='columns')

In [7]:
# pair qaps with their relevant summaries and drop non-tokenized fields
qaps = qaps.join(summaries)

In [8]:
qaps.head()

Unnamed: 0_level_0,set,question,answer1,answer2,summary
document_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0025577043f5090cd603c6aea60f26e236195594,test,Who is Mark Hunter?,He is a high school student in Phoenix.,A loner and outsider student with a radio stat...,"Mark Hunter (Slater), a high school student i..."
0025577043f5090cd603c6aea60f26e236195594,test,Where does this radio station take place?,It takes place in Mark's parents basement.,"Phoenix, Arizona","Mark Hunter (Slater), a high school student i..."
0025577043f5090cd603c6aea60f26e236195594,test,Why do more students tune into Mark's show?,Mark talks about what goes on at school and in...,Because he has a thing to say about what is ha...,"Mark Hunter (Slater), a high school student i..."
0025577043f5090cd603c6aea60f26e236195594,test,Who commits suicide?,Malcolm.,Malcolm.,"Mark Hunter (Slater), a high school student i..."
0025577043f5090cd603c6aea60f26e236195594,test,What does Paige jam into her microwave?,She jams her medals and accolades.,Her award medals,"Mark Hunter (Slater), a high school student i..."


Acquire data in lists of: contexts (summaries), questions, answers

In [9]:
def format_data(data):
  contexts = []
  questions = []
  answers = []
  for index, row in data.iterrows():
    context   = 'context: ' + row['summary']
    question  = 'question: ' + row['question']
    answer    = {}
    answer['answer1'] = clean(row['answer1'])
    answer['answer2'] = clean(row['answer2'])

    contexts.append(context)
    questions.append(question)
    answers.append(answer)
  
  return contexts, questions, answers

train_contexts, train_questions, train_answers = format_data(qaps[qaps['set']=='train'])
val_contexts, val_questions, val_answers       = format_data(qaps[qaps['set']=='valid'])

dummy_questions = 100* ['How big is the Empire State Building?', 'Who is Shrek married to?']
dummy_contexts = 100 * ['The Empire State building is a very big building. It is one of the biggest buildings in the world. It is large.', 'Shrek is an ogre. There is a common misconception that Shrek is married to Donkey, but he is actually married to Fiona, the princess.']
dummy_answers = 100 * [{'answer1': 'very big', 'answer2': 'large'}, {'answer1': 'Fiona', 'answer2': 'the princess'}]

## Tokenization and Data Preparation

In [10]:
from transformers import T5TokenizerFast, T5ForConditionalGeneration
import torch

MODEL_ID = 'T5_base_finetuned/epoch1'

tokenizer = T5TokenizerFast.from_pretrained(MODELS_DIR+MODEL_ID)
model = T5ForConditionalGeneration.from_pretrained(MODELS_DIR+MODEL_ID, output_attentions=False, output_hidden_states=False)

MODEL_ID = 'T5_base_finetuned'

batch_size = 2
training_stats = []
current_epoch = 1
num_of_epochs = 3
lr = 1e-5

In [11]:
def tokenize_data(questions, contexts, answers, tokenizer):

  input_ids       = []
  labels          = []

  for q, c, a in zip(questions, contexts, answers):
    question_input_ids = tokenizer.encode(q + '\t' + c, max_length=1024, padding='max_length', truncation=True, return_tensors='pt')
    label_input_ids_1  = tokenizer.encode(a['answer1'], return_tensors='pt', max_length=100, truncation=True, padding='max_length')
    label_input_ids_2  = tokenizer.encode(a['answer2'], return_tensors='pt', max_length=100, truncation=True, padding='max_length')

    input_ids.append(torch.tensor(question_input_ids))
    input_ids.append(torch.tensor(question_input_ids))
    labels.append(torch.tensor(label_input_ids_1))
    labels.append(torch.tensor(label_input_ids_2))


  input_ids = torch.cat(input_ids, dim=0)
  labels    = torch.cat(labels, dim=0)

  return torch.utils.data.TensorDataset(input_ids, labels)

dummy_dataset = tokenize_data(dummy_questions, dummy_contexts, dummy_answers, tokenizer)
train_dataset = tokenize_data(train_questions, train_contexts, train_answers, tokenizer)
val_dataset = tokenize_data(val_questions, val_contexts, val_answers, tokenizer)

  # This is added back by InteractiveShellApp.init_path()
  if sys.path[0] == '':
  del sys.path[0]
  


## Model Training

In [12]:
from transformers import get_linear_schedule_with_warmup
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from transformers import AdamW

# create DataLoaders
train_loader = DataLoader(train_dataset,
                          sampler = RandomSampler(train_dataset),
                          batch_size=batch_size)

# For dev the order doesn't matter, so we'll just read them sequentially.
val_loader = DataLoader(
            val_dataset, # The dev samples.
            sampler = SequentialSampler(val_dataset), # Pull out batches sequentially.
            batch_size = batch_size # Evaluate with this batch size.
        )

# initialise optimizer
optim = AdamW(model.parameters(), 
              lr=lr,
              eps = 1e-8)

# Total number of training steps is [number of batches] x [number of epochs]. 
total_steps = len(train_loader) * (num_of_epochs - current_epoch)

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optim, 
                                            num_warmup_steps = 0,
                                            num_training_steps = total_steps)

In [None]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
if torch.cuda.is_available() : model.cuda()
model.to(device)

# Measure how long the training epoch takes.
total_t0 = time.time()

for epoch in range(current_epoch, num_of_epochs):
    
    # ========================================
    #               Training
    # ========================================
    
    # Perform one full pass over the training set.

    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch + 1, num_of_epochs))
    print('Training...')  

    # Measure how long training epoch takes
    t0 = time.time()

    # Reset loss for epoch
    total_train_loss = 0
    model.train()

    # For each batch of training data...
    for step, batch in enumerate(train_loader):
        #print(batch)

        # Progress update every 100 batches.
        if (step % 100 == 0 or step < 6) and not step == 0:
            # Calculate elapsed time in minutes.
            elapsed = format_time(time.time() - t0)
            
            # Report progress.
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_loader), elapsed))

        optim.zero_grad()
        input_ids = batch[0].to(device)
        labels    = batch[1].to(device)

        outputs = model(input_ids=input_ids, labels=labels)
        loss = outputs.loss

        # accumulate training loss over batches
        total_train_loss += loss.item()

        # clip norm of gradients to 1.0 to address vanishing gradient problem
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        loss.backward()
        optim.step()
        scheduler.step()

    avg_train_loss = total_train_loss / len(train_loader)
    training_time = format_time(time.time() - t0)
    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epoch took: {:}".format(training_time))



    # ========================================
    #               Validation
    # ========================================
    # After the completion of each training epoch, measure our performance on
    # our validation set.

    print("")
    print("Running validation...")

    total_val_loss = 0
    t0 = time.time()

    # Put the model in evaluation mode--the dropout layers behave differently
    # during evaluation.
    model.eval()

    # Evaluate data for one epoch
    for batch in val_loader:

        input_ids = batch[0].to(device)
        labels    = batch[1].to(device)

        # Don't construct a compute graph (only required for backprop during training)
        with torch.no_grad():
          outputs = model(input_ids=input_ids, labels=labels)

        loss = outputs.loss
        total_val_loss += loss.item()

    # Calculate the average loss over all of the batches.
    avg_val_loss = total_val_loss / len(val_loader)

    # Measure how long the dev run took.
    dev_time = format_time(time.time() - t0)
    
    print("  Dev Loss: {0:.2f}".format(avg_val_loss))
    print("  Dev took: {:}".format(dev_time))

    training_stats.append(
        {
            'epoch': epoch + 1,
            'training_loss': avg_train_loss,
            'validation_loss': avg_val_loss,
            'total_loss': avg_train_loss + avg_val_loss
        }
    )

    print("")
    print("Saving model")

    # Save a trained model, configuration and tokenizer using `save_pretrained()`.
    # They can then be reloaded using `from_pretrained()`
    model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training

    MODEL_PATH = MODELS_DIR + MODEL_ID + '/epoch' + str(epoch + 1)

    if not os.path.exists(MODEL_PATH):
        os.makedirs(MODEL_PATH)

    model_to_save.save_pretrained(MODEL_PATH)
    tokenizer.save_pretrained(MODEL_PATH)
    with open(MODEL_PATH + '/stats', "wb") as stats:
      pickle.dump(training_stats, stats)

print("")
print("Training complete!")

print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))


Training...
  Batch     1  of  32,747.    Elapsed: 0:00:01.
  Batch     2  of  32,747.    Elapsed: 0:00:02.
  Batch     3  of  32,747.    Elapsed: 0:00:02.
  Batch     4  of  32,747.    Elapsed: 0:00:03.
  Batch     5  of  32,747.    Elapsed: 0:00:04.
  Batch   100  of  32,747.    Elapsed: 0:01:14.
  Batch   200  of  32,747.    Elapsed: 0:02:30.
  Batch   300  of  32,747.    Elapsed: 0:03:47.
  Batch   400  of  32,747.    Elapsed: 0:05:06.
  Batch   500  of  32,747.    Elapsed: 0:06:25.
  Batch   600  of  32,747.    Elapsed: 0:07:45.
  Batch   700  of  32,747.    Elapsed: 0:09:04.
  Batch   800  of  32,747.    Elapsed: 0:10:23.
  Batch   900  of  32,747.    Elapsed: 0:11:43.
  Batch 1,000  of  32,747.    Elapsed: 0:13:02.
  Batch 1,100  of  32,747.    Elapsed: 0:14:21.
  Batch 1,200  of  32,747.    Elapsed: 0:15:40.
  Batch 1,300  of  32,747.    Elapsed: 0:17:00.
  Batch 1,400  of  32,747.    Elapsed: 0:18:19.
  Batch 1,500  of  32,747.    Elapsed: 0:19:38.
  Batch 1,600  of  32,747. 

## Model stats

In [None]:
import pandas as pd

# Display floats with two decimal places.
pd.set_option('precision', 2)

# Create a DataFrame from our training statistics.
df_stats = pd.DataFrame(data=training_stats)

# Use the 'epoch' as the row index.
df_stats = df_stats.set_index('epoch')

# Display the table.
df_stats

In [None]:
import matplotlib.pyplot as plt
% matplotlib inline

import seaborn as sns

# Use plot styling from seaborn.
sns.set(style='darkgrid')

# Increase the plot size and font size.
sns.set(font_scale=1.5)
plt.rcParams["figure.figsize"] = (12,6)

# Plot the learning curve.
plt.plot(df_stats['training_loss'], 'b-o', label="Training")
plt.plot(df_stats['validation_loss'], 'g-o', label="Validation")
plt.plot(df_stats['total_loss'], 'y-o', label="Total")

# Label the plot.
plt.title("Training & Validation Loss")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.xticks(range(1, num_of_epochs+1))
plt.legend(loc='upper center', bbox_to_anchor=(0.5, -0.05), shadow=True, ncol=2)

plt.show()

## Sandbox

In [None]:
for question, answer in zip(dummy_questions, generated_answers):
  print('Question : {}'.format(question))
  print('Answer   : {}\n'.format(answer))

# Evaluation metrics

In [None]:
from pycocoevalcap.meteor.meteor import Meteor
from pycocoevalcap.cider.cider import Cider
from pycocoevalcap.rouge.rouge import Rouge
from pycocoevalcap.bleu.bleu import Bleu

meteor_obj = Meteor()
rouge_obj = Rouge()
cider_obj = Cider()
bleu_obj = Bleu(4)

In [None]:
ref1_strs = [answer['answer1'] for answer in answers]
ref2_strs = [answer['answer2'] for answer in answers]
sys_strs  = [answer['generated_answer'] for answer in answers]

assert len(ref1_strs) == len(ref2_strs)
assert len(ref2_strs) == len(sys_strs)

In [None]:
word_target_dict = {}
word_response_dict = {}

for i in range(len(ref1_strs)):
    word_target_dict[i] = [ref1_strs[i], ref2_strs[i]]
    word_response_dict[i] = [sys_strs[i]]


bleu_score, bleu_scores = bleu_obj.compute_score(
        word_target_dict, word_response_dict,
        verbose=False)
bleu1_score, _, _, bleu4_score = bleu_score
bleu1_scores, _, _, bleu4_scores = bleu_scores
meteor_score, meteor_scores = meteor_obj.compute_score(
        word_target_dict, word_response_dict) 
rouge_score, rouge_scores = rouge_obj.compute_score(
        word_target_dict, word_response_dict) 
cider_score, cider_scores = cider_obj.compute_score(
        word_target_dict, word_response_dict)

ref1_comparison = [a == b for a,b in zip(ref1_strs, sys_strs)]                  # For each question, True if extracted answer matches answer1
ref2_comparison = [a == b for a,b in zip(ref2_strs, sys_strs)]                  # For each question, True if extracted answer matches answer2
ref_comparison = [int(a or b) for a,b in zip(ref1_comparison, ref2_comparison)] # For each question, 1 if extracted answer matches either answer1 or answer2, else 0
accuracy = sum(ref_comparison) / len(ref_comparison)

print("ROUGE-L : ", round(100*rouge_score,2))
print("BLEU-1  : ", round(100*bleu1_score,2))
print("BLEU-4  : ", round(100*bleu4_score,2))
print("METEOR  : ", round(100*meteor_score,2))
print("CiDER   : ", round(100*cider_score,2))
print("Accuracy: ", round(100*accuracy, 2))