# Dependencies and helper functions

In [None]:
%%capture
!pip install transformers
!pip install git+https://github.com/salaniz/pycocoevalcap
!pip install sentencepiece

In [None]:
import pandas as pd
import os
import time
import datetime
from string import punctuation
import pickle

def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

def clean(text):
  '''
  Takes a string, removes leading and trailing whitespace,
  makes it lower case, and removes leading and trailing punctuation.
  '''
  text = text.strip() # remove leading and trailing whitespace
  text = text.lower() # lower case
  text = text.strip(punctuation)

  return text

# Data Pre-processing
In this section, we load the data required for training the model and perform any appropriate filtering/pre-processing

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

ROOT        = 'gdrive/Shared drives/CDT Mini-Project Team 1/Colab Notebooks/'
DATA_DIR    = ROOT + 'data/' 
MODELS_DIR  = ROOT + 'models/'

Mounted at /content/gdrive


In [None]:
#Load all data
qaps = pd.read_csv(DATA_DIR + 'narrativeqa_qas.csv')
summaries = pd.read_csv(DATA_DIR + 'summaries.csv')
qaps = qaps[qaps['set']=='test']
summaries = summaries[summaries['set']=='test']

In [None]:
summaries = summaries.set_index('document_id')
summaries = summaries.drop(labels=['set','summary_tokenized'],axis='columns')

In [None]:
qaps = qaps.set_index('document_id')
qaps = qaps.drop(labels=['question_tokenized','answer1_tokenized','answer2_tokenized'], axis='columns')

In [None]:
# pair qaps with their relevant summaries and drop non-tokenized fields
qaps = qaps.join(summaries)

In [None]:
qaps.head()

Unnamed: 0_level_0,set,question,answer1,answer2,summary
document_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0025577043f5090cd603c6aea60f26e236195594,test,Who is Mark Hunter?,He is a high school student in Phoenix.,A loner and outsider student with a radio stat...,"Mark Hunter (Slater), a high school student i..."
0025577043f5090cd603c6aea60f26e236195594,test,Where does this radio station take place?,It takes place in Mark's parents basement.,"Phoenix, Arizona","Mark Hunter (Slater), a high school student i..."
0025577043f5090cd603c6aea60f26e236195594,test,Why do more students tune into Mark's show?,Mark talks about what goes on at school and in...,Because he has a thing to say about what is ha...,"Mark Hunter (Slater), a high school student i..."
0025577043f5090cd603c6aea60f26e236195594,test,Who commits suicide?,Malcolm.,Malcolm.,"Mark Hunter (Slater), a high school student i..."
0025577043f5090cd603c6aea60f26e236195594,test,What does Paige jam into her microwave?,She jams her medals and accolades.,Her award medals,"Mark Hunter (Slater), a high school student i..."


Acquire data in lists of: contexts (summaries), questions, answers

In [None]:
def format_data(data):
  contexts = []
  questions = []
  answers = []
  for index, row in data.iterrows():
    context   = 'context: ' + row['summary']
    question  = 'question: ' + row['question']
    answer    = {}
    answer['answer1'] = clean(row['answer1'])
    answer['answer2'] = clean(row['answer2'])

    contexts.append(context)
    questions.append(question)
    answers.append(answer)
  
  return contexts, questions, answers

test_contexts, test_questions, test_answers = format_data(qaps)

In [None]:
from transformers import T5TokenizerFast, T5ForConditionalGeneration
import torch

MODEL_PATH = MODELS_DIR + 'T5_base_finetuned/epoch1'
finetuned = False

if finetuned:
  tokenizer = T5TokenizerFast.from_pretrained(MODEL_PATH)
  model = T5ForConditionalGeneration.from_pretrained(MODEL_PATH, output_attentions=False, output_hidden_states=False)
else:
  tokenizer = T5TokenizerFast.from_pretrained('t5-base')
  model = T5ForConditionalGeneration.from_pretrained('t5-base', output_attentions=False, output_hidden_states=False)

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

if torch.cuda.is_available() : model.cuda()
model.to(device)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=791656.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1389353.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1199.0, style=ProgressStyle(description…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=891691430.0, style=ProgressStyle(descri…




T5ForConditionalGeneration(
  (shared): Embedding(32128, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseReluDense(
              (wi): Linear(in_features=768, out_features=3072, bias=False)
              (wo): Linear(in_features=3072, out_features=768, bias=False)
              (dropout): Dr

In [None]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler


def tokenize_data(questions, contexts, tokenizer):

  input_ids       = []

  for q, c in zip(questions, contexts):
    question_input_ids = tokenizer.encode(q + '\t' + c, max_length=1024, padding='max_length', truncation=True, return_tensors='pt')
    input_ids.append(torch.tensor(question_input_ids))

  input_ids = torch.cat(input_ids, dim=0)

  return torch.utils.data.TensorDataset(input_ids)

test_dataset = tokenize_data(test_questions, test_contexts, tokenizer)

  # Remove the CWD from sys.path while we load stuff.


In [None]:
batch_size = 4
test_loader = DataLoader(test_dataset, sampler=SequentialSampler(test_dataset), batch_size = batch_size)

In [None]:
generated_answers = []
t0 = time.time()

for step, batch in enumerate(test_loader):

  # Progress update every 40 batches.
  if (step % 100 == 0 or step < 6) and not step == 0:
    # Calculate elapsed time in minutes.
    elapsed = format_time(time.time() - t0)
            
    # Report progress.
    print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(test_loader), elapsed))

  input_ids = batch[0].to(device)

  # summmarize 
  output_ids = model.generate(input_ids,
                              num_beams=10,
                              no_repeat_ngram_size=2,
                              min_length=2,
                              max_length=50,
                              early_stopping=False)
  
  for answer in output_ids:
      output = tokenizer.decode(answer, skip_special_tokens=True) 
      generated_answers.append(output)

  Batch     1  of  2,640.    Elapsed: 0:00:04.
  Batch     2  of  2,640.    Elapsed: 0:00:09.
  Batch     3  of  2,640.    Elapsed: 0:00:11.
  Batch     4  of  2,640.    Elapsed: 0:00:16.
  Batch     5  of  2,640.    Elapsed: 0:00:20.
  Batch   100  of  2,640.    Elapsed: 0:07:26.
  Batch   200  of  2,640.    Elapsed: 0:14:53.
  Batch   300  of  2,640.    Elapsed: 0:21:52.
  Batch   400  of  2,640.    Elapsed: 0:29:28.
  Batch   500  of  2,640.    Elapsed: 0:37:09.
  Batch   600  of  2,640.    Elapsed: 0:45:03.
  Batch   700  of  2,640.    Elapsed: 0:52:56.
  Batch   800  of  2,640.    Elapsed: 1:00:36.
  Batch   900  of  2,640.    Elapsed: 1:08:06.
  Batch 1,000  of  2,640.    Elapsed: 1:15:52.
  Batch 1,100  of  2,640.    Elapsed: 1:23:24.
  Batch 1,200  of  2,640.    Elapsed: 1:31:05.
  Batch 1,300  of  2,640.    Elapsed: 1:38:50.
  Batch 1,400  of  2,640.    Elapsed: 1:46:16.
  Batch 1,500  of  2,640.    Elapsed: 1:54:00.
  Batch 1,600  of  2,640.    Elapsed: 2:01:48.
  Batch 1,700

In [None]:
# Get answers using positions
for i,answer in enumerate(generated_answers):
  test_answers[i]['generated_answer'] = clean(answer)

# Evaluation metrics

In [None]:
from pycocoevalcap.meteor.meteor import Meteor
from pycocoevalcap.cider.cider import Cider
from pycocoevalcap.rouge.rouge import Rouge
from pycocoevalcap.bleu.bleu import Bleu

meteor_obj = Meteor()
rouge_obj = Rouge()
cider_obj = Cider()
bleu_obj = Bleu(4)

In [None]:
answers = test_answers
ref1_strs = [answer['answer1'] for answer in answers]
ref2_strs = [answer['answer2'] for answer in answers]
sys_strs  = [answer['generated_answer'] for answer in answers]

assert len(ref1_strs) == len(ref2_strs)
assert len(ref2_strs) == len(sys_strs)

In [None]:
word_target_dict = {}
word_response_dict = {}

for i in range(len(ref1_strs)):
    word_target_dict[i] = [ref1_strs[i], ref2_strs[i]]
    word_response_dict[i] = [sys_strs[i]]


bleu_score, bleu_scores = bleu_obj.compute_score(
        word_target_dict, word_response_dict,
        verbose=False)
bleu1_score, _, _, bleu4_score = bleu_score
bleu1_scores, _, _, bleu4_scores = bleu_scores
meteor_score, meteor_scores = meteor_obj.compute_score(
        word_target_dict, word_response_dict) 
rouge_score, rouge_scores = rouge_obj.compute_score(
        word_target_dict, word_response_dict) 
cider_score, cider_scores = cider_obj.compute_score(
        word_target_dict, word_response_dict)

ref1_comparison = [a == b for a,b in zip(ref1_strs, sys_strs)]                  # For each question, True if extracted answer matches answer1
ref2_comparison = [a == b for a,b in zip(ref2_strs, sys_strs)]                  # For each question, True if extracted answer matches answer2
ref_comparison = [int(a or b) for a,b in zip(ref1_comparison, ref2_comparison)] # For each question, 1 if extracted answer matches either answer1 or answer2, else 0
accuracy = sum(ref_comparison) / len(ref_comparison)

print("ROUGE-L : ", round(100*rouge_score,2))
print("BLEU-1  : ", round(100*bleu1_score,2))
print("BLEU-4  : ", round(100*bleu4_score,2))
print("METEOR  : ", round(100*meteor_score,2))
print("CiDER   : ", round(100*cider_score,2))
print("Accuracy: ", round(100*accuracy, 2))

ROUGE-L :  60.91
BLEU-1  :  49.87
BLEU-4  :  24.17
METEOR  :  28.94
CiDER   :  223.94
Accuracy:  36.38


In [None]:
import pickle

with open(DATA_DIR+'/t5_base_non_finetuned', 'wb') as f: 
  pickle.dump(answers, f)

In [36]:
answers[:10]

[{'answer1': 'he is a high school student in phoenix',
  'answer2': 'a loner and outsider student with a radio station',
  'generated_answer': 'a high school student'},
 {'answer1': "it takes place in mark's parents basement",
  'answer2': 'phoenix, arizona',
  'generated_answer': "basement of his parents' house"},
 {'answer1': 'mark talks about what goes on at school and in the community',
  'answer2': 'because he has a thing to say about what is happening at his school and the community',
  'generated_answer': 'when he speaks his mind about what is going on at his school and in the community'},
 {'answer1': 'malcolm',
  'answer2': 'malcolm',
  'generated_answer': 'a student named malcolm'},
 {'answer1': 'she jams her medals and accolades',
  'answer2': 'her award medals',
  'generated_answer': 'various medals and accolades'},
 {'answer1': "he dismantles it and attaches it to his mother's jeep",
  'answer2': 'dismantle it',
  'generated_answer': 'dismantles'},
 {'answer1': 'he tells t

In [37]:
num_extracts = 0

for a,c in zip(test_answers, test_contexts):
  if a['generated_answer'] in clean(c):
    num_extracts += 1

In [38]:
print(num_extracts/len(test_answers))

0.9766979255470304
