# Dependencies and helper functions

In [1]:
#%%capture
#!pip install transformers
#!pip install git+https://github.com/salaniz/pycocoevalcap
#!module load apps/java/jdk1.8.0_102/binary

In [2]:
import pandas as pd
import os.path as op
import time
import datetime
from string import punctuation

def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

def clean(text):
  '''
  Takes a string, removes leading and trailing whitespace,
  makes it lower case, and removes leading and trailing punctuation.
  '''
  text = text.strip() # remove leading and trailing whitespace
  text = text.lower() # lower case
  text = text.strip(punctuation)

  return text

# Data Pre-processing
In this section, we load the data required for training the model and perform any appropriate filtering/pre-processing

In [3]:
#from google.colab import drive
#drive.mount('/content/gdrive')

ROOT        = './'
DATA_DIR    = ROOT + 'data/' 
MODELS_DIR  = ROOT + 'models/'

In [4]:
#Load all data
#Load all data
qaps = pd.read_csv(op.join(DATA_DIR,'narrativeqa_qas.csv'))
#qaps_2 = pd.read_csv(op.join(DATA_DIR,'narrativeqa_qaps_single_answer_2.csv'))

summaries = pd.read_csv(DATA_DIR + 'summaries.csv')
qaps = qaps[qaps['set']=='valid']
summaries = summaries[summaries['set']=='valid']

In [5]:
summaries = summaries.set_index('document_id')
summaries = summaries.drop(labels=['set','summary'],axis='columns')

In [6]:
qaps = qaps.set_index('document_id')
qaps = qaps.drop(labels=['set','question','answer1','answer2'], axis='columns')

In [7]:
# pair qaps with their relevant summaries and drop non-tokenized fields
qaps = qaps.join(summaries)

Acquire data in lists of: contexts (summaries), questions, answers

In [8]:
def format_data(data):
  contexts = []
  questions = []
  answers = []
  for index, row in data.iterrows():
    context   = row['summary_tokenized']
    question  = row['question_tokenized']
    answer    = {}
    answer['answer1'] = clean(row['answer1_tokenized'])
    answer['answer2'] = clean(row['answer2_tokenized'])

    contexts.append(context)
    questions.append(question)
    answers.append(answer)
  
  return contexts, questions, answers

contexts, questions, answers = format_data(qaps)

In [9]:
# IF YOU WANT TO RUN TESTS WITH A SMALLER DATASET, UNCOMMENT THE CODE BELOW (ctrl + /)

# questions = ['How big is the Empire State Building?', 
#                    'Who is Shrek married to?',
#                    'How old is Gandalf?',
#                    'Where does Winnie the Pooh live?']

# contexts = ['The Empire State building is a very big building. It is one of the biggest buildings in the world. It is large.', 
#                   'Shrek is an ogre. There is a common misconception that Shrek is married to Donkey, but he is actually married to Fiona.',
#                   'Gandalf is a 900 year old wizard.',
#                   'Winnie the Pooh lives in Dalston with some of his uni housemates']

# answers = [{'answer1': 'really big', 'answer2': 'very big'},
#                  {'answer1': 'Shrek is married to Fiona', 'answer2': 'Fiona'},
#                  {'answer1': 'very old', 'answer2': '900 years old'},
#                  {'answer1': 'in London', 'answer2': 'near Dalston'}]

# Model information / hyperparameter selection
In this section we have details on our model type, and the start-points for training (either a pre-trained model or a partially trained model we wish to resume training)

To keep our experiments valid - ensure that the model id is of the form "modelname-learning-rate". 

If you resume training for a model, ensure the learning rates are consistent

In [10]:
from transformers import BertForQuestionAnswering, BertTokenizerFast, DistilBertTokenizerFast, DistilBertForQuestionAnswering, AutoModelForQuestionAnswering, AutoTokenizer
import pickle
import os

MODEL_IDS = ['distilbert-base-uncased-distilled-squad',
             'distilbert-base-cased-distilled-squad',
             'bert-large-uncased-whole-word-masking-finetuned-squad',
             'mrm8488/longformer-base-4096-finetuned-squadv2',
             'distilbert-squad-nqa-5e-5',
             'distilbert-squad-nqa-3e-5',
             'bert-large-squad-nqa-3e-5'
             'bert-large-squad-nqa-5e-5',
             'bert-large-squad-nqa-5e-6',
             'Primer/bart-squad2']

model_types = ['distilbert', 'bert-base', 'bert-large', 'longformer']

# select model id
MODEL_ID      = MODEL_IDS[0]
model_type    = model_types[0]
finetuned     = False            # is the model saved on Drive
epoch_to_load = 3               # if a model uploaded to Drive, which epoch to load

batch_sizes = {'distilbert': 32, 'bert-base': 32, 'bert-large': 4, 'longformer': 8}

In [11]:
#If we are training from scratch then load up the appropriate model, else load the partially trained model
if finetuned:
  MODEL_PATH = MODELS_DIR + MODEL_ID
  MODEL_PATH = MODEL_PATH + '/epoch' + str(epoch_to_load)
  tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
  model     = AutoModelForQuestionAnswering.from_pretrained(MODEL_PATH,
                                                          output_attentions = False,
                                                          output_hidden_states=False)
  with open(MODEL_PATH + '/stats', "rb") as stats:
    training_stats = pickle.load(stats)
  print('Evaluating {} (finetuned for {} epochs)'.format(MODEL_ID, str(epoch_to_load)))

else:
  MODEL_PATH = MODEL_ID
  tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
  model     = AutoModelForQuestionAnswering.from_pretrained(MODEL_PATH,
                                                          output_attentions = False,
                                                          output_hidden_states=False)
  print('Evaluating {}'.format(MODEL_ID))


batch_size = batch_sizes[model_type]

Evaluating distilbert-base-uncased-distilled-squad


# Data tokenization
In this section we format our data so it is of the form required for GPU training.

Additional features are (likely to be) added here.

In [12]:
encodings = tokenizer(questions, contexts, truncation='only_second', padding='max_length')

In [13]:
import torch

class NQADataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings.input_ids)

dataset = NQADataset(encodings)

# Get Answers

In [14]:
from torch.utils.data import DataLoader, SequentialSampler

loader = DataLoader(
            dataset, # The dev samples.
            sampler = SequentialSampler(dataset), # Pull out batches sequentially.
            batch_size = batch_size) # Evaluate with this batch size.

In [None]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

if torch.cuda.is_available() : model.cuda()
model.to(device)

print("")
print("Getting answers positions...")

total_val_loss = 0
t0 = time.time()

answer_starts = []
answer_ends   = []
answer_tokens = []

# Put the model in evaluation mode--the dropout layers behave differently
# during evaluation.
model.eval()

# Evaluate data for one epoch
for step, batch in enumerate(loader):

    # Progress update every 40 batches.
    if (step <= 5 or step % 40 == 0) and not step == 0:
        # Calculate elapsed time in minutes.
        elapsed = format_time(time.time() - t0)
            
        # Report progress.
        print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(loader), elapsed))

    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)

    # Don't construct a compute graph (only required for backprop during training)
    with torch.no_grad():
      outputs = model(input_ids, attention_mask=attention_mask)

    start_scores = outputs.start_logits
    end_scores = outputs.end_logits

    for i,score in enumerate(start_scores):
      answer_start = torch.argmax(score)
      answer_end = answer_start + torch.argmax(end_scores[i][answer_start:])
      answer_starts.append(int(answer_start))
      answer_ends.append(int(answer_end))
      answer_tokens.append(input_ids[i].tolist())

# Measure how long the dev run took.
dev_time = format_time(time.time() - t0)    
print("Answering took: {:}".format(dev_time))

  return torch._C._cuda_getDeviceCount() > 0



Getting answers positions...
  Batch     1  of    109.    Elapsed: 0:00:22.
  Batch     2  of    109.    Elapsed: 0:00:44.
  Batch     3  of    109.    Elapsed: 0:01:06.
  Batch     4  of    109.    Elapsed: 0:01:27.
  Batch     5  of    109.    Elapsed: 0:01:49.


In [None]:
# Get answers using positions
print("Getting answers...")

for i,(source,answer_start,answer_end) in enumerate(zip(answer_tokens, answer_starts,answer_ends)):
  answer = tokenizer.decode(source[answer_start:answer_end+1])
  answers[i]['extracted_answer'] = clean(answer)

print("")
print("Extracted answers")

In [None]:
for answer in answers[:5]:
  print(answer)

# Evaluation metrics

In [None]:
from pycocoevalcap.meteor.meteor import Meteor
from pycocoevalcap.cider.cider import Cider
from pycocoevalcap.rouge.rouge import Rouge
from pycocoevalcap.bleu.bleu import Bleu

meteor_obj = Meteor()
rouge_obj = Rouge()
cider_obj = Cider()
bleu_obj = Bleu(4)

In [None]:
ref1_strs = [answer['answer1'] for answer in answers]
ref2_strs = [answer['answer2'] for answer in answers]
sys_strs  = [answer['extracted_answer'] for answer in answers]

assert len(ref1_strs) == len(ref2_strs)
assert len(ref2_strs) == len(sys_strs)

In [None]:
word_target_dict = {}
word_response_dict = {}

for i in range(len(ref1_strs)):
    word_target_dict[i] = [ref1_strs[i], ref2_strs[i]]
    word_response_dict[i] = [sys_strs[i]]


bleu_score, bleu_scores = bleu_obj.compute_score(
        word_target_dict, word_response_dict,
        verbose=False)
bleu1_score, _, _, bleu4_score = bleu_score
bleu1_scores, _, _, bleu4_scores = bleu_scores
meteor_score, meteor_scores = meteor_obj.compute_score(
        word_target_dict, word_response_dict) 
rouge_score, rouge_scores = rouge_obj.compute_score(
        word_target_dict, word_response_dict) 
cider_score, cider_scores = cider_obj.compute_score(
        word_target_dict, word_response_dict)

ref1_comparison = [a == b for a,b in zip(ref1_strs, sys_strs)]                  # For each question, True if extracted answer matches answer1
ref2_comparison = [a == b for a,b in zip(ref2_strs, sys_strs)]                  # For each question, True if extracted answer matches answer2
ref_comparison = [int(a or b) for a,b in zip(ref1_comparison, ref2_comparison)] # For each question, 1 if extracted answer matches either answer1 or answer2, else 0
accuracy = sum(ref_comparison) / len(ref_comparison)

print("ROUGE-L : ", round(100*rouge_score,2))
print("BLEU-1  : ", round(100*bleu1_score,2))
print("BLEU-4  : ", round(100*bleu4_score,2))
print("METEOR  : ", round(100*meteor_score,2))
print("CiDER   : ", round(100*cider_score,2))
print("Accuracy: ", round(100*accuracy, 2))