# Dependencies and helper functions

In [1]:
%%capture
!pip install git+https://github.com/salaniz/pycocoevalcap

In [2]:
import pandas as pd
import os
import time
import datetime
from string import punctuation
import pickle

def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

def clean(text):
  '''
  Takes a string, removes leading and trailing whitespace,
  makes it lower case, and removes leading and trailing punctuation.
  '''
  text = text.strip() # remove leading and trailing whitespace
  text = text.lower() # lower case
  text = text.strip(punctuation)
  text = text.replace('\\', '')

  return text

# Data Pre-processing
In this section, we load the data required for training the model and perform any appropriate filtering/pre-processing

In [3]:
from google.colab import drive
drive.mount('/content/gdrive')

ROOT        = 'gdrive/Shared drives/CDT Mini-Project Team 1/Colab Notebooks/'
DATA_DIR    = ROOT + 'data/' 
MODELS_DIR  = ROOT + 'models/'

Mounted at /content/gdrive


In [4]:
#Load all data
qaps = pd.read_csv(DATA_DIR + 'narrativeqa_qas.csv')
summaries = pd.read_csv(DATA_DIR + 'summaries.csv')
qaps = qaps[qaps['set']=='test']
summaries = summaries[summaries['set']=='test']

In [5]:
summaries = summaries.set_index('document_id')
summaries = summaries.drop(labels=['set','summary_tokenized'],axis='columns')

In [6]:
qaps = qaps.set_index('document_id')
qaps = qaps.drop(labels=['question_tokenized','answer1_tokenized','answer2_tokenized'], axis='columns')

In [7]:
# pair qaps with their relevant summaries and drop non-tokenized fields
qaps = qaps.join(summaries)

In [8]:
qaps.head()

Unnamed: 0_level_0,set,question,answer1,answer2,summary
document_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0025577043f5090cd603c6aea60f26e236195594,test,Who is Mark Hunter?,He is a high school student in Phoenix.,A loner and outsider student with a radio stat...,"Mark Hunter (Slater), a high school student i..."
0025577043f5090cd603c6aea60f26e236195594,test,Where does this radio station take place?,It takes place in Mark's parents basement.,"Phoenix, Arizona","Mark Hunter (Slater), a high school student i..."
0025577043f5090cd603c6aea60f26e236195594,test,Why do more students tune into Mark's show?,Mark talks about what goes on at school and in...,Because he has a thing to say about what is ha...,"Mark Hunter (Slater), a high school student i..."
0025577043f5090cd603c6aea60f26e236195594,test,Who commits suicide?,Malcolm.,Malcolm.,"Mark Hunter (Slater), a high school student i..."
0025577043f5090cd603c6aea60f26e236195594,test,What does Paige jam into her microwave?,She jams her medals and accolades.,Her award medals,"Mark Hunter (Slater), a high school student i..."


Acquire data in lists of: contexts (summaries), questions, answers

In [9]:
def format_data(data):
  contexts = []
  questions = []
  answers = []
  for index, row in data.iterrows():
    context   = 'context: ' + row['summary']
    question  = 'question: ' + row['question']
    answer    = {}
    answer['answer1'] = clean(row['answer1'])
    answer['answer2'] = clean(row['answer2'])

    contexts.append(context)
    questions.append(question)
    answers.append(answer)
  
  return contexts, questions, answers

test_contexts, test_questions, test_answers = format_data(qaps)

## Get answers

In [10]:
import pickle

with open(DATA_DIR+'/t5_base_finetuned', 'rb') as f:
  t5_base_finetuned_answers = pickle.load(f)

with open(DATA_DIR + '/t5_small_finetuned', 'rb') as f:
  t5_small_finetuned_answers = pickle.load(f)

with open(DATA_DIR + '/t5_small', 'rb') as f:
  t5_small_answers = pickle.load(f)

In [48]:
print(sample_questions[0][1])

{'answer1': 'through the whole in the duster that miss moppet forgot about', 'answer2': 'a hole in the duster', 'generated_answer': 'he dances a jig'}


In [49]:
# Get answers using positions
print("Getting answers...")

for i,(source,answer_start,answer_end) in enumerate(zip(answer_tokens, answer_starts,answer_ends)):
  answer = tokenizer.decode(source[answer_start:answer_end+1])
  sample_questions[i][1]['extracted_answer'] = clean(answer)

print("")
print("Extracted answers")

Getting answers...

Extracted answers


In [50]:
for q in sample_questions:
  print('Question     : ', q[0][10:])
  print('Human answer : ', q[1]['answer1'])
  print('t5-base      : ', q[1]['generated_answer'])
  print('Longformer   : ', q[1]['extracted_answer'])
  print('')

Question     :  How does the mouse escape?
Human answer :  through the whole in the duster that miss moppet forgot about
t5-base      :  he dances a jig
Longformer   :  he dances a jig

Question     :  Where is Tyler's dad at the beginning of the story?
Human answer :  seaton mansion
t5-base      :  bartholomew seaton
Longformer   :  seaton mansion

Question     :  Capt. Wynnegate took the blame for what?
Human answer :  the loss of the money
t5-base      :  being a thief
Longformer   :  steals from the family trust fund and speculates heavily. henry loses the fortune, causing them to default on a commitment to an orphans' home.
capt. wynnegate is in love with henry's wife, diana. she does not love her husband and returns the affection of the captain. as the money has been lost, capt. wynnegate agrees to leave england and take the blame (see remittance man). he is then accused of being a thief

Question     :  What did Marilyn have when Colin found her crying?
Human answer :  arthur's 

# Evaluation metrics

In [97]:
from pycocoevalcap.meteor.meteor import Meteor
from pycocoevalcap.cider.cider import Cider
from pycocoevalcap.rouge.rouge import Rouge
from pycocoevalcap.bleu.bleu import Bleu

def evaluate_answers(answers):
  meteor_obj = Meteor()
  rouge_obj = Rouge()
  cider_obj = Cider()
  bleu_obj = Bleu(4)

  ref1_strs = [answer['answer1'] for answer in answers]
  ref2_strs = [answer['answer2'] for answer in answers]
  sys_strs  = [answer['generated_answer'] for answer in answers]

  assert len(ref1_strs) == len(ref2_strs)
  assert len(ref2_strs) == len(sys_strs)

  word_target_dict = {}
  word_response_dict = {}

  for i in range(len(ref1_strs)):
      word_target_dict[i] = [ref1_strs[i], ref2_strs[i]]
      word_response_dict[i] = [sys_strs[i]]


  bleu_score, bleu_scores = bleu_obj.compute_score(
          word_target_dict, word_response_dict,
          verbose=False)
  bleu1_score, _, _, bleu4_score = bleu_score
  bleu1_scores, _, _, bleu4_scores = bleu_scores
  meteor_score, meteor_scores = meteor_obj.compute_score(
          word_target_dict, word_response_dict) 
  rouge_score, rouge_scores = rouge_obj.compute_score(
          word_target_dict, word_response_dict) 
  cider_score, cider_scores = cider_obj.compute_score(
          word_target_dict, word_response_dict)

  ref1_comparison = [a == b for a,b in zip(ref1_strs, sys_strs)]                  # For each question, True if extracted answer matches answer1
  ref2_comparison = [a == b for a,b in zip(ref2_strs, sys_strs)]                  # For each question, True if extracted answer matches answer2
  ref_comparison = [int(a or b) for a,b in zip(ref1_comparison, ref2_comparison)] # For each question, 1 if extracted answer matches either answer1 or answer2, else 0
  accuracy = sum(ref_comparison) / len(ref_comparison)

  print("ROUGE-L     : ", round(100*rouge_score,2))
  print("BLEU-1      : ", round(100*bleu1_score,2))
  print("BLEU-4      : ", round(100*bleu4_score,2))
  print("METEOR      : ", round(100*meteor_score,2))
  print("CiDER       : ", round(100*cider_score,2))
  print("Exact match : ", round(100*accuracy, 2))

  return {'bleu1':bleu1_scores, 'bleu4':bleu4_scores, 'meteor':meteor_scores, 'rouge':rouge_scores, 'cider':cider_scores}

##T5 small

In [98]:
t5_small_scores = evaluate_answers(t5_small_answers)
t5_small_finetuned_scores = evaluate_answers(t5_small_finetuned_answers)

ROUGE-L     :  51.01
BLEU-1      :  29.09
BLEU-4      :  11.43
METEOR      :  23.56
CiDER       :  178.84
Exact match :  29.89
ROUGE-L     :  53.16
BLEU-1      :  44.02
BLEU-4      :  21.42
METEOR      :  25.12
CiDER       :  193.76
Exact match :  29.54


## T5 base

In [29]:
t5_base_finetuned_scores = evaluate_answers(t5_base_finetuned_answers)

ROUGE-L     :  65.47
BLEU-1      :  58.39
BLEU-4      :  31.95
METEOR      :  32.21
CiDER       :  250.51
Exact match :  39.12


## T5 base generated vs extracted

In [30]:
t5_base_finetuned_e_answers = []
t5_base_finetuned_g_answers = []

for a,c in zip(t5_base_finetuned_answers, test_contexts):
  if a['generated_answer'] in clean(c):
    t5_base_finetuned_e_answers.append(a)
  else:
    t5_base_finetuned_g_answers.append(a)

## T5 base extracted answers

In [31]:
t5_base_finetuned_e_scores = evaluate_answers(t5_base_finetuned_e_answers)

ROUGE-L     :  73.02
BLEU-1      :  68.86
BLEU-4      :  43.4
METEOR      :  37.77
CiDER       :  284.8
Exact match :  51.29


## T5 base generated answers

In [32]:
t5_base_finetuned_g_scores = evaluate_answers(t5_base_finetuned_g_answers)

ROUGE-L     :  47.01
BLEU-1      :  46.27
BLEU-4      :  22.45
METEOR      :  24.84
CiDER       :  166.57
Exact match :  9.31


## Statistical significance tests

In [99]:
# Set the two score_sets to be compared

a = t5_small_scores
b = t5_small_finetuned_scores

In [113]:
from scipy.stats import wilcoxon
from scipy.stats import ttest_rel
from scipy.stats import ranksums

for metric in a.keys():
  scores_a = a[metric]
  #print(scores_a)
  scores_b = b[metric]
  _,p = wilcoxon(scores_a, scores_b)
  print(metric + ': ' + str(p))

bleu1: 6.968127923345122e-34
bleu4: 1.9291740371023352e-26
meteor: 0.9730947489578484
rouge: 9.474828779834348e-19
cider: 2.640050946034867e-31


In [114]:
for metric in a.keys():
  scores_a = a[metric]
  #print(scores_a)
  scores_b = b[metric]
  _,p = ttest_rel(scores_a, scores_b)
  print(metric + ': ' + str(p))

bleu1: 1.0305774910212924e-19
bleu4: 4.6057373842315943e-26
meteor: 0.3003812624397412
rouge: 1.0048828572026799e-10
cider: 1.1183156876533103e-20


In [115]:
for metric in a.keys():
  scores_a = a[metric]
  #print(scores_a)
  scores_b = b[metric]
  _,p = ranksums(scores_a, scores_b)
  print(metric + ': ' + str(p))

bleu1: 6.557845772770955e-11
bleu4: 4.092284014679699e-09
meteor: 0.14933150153507369
rouge: 0.0025540008106719847
cider: 4.1962492413334195e-07
