In [7]:
import csv
import pandas as pd
import string
import re

In [2]:
squad_base = "/content/benchmark_context.csv"
squad_finetune = "/content/benchmark_fine_tuned.csv"
commomsense_base = "/content/benchmark_context_commonsenqa.csv"

In [83]:
squad_base_res = pd.read_csv(squad_base)

squad_ft_res = pd.read_csv(squad_finetune)

commomsense_base_res = pd.read_csv(commomsense_base)

In [84]:
def normalize_answer(text):
    if text:
        punc = string.punctuation
        text = re.sub(r"[\n\t]*", "", text)
        text = text.lower().strip()
        text = " ".join(text.split())
        return ''.join(char for char in text if char not in punc)
    else:
        return None

In [100]:
def custom_em_metrics(predict, ground_truth):
  if len(normalize_answer(predict)) > len(normalize_answer(ground_truth)):
    if normalize_answer(ground_truth) in normalize_answer(predict):
      return 1
    else:
      return 0
  else:
    return int(normalize_answer(predict) == normalize_answer(ground_truth))

def em_metrics(predict, ground_truth):
  return int(normalize_answer(predict) == normalize_answer(ground_truth))

def acc_metrics(predict, ground_truth):
  if predict:
    return int(normalize_answer(predict) in normalize_answer(ground_truth))
  else:
    return 0


In [107]:
# Benchmark for base model on Squad v2
base_em_scores = []
base_custom_em_scores = []

for index, row in squad_base_res.iterrows():
  answer = row['Answer']
  prediction = row['Prediction']
  # Base model
  em_score = em_metrics(answer, prediction)
  base_em_scores.append(em_score)

  custom_em_score = custom_em_metrics(answer, prediction)
  base_custom_em_scores.append(custom_em_score)

base_em_average = sum(base_em_scores)/len(base_em_scores)
base_custom_em_average = sum(base_custom_em_scores)/len(base_custom_em_scores)
print("EM score for base model: ", base_em_average)
print("Custom EM score for base model: ", base_custom_em_average)

EM score for base model:  0.03
Custom EM score for base model:  0.04


In [98]:
# Benchmark for ft model on Squad v2
ft_em_scores = []
ft_custom_em_scores = []

for index, row in squad_ft_res.iterrows():
  answer = row['Answer']
  prediction = row['Prediction']
  # FT model
  em_score = em_metrics(answer, prediction)
  ft_em_scores.append(em_score)

  custom_em_score = custom_em_metrics(answer, prediction)
  ft_custom_em_scores.append(custom_em_score)

base_em_average = sum(ft_em_scores)/len(ft_em_scores)
base_custom_em_average = sum(ft_custom_em_scores)/len(ft_custom_em_scores)
print("EM score for ft model: ", base_em_average)
print("Custom EM score for ft model: ", base_custom_em_average)

EM score for ft model:  0.31
Custom EM score for ft model:  0.335


In [104]:
# Accuracy on CommonsenseQA
commomsense_base_res = commomsense_base_res.fillna('z')
acc_scores = []
for index, row in commomsense_base_res.iterrows():
  answer = row['Answer key']
  prediction = row['Prediction']
  score = acc_metrics(answer, prediction)
  acc_scores.append(score)
acc = sum(acc_scores)/len(acc_scores)
acc

0.345