# Benchmark for Retro-Reader  
This is a benchmark for the retro-reader architecture using ELECTRA Large LM as the baseline. Tests are done with 500 answerable and 500 non answerable questions from our dataset.

Dependencies

In [None]:
!git clone https://github.com/ShivamIITK21/retro-reader.git

In [None]:
%cd retro-reader

In [None]:
!pip install datasets
!pip install transformers
import datasets
import transformers
import pandas as pd

Loading the Model

In [None]:
from retro_reader import constants as C
from retro_reader import RetroReader

In [None]:
config_file = "configs/inference_en_electra_large.yaml"
retro_reader = RetroReader.load(config_file=config_file)

Test prediction for understanding IO

In [None]:
pred_examples = datasets.Dataset.from_pandas(pd.DataFrame(data = [
    {
        "example_id": "0",
        "guid": "id-01",
        "question": "What is the most popular game on twitch right now?",
        "context": "Valorant is the most popular game on twitch right now."
    },
    {
        "example_id": "1",
        "guid": "id-02",
        "question": "Which is the most popular drink brand?",
        "context": "Coca Cola is the most popular drink brand."
    }
]))

In [None]:
results = retro_reader.inference(pred_examples)

In [None]:
results

Loading the dataset

In [None]:
import csv
import requests

# load training dataset
def load_data():
    CSV_URL = 'https://drive.google.com/u/0/uc?id=1Z-yb752A3o7b9dqrGt24XU0sl53FVqya&export=download'

    with requests.Session() as s:
        download = s.get(CSV_URL)
        decoded_content = download.content.decode('utf-8')
        cr = csv.reader(decoded_content.splitlines(), delimiter=',')
        train_data = pd.DataFrame(cr)

    # print(f"Number of examples = {len(train_data)}")
    # ans, noans = 0, 0
    # for x in train_data:
    #     if x[4] == 'False':
    #         noans += 1
    #     else:
    #         ans += 1
    # print(f"\tAnswerable questions = {ans}")
    # print(f"\tNon-Answerable questions = {noans}\n")
    # print("Examples:")
    # for i in [0, 1000, 1300]:
    #     print(train_data[i][1], ' | ', train_data[i][2][:20] + '...', ' | ', ' | '.join(train_data[i][3:]))
    return train_data

In [None]:
data = load_data()

In [None]:
data

In [None]:
noans = data.loc[data[4] == "False"].sample(n = 100)

In [None]:
noans

In [None]:
withans = data.loc[data[4] == "True"].sample(100)

In [None]:
withans

Preprocessing the data

In [None]:
def transform_data(data):
  transform = []
  i = 0
  for index, row in data.iterrows():
    i += 1
    context = row[2]
    question = row[3]
    obj = {
        "example_id": str(i-1),
        "guid": "id-"+str(i),
        "question": question,
        "context": context
    }
    transform.append(obj)
  transform = datasets.Dataset.from_pandas(pd.DataFrame(data = transform))
  return transform

In [None]:
withans_processed = transform_data(withans)

In [None]:
withans_processed

In [None]:
noans_processed = transform_data(noans)

In [None]:
noans_processed

In [None]:
def get_truth(data):
  truths = []
  for index, row in data.iterrows():
    truth = row[5][2:-2]
    truths.append(truth)
  return truths

Inference

In [None]:
result_withans = retro_reader.inference(withans_processed)

In [None]:
result_withans

In [None]:
truth = get_truth(withans)
truth

In [None]:
result_noans = retro_reader.inference(noans_processed)

In [None]:
result_withans[0]

Metrics

In [None]:
import string, re, json, ast

def normalize_text(s):
    """Removing articles and punctuation, and standardizing whitespace are all typical text processing steps."""

    def remove_articles(text):
        regex = re.compile(r"\b(a|an|the)\b", re.UNICODE)
        return re.sub(regex, " ", text)

    def white_space_fix(text):
        return " ".join(text.split())

    def remove_punc(text):
        exclude = set(string.punctuation)
        return "".join(ch for ch in text if ch not in exclude)

    def lower(text):
        return text.lower()

    return white_space_fix(remove_articles(remove_punc(lower(s))))


def compute_exact_match(prediction, truth):
    return int(normalize_text(prediction) == normalize_text(truth))


def compute_f1(prediction, truth):
    pred_tokens = normalize_text(prediction).split()
    truth_tokens = normalize_text(truth).split()
    
    # if either the prediction or the truth is no-answer then f1 = 1 if they agree, 0 otherwise
    if len(pred_tokens) == 0 or len(truth_tokens) == 0:
        return int(pred_tokens == truth_tokens)
    
    common_tokens = set(pred_tokens) & set(truth_tokens)
    
    # if there are no common tokens then f1 = 0
    if len(common_tokens) == 0:
        return 0
    
    prec = len(common_tokens) / len(pred_tokens)
    rec = len(common_tokens) / len(truth_tokens)
    
    return 2 * (prec * rec) / (prec + rec)

In [None]:
result_withans[0]["id-1"]


In [None]:
len(truth)

In [None]:
def computef1_batch_withans(result, truth):
  f1_sum = 0
  for i in range(0, len(truth)):
    t = truth[i]
    pred = result[0]["id-" + str(i+1)]
    f1_sum += compute_f1(pred, t)
  return f1_sum/len(truth)

In [None]:
computef1_batch_withans(result_withans, truth)

In [None]:
def computef1_batch_noans(result):
  f1_sum = 0
  for i in range(0, 100):
    t = ""
    pred = result[0]["id-" + str(i+1)]
    f1_sum += compute_f1(pred, t)
  return f1_sum/100

In [None]:
computef1_batch_noans(result_noans)

TLDR :- Very high accuracy but also very high inference time, takes ~10 mins for 100 tests, will try to optimize