# Evaluation

In [None]:
import os
import re
import argparse
import json
import sys

In [None]:


import torch
from transformers import MT5ForConditionalGeneration, MT5Tokenizer, DataCollatorForSeq2Seq, DataCollatorWithPadding
from datasets import Dataset, load_dataset

from pprint import pprint

from tqdm.notebook import tqdm

device = "cuda:0"# "cuda" if torch.cuda.is_available() else "cpu"

In [None]:
def normalize_answer(s):
    """Lower text and remove punctuation, articles and extra whitespace."""
    def remove_articles(text):
        tx = re.sub(r'\b(a|an|the)\b.', ' ', text)
        tx = tx.replace('pad', '').replace('s', '')
        return tx

    def white_space_fix(text):
        return ' '.join(text.split())

    def remove_punc(text):
        exclude = set(string.punctuation)
        return ''.join(ch for ch in text if ch not in exclude)

    def lower(text):
        return text.lower()

    return white_space_fix(remove_articles(remove_punc(lower(s))))


def f1_score(prediction, ground_truth):
    prediction_tokens = normalize_answer(prediction).split()
    ground_truth_tokens = normalize_answer(ground_truth).split()
    common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
    num_same = sum(common.values())
    if num_same == 0:
        return 0
    precision = 1.0 * num_same / len(prediction_tokens)
    recall = 1.0 * num_same / len(ground_truth_tokens)
    f1 = (2 * precision * recall) / (precision + recall)
    return f1


def exact_match_score(prediction, ground_truth):
    
    return (normalize_answer(prediction) == normalize_answer(ground_truth))


def metric_max_over_ground_truths(metric_fn, prediction, ground_truths):
    scores_for_ground_truths = []
    for ground_truth in ground_truths:
        score = metric_fn(prediction, ground_truth)
        scores_for_ground_truths.append(score)
    return max(scores_for_ground_truths)


def evaluate(gold_answers, predictions):
    f1 = exact_match = total = 0

    for ground_truths, prediction in zip(gold_answers, predictions):
        total += 1
        exact_match += metric_max_over_ground_truths(
                    exact_match_score, prediction, ground_truths)
        f1 += metric_max_over_ground_truths(
          f1_score, prediction, ground_truths)

    exact_match = 100.0 * exact_match / total
    f1 = 100.0 * f1 / total

    return {'exact_match': exact_match, 'f1': f1}

In [None]:
# process the examples in input and target text format and the eos token at the end 
def add_eos_to_examples(example):
    example['input_text'] = 'question: %s context: %s' % (example['paragraphs'][0]['qas'][0]['question'], example['paragraphs'][0]['context'])
    example['target_text'] = '%s' % example['paragraphs'][0]['qas'][0]['answers'][0]['text']
    return example

# tokenize the examples
def convert_to_features(example_batch):
    input_encodings = tokenizer.batch_encode_plus(example_batch['input_text'], pad_to_max_length=True, max_length=512)
    target_encodings = tokenizer.batch_encode_plus(example_batch['target_text'], pad_to_max_length=True, max_length=16)

    encodings = {
        'input_ids': input_encodings['input_ids'], 
        'attention_mask': input_encodings['attention_mask'],
        'target_ids': target_encodings['input_ids'],
        'target_attention_mask': target_encodings['attention_mask']
    }

    return encodings

# MLQA

In [None]:
model = MT5ForConditionalGeneration.from_pretrained('/ist/ist-share/scads/korn/MRC/tmp/mt5-mlqa/').to(device) 
tokenizer = MT5Tokenizer.from_pretrained('/ist/ist-share/scads/korn/MRC/tmp/mt5-mlqa/')

### Your dataset

In [None]:
data_files = {"test": "/ist/ist-share/scads/korn/datasets/qa_datasset/MLQA/dev/dev-context-ar-question-vi.json"}
test_dataset = load_dataset("json", data_files=data_files, split="test", field='data')

In [None]:
dataset_features = test_dataset.map(add_eos_to_examples, load_from_cache_file=False)
dataset_features = dataset_features.map(convert_to_features, batched=True, load_from_cache_file=False)

# set the tensor type and the columns which the dataset should return
columns = ['input_ids', 'target_ids', 'attention_mask', 'target_attention_mask']
dataset_features.set_format(type='torch', columns=columns)
dataloader = torch.utils.data.DataLoader(dataset_features, batch_size=32)

In [None]:
answers = []
for batch in tqdm(dataloader):
    outs = model.generate(input_ids=batch['input_ids'].to(device), 
                        attention_mask=batch['attention_mask'].to(device),
                        max_length=16,
                        early_stopping=True)
    outs = [tokenizer.decode(ids) for ids in outs]
    answers.extend(outs)
    
predictions = []
references = []
for ref, pred in zip(test_dataset, answers):
    predictions.append(pred)
    references.append([ref['paragraphs'][0]['qas'][0]['answers'][0]['text']])

evaluate(references, predictions)

# XORQA

In [None]:
model = MT5ForConditionalGeneration.from_pretrained('/ist/ist-share/scads/korn/MRC/tmp/mt5-xorqa/').to(device) 
tokenizer = MT5Tokenizer.from_pretrained('/ist/ist-share/scads/korn/MRC/tmp/mt5-xorqa/')

### Your dataset

In [None]:
data_files = {"test": "/ist/ist-share/scads/korn/datasets/tydi_xor_gp/gp_squad_dev_data.json"}
test_dataset = load_dataset("json", data_files=data_files, split="test", field='data')

In [None]:
dataset_features = test_dataset.map(add_eos_to_examples, load_from_cache_file=False)
dataset_features = dataset_features.map(convert_to_features, batched=True, load_from_cache_file=False)

# set the tensor type and the columns which the dataset should return
columns = ['input_ids', 'target_ids', 'attention_mask', 'target_attention_mask']
dataset_features.set_format(type='torch', columns=columns)
dataloader = torch.utils.data.DataLoader(dataset_features, batch_size=32)

In [None]:
answers = []
for batch in tqdm(dataloader):
    outs = model.generate(input_ids=batch['input_ids'].to(device), 
                        attention_mask=batch['attention_mask'].to(device),
                        max_length=16,
                        early_stopping=True)
    outs = [tokenizer.decode(ids) for ids in outs]
    answers.extend(outs)
    
predictions = []
references = []
for ref, pred in zip(test_dataset, answers):
    predictions.append(pred)
    references.append([ref['paragraphs'][0]['qas'][0]['answers'][0]['text']])

evaluate(references, predictions)

# XQuAD

In [None]:
import os
import glob

In [None]:
# final ckp
XQUAD_FINETUNED_MODEL_DIR = '/ist/ist-share/scads/aires/CL-ReLKT/mrc_training/checkpoints/t5-base.seq2seq.squad_hparams.bz-64.grad_acc-1.lr-1e-4.epochs-25'

# model = MT5ForConditionalGeneration.from_pretrained(XQUAD_FINETUNED_MODEL_DIR).to(device)
# tokenizer = MT5Tokenizer.from_pretrained(XQUAD_FINETUNED_MODEL_DIR)

In [8]:
XQUAD_FINETUNED_MODEL_DIRS = sorted(glob.glob(os.path.join(XQUAD_FINETUNED_MODEL_DIR, 'checkpoint-*')), key=lambda x: int(x.split('-')[-1]) )
XQUAD_FINETUNED_MODEL_DIRS

['/ist/ist-share/scads/aires/CL-ReLKT/mrc_training/checkpoints/t5-base.seq2seq.squad_hparams.bz-64.grad_acc-2.lr-1e-4.epochs-10/checkpoint-684',
 '/ist/ist-share/scads/aires/CL-ReLKT/mrc_training/checkpoints/t5-base.seq2seq.squad_hparams.bz-64.grad_acc-2.lr-1e-4.epochs-10/checkpoint-1368',
 '/ist/ist-share/scads/aires/CL-ReLKT/mrc_training/checkpoints/t5-base.seq2seq.squad_hparams.bz-64.grad_acc-2.lr-1e-4.epochs-10/checkpoint-2052',
 '/ist/ist-share/scads/aires/CL-ReLKT/mrc_training/checkpoints/t5-base.seq2seq.squad_hparams.bz-64.grad_acc-2.lr-1e-4.epochs-10/checkpoint-2736',
 '/ist/ist-share/scads/aires/CL-ReLKT/mrc_training/checkpoints/t5-base.seq2seq.squad_hparams.bz-64.grad_acc-2.lr-1e-4.epochs-10/checkpoint-3420',
 '/ist/ist-share/scads/aires/CL-ReLKT/mrc_training/checkpoints/t5-base.seq2seq.squad_hparams.bz-64.grad_acc-2.lr-1e-4.epochs-10/checkpoint-4104',
 '/ist/ist-share/scads/aires/CL-ReLKT/mrc_training/checkpoints/t5-base.seq2seq.squad_hparams.bz-64.grad_acc-2.lr-1e-4.epochs-

In [9]:
# model/ist/ist-share/scads/aires/CL-ReLKT/mrc_training/checkpoints/exp001_r9.1.t5-large.seq2seq.squad_hparams.bz-8.grad_acc-2.lr-1e-3.max_steps-684/

### Your dataset

In [10]:
T5_TOKENIZER_MODEL_DIR = '/ist/ist-share/scads/aires/CL-ReLKT/mrc_training/checkpoints/exp001_r9.1.t5-large.seq2seq.squad_hparams.bz-8.grad_acc-2.lr-1e-3.max_steps-684'

tokenizer = MT5Tokenizer.from_pretrained(T5_TOKENIZER_MODEL_DIR)


In [11]:
SQUAD_EN_DATA_DIR = '/ist/ist-share/scads/aires/CL-ReLKT/mrc_training/data/xquad/en/'

squad_en = { 
     'train': json.load(open(os.path.join(SQUAD_EN_DATA_DIR, 'train-v1.1.json')))['data'],
     'validation': json.load(open(os.path.join(SQUAD_EN_DATA_DIR, 'dev-v1.1.json')))['data']
}

In [12]:
def get_squad_answer_str(context, qas):
    context_qa_pairs = []
    for qa in qas:
        qid = qa['id']
        question = qa['question']
        answer = qa['answers'][0]['text']
        answer_start = qa['answers'][0]['answer_start']
        context_qa_pairs.append((qid, context, question, answer, answer_start))
    return context_qa_pairs

In [13]:
squad_dataset = defaultdict(lambda : dict())
for split_name in ['train', 'validation']:
    for i, item in enumerate(squad_en[split_name]):
        paragraphs = item['paragraphs']
#         print('.' ,end='')
        for j, paragraph in enumerate(paragraphs):

            context = paragraph['context']
            context_qa_pairs = get_squad_answer_str(context=context, qas=paragraph['qas'])

            for context_qa_pair in context_qa_pairs:
                qid, context, question, answer, answer_start = context_qa_pair

                qa_item = {
                    'qid': qid,
                    'question': question,
                    'context': context,
                    'answer': answer,
                    'answer_start': answer_start,
                }
                squad_dataset[split_name][qid] = qa_item
    
    print(f'Number of {split_name} examples: {len(squad_dataset[split_name]):,}')

Number of train examples: 87,599
Number of validation examples: 10,570


In [14]:
squad_dev = list(squad_dataset['validation'].values())

len(squad_dev)

10570

In [15]:
squad_dev[0]

{'qid': '56be4db0acb8001400a502ec',
 'question': 'Which NFL team represented the AFC at Super Bowl 50?',
 'context': 'Super Bowl 50 was an American football game to determine the champion of the National Football League (NFL) for the 2015 season. The American Football Conference (AFC) champion Denver Broncos defeated the National Football Conference (NFC) champion Carolina Panthers 24–10 to earn their third Super Bowl title. The game was played on February 7, 2016, at Levi\'s Stadium in the San Francisco Bay Area at Santa Clara, California. As this was the 50th Super Bowl, the league emphasized the "golden anniversary" with various gold-themed initiatives, as well as temporarily suspending the tradition of naming each Super Bowl game with Roman numerals (under which the game would have been known as "Super Bowl L"), so that the logo could prominently feature the Arabic numerals 50.',
 'answer': 'Denver Broncos',
 'answer_start': 177}

In [16]:
# process the examples in input and target text format and the eos token at the end 
def add_eos_to_examples(example):
    result = {}
    context = example['context']
    question = example['question']
    answer = example['answer']

    result['input_text'] =  'question: %s context: %s' % (question, context)
    result['target_text'] = '%s' % answer
    
    return result

def convert_to_features(example):

    encoding = {}
    
    input_encoding = tokenizer.encode_plus(example['input_text'],
                                           pad_to_max_length=True,
                                           max_length=384, add_special_tokens=True)
    target_encoding = tokenizer.encode_plus(example['target_text'],
                                            pad_to_max_length=True,
                                            max_length=16, add_special_tokens=True)

    encoding['input_ids'] = input_encoding['input_ids'] 
    encoding['attention_mask'] = input_encoding['attention_mask'] 
    encoding['target_ids'] = target_encoding['input_ids'] 
    encoding['target_attention_mask'] =  target_encoding['attention_mask'] 

    # print(f"type(encodings['input_ids']: {type(encodings['input_ids'])}")
    return encoding

In [17]:
squad_dev_features = list(map(convert_to_features, map(add_eos_to_examples, squad_dev)))

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [18]:
tokenizer.pad_token_id

0

In [19]:
references = [[item['answer']] for item in squad_dev]
len(references)

10570

### Evaluate for each ckp

In [20]:
xquad_en_scores = []
for MODEL_DIR in XQUAD_FINETUNED_MODEL_DIRS:
# MODEL_DIR = '/ist/ist-share/scads/aires/'
    print(f'MODEL_DIR: {MODEL_DIR}')
    model_ckp = MODEL_DIR.split('-')[-1]

    model = MT5ForConditionalGeneration.from_pretrained(MODEL_DIR).to(device)

    data_collator = DataCollatorWithPadding(tokenizer=tokenizer,
                                            padding=True,
                                            max_length=384,
                                            pad_to_multiple_of=8,
                                            return_tensors='pt')
    data_loader = torch.utils.data.DataLoader(squad_dev_features,
                                              batch_size=80,
                                              collate_fn=data_collator)

    predictions = []
    answers = []
    c = 0
    for i, batch in tqdm(enumerate(data_loader)):
        batch_size = len(batch['input_ids'])
        # print('batch size', len(batch['input_ids']))
        outs = model.generate(input_ids=batch['input_ids'].to(device), 
                            attention_mask=batch['attention_mask'].to(device),
                            max_length=16,
                            early_stopping=True,
                            num_beams=1)
      
        answer = [tokenizer.decode(ids, skip_special_tokens=True) for ids in outs]
        if c < 1:
            print(f'  refs: {references[:10]}')
            print(f'answer: {answer[:10]}')
            c+=1
        answers.extend(answer)
        # print(f'answers: {answers}')
        # break
    # break
    predictions = answers

    eval_results = evaluate(references, predictions)
    print(eval_results)
    print('\n\n')
    xquad_en_scores.append({
        'model_ckp': model_ckp,
        'model_dir': MODEL_DIR,
        **eval_results,
    })


MODEL_DIR: /ist/ist-share/scads/aires/CL-ReLKT/mrc_training/checkpoints/t5-base.seq2seq.squad_hparams.bz-64.grad_acc-2.lr-1e-4.epochs-10/checkpoint-684


0it [00:00, ?it/s]



  refs: [['Denver Broncos'], ['Carolina Panthers'], ['Santa Clara, California'], ['Denver Broncos'], ['gold'], ['"golden anniversary"'], ['February 7, 2016'], ['American Football Conference'], ['"golden anniversary"'], ['American Football Conference']]
answer: ['Denver Broncos', 'Carolina Panthers', "Levi's Stadium", 'Carolina Panthers', 'gold', 'golden anniversary', 'February 7, 2016', 'American Football Conference', 'golden anniversary', 'American Football Conference']
  refs: [['Denver Broncos'], ['Carolina Panthers'], ['Santa Clara, California'], ['Denver Broncos'], ['gold'], ['"golden anniversary"'], ['February 7, 2016'], ['American Football Conference'], ['"golden anniversary"'], ['American Football Conference']]
answer: ['CBS', '$5 million', 'Coldplay', 'Beyoncé and Bruno Mars', 'Super Bowl XLVII', 'CBS', '$5 million', 'Beyoncé and Bruno Mars', 'Beyoncé and Bruno Mars', 'Beyoncé and Bruno Mars']
{'exact_match': 63.01797540208136, 'f1': 78.0354734846366}



MODEL_DIR: /ist/ist-sh

0it [00:00, ?it/s]

  refs: [['Denver Broncos'], ['Carolina Panthers'], ['Santa Clara, California'], ['Denver Broncos'], ['gold'], ['"golden anniversary"'], ['February 7, 2016'], ['American Football Conference'], ['"golden anniversary"'], ['American Football Conference']]
answer: ['Denver Broncos', 'Carolina Panthers', 'San Francisco Bay Area', 'Carolina Panthers', 'golden', 'golden anniversary', 'February 7, 2016', 'American Football Conference', 'golden anniversary', 'American Football Conference']
  refs: [['Denver Broncos'], ['Carolina Panthers'], ['Santa Clara, California'], ['Denver Broncos'], ['gold'], ['"golden anniversary"'], ['February 7, 2016'], ['American Football Conference'], ['"golden anniversary"'], ['American Football Conference']]
answer: ['CBS', '$5 million', 'Coldplay', 'Beyoncé and Bruno Mars', 'Super Bowl 50', 'CBS', '$5 million', 'Beyoncé', 'Beyoncé', 'Beyoncé and Bruno Mars']
{'exact_match': 65.279091769158, 'f1': 79.88542789132555}



MODEL_DIR: /ist/ist-share/scads/aires/CL-ReLKT

0it [00:00, ?it/s]

  refs: [['Denver Broncos'], ['Carolina Panthers'], ['Santa Clara, California'], ['Denver Broncos'], ['gold'], ['"golden anniversary"'], ['February 7, 2016'], ['American Football Conference'], ['"golden anniversary"'], ['American Football Conference']]
answer: ['Denver Broncos', 'Carolina Panthers', "Levi's Stadium", 'Carolina Panthers', 'golden', 'golden anniversary', 'February 7, 2016', 'American Football Conference', 'golden anniversary', 'American Football Conference']
  refs: [['Denver Broncos'], ['Carolina Panthers'], ['Santa Clara, California'], ['Denver Broncos'], ['gold'], ['"golden anniversary"'], ['February 7, 2016'], ['American Football Conference'], ['"golden anniversary"'], ['American Football Conference']]
answer: ['CBS', '$5 million', 'Coldplay', 'Beyoncé and Bruno Mars', 'Super Bowl 50', 'CBS', '$5 million', 'Beyoncé', 'Beyoncé', 'Beyoncé']
{'exact_match': 66.27246925260171, 'f1': 80.84468842653816}



MODEL_DIR: /ist/ist-share/scads/aires/CL-ReLKT/mrc_training/checkpo

0it [00:00, ?it/s]

  refs: [['Denver Broncos'], ['Carolina Panthers'], ['Santa Clara, California'], ['Denver Broncos'], ['gold'], ['"golden anniversary"'], ['February 7, 2016'], ['American Football Conference'], ['"golden anniversary"'], ['American Football Conference']]
answer: ['Denver Broncos', 'Carolina Panthers', 'San Francisco Bay Area', 'Carolina Panthers', 'gold', 'golden anniversary', 'February 7, 2016', 'American Football Conference', 'golden anniversary', 'American Football Conference']
  refs: [['Denver Broncos'], ['Carolina Panthers'], ['Santa Clara, California'], ['Denver Broncos'], ['gold'], ['"golden anniversary"'], ['February 7, 2016'], ['American Football Conference'], ['"golden anniversary"'], ['American Football Conference']]
answer: ['CBS', '$5 million', 'Coldplay', 'Beyoncé and Bruno Mars', 'Super Bowl 50', 'CBS', '$5 million', 'Beyoncé', 'Beyoncé', 'Beyoncé']
{'exact_match': 67.13339640491958, 'f1': 81.50835245888655}



MODEL_DIR: /ist/ist-share/scads/aires/CL-ReLKT/mrc_training/c

0it [00:00, ?it/s]

  refs: [['Denver Broncos'], ['Carolina Panthers'], ['Santa Clara, California'], ['Denver Broncos'], ['gold'], ['"golden anniversary"'], ['February 7, 2016'], ['American Football Conference'], ['"golden anniversary"'], ['American Football Conference']]
answer: ['Denver Broncos', 'Carolina Panthers', "Levi's Stadium", 'Carolina Panthers', 'gold', 'golden anniversary', 'February 7, 2016', 'American Football Conference', 'golden anniversary', 'American Football Conference']
  refs: [['Denver Broncos'], ['Carolina Panthers'], ['Santa Clara, California'], ['Denver Broncos'], ['gold'], ['"golden anniversary"'], ['February 7, 2016'], ['American Football Conference'], ['"golden anniversary"'], ['American Football Conference']]
answer: ['CBS', '$5 million', 'Coldplay', 'Beyoncé and Bruno Mars', 'Super Bowl 50', 'CBS', '$5 million', 'Beyoncé', 'Beyoncé', 'Beyoncé']
{'exact_match': 67.04824976348155, 'f1': 81.2873356740876}



MODEL_DIR: /ist/ist-share/scads/aires/CL-ReLKT/mrc_training/checkpoint

0it [00:00, ?it/s]

  refs: [['Denver Broncos'], ['Carolina Panthers'], ['Santa Clara, California'], ['Denver Broncos'], ['gold'], ['"golden anniversary"'], ['February 7, 2016'], ['American Football Conference'], ['"golden anniversary"'], ['American Football Conference']]
answer: ['Denver Broncos', 'Carolina Panthers', "Levi's Stadium in the San Francisco Bay Area at Santa Clara,", 'Denver Broncos', 'gold', 'golden anniversary', 'February 7, 2016', 'American Football Conference', 'golden anniversary', 'American Football Conference']
  refs: [['Denver Broncos'], ['Carolina Panthers'], ['Santa Clara, California'], ['Denver Broncos'], ['gold'], ['"golden anniversary"'], ['February 7, 2016'], ['American Football Conference'], ['"golden anniversary"'], ['American Football Conference']]
answer: ['CBS', '$5 million', 'Coldplay', 'Beyoncé and Bruno Mars', 'Super Bowl 50', 'CBS', '$5 million', 'Beyoncé', 'Beyoncé', 'Beyoncé']
{'exact_match': 67.33207190160833, 'f1': 81.53106977741842}



MODEL_DIR: /ist/ist-share/

0it [00:00, ?it/s]

  refs: [['Denver Broncos'], ['Carolina Panthers'], ['Santa Clara, California'], ['Denver Broncos'], ['gold'], ['"golden anniversary"'], ['February 7, 2016'], ['American Football Conference'], ['"golden anniversary"'], ['American Football Conference']]
answer: ['Denver Broncos', 'Carolina Panthers', "Levi's Stadium in the San Francisco Bay Area at Santa Clara,", 'Denver Broncos', 'gold', 'golden anniversary', 'February 7, 2016', 'American Football Conference', 'golden anniversary', 'American Football Conference']
  refs: [['Denver Broncos'], ['Carolina Panthers'], ['Santa Clara, California'], ['Denver Broncos'], ['gold'], ['"golden anniversary"'], ['February 7, 2016'], ['American Football Conference'], ['"golden anniversary"'], ['American Football Conference']]
answer: ['CBS', '$5 million', 'Coldplay', 'Beyoncé and Bruno Mars', 'Super Bowl 50', 'CBS', '$5 million', 'Beyoncé', 'Beyoncé', 'Beyoncé']
{'exact_match': 66.7833491012299, 'f1': 81.50182872109056}



MODEL_DIR: /ist/ist-share/s

0it [00:00, ?it/s]

  refs: [['Denver Broncos'], ['Carolina Panthers'], ['Santa Clara, California'], ['Denver Broncos'], ['gold'], ['"golden anniversary"'], ['February 7, 2016'], ['American Football Conference'], ['"golden anniversary"'], ['American Football Conference']]
answer: ['Denver Broncos', 'Carolina Panthers', "Levi's Stadium in the San Francisco Bay Area at Santa Clara,", 'Carolina Panthers', 'gold', 'golden anniversary', 'February 7, 2016', 'American Football Conference', 'golden anniversary', 'American Football Conference']
  refs: [['Denver Broncos'], ['Carolina Panthers'], ['Santa Clara, California'], ['Denver Broncos'], ['gold'], ['"golden anniversary"'], ['February 7, 2016'], ['American Football Conference'], ['"golden anniversary"'], ['American Football Conference']]
answer: ['CBS', '$5 million', 'Coldplay', 'Beyoncé and Bruno Mars', 'Super Bowl XLVII', 'CBS', '$5 million', 'Beyoncé', 'Bruno Mars', 'Beyoncé']
{'exact_match': 66.49006622516556, 'f1': 81.0863031085037}



MODEL_DIR: /ist/is

0it [00:00, ?it/s]

  refs: [['Denver Broncos'], ['Carolina Panthers'], ['Santa Clara, California'], ['Denver Broncos'], ['gold'], ['"golden anniversary"'], ['February 7, 2016'], ['American Football Conference'], ['"golden anniversary"'], ['American Football Conference']]
answer: ['Denver Broncos', 'Carolina Panthers', "Levi's Stadium in the San Francisco Bay Area at Santa Clara,", 'Denver Broncos', 'golden', 'golden anniversary', 'February 7, 2016', 'American Football Conference', 'golden anniversary', 'American Football Conference']
  refs: [['Denver Broncos'], ['Carolina Panthers'], ['Santa Clara, California'], ['Denver Broncos'], ['gold'], ['"golden anniversary"'], ['February 7, 2016'], ['American Football Conference'], ['"golden anniversary"'], ['American Football Conference']]
answer: ['CBS', '$5 million', 'Coldplay', 'Beyoncé and Bruno Mars', 'Super Bowl 50', 'CBS', '$5 million', 'Beyoncé', 'Beyoncé', 'Beyoncé']
{'exact_match': 66.66035950804162, 'f1': 81.46607822238964}



MODEL_DIR: /ist/ist-shar

0it [00:00, ?it/s]

  refs: [['Denver Broncos'], ['Carolina Panthers'], ['Santa Clara, California'], ['Denver Broncos'], ['gold'], ['"golden anniversary"'], ['February 7, 2016'], ['American Football Conference'], ['"golden anniversary"'], ['American Football Conference']]
answer: ['Denver Broncos', 'Carolina Panthers', "Levi's Stadium in the San Francisco Bay Area at Santa Clara,", 'Denver Broncos', 'golden', 'golden anniversary', 'February 7, 2016', 'American Football Conference', 'golden anniversary', 'American Football Conference']
  refs: [['Denver Broncos'], ['Carolina Panthers'], ['Santa Clara, California'], ['Denver Broncos'], ['gold'], ['"golden anniversary"'], ['February 7, 2016'], ['American Football Conference'], ['"golden anniversary"'], ['American Football Conference']]
answer: ['CBS', '$5 million', 'Coldplay', 'Beyoncé and Bruno Mars', 'Super Bowl 50', 'CBS', '$5 million', 'Beyoncé', 'Bruno Mars', 'Beyoncé']
{'exact_match': 66.38599810785242, 'f1': 81.36083853364462}





In [None]:
# answers
evaluate(references[:len(answers)], predictions=answers)

In [None]:
references[0:5]

In [None]:
len(predictions)

In [None]:
# predictions[:10]

In [None]:
# references[:10]

In [None]:
# evaluate(references, predictions)

In [None]:
print('\tPrediction \t|\t Groundtruth')
list(zip(predictions[:10], references[:10]))