## Imports/Prerequisites

In [1]:
import pandas as pd
from tqdm import tqdm
from evaluate import load

def load_dataset(ds_path):
    data = pd.read_csv(ds_path)
    # TODO: Clean the __or__ occurrences and save in usable format for evaluation

    return data

## Variables to be set by user

In [2]:
### Vars

# data, question_type = load_dataset("/home/wallat/temporal-llms/data/Event-focused Questions/Explicitly Time-Scoped Questions.csv"), "explicit"
# data, question_type = load_dataset("/home/wallat/temporal-llms/data/Event-focused Questions/Implicitly Time-Scoped Questions.csv"), "implicit"
# data, question_type = load_dataset("/home/wallat/temporal-llms/data/ArchivalQA/splits/ArchivalQATime/ArchivalQATime_train.csv"), "ArchivalQA_time"
data, question_type = load_dataset("/home/wallat/temporal-llms/data/templama/preprocessed/templamaQA.csv"), "TempLAMA"
# print('Dataset head: ', data.head())

# Text-davinci-003
# prediction_data, model_name = pd.read_csv("/home/wallat/temporal-llms/gpt_text-davinci-003_predictions_explicit_tuned_examples_predictions.csv", sep="\t"), "text-davinci-003"
# prediction_data, model_name = pd.read_csv("/home/wallat/temporal-llms/data/predictions/gpt_text-davinci-003_predictions_implicit_tuned_examples_predictions.csv", sep="\t"), "text-davinci-003"
prediction_data, model_name = pd.read_csv("/home/wallat/temporal-llms/data/predictions/davinci/text-davinci-003_TempLAMA_all_tuned_examples_predictions_2023_05_30T09:17.csv_ordered", sep="\t"), "text-davinci-003"
# Alpaca-7B
# prediction_data, model_name = pd.read_csv("/home/wallat/temporal-llms/alpaca-7b_predictions_explicit.csv", sep="\t"), "alpaca-7B"
# prediction_data, model_name = pd.read_csv("/home/wallat/temporal-llms/alpaca-7b_predictions_implicit.csv", sep="\t"), "alpaca-7B"

# print("Model predictions head: ", prediction_data.head())

## Evaluation

In [3]:
# Converts dataset to reference format (huggingface evaluate)
# Also splits temporal questions answers that contains multiple possible answers with __or__

dataset = []

for index in range(0, len(data)):
    row = data.iloc[index]

    answers = row['Answer']
    if "__or__" in answers:
        answers = answers.split("__or__")
    else:
        answers = [answers]
    
    dataset.append({"id": str(index), "question": row['Question'], "answers": answers, "type": question_type})

print(dataset[14])

# Convert to reference format (evaluate library)
# {'answers': {'answer_start': [97], 'text': ['1976']}, 'id': '0'}
references = []

for ele in dataset:
    answers = ele['answers']
    answer_starts = []

    ref = {"id": str(ele["id"]), 'answers': {'answer_start': len(answers)*[0], 'text': answers}}

    references.append(ref)

print(references[14])

{'id': '14', 'question': 'David Beckham played for which team in 2013?', 'answers': ['Paris Saint-Germain'], 'type': 'TempLAMA'}
{'id': '14', 'answers': {'answer_start': [0], 'text': ['Paris Saint-Germain']}}


In [4]:
# Converts predictions to reference format

predictions = []

for index in range(0, len(prediction_data)):
    # print(index)
    row = prediction_data.iloc[index]

    predictions.append({"prediction_text": row['answer'], "id": str(row['q_id'])})

predictions[14]

{'prediction_text': 'Paris Saint-Germain.', 'id': '14'}

In [5]:
# EM/F1
squad_metric = load("squad")
results = squad_metric.compute(predictions=predictions, references=references)


# New metric "contains answer"
num_contains = 0

for pred, ref in zip(predictions, references):
    # print(pred)
    # print(ref)

    predicted_answer = pred['prediction_text'].lower()
    ref_answers = ref['answers']['text']
    ref_answers = [x.lower() for x in ref_answers]

    # print(predicted_answer)
    # print(ref_answers)

    contained = False
    for ref_answer in ref_answers:
        if ref_answer in predicted_answer:
            contained = True
    
    if contained:
        num_contains += 1

# print(f"The ground-truth answer was contained in the prediction {num_contains} times -> {num_contains/len(references)}")
results["contains"] = (num_contains/len(references)) * 100

print(f"The {model_name} model on TemporalQuestions {question_type}:\n{results}")

The text-davinci-003 model on TemporalQuestions TempLAMA:
{'exact_match': 16.289008149473265, 'f1': 30.706724000224852, 'contains': 17.563108725899422}


In [7]:
predictions[:3]

preds = []
for pred in predictions:
    preds.append(pred['prediction_text'])

refs = []
for ref in references:
    refs.append(ref['answers']['text'][0])

refs[:3]

['The Church of England', 'Mavericks', 'Buckley']

In [8]:
bertscore_metric = load("bertscore")
results = bertscore_metric.compute(predictions=preds[:3], references=refs[:3], lang="en", idf=True)

results

{'precision': [0.8026286959648132, 0.8540196418762207, 0.9622761011123657],
 'recall': [0.8769065737724304, 0.8983532190322876, 0.9246299266815186],
 'f1': [0.8381251096725464, 0.8756256699562073, 0.9430775046348572],
 'hashcode': 'roberta-large_L17_idf_version=0.3.12(hug_trans=4.29.0.dev0)'}

In [1]:
d = {'exact_match': 47.6, 'f1': 60.71807574167095, 'contains': 66.4}

d.append(results)

AttributeError: 'dict' object has no attribute 'append'

In [33]:
# Log to wandb
# import wandb

# naming format: run_name = f"{args.model_name}_{ds_name}_{question_type}_{args.prompt_name}"
# wandb.init(project="temporal-ir", name=f"{model_name}_TemporalQuestions_{question_type}_default")
# wandb.log(results)