## Imports/Prerequisites

In [25]:
import pandas as pd
from tqdm import tqdm
from evaluate import load

def load_dataset(ds_path):
    data = pd.read_csv(ds_path)
    # TODO: Clean the __or__ occurrences and save in usable format for evaluation

    return data

## Variables to be set by user

In [26]:
### Vars

data, question_type = load_dataset("/home/wallat/temporal-llms/data/Event-focused Questions/Explicitly Time-Scoped Questions.csv"), "explicit"
data, question_type = load_dataset("/home/wallat/temporal-llms/data/Event-focused Questions/Implicitly Time-Scoped Questions.csv"), "implicit"
# print('Dataset head: ', data.head())

# Text-davinci-003
# prediction_data, model_name = pd.read_csv("/home/wallat/temporal-llms/gpt_text-davinci-003_predictions_explicit_tuned_examples_predictions.csv", sep="\t"), "text-davinci-003"
prediction_data, model_name = pd.read_csv("/home/wallat/temporal-llms/gpt_text-davinci-003_predictions_implicit_tuned_examples_predictions.csv", sep="\t"), "text-davinci-003"

# Alpaca-7B
# prediction_data, model_name = pd.read_csv("/home/wallat/temporal-llms/alpaca-7b_predictions_explicit.csv", sep="\t"), "alpaca-7B"
# prediction_data, model_name = pd.read_csv("/home/wallat/temporal-llms/alpaca-7b_predictions_implicit.csv", sep="\t"), "alpaca-7B"

# print("Model predictions head: ", prediction_data.head())

## Evaluation

In [27]:
# Converts dataset to reference format (huggingface evaluate)
# Also splits temporal questions answers that contains multiple possible answers with __or__

dataset = []

for index in range(0, len(data)):
    row = data.iloc[index]

    answers = row['Answer']
    if "__or__" in answers:
        answers = answers.split("__or__")
    else:
        answers = [answers]
    
    dataset.append({"id": str(index), "question": row['Question'], "answers": answers, "type": question_type})

print(dataset[14])

# Convert to reference format (evaluate library)
# {'answers': {'answer_start': [97], 'text': ['1976']}, 'id': '0'}
references = []

for ele in dataset:
    answers = ele['answers']
    answer_starts = []

    ref = {"id": str(ele["id"]), 'answers': {'answer_start': len(answers)*[0], 'text': answers}}

    references.append(ref)

print(references[14])

{'id': '14', 'question': 'Which province had a referendum to ask voters whether it should secede from Canada?', 'answers': ['Quebec'], 'type': 'implicit'}
{'id': '14', 'answers': {'answer_start': [0], 'text': ['Quebec']}}


In [28]:
# Converts predictions to reference format

predictions = []

for index in range(0, len(prediction_data)):
    # print(index)
    row = prediction_data.iloc[index]

    predictions.append({"prediction_text": row['answer'], "id": str(row['q_id'])})

predictions[14]

{'prediction_text': 'Quebec', 'id': '14'}

In [29]:
# EM/F1
squad_metric = load("squad")
results = squad_metric.compute(predictions=predictions, references=references)


# New metric "contains answer"
num_contains = 0

for pred, ref in zip(predictions, references):
    # print(pred)
    # print(ref)

    predicted_answer = pred['prediction_text'].lower()
    ref_answers = ref['answers']['text']
    ref_answers = [x.lower() for x in ref_answers]

    # print(predicted_answer)
    # print(ref_answers)

    contained = False
    for ref_answer in ref_answers:
        if ref_answer in predicted_answer:
            contained = True
    
    if contained:
        num_contains += 1

# print(f"The ground-truth answer was contained in the prediction {num_contains} times -> {num_contains/len(references)}")
results["contains"] = (num_contains/len(references)) * 100

print(f"The {model_name} model on TemporalQuestions {question_type}:\n{results}")

The text-davinci-003 model on TemporalQuestions implicit:
{'exact_match': 47.6, 'f1': 60.71807574167095, 'contains': 66.4}


In [30]:
# Log to wandb
import wandb

# naming format: run_name = f"{args.model_name}_{ds_name}_{question_type}_{args.prompt_name}"
wandb.init(project="temporal-ir", name=f"{model_name}_TemporalQuestions_{question_type}_default")
wandb.log(results)

VBox(children=(Label(value='0.014 MB of 0.023 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=0.607243…

0,1
contains,▁
exact_match,▁
f1,▁

0,1
contains,72.0
exact_match,56.4
f1,67.17504


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.016668996214866637, max=1.0…