## Prompt comparison results

In [1]:
import os
import json
import pandas as pd
from pipeline.constants import TEST_SET_PATH

# model = "code-davinci-002"
model = "text-davinci-003"
# model = "gpt-3.5-turbo"
retriever = "monot5-base-msmarco-10k"
# retriever = "monot5-3b-msmarco-10k"
# retriever = "random"
# retriever = "closed-book"
# retriever = "gold"

In [2]:
# Calculate QASPER results

for root, dirs, files in os.walk(f"results/{retriever}/{model}"):
    for file in files:
        if file.endswith(".jsonl"):
            predictions_path = os.path.join(root, file)
            os.system(f"python qasper_evaluator.py --predictions {predictions_path} --gold {TEST_SET_PATH} --text_evidence_only")

In [3]:
results_dict = {}
for root, dirs, files in os.walk(f"results/{retriever}/{model}"):
    for file in files:
        if file.endswith("results.json"):
            results = json.load(open(os.path.join(root, file)))
            # remove "results.json" from file name
            model_name = file[:-13]
            retrieval_name = root.split("/")[-1]
            results_dict[os.path.join(retrieval_name, model_name)] = {
                "Extractive": results["Answer F1 by type"]["extractive"] * 100,
                "Abstractive": results["Answer F1 by type"]["abstractive"] * 100,
                "Yes/No": results["Answer F1 by type"]["boolean"] * 100,
                "Unanswerable": results["Answer F1 by type"]["none"] * 100,
                "Answer F1": results["Answer F1"] * 100,
                "Evidence F1": results["Evidence F1"] * 100,
            }

In [9]:
# code-davinci-002 results
df = pd.DataFrame.from_dict(results_dict, orient="index")
df = df.round(1)
df = df.sort_values(by="Answer F1")
df

Unnamed: 0,Extractive,Abstractive,Yes/No,Unanswerable,Answer F1,Evidence F1
code-davinci-002\qasper_zeroshot_prompt,40.0,17.1,82.8,51.7,42.3,26.8
code-davinci-002\qasper_fewshot_qc,50.5,26.3,70.7,9.0,44.6,26.8
code-davinci-002\qasper_cot_prompt,51.6,25.2,75.5,30.1,46.9,41.9
code-davinci-002\qasper_document_fewshot_prompt,54.1,27.4,80.9,11.2,48.7,26.8
code-davinci-002\qasper_fewshot_prompt,56.5,27.1,81.4,20.4,50.7,26.8


In [4]:
# text-davinci-003 results
df = pd.DataFrame.from_dict(results_dict, orient="index")
df = df.round(1)
df = df.sort_values(by="Answer F1")
df

Unnamed: 0,Extractive,Abstractive,Yes/No,Unanswerable,Answer F1,Evidence F1
text-davinci-003\qasper_fewshot_qc,40.6,22.0,44.4,61.6,38.8,26.8
text-davinci-003\qasper_zeroshot_prompt,40.7,22.4,44.9,61.0,38.9,26.8
text-davinci-003\qasper_cot_prompt,48.6,24.2,72.3,70.1,48.6,49.5
text-davinci-003\qasper_fewshot_prompt,56.6,26.9,80.1,55.6,53.5,26.8


In [26]:
# gpt-3.5-turbo results
df = pd.DataFrame.from_dict(results_dict, orient="index")
df = df.round(1)
df = df.sort_values(by="Answer F1")
df

Unnamed: 0,Extractive,Abstractive,Yes/No,Unanswerable,Answer F1,Evidence F1
gpt-3.5-turbo\qasper_zeroshot_prompt,36.7,20.9,62.4,79.6,41.1,26.8
gpt-3.5-turbo\qasper_cot_prompt,41.9,19.7,55.2,92.2,45.0,51.5
gpt-3.5-turbo\qasper_cot_prompt_user_assistant,45.2,23.0,58.4,81.0,46.0,51.6
gpt-3.5-turbo\qasper_fewshot_prompt,46.0,23.1,62.3,87.0,48.0,26.8


In [12]:
# unifiedQA results
df = pd.DataFrame.from_dict(results_dict, orient="index")
df = df.round(1)
df = df.sort_values(by="Answer F1")
df

Unnamed: 0,Extractive,Abstractive,Yes/No,Unanswerable,Answer F1,Evidence F1
unifiedqa\unifiedqa-t5-small,28.1,12.8,48.4,0.8,25.0,26.8
unifiedqa\unifiedqa-v2-t5-large-1251000,37.4,18.2,66.7,0.0,34.5,26.8
unifiedqa\unifiedqa-v2-t5-3b-1251000,43.6,18.6,71.7,2.6,38.7,26.8
unifiedqa\unifiedqa-v2-t5-11b-1251000,44.0,18.9,80.6,19.3,41.7,26.8


In [40]:
# RANDOM
df = pd.DataFrame.from_dict(results_dict, orient="index")
df = df.round(1)
df = df.sort_values(by="Answer F1")
df

Unnamed: 0,Extractive,Abstractive,Yes/No,Unanswerable,Answer F1,Evidence F1
text-davinci-003\qasper_zeroshot_prompt,0.4,0.5,17.8,98.3,18.1,100.0
text-davinci-003\qasper_fewshot_prompt,1.7,1.8,24.5,90.5,18.4,100.0


In [44]:
# QUESTION
df = pd.DataFrame.from_dict(results_dict, orient="index")
df = df.round(1)
df = df.sort_values(by="Answer F1")
df

Unnamed: 0,Extractive,Abstractive,Yes/No,Unanswerable,Answer F1,Evidence F1
text-davinci-003\qasper_question_only_prompt,12.4,10.0,68.7,66.7,26.5,100.0


In [36]:
# GOLD
df = pd.DataFrame.from_dict(results_dict, orient="index")
df = df.round(1)
df = df.sort_values(by="Answer F1")
df

Unnamed: 0,Extractive,Abstractive,Yes/No,Unanswerable,Answer F1,Evidence F1
text-davinci-003\qasper_zeroshot_prompt,47.5,24.5,39.3,97.4,47.0,100.0
text-davinci-003\qasper_fewshot_prompt,64.1,24.3,65.7,97.7,59.0,100.0


## Different k - Evidence F1 scores

In [3]:
# retrieval_name = "all-mpnet-base-v2"
retrieval_name = "monoT5-3b-msmarco-10k"

with open(f'retrieval_passages/{retrieval_name}_contents.json', 'r') as f:
    retrieval_psgs = json.load(f)

for k in range(1,6):
    predictions = []
    for q_id, passages in retrieval_psgs.items():
        predictions.append({
            "question_id": q_id,
            "predicted_answer": "",
            "predicted_evidence": passages[:k]
        })
        #make dir if it doesn't exist
    if not os.path.exists(f"results/{retrieval_name}/passages"):
        os.makedirs(f"results/{retrieval_name}/passages")
    with open(f"results/{retrieval_name}/passages/{k}.jsonl", "w") as f:
        for prediction in predictions:
            f.write(json.dumps(prediction) + "\n")

In [4]:
for root, dirs, files in os.walk(f"results/{retrieval_name}/passages"):
    for file in files:
        if file.endswith(".jsonl"):
            predictions_path = os.path.join(root, file)
            os.system(f"python qasper_evaluator.py --predictions {predictions_path} --gold {TEST_SET_PATH} --text_evidence_only")

In [7]:
results_dict = {}
for root, dirs, files in os.walk(f"results/{retrieval_name}/passages/"):
    for file in files:
        if file.endswith("results.json"):
            results = json.load(open(os.path.join(root, file)))
            model_name = file[:-13]
            results_dict[os.path.join(retrieval_name, model_name)] = {
                "Evidence F1": results["Evidence F1"] * 100,
            }

In [8]:
passages_df = pd.DataFrame.from_dict(results_dict, orient="index")
passages_df = passages_df.round(1)
passages_df.T

Unnamed: 0,monoT5-3b-msmarco-10k\1,monoT5-3b-msmarco-10k\2,monoT5-3b-msmarco-10k\3,monoT5-3b-msmarco-10k\4,monoT5-3b-msmarco-10k\5
Evidence F1,40.8,38.5,34.4,31.0,28.2


In [101]:
passages_df = pd.DataFrame.from_dict(results_dict, orient="index")
passages_df = passages_df.round(1)
passages_df.T

Unnamed: 0,monoT5-base-msmarco-10k\1,monoT5-base-msmarco-10k\2,monoT5-base-msmarco-10k\3,monoT5-base-msmarco-10k\4,monoT5-base-msmarco-10k\5
Evidence F1,37.1,35.6,32.4,29.2,26.8


In [105]:
passages_df = pd.DataFrame.from_dict(results_dict, orient="index")
passages_df = passages_df.round(1)
passages_df.T

Unnamed: 0,bm25\1,bm25\2,bm25\3,bm25\4,bm25\5
Evidence F1,21.7,23.6,23.0,22.5,21.2


In [11]:
passages_df = pd.DataFrame.from_dict(results_dict, orient="index")
passages_df = passages_df.round(1)
passages_df.T

Unnamed: 0,all-mpnet-base-v2/1,all-mpnet-base-v2/2,all-mpnet-base-v2/3,all-mpnet-base-v2/4,all-mpnet-base-v2/5
Evidence F1,17.2,19.9,19.6,19.4,18.2
