In [1]:
import pandas as pd

## Dataset with true vs machine answers 
where machine answers were already given by GPT models Babbage, Devinci

In [2]:
csv = "data/test_gpt_qa_babbage_davinci_21Mar2023.csv"
df = pd.read_csv(csv)
df.sample(5)

Unnamed: 0,Context,Question,True Answer,Babbage Answer,Devinci Answer
53,"Father Joseph Carrier, C.S.C. was Director of ...",What professorship did Father Josh Carrier hol...,Professor of Chemistry and Physics,Father Josh Carrier held the professorship of ...,Professor of Chemistry and Physics.
19,The College of Engineering was established in ...,The College of Science began to offer civil en...,the 1870s,The College of Science began to offer civil en...,1870s
125,Hesburgh is also credited with transforming th...,In what year did Notre Dame have its earliest ...,1971,In 1971 Mary Ann Proctor became the first fema...,1971
126,Hesburgh is also credited with transforming th...,With what institute did Notre Dame agree to an...,Saint Mary's College,Notre Dame agreed to an exchange program with ...,Saint Mary's College
49,"Father Joseph Carrier, C.S.C. was Director of ...",What person was the Director of the Science Mu...,"Father Joseph Carrier, C.S.C.","Father Joseph Carrier, C.S.C.","Father Joseph Carrier, C.S.C."


## Unit tests of text_eval_utils

In [3]:
import text_eval_utils as tteval

In [4]:
J = 23
a = df.iloc[J]['True Answer']
a_babbage = df.iloc[J]['Babbage Answer']
a_devinci = df.iloc[J]['Devinci Answer']

a, a_babbage, a_devinci

('U.S. News & World Report',
 'The First Year of Studies program was declared "outstanding" by U.S. News & World Report in their 2018 guide to undergraduate colleges.',
 'U.S. News & World Report.')

In [5]:
babbage_scores = tteval.PrecisionRecallF1(machine_answer=a_babbage, true_answer=a)
babbage_scores.precision, babbage_scores.recall, babbage_scores.f1

(0.2608695652173913, 1.0, 0.41379310016646853)

In [6]:
devinci_scores = tteval.PrecisionRecallF1(machine_answer=a_devinci, true_answer=a)
devinci_scores.precision, devinci_scores.recall, devinci_scores.f1

(1.0, 1.0, 0.999999995)

In [7]:
scores = tteval.get_batch_metrics(
    array_machine_answers=df["Babbage Answer"],
    array_true_answers=df['True Answer']
)

df["Babbage Answer Precision"] = scores['precision']
df["Babbage Answer Recall"] = scores['recall']
df["Babbage Answer F1"] = scores['f1']

In [8]:
scores = tteval.get_batch_metrics(
    array_machine_answers=df["Devinci Answer"],
    array_true_answers=df['True Answer']
)

df["Devinci Answer Precision"] = scores['precision']
df["Devinci Answer Recall"] = scores['recall']
df["Devinci Answer F1"] = scores['f1']

In [9]:
df.sample(5)

Unnamed: 0,Context,Question,True Answer,Babbage Answer,Devinci Answer,Babbage Answer Precision,Babbage Answer Recall,Babbage Answer F1,Devinci Answer Precision,Devinci Answer Recall,Devinci Answer F1
103,The success of its football team made Notre Da...,"Catholic people identified with Notre Dame, wh...",the Protestant establishment,Catholics rallied around the team and listen t...,Protestantism.,0.107143,1.0,0.193548,0.0,0.0,0.0
124,Hesburgh is also credited with transforming th...,What title did Thomas Blantz have at Notre Dame?,Vice President of Student Affairs,"""Notre Dame's Vice President of Student Affairs.""",Vice President of Student Affairs,0.625,1.0,0.769231,1.0,1.0,1.0
125,Hesburgh is also credited with transforming th...,In what year did Notre Dame have its earliest ...,1971,In 1971 Mary Ann Proctor became the first fema...,1971,0.0625,1.0,0.117647,1.0,1.0,1.0
32,The Joan B. Kroc Institute for International P...,To whom was John B. Kroc married?,Ray Kroc,"Joan B. Kroc was married to John B. Kroc, the ...",John B. Kroc was married to McDonald's owner R...,0.142857,1.0,0.25,0.222222,1.0,0.363636
35,The library system of the university is divide...,What is the name of the main library at Notre ...,Theodore M. Hesburgh Library,The name of the main library at Notre Dame is ...,The Theodore M. Hesburgh Library.,0.285714,1.0,0.444444,0.8,1.0,0.888889


In [10]:
str_out = f"Scores for GPT-Babbage QA:\n"
str_out += f"Precision = {df['Babbage Answer Precision'].mean():.4f}, Recall = {df['Babbage Answer Recall'].mean():.4f}, F1 = {df['Babbage Answer F1'].mean():.4f}"
print(str_out)

Scores for GPT-Babbage QA:
Precision = 0.2491, Recall = 0.7285, F1 = 0.3175


In [11]:
str_out = f"Scores for GPT-Devinci QA:\n"
str_out += f"Precision = {df['Devinci Answer Precision'].mean():.4f}, Recall = {df['Devinci Answer Recall'].mean():.4f}, F1 = {df['Devinci Answer F1'].mean():.4f}"
print(str_out)

Scores for GPT-Devinci QA:
Precision = 0.6816, Recall = 0.8327, F1 = 0.7168
