In [1]:
import pandas as pd

## Dataset with true vs machine answers 
where machine answers were already given by GPT models Babbage, Devinci

In [2]:
csv = "data/test_gpt_qa_babbage_davinci_21Mar2023.csv"
df = pd.read_csv(csv)
df.sample(5)

Unnamed: 0,Context,Question,True Answer,Babbage Answer,Devinci Answer
67,The Review of Politics was founded in 1939 by ...,Thomas Stritch was an editor of which publican...,Review of Politics,Thomas Stritch was an editor of the Review of ...,The Review of Politics from Notre Dame.
173,The library system also includes branch librar...,In what year did the opening of a theology lib...,2015,The opening of a theology library at Notre Dam...,2015
193,"Besides its prominence in sports, Notre Dame i...",How many individual colleges are part of Notre...,four,There are four colleges that are part of Notre...,Five.
17,The College of Engineering was established in ...,Before the creation of the College of Engineer...,the College of Science,The College of Science was established in 1870...,The College of Science.
81,About 80% of undergraduates and 20% of graduat...,What amount of the graduate student body at No...,20%,The majority of the graduate students on campu...,About 20% of the graduate student body at Notr...


## Unit tests of text_eval_utils

In [3]:
import text_eval_utils as tteval

In [4]:
J = 23
a = df.iloc[J]['True Answer']
a_babbage = df.iloc[J]['Babbage Answer']
a_devinci = df.iloc[J]['Devinci Answer']

a, a_babbage, a_devinci

('U.S. News & World Report',
 'The First Year of Studies program was declared "outstanding" by U.S. News & World Report in their 2018 guide to undergraduate colleges.',
 'U.S. News & World Report.')

In [5]:
babbage_scores = tteval.PrecisionRecallF1(machine_answer=a_babbage, true_answer=a)
babbage_scores.precision, babbage_scores.recall, babbage_scores.f1

(0.2608695652173913, 1.0, 0.41379310016646853)

In [6]:
devinci_scores = tteval.PrecisionRecallF1(machine_answer=a_devinci, true_answer=a)
devinci_scores.precision, devinci_scores.recall, devinci_scores.f1

(1.0, 1.0, 0.999999995)

In [7]:
scores = tteval.get_batch_metrics(
    array_machine_answers=df["Babbage Answer"],
    array_true_answers=df['True Answer']
)

df["Babbage Answer Precision"] = scores['precision']
df["Babbage Answer Recall"] = scores['recall']
df["Babbage Answer F1"] = scores['f1']

In [8]:
scores = tteval.get_batch_metrics(
    array_machine_answers=df["Devinci Answer"],
    array_true_answers=df['True Answer']
)

df["Devinci Answer Precision"] = scores['precision']
df["Devinci Answer Recall"] = scores['recall']
df["Devinci Answer F1"] = scores['f1']

In [9]:
df.sample(5)

Unnamed: 0,Context,Question,True Answer,Babbage Answer,Devinci Answer,Babbage Answer Precision,Babbage Answer Recall,Babbage Answer F1,Devinci Answer Precision,Devinci Answer Recall,Devinci Answer F1
61,The Lobund Institute grew out of pioneering re...,In what year did Lobund at Notre Dame become a...,1950,Lobund at Notre Dame became an Institute in 1958.,1950,0.0,0.0,0.0,1.0,1.0,1.0
41,Notre Dame is known for its competitive admiss...,Where does Notre Dame rank in terms of academi...,the top 10 to 15 in the nation,Notre Dame is known for its competitive admiss...,Among the top 10 to 15 in the nation for natio...,0.175,1.0,0.297872,0.583333,1.0,0.736842
169,The College of Science was established at the ...,How many undergrad students attend the College...,"over 1,200","1,200 undergraduate students attend the Colleg...","Over 1,200 undergraduates.",0.083333,0.5,0.142857,0.333333,0.5,0.4
144,A Science Hall was built in 1883 under the dir...,After which individual was the LaFortune Cente...,Joseph LaFortune,"Joseph LaFortune, an oil executive from Tulsa,...","Joseph LaFortune, an oil executive from Tulsa,...",0.125,0.5,0.2,0.125,0.5,0.2
138,"Because of its Catholic identity, a number of ...",What structure is found on the location of the...,Basilica of the Sacred Heart,"The Old College building, which became one of ...",The current Basilica of the Sacred Heart.,0.111111,0.4,0.173913,0.714286,1.0,0.833333


In [10]:
str_out = f"Scores for GPT-Babbage QA:\n"
str_out += f"Precision = {df['Babbage Answer Recall'].mean():.4f}, Recall = {df['Babbage Answer Precision'].mean():.4f}, F1 = {df['Babbage Answer F1'].mean():.4f}"
print(str_out)

Scores for GPT-Babbage QA:
Precision = 0.7285, Recall = 0.2491, F1 = 0.3175


In [11]:
str_out = f"Scores for GPT-Devinci QA:\n"
str_out += f"Precision = {df['Devinci Answer Recall'].mean():.4f}, Recall = {df['Devinci Answer Precision'].mean():.4f}, F1 = {df['Devinci Answer F1'].mean():.4f}"
print(str_out)

Scores for GPT-Devinci QA:
Precision = 0.8327, Recall = 0.6816, F1 = 0.7168
