#### Importing Libraries

In [1]:
import rouge_score
from glob import glob
import ast
import pandas as pd
from rouge_score import rouge_scorer
import numpy as np

#### Rouge Score calculator function

In [2]:
scorer = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)
def score_calculator(ground_truth, gpt3response ):
    list_of_ground_truths = ast.literal_eval(ground_truth)
    max_f1,max_precision,max_recall = 0,0,0
    for gt in list_of_ground_truths:
        max_f1 = max(scorer.score(gt,gpt3response)['rougeL'][2],max_f1)   
        max_precision = max(scorer.score(gt,gpt3response)['rougeL'][0],max_precision)   
        max_recall = max(scorer.score(gt,gpt3response)['rougeL'][1],max_recall)   
    return [max_f1,max_precision,max_recall]

#### Data Reading and Stats tables

In [8]:
def read_process_data(base_folder,flag = None):
    all_files = glob(base_folder + '/*.csv', recursive=True)
    li = []
    for filename in all_files:
        df = pd.read_csv(filename, index_col=None, header=0)
        li.append(df)
    frame = pd.concat(li, axis=0, ignore_index=True)
    frame = frame[frame['label']!="[]"]
    frame['key'].fillna('',inplace=True)
    frame['baseline scores'] = frame[['label','key']].apply(lambda x: score_calculator(*x), axis=1)
    frame[['baseline_f1','baseline_prec','baseline_recall']] = pd.DataFrame(frame['baseline scores'].tolist(), index= frame.index)
    if flag == 'gpt':
        frame['gpt3 rouge scores'] = frame[['label','GPT3 Responses']].apply(lambda x: score_calculator(*x), axis=1)
        frame[['gpt3_f1','gpt3_prec','gpt3_recall']] = pd.DataFrame(frame['gpt3 rouge scores'].tolist(), index= frame.index)
        deduped_df = frame[['sent','entity','label','key','GPT3 Responses','gpt3_f1','gpt3_prec','gpt3_recall',\
                        'baseline_f1','baseline_prec','baseline_recall']].drop_duplicates()
    else:
        deduped_df = frame[['sent','entity','label','key',\
                    'baseline_f1','baseline_prec','baseline_recall']].drop_duplicates()
    entity_map = {k:v for k,v in zip(list(deduped_df.groupby(['sent'])['entity'].nunique().reset_index()['sent']),\
                    list(deduped_df.groupby(['sent'])['entity'].nunique().reset_index()['entity']))}
    deduped_df['sent_len'] = deduped_df['sent'].apply(lambda x : len(x.split(' ')))
    deduped_df['num_ent'] = deduped_df['sent'].apply(lambda x : entity_map[x])
    deduped_df['num_ent1'] = deduped_df['num_ent'].apply(lambda x : '5 or more' if x>=5 else str(x))
    
    if flag == 'gpt':
        grp = deduped_df.groupby(['num_ent1']).agg({'sent':'nunique',\
                                                  'gpt3_f1':'mean',\
                                                   'gpt3_prec':'mean',\
                                                  'gpt3_recall':'mean',\
                                                  'baseline_f1':'mean',\
                                                  'baseline_prec':'mean',\
                                                  'baseline_recall':'mean',\
                                                  'sent_len':'mean'}).reset_index()
       
    else:
        grp = deduped_df.groupby(['num_ent1']).agg({'sent':'nunique',\
                                                  'baseline_f1':'mean',\
                                                  'baseline_prec':'mean',\
                                                  'baseline_recall':'mean',\
                                                  'sent_len':'mean'}).reset_index()
    return grp,deduped_df

In [9]:
base_folder = './GPT3Responses'
group_data,frame = read_process_data(base_folder,flag = 'gpt')
group_data

Unnamed: 0,num_ent1,sent,gpt3_f1,gpt3_prec,gpt3_recall,baseline_f1,baseline_prec,baseline_recall,sent_len
0,1,2567,0.358512,0.418636,0.372663,0.314365,0.445057,0.284406,27.676541
1,2,1195,0.413787,0.447036,0.47125,0.35663,0.479905,0.332139,29.016579
2,3,557,0.448314,0.471564,0.53746,0.35272,0.469555,0.327552,30.717921
3,4,103,0.341414,0.363681,0.422724,0.303974,0.408391,0.289925,42.596529
4,5 or more,55,0.409523,0.43513,0.479771,0.324413,0.485859,0.276166,53.848066


In [10]:
frame[['baseline_f1','baseline_prec','baseline_recall','gpt3_f1','gpt3_prec','gpt3_recall']].describe()

Unnamed: 0,baseline_f1,baseline_prec,baseline_recall,gpt3_f1,gpt3_prec,gpt3_recall
count,8261.0,8261.0,8261.0,8261.0,8261.0,8261.0
mean,0.336379,0.461472,0.309335,0.397629,0.43725,0.448669
std,0.32428,0.421121,0.329171,0.313867,0.362869,0.357473
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.142857,0.125,0.142857
50%,0.285714,0.4,0.25,0.363636,0.333333,0.4
75%,0.571429,1.0,0.5,0.625,0.75,0.75
max,1.0,1.0,1.0,1.0,1.0,1.0


In [None]:
base_folder = "./Output"
group_data,frame = read_process_data(base_folder,flag = None)
group_data

In [None]:
frame[['baseline_f1','baseline_prec','baseline_recall']].describe()