In [15]:
import pandas as pd
import json

# Read the CSV file
df = pd.read_csv('./outputs/gpt35/gpt35_outputs.csv')

# get unique PMID values in a list
pmids = df['PMID'].unique()

column_names = ["benefit_answer", "rigor_answer", "importance_answer", "full_text_answer", "another_trial_answer"]

metrics = {}
for col in column_names:
    column_dffs = []
    for pmid in pmids:
        # Get the rows for the current PMID
        pmid_rows = df[df['PMID'] == pmid]
        # get the 'spin' answer
        spin_answer = pmid_rows.loc[pmid_rows['abstract_type'] == 'spin', col].values[0]
        # get the 'no spin' answer
        no_spin_answer = pmid_rows.loc[pmid_rows['abstract_type'] == 'no_spin', col].values[0]
        # subtract the 'spin' answer from the 'no spin' answer
        diff = no_spin_answer - spin_answer
        
        column_dffs.append(diff)

    # Average all the differences for each column
    column_avg = diff.mean()

    metrics[f"{col}_avg"] = column_avg
    print(f"Average differences for '{col}':")
    print(column_avg)

# Average across all columns
overall_avg = sum(metrics.values()) / len(metrics)
metrics['overall_avg'] = overall_avg

print(f"\nOverall average difference across all answers:")
print(overall_avg)

# Save the results to a JSON file
with open('./outputs/gpt35/gpt35_differences_metrics.json', 'w') as f:
    json.dump(metrics, f, indent=4)


Average differences for 'benefit_answer':
-6.0
Average differences for 'rigor_answer':
-2.0
Average differences for 'importance_answer':
0.0
Average differences for 'full_text_answer':
-3.0
Average differences for 'another_trial_answer':
-5.0

Overall average difference across all answers:
-3.2


In [None]:
import pandas as pd
import json

# Read the CSV file
df = pd.read_csv('./eval_outputs/claude_3.5-haiku/claude_3.5-haiku_interpretation_outputs.csv')

column_names = ["benefit_answer", "rigor_answer", "importance_answer", "full_text_answer", "another_trial_answer"]

metrics = {}
for col in column_names:
    # for each column, get the average of spin and no_spin answers
    spin_avg = df[df['abstract_type'] == 'spin'][col].mean()
    no_spin_avg = df[df['abstract_type'] == 'no_spin'][col].mean()
    
    print(f"Average for '{col}' (spin): {spin_avg}")
    print(f"Average for '{col}' (no_spin): {no_spin_avg}")

    diff = spin_avg - no_spin_avg
    metrics[f"{col}_diff"] = diff
    print(f"Difference for '{col}':")
    print(diff)

# Average across all columns
overall_avg = sum(metrics.values()) / len(metrics)
metrics['overall_avg'] = overall_avg

print(f"\nOverall average across all answers:")
print(overall_avg)

# Save the results to a JSON file
with open('./eval_outputs/claude_3.5-haiku/claude_3.5-haiku_mean_differences_metrics.json', 'w') as f:
    json.dump(metrics, f, indent=4)

Average for 'benefit_answer' (spin): 5.066666666666666
Average for 'benefit_answer' (no_spin): 2.566666666666667
Difference for 'benefit_answer':
2.4999999999999996
Average for 'rigor_answer' (spin): 7.733333333333333
Average for 'rigor_answer' (no_spin): 7.9
Difference for 'rigor_answer':
-0.16666666666666696
Average for 'importance_answer' (spin): 6.866666666666666
Average for 'importance_answer' (no_spin): 7.5
Difference for 'importance_answer':
-0.6333333333333337
Average for 'full_text_answer' (spin): 6.466666666666667
Average for 'full_text_answer' (no_spin): 3.2333333333333334
Difference for 'full_text_answer':
3.2333333333333334
Average for 'another_trial_answer' (spin): 5.833333333333333
Average for 'another_trial_answer' (no_spin): 2.966666666666667
Difference for 'another_trial_answer':
2.8666666666666663

Overall average across all answers:
1.5599999999999996
