In [35]:
import json
import pandas as pd
import numpy as np

In [75]:
gpt_4_eval_path = "logs-json/2024-11-23T00-27-58-08-00_theory-of-mind_fZviDx4z7a8UaPgMmCNmpD.json"
claude_3_sonnet_eval_path = "logs-json/2024-11-23T10-44-13-08-00_theory-of-mind_Ehd35cmCvqzUJmLrKpJTXi.json"
gemini_1_5_flash_eval_path = "logs-json/2024-11-23T10-58-18-08-00_theory-of-mind_3xrMbdT8mbuDe78pZj6n4h.json"
grok_beta_eval_path = "logs-json/2024-11-23T11-01-41-08-00_theory-of-mind_GCZsTqfGL8YygNvSP9vZSD.json"
grok_vision_beta_eval_path = "logs-json/2024-11-23T11-12-29-08-00_theory-of-mind_fBnWBgug2x4YvmxSWWep5r.json"

with open(gpt_4_eval_path, "r") as f:
    gpt_4_eval_data = dict(json.load(f))

with open(claude_3_sonnet_eval_path, "r") as f:
    claude_3_sonnet_eval_data = dict(json.load(f))

with open(gemini_1_5_flash_eval_path, "r") as f:
    gemini_1_5_flash_eval_data = dict(json.load(f))


with open(grok_beta_eval_path, "r") as f:
    grok_beta_eval_data = dict(json.load(f))


with open(grok_vision_beta_eval_path, "r") as f:
    grok_vision_beta_eval_data = dict(json.load(f))


In [73]:
def get_run_data(eval_data):
    dataset = eval_data['eval']['dataset']['name']
    model = eval_data['eval']['model']
    prompt = eval_data['plan']['steps'][0]['params']['template']
    accuracy = eval_data['results']['scores'][0]['metrics']['accuracy']['value']
    stderr = eval_data['results']['scores'][0]['metrics']['stderr']['value']
    run_start = eval_data['stats']['started_at']
    run_end = eval_data['stats']['completed_at']
    run_input_tokens = eval_data['stats']['model_usage'][model]['input_tokens']
    run_output_tokens = eval_data['stats']['model_usage'][model]['output_tokens']
    run_total_tokens = eval_data['stats']['model_usage'][model]['total_tokens']
    run_id = eval_data['eval']['run_id']
    run_data = {
        'run_id': run_id,
        'dataset': dataset,
        'model': model,
    'prompt': prompt,
    'accuracy': accuracy,
    'stderr': stderr,
    'run_start': run_start,
    'run_end': run_end,
    'run_input_tokens': run_input_tokens,
    'run_output_tokens': run_output_tokens,
    'run_total_tokens': run_total_tokens
}

    run_data_df = pd.DataFrame([run_data])
    run_data_df['upper_bound'] = run_data_df['accuracy'] + 1.96 * run_data_df['stderr']
    run_data_df['lower_bound'] = run_data_df['accuracy'] - 1.96 * run_data_df['stderr']

    return run_data_df

# Get eval data for every file and concatenate to one DataFrame
eval_files = [
    gpt_4_eval_data,
    claude_3_sonnet_eval_data,
    gemini_1_5_flash_eval_data,
    grok_beta_eval_data,
    grok_vision_beta_eval_data
]

gpt_4_run_data_df = get_run_data(gpt_4_eval_data)
claude_3_sonnet_run_data_df = get_run_data(claude_3_sonnet_eval_data)
gemini_1_5_flash_run_data_df = get_run_data(gemini_1_5_flash_eval_data)
grok_beta_run_data_df = get_run_data(grok_beta_eval_data)
grok_vision_beta_run_data_df = get_run_data(grok_vision_beta_eval_data)

all_run_data_df = pd.concat([gpt_4_run_data_df, claude_3_sonnet_run_data_df, gemini_1_5_flash_run_data_df, grok_beta_run_data_df, grok_vision_beta_run_data_df], ignore_index=True)

all_run_data_df.to_json('all_run_data.json', orient='records')

all_run_data_df



Unnamed: 0,run_id,dataset,model,prompt,accuracy,stderr,run_start,run_end,run_input_tokens,run_output_tokens,run_total_tokens,upper_bound,lower_bound
0,GFJUCeSDrcsJMzch5HZ5an,theory_of_mind,openai/gpt-4,{prompt},0.79,0.040936,2024-11-23T00:27:58-08:00,2024-11-23T00:40:23-08:00,75395,19493,94888,0.870235,0.709765
1,7sTJLizK2fWUmunLGZ45Bo,theory_of_mind,anthropic/claude-3-5-sonnet-latest,{prompt},0.81,0.039428,2024-11-23T10:44:13-08:00,2024-11-23T12:07:27-08:00,113609,40982,154591,0.887278,0.732722
2,XYMM64LW4txoCvQgJBoAEt,theory_of_mind,google/gemini-1.5-flash-001,{prompt},0.67,0.047258,2024-11-23T10:58:18-08:00,2024-11-23T11:36:30-08:00,116382,39693,156075,0.762626,0.577374
3,9CZwNdPp8pMQdy5cWZeLiP,theory_of_mind,grok/grok-beta,{prompt},0.82,0.038612,2024-11-23T11:01:41-08:00,2024-11-23T11:03:15-08:00,91260,33541,124801,0.89568,0.74432
4,bzJFKKPmkGZXmbDQroL6LQ,theory_of_mind,grok/grok-vision-beta,{prompt},0.8,0.040202,2024-11-23T11:12:29-08:00,2024-11-23T11:13:45-08:00,87505,30140,117645,0.878795,0.721205


In [74]:
def get_samples_data(samples, run_id):
    final_samples = []

    for sample in samples:   
        
        sample_id = sample['id']
        epoch = sample['epoch']
        input = sample['input'][0]['content']
        target = sample['target']
        messages = sample['messages']
        output = sample['output']['choices'][0]['message']['content']
        score = sample['scores']['model_graded_fact']['value']

        final_samples.append({
            'run_id': run_id,
            'sample_id': sample_id,
            'epoch': epoch,
            'input': input,
            'target': target,
            'output': output,
            'score': score,
        })
        
    df = pd.DataFrame(final_samples)
    df['score_binary'] = df['score'].apply(lambda x: 1 if x == 'C' else 0)
    df['cumulative_score'] = df['score_binary'].cumsum()
    df['cumulative_score_percentage'] = df['cumulative_score'] / df['sample_id']

    return df

gpt_4_samples_data_df = get_samples_data(gpt_4_eval_data['samples'], gpt_4_eval_data['eval']['run_id'])
claude_3_sonnet_samples_data_df = get_samples_data(claude_3_sonnet_eval_data['samples'], claude_3_sonnet_eval_data['eval']['run_id'])
gemini_1_5_flash_samples_data_df = get_samples_data(gemini_1_5_flash_eval_data['samples'], gemini_1_5_flash_eval_data['eval']['run_id'])
grok_beta_samples_data_df = get_samples_data(grok_beta_eval_data['samples'], grok_beta_eval_data['eval']['run_id'])
grok_vision_beta_samples_data_df = get_samples_data(grok_vision_beta_eval_data['samples'], grok_vision_beta_eval_data['eval']['run_id'])


all_samples_data_df = pd.concat([gpt_4_samples_data_df, claude_3_sonnet_samples_data_df, gemini_1_5_flash_samples_data_df, grok_beta_samples_data_df, grok_vision_beta_samples_data_df], ignore_index=True)

all_samples_data_df.to_json('all_samples_data.json', orient='records')

all_samples_data_df

Unnamed: 0,run_id,sample_id,epoch,input,target,output,score,score_binary,cumulative_score,cumulative_score_percentage
0,GFJUCeSDrcsJMzch5HZ5an,1,1,Jackson entered the hall. Chloe entered the ha...,bathtub,ANSWER: The boots were in the bathtub at the b...,C,1,1,1.000000
1,GFJUCeSDrcsJMzch5HZ5an,2,1,Jackson entered the hall. Chloe entered the ha...,pantry,ANSWER: Chloe will look for the boots in the p...,C,1,2,1.000000
2,GFJUCeSDrcsJMzch5HZ5an,3,1,Jackson entered the hall. Chloe entered the ha...,bathtub,ANSWER: Chloe would think that Jackson is sear...,I,0,2,0.666667
3,GFJUCeSDrcsJMzch5HZ5an,4,1,Jackson entered the hall. Chloe entered the ha...,pantry,ANSWER: The boots are really in the pantry.,C,1,3,0.750000
4,GFJUCeSDrcsJMzch5HZ5an,5,1,Jackson entered the hall. Chloe entered the ha...,bathtub,ANSWER: Jackson will look for the boots in the...,I,0,3,0.600000
...,...,...,...,...,...,...,...,...,...,...
495,bzJFKKPmkGZXmbDQroL6LQ,96,1,Ethan entered the sunroom. Mia entered the sun...,box,ANSWER: Ethan would think that Mia searches fo...,C,1,77,0.802083
496,bzJFKKPmkGZXmbDQroL6LQ,97,1,Lily entered the patio. Logan entered the pati...,crate,"ANSWER: At the beginning, the tie was in the c...",C,1,78,0.804124
497,bzJFKKPmkGZXmbDQroL6LQ,98,1,Lily entered the patio. Logan entered the pati...,bucket,ANSWER: Lily will look for the tie in the buck...,C,1,79,0.806122
498,bzJFKKPmkGZXmbDQroL6LQ,99,1,Lily entered the patio. Logan entered the pati...,bucket,ANSWER: Lily would likely think that Abigail w...,I,0,79,0.797980


In [25]:
all_run_data_df

Unnamed: 0,run_id,model,prompt,accuracy,stderr,run_start,run_end,run_input_tokens,run_output_tokens,run_total_tokens,upper_bound,lower_bound
0,GFJUCeSDrcsJMzch5HZ5an,openai/gpt-4,{prompt},0.79,0.040936,2024-11-23T00:27:58-08:00,2024-11-23T00:40:23-08:00,75395,19493,94888,0.870235,0.709765
1,7sTJLizK2fWUmunLGZ45Bo,anthropic/claude-3-5-sonnet-latest,{prompt},0.81,0.039428,2024-11-23T10:44:13-08:00,2024-11-23T12:07:27-08:00,113609,40982,154591,0.887278,0.732722
2,XYMM64LW4txoCvQgJBoAEt,google/gemini-1.5-flash-001,{prompt},0.67,0.047258,2024-11-23T10:58:18-08:00,2024-11-23T11:36:30-08:00,116382,39693,156075,0.762626,0.577374
3,9CZwNdPp8pMQdy5cWZeLiP,grok/grok-beta,{prompt},0.82,0.038612,2024-11-23T11:01:41-08:00,2024-11-23T11:03:15-08:00,91260,33541,124801,0.89568,0.74432
4,bzJFKKPmkGZXmbDQroL6LQ,grok/grok-vision-beta,{prompt},0.8,0.040202,2024-11-23T11:12:29-08:00,2024-11-23T11:13:45-08:00,87505,30140,117645,0.878795,0.721205


In [66]:
"""Unpaired_analysis"""

def compare_models(df, model1, model2):
    model1_mean_accuracy = df[df['model'] == model1]['accuracy'].iloc[0]
    model2_mean_accuracy = df[df['model'] == model2]['accuracy'].iloc[0]

    model1_stderr = df[df['model'] == model1]['stderr'].iloc[0]
    model2_stderr = df[df['model'] == model2]['stderr'].iloc[0]

    diff_mean_accuracy = model1_mean_accuracy - model2_mean_accuracy
    diff_stderr = np.sqrt(model1_stderr**2 + model2_stderr**2)

    upper_bound = diff_mean_accuracy + 1.96 * diff_stderr
    lower_bound = diff_mean_accuracy - 1.96 * diff_stderr

    z_score = diff_mean_accuracy / diff_stderr

    is_significant_at_90_confidence = z_score > 1.645 or z_score < -1.645
    is_significant_at_95_confidence = z_score > 1.96 or z_score < -1.96
    is_significant_at_99_confidence = z_score > 2.58 or z_score < -2.58
    is_significant_at_99_9_confidence = z_score > 3.29 or z_score < -3.29


    return {'diff_mean_accuracy': diff_mean_accuracy,
            'diff_stderr': diff_stderr,
            'upper_bound': upper_bound, 
            'lower_bound': lower_bound, 
            'z_score': z_score, 
            'is_significant_at_90_confidence': is_significant_at_90_confidence, 
            'is_significant_at_95_confidence': is_significant_at_95_confidence, 
            'is_significant_at_99_confidence': is_significant_at_99_confidence, 
            'is_significant_at_99_9_confidence': is_significant_at_99_9_confidence}


# Example usage:
# diff, upper, lower, z = compare_models(all_run_data_df, 'openai/gpt-4', 'anthropic/claude-3-5-sonnet-latest')

compare_models(all_run_data_df, 'openai/gpt-4', 'anthropic/claude-3-5-sonnet-latest')

{'diff_mean_accuracy': np.float64(-0.020000000000000018),
 'diff_stderr': np.float64(0.0568357548582143),
 'upper_bound': np.float64(0.09139807952210001),
 'lower_bound': np.float64(-0.13139807952210003),
 'z_score': np.float64(-0.35189116516342844),
 'is_significant_at_90_confidence': np.False_,
 'is_significant_at_95_confidence': np.False_,
 'is_significant_at_99_confidence': np.False_,
 'is_significant_at_99_9_confidence': np.False_}

In [45]:
compare_models(all_run_data_df, 'openai/gpt-4', 'google/gemini-1.5-flash-001')


{'diff_mean_accuracy': np.float64(0.12),
 'upper_bound': np.float64(0.24254453735831571),
 'lower_bound': np.float64(-0.002544537358315724),
 'z_score': np.float64(1.919302198777607),
 'is_significant_at_90_confidence': np.True_,
 'is_significant_at_95_confidence': np.False_,
 'is_significant_at_99_confidence': np.False_,
 'is_significant_at_99_9_confidence': np.False_}

In [68]:
"""Paired_analysis"""
def compare_models_paired(df,df_samples, model1, model2, n_samples=100):
    model1_mean_accuracy = df[df['model'] == model1]['accuracy'].iloc[0]
    model2_mean_accuracy = df[df['model'] == model2]['accuracy'].iloc[0]


    diff_mean_accuracy = model1_mean_accuracy - model2_mean_accuracy

    models = [model1, model2]

    run_ids = df[df['model'].isin(models)]['run_id'].unique()

    run1_df = df_samples[df_samples['run_id'] == run_ids[0]][['run_id', 'sample_id', 'score_binary']]
    run2_df = df_samples[df_samples['run_id'] == run_ids[1]][['run_id', 'sample_id', 'score_binary']]

    merged_df = run1_df.merge(run2_df, on='sample_id', suffixes=('_run1', '_run2'))

    merged_df['score_diff'] = merged_df['score_binary_run1'] - merged_df['score_binary_run2']

    merged_df['diff_mean_accuracy'] = diff_mean_accuracy

    merged_df['score_diff_agg_sqaured'] = (merged_df['score_diff'] - merged_df['diff_mean_accuracy'])**2

    sum_squared_diffs = merged_df['score_diff_agg_sqaured'].sum()

    multiplier = 1/(n_samples - 1)

    paired_se = ((sum_squared_diffs * multiplier) / n_samples)**0.5

    upper_bound = diff_mean_accuracy + 1.96 * paired_se
    lower_bound = diff_mean_accuracy - 1.96 * paired_se

    z_score = diff_mean_accuracy / paired_se

    is_significant_at_90_confidence = z_score > 1.645 or z_score < -1.645
    is_significant_at_95_confidence = z_score > 1.96 or z_score < -1.96
    is_significant_at_99_confidence = z_score > 2.58 or z_score < -2.58
    is_significant_at_99_9_confidence = z_score > 3.29 or z_score < -3.29

    return {'diff_mean_accuracy': diff_mean_accuracy,
            'diff_stderr': paired_se,
            'upper_bound': upper_bound, 
            'lower_bound': lower_bound, 
            'z_score': z_score, 
            'is_significant_at_90_confidence': is_significant_at_90_confidence, 
            'is_significant_at_95_confidence': is_significant_at_95_confidence, 
            'is_significant_at_99_confidence': is_significant_at_99_confidence, 
            'is_significant_at_99_9_confidence': is_significant_at_99_9_confidence}

    

compare_models_paired(all_run_data_df, all_samples_data_df, 'anthropic/claude-3-5-sonnet-latest', 'google/gemini-1.5-flash-001')





{'diff_mean_accuracy': np.float64(0.14),
 'diff_stderr': np.float64(0.05321957564959752),
 'upper_bound': np.float64(0.24431036827321115),
 'lower_bound': np.float64(0.03568963172678888),
 'z_score': np.float64(2.6306109789708327),
 'is_significant_at_90_confidence': np.True_,
 'is_significant_at_95_confidence': np.True_,
 'is_significant_at_99_confidence': np.True_,
 'is_significant_at_99_9_confidence': np.False_}