In [None]:
import pandas as pd
import numpy as np
from itertools import combinations

In [None]:
cdf = pd.read_csv("gpt-4o-mini_pilotdata.csv").drop(["my_sen", "else_sen", "nice_sen", "honest_sen"], axis=1)
mdf = pd.read_csv("mistral-small_pilotdata.csv").drop(["my_sen", "else_sen", "nice_sen", "honest_sen"], axis=1)

cdf['model'] = 'gpt'
mdf['model'] = 'mistral'
df_all = pd.concat([cdf, mdf], ignore_index=False)
df_all

Unnamed: 0,i,score,my,else,nice,honest,model
0,0,4,3,3,3,3,gpt
1,1,3,3,3,3,3,gpt
2,2,4,3,3,3,3,gpt
3,3,3,3,3,3,3,gpt
4,4,5,3,3,3,3,gpt
...,...,...,...,...,...,...,...
95,95,2,3,2,3,3,mistral
96,96,2,3,2,3,2,mistral
97,97,2,2,2,2,2,mistral
98,98,2,2,2,3,2,mistral


In [20]:
df_alt = df_all.melt(id_vars=['i', 'model', 'score'], 
                      value_vars=['my', 'else', 'nice', 'honest'], 
                      var_name='prompt', value_name='llm_score')
df_alt['abs_dev'] = abs(df_alt['llm_score'] - df_alt['score'])

In [None]:
results = []


for model in df_alt['model'].unique():
    for p1, p2 in combinations(df_alt['prompt'].unique(), 2):
        df1 = df_alt[(df_alt['model'] == model) & (df_alt['prompt'] == p1)].set_index('i')
        df2 = df_alt[(df_alt['model'] == model) & (df_alt['prompt'] == p2)].set_index('i')
        common = df1.index.intersection(df2.index)
        if len(common) == 0:
            continue
        diff_llm = df1.loc[common]['llm_score'] - df2.loc[common]['llm_score']
        diff_abs = df1.loc[common]['abs_dev'] - df2.loc[common]['abs_dev']
        results.append({
            'comparison': f'{model}: {p1} vs {p2}',
            'type': 'within-model',
            'n': len(common),
            'std_llm': diff_llm.std(),
            'std_abs_dev': diff_abs.std()
        })

for prompt in df_alt['prompt'].unique():
    df1 = df_alt[(df_alt['model'] == 'gpt') & (df_alt['prompt'] == prompt)].set_index('i')
    df2 = df_alt[(df_alt['model'] == 'mistral') & (df_alt['prompt'] == prompt)].set_index('i')
    common = df1.index.intersection(df2.index)
    if len(common) == 0:
        continue
    diff_llm = df1.loc[common]['llm_score'] - df2.loc[common]['llm_score']
    diff_abs = df1.loc[common]['abs_dev'] - df2.loc[common]['abs_dev']
    results.append({
        'comparison': f'{prompt}: gpt vs mistral',
        'type': 'within-prompt',
        'n': len(common),
        'std_llm': diff_llm.std(),
        'std_abs_dev': diff_abs.std()
    })


df_results = pd.DataFrame(results)

df_results


Unnamed: 0,comparison,type,n,std_llm,std_abs_dev
0,gpt: my vs else,within-model,100,0.563539,0.580491
1,gpt: my vs nice,within-model,100,0.460566,0.541229
2,gpt: my vs honest,within-model,100,0.471405,0.510891
3,gpt: else vs nice,within-model,100,0.593228,0.751833
4,gpt: else vs honest,within-model,100,0.469687,0.469687
5,gpt: nice vs honest,within-model,100,0.522233,0.72223
6,mistral: my vs else,within-model,100,0.544857,0.559491
7,mistral: my vs nice,within-model,100,0.506922,0.530294
8,mistral: my vs honest,within-model,100,0.604361,0.597638
9,mistral: else vs nice,within-model,100,0.521362,0.70288


In [22]:
np.sqrt(sum(df_results["std_abs_dev"]**2)/len(df_results["std_abs_dev"]))

0.5815665885947171

In [23]:
np.sqrt(sum(df_results["std_llm"]**2)/len(df_results["std_llm"]))

0.5303777003706933