In [1]:
import json
import pandas as pd
import numpy as np

ann_df = pd.read_csv("human_evaluation/he_data.csv", index_col=0)

dims = ['Dim1', 'Dim2', 'Dim3', 'Dim4', 'Dim5']

ann_df.head(3)

Unnamed: 0,sample_index,Dim1,Dim2,Dim3,Dim4,Dim5,annotator,ED_index,chat_templates,gen_targets,prevs,emotions,gens,chat_cut,tidy_chat,model
0,0,70,82,49,44,1,1,343,"<|system|>\nYou are a friendly assistant, who ...",so young and ready to conquer the world,"It is! In it, she's just standing with a huge...",nostalgic,I can imagine how happy she must have been on ...,I found an old picture of when my oldest daugh...,Speaker: I found an old picture of when my old...,zephyr_base
1,1,70,57,70,70,1,1,274,"<|system|>\nYou are a friendly assistant, who ...","Wow, that is terrible. I hope that you are abl...","Huge dent, I can't believe someone could do th...",furious,That's really unfortunate. I'm sorry to hear t...,Someone hit my car while I was shopping. I am ...,Speaker: Someone hit my car while I was shoppi...,zephyr_base
2,2,62,11,24,14,1,1,2061,"<|system|>\nYou are a friendly assistant, who ...",It's good that you're being more proactive abo...,I'll try to do my best. i think i will also ne...,proud,It's always a good idea to consult with a doct...,I stress eat a lot. I decided i need to contro...,Speaker: I stress eat a lot. I decided i need ...,zephyr_base


In [2]:
ann_mean_df = ann_df.groupby(["model", "ED_index"])[dims].mean().reset_index()
sample_data_df = ann_df[["emotions", "model", "tidy_chat", "ED_index", "gens"]].drop_duplicates()
res_df = ann_mean_df.merge(sample_data_df, on=["ED_index", "model"])
res_df.drop(columns='ED_index').groupby("model").describe().round(2).T

Unnamed: 0,model,SFT,human_targets,zephyr_base,zephyr_base_output_limit,zephyr_dpo
Dim1,count,135.0,135.0,135.0,135.0,135.0
Dim1,mean,73.33,71.89,80.38,74.31,74.91
Dim1,std,15.68,16.24,12.56,18.31,16.02
Dim1,min,23.67,14.33,27.67,5.67,35.0
Dim1,25%,64.33,64.33,73.83,64.67,64.33
Dim1,50%,77.33,75.0,82.67,78.67,76.33
Dim1,75%,83.67,82.5,89.0,86.83,87.33
Dim1,max,100.0,98.33,100.0,100.0,100.0
Dim2,count,135.0,135.0,135.0,135.0,135.0
Dim2,mean,63.06,60.27,75.96,68.23,63.11


### Effect sizes of the score differences

The sample sizes are large enough to assume normality from the central limit theorem

In [3]:
def cohend(d1, d2):
    s = np.sqrt(((len(d1) - 1) * np.var(d1, ddof=1) + (len(d2) - 1) * np.var(d2, ddof=1)) / (len(d1) + len(d2) - 2))
    return (np.mean(d1) - np.mean(d2)) / s

In [4]:
from scipy.stats import ttest_rel, f_oneway


models = ['zephyr_base', 'zephyr_dpo', 'zephyr_base_output_limit', 'SFT', 'human_targets']
t_test_sig_res = []

for model_orig in models:
    for dim in dims[:-1]:
        for i, model in enumerate(models):
            ann_A = res_df[res_df.model == model][dim]
            ann_B = res_df[res_df.model == model_orig][dim]
            p_res =  ttest_rel(ann_A, ann_B, alternative='greater').pvalue
            effsize_d = cohend(ann_A, ann_B)
            # print(f'Comparing {model_orig} to {model}, {dim}, t={t_res}, Effect_size: {effsize_d}')
            t_test_sig_res.append([dim, model, model_orig, p_res, effsize_d])

effsize_df = pd.DataFrame(t_test_sig_res, columns=['dim', 'model_greater_score', 'model', 'p', 'effect_size']).dropna()
effsize_df[effsize_df.p < 0.05].sort_values(["dim", "effect_size"], ascending=False).round({'p':5, 'effect_size': 2})

Unnamed: 0,dim,model_greater_score,model,p,effect_size
75,Dim4,zephyr_base,SFT,1e-05,0.53
95,Dim4,zephyr_base,human_targets,4e-05,0.47
55,Dim4,zephyr_base,zephyr_base_output_limit,5e-05,0.45
35,Dim4,zephyr_base,zephyr_dpo,0.00962,0.3
76,Dim4,zephyr_dpo,SFT,0.02322,0.22
90,Dim3,zephyr_base,human_targets,3e-05,0.48
70,Dim3,zephyr_base,SFT,0.00013,0.42
91,Dim3,zephyr_dpo,human_targets,0.00491,0.32
50,Dim3,zephyr_base,zephyr_base_output_limit,0.00372,0.3
71,Dim3,zephyr_dpo,SFT,0.01041,0.26


### Relation of length, emotion and the final score

In [5]:
from scipy.stats import pearsonr
from functools import reduce


res_x = res_df.copy()
res_x['len'] = res_x.gens.str.len()

In [6]:
continuous_dim_count = 4
res_dim = [[] for _ in range(continuous_dim_count)]

for i in range(1, continuous_dim_count + 1):
    for em in res_x.emotions.unique():
        res_em = res_x[res_x.emotions == em]
        pear_res = pearsonr(res_em['len'], res_em[f'Dim{i}'])
        res_dim[i - 1] += [[pear_res.statistic, pear_res.pvalue, em]]

In [7]:
gen_len_df = res_x.groupby(['emotions', 'model'])['len'].mean().unstack('model').add_prefix('gen_length_')
em_corr_arr = [gen_len_df]

for i in range(1, continuous_dim_count + 1):
    em_corr = pd.DataFrame(res_dim[i - 1], columns=[f'correlation_dim{i}', f'p_dim{i}', 'emotions'])
    gen_dim = res_x.groupby(['emotions', 'model'])[f'Dim{i}'].mean().unstack('model').add_prefix(f'dim{i}_score_')
    em_corr_arr = [em_corr] + em_corr_arr + [gen_dim] 

em_corr_df = reduce(lambda l, r: pd.merge(l, r, on=["emotions"]), em_corr_arr).round(2)
em_corr_df = em_corr_df.sort_values(by="gen_length_zephyr_base",ascending=False)
#em_corr_df.reindex(sorted(em_corr_df.columns), axis=1)

In [8]:
em_corr_df

Unnamed: 0,correlation_dim4,p_dim4,emotions,correlation_dim3,p_dim3,correlation_dim2,p_dim2,correlation_dim1,p_dim1,gen_length_SFT,...,dim3_score_SFT,dim3_score_human_targets,dim3_score_zephyr_base,dim3_score_zephyr_base_output_limit,dim3_score_zephyr_dpo,dim4_score_SFT,dim4_score_human_targets,dim4_score_zephyr_base,dim4_score_zephyr_base_output_limit,dim4_score_zephyr_dpo
0,0.02,0.93,guilty,-0.16,0.51,0.14,0.56,-0.01,0.96,38.0,...,76.58,59.58,67.58,64.58,78.67,75.75,55.25,72.0,54.0,77.33
4,-0.18,0.44,apprehensive,-0.36,0.12,0.2,0.41,-0.21,0.36,155.25,...,60.83,72.58,72.25,62.67,74.58,46.5,63.33,68.08,50.92,61.08
17,0.13,0.59,impressed,-0.35,0.14,0.11,0.64,-0.07,0.77,50.75,...,80.0,70.08,68.83,72.42,68.33,71.42,59.0,75.08,59.58,68.92
24,0.04,0.87,lonely,-0.06,0.8,0.05,0.83,-0.11,0.65,40.0,...,59.67,68.33,66.67,71.17,79.58,49.92,61.83,60.08,66.5,73.75
20,0.06,0.79,sad,0.36,0.08,0.41,0.04,0.45,0.02,67.8,...,76.53,70.27,84.73,62.87,69.6,74.33,67.33,68.4,51.93,68.53
21,0.4,0.08,angry,0.53,0.02,0.56,0.01,0.39,0.09,40.75,...,64.42,65.92,82.25,80.17,60.58,64.08,70.33,70.25,72.17,58.0
18,0.3,0.2,devastated,0.28,0.23,0.23,0.32,0.18,0.44,68.5,...,71.5,64.0,81.42,71.42,71.42,67.58,35.33,76.33,61.33,57.33
10,0.55,0.01,confident,0.03,0.9,0.34,0.14,0.43,0.06,39.75,...,65.67,72.58,78.83,69.25,65.5,56.17,81.58,86.75,75.75,64.75
5,-0.01,0.98,caring,-0.1,0.69,0.11,0.65,-0.14,0.55,47.25,...,66.08,65.75,75.92,75.08,56.08,57.17,59.58,74.75,75.0,47.0
23,0.29,0.22,joyful,-0.03,0.9,0.25,0.29,0.16,0.51,43.0,...,82.83,59.08,74.58,86.58,68.5,78.83,54.75,79.17,71.42,56.0


In [9]:
em_corr_df.sort_values(by="correlation_dim1",ascending=False)

Unnamed: 0,correlation_dim4,p_dim4,emotions,correlation_dim3,p_dim3,correlation_dim2,p_dim2,correlation_dim1,p_dim1,gen_length_SFT,...,dim3_score_SFT,dim3_score_human_targets,dim3_score_zephyr_base,dim3_score_zephyr_base_output_limit,dim3_score_zephyr_dpo,dim4_score_SFT,dim4_score_human_targets,dim4_score_zephyr_base,dim4_score_zephyr_base_output_limit,dim4_score_zephyr_dpo
20,0.06,0.79,sad,0.36,0.08,0.41,0.04,0.45,0.02,67.8,...,76.53,70.27,84.73,62.87,69.6,74.33,67.33,68.4,51.93,68.53
19,0.3,0.19,terrified,0.3,0.2,0.73,0.0,0.45,0.05,38.25,...,58.33,52.17,81.92,67.5,64.33,57.5,62.17,80.33,55.83,56.42
10,0.55,0.01,confident,0.03,0.9,0.34,0.14,0.43,0.06,39.75,...,65.67,72.58,78.83,69.25,65.5,56.17,81.58,86.75,75.75,64.75
13,0.32,0.17,hopeful,0.21,0.38,0.36,0.12,0.4,0.08,43.5,...,67.92,78.33,82.08,80.42,63.75,62.0,69.17,74.25,82.42,68.0
21,0.4,0.08,angry,0.53,0.02,0.56,0.01,0.39,0.09,40.75,...,64.42,65.92,82.25,80.17,60.58,64.08,70.33,70.25,72.17,58.0
25,0.25,0.3,afraid,0.32,0.17,0.43,0.06,0.37,0.11,53.75,...,68.33,60.42,79.33,67.42,79.42,57.08,65.58,65.75,61.33,71.0
2,0.27,0.25,trusting,0.29,0.21,0.55,0.01,0.31,0.18,39.5,...,60.33,51.08,69.67,70.17,86.67,55.08,48.58,67.67,58.75,64.42
9,0.27,0.26,jealous,0.17,0.48,0.39,0.09,0.3,0.2,55.75,...,60.83,64.33,67.33,77.25,59.33,54.08,47.0,59.0,68.67,74.08
1,0.34,0.04,surprised,0.23,0.18,0.24,0.17,0.25,0.16,34.86,...,63.57,68.38,80.19,75.52,73.67,53.86,62.71,72.62,69.1,72.86
27,0.44,0.05,faithful,0.07,0.76,0.39,0.09,0.24,0.31,55.5,...,77.17,66.0,79.83,80.92,65.17,68.5,68.0,90.17,88.92,50.08


In [10]:
d = 1
alpha = 0.5
model = "zephyr_base"
em_corr_df[[f"correlation_dim{d}", f"p_dim{d}", "emotions", f"dim{d}_score_{model}"]][em_corr_df[f"p_dim{d}"] < alpha].sort_values(by=f"correlation_dim{d}", ascending=False)

Unnamed: 0,correlation_dim1,p_dim1,emotions,dim1_score_zephyr_base
20,0.45,0.02,sad,87.53
19,0.45,0.05,terrified,83.42
10,0.43,0.06,confident,89.67
13,0.4,0.08,hopeful,88.5
21,0.39,0.09,angry,82.5
25,0.37,0.11,afraid,81.33
2,0.31,0.18,trusting,80.17
9,0.3,0.2,jealous,73.17
1,0.25,0.16,surprised,81.43
27,0.24,0.31,faithful,88.33
