In [1]:
import pandas as pd
import os
from scipy import stats
import numpy as np
from collections import defaultdict
from tabulate import tabulate


results_dir = '../data/results/empathy_eval_results/'
files = ['preds_dlrxxx_zephyr-7b-sft-full122_specificity.txt', 'preds_dlr1e6_zephyr-7b-sft-full122_d211_specificity.txt', 'preds_dlrxxx_zephyr-7b-sft-full_specificity.txt']

In [2]:
df1 = pd.read_csv(os.path.join(results_dir, 'preds_dlrxxx_zephyr-7b-sft-full122_specificity.txt'), sep='~')
df2 = pd.read_csv(os.path.join(results_dir, 'preds_dlr1e6_zephyr-7b-sft-full122_d211_specificity.txt'), sep='~')
df3 = pd.read_csv(os.path.join(results_dir, 'preds_dlrxxx_zephyr-7b-sft-full_specificity.txt'), sep='~')
human_df = pd.read_csv(os.path.join(results_dir, "human_specificity.txt"), sep='~')

In [3]:
specificity_df = pd.concat([df[['specificity']].rename(columns={'specificity':f"df{i+1}"}) for i, df in enumerate([df1,df2,df3,human_df])], axis=1)
specificity_df

Unnamed: 0,df1,df2,df3,df4
0,0.183983,0.374588,0.329744,0.303654
1,0.198119,0.247147,0.258448,0.220178
2,0.244100,0.298339,0.325729,0.349959
3,0.215458,0.294622,0.266352,0.305621
4,0.217794,0.218387,0.273491,0.340205
...,...,...,...,...
2535,0.216682,0.278673,0.227164,0.187269
2536,0.243179,0.366589,0.362967,0.280253
2537,0.247419,0.259978,0.298492,0.409763
2538,0.245652,0.369529,0.207603,0.347666


In [4]:
# from tabulate import tabulate

# print(tabulate(specificity_df.var(), headers=['model', 'var'], floatfmt=".3f", showindex=False, tablefmt='outline'))

In [5]:

sig_thresh = .05
sig_table = defaultdict(lambda:[])

for col in ['df1','df2','df3']:
    sig_table['model'].append(col)
    res = stats.ttest_ind(specificity_df[col], specificity_df['df4'], equal_var=False, permutations=10000, random_state=34)
    sig_table['t'].append(res.statistic)
    sig_table['pval'].append(res.pvalue)

sig_table = pd.DataFrame(sig_table)
print(tabulate(sig_table, headers=sig_table.columns, floatfmt=".3f", showindex=False, tablefmt='outline'))




+---------+---------+--------+
| model   |       t |   pval |
| df1     | -46.746 |  0.000 |
| df2     |  -1.950 |  0.052 |
| df3     |   1.712 |  0.087 |
+---------+---------+--------+


In [6]:
sig_table = defaultdict(lambda:[])

for model in ['df1','df2','df3']:
    for model2 in ['df1','df2','df3']:
        if model == model2:
            continue
        if f"{model} vs. {model2}" not in sig_table['pair']:
            sig_table['pair'].append(f"{model} vs. {model2}")

        sig_table['model'].append(col)
        res = stats.ttest_ind(specificity_df[model], specificity_df[model2], equal_var=False, permutations=10000, random_state=34)
        sig_table['t'].append(res.statistic)
        sig_table['pval'].append(res.pvalue)

sig_table = pd.DataFrame(sig_table)
print(tabulate(sig_table, headers=sig_table.columns, floatfmt=".3f", showindex=False, tablefmt='outline'))


+-------------+---------+---------+--------+
| pair        | model   |       t |   pval |
| df1 vs. df2 | df3     | -47.265 |  0.000 |
| df1 vs. df3 | df3     | -58.005 |  0.000 |
| df2 vs. df1 | df3     |  47.265 |  0.000 |
| df2 vs. df3 | df3     |  -4.037 |  0.000 |
| df3 vs. df1 | df3     |  58.005 |  0.000 |
| df3 vs. df2 | df3     |   4.037 |  0.000 |
+-------------+---------+---------+--------+


### Deal with outliers

we'll take things within z-score -3 to 3

In [7]:
z_thresh = 3

outlier_idx = {}
vals_no_outliers = {}

for col in specificity_df.columns:
    zscores = np.abs(stats.zscore(specificity_df[col]))
    outlier_idx = zscores[zscores > z_thresh].index

    vals_no_outliers[col] = specificity_df[~specificity_df.index.isin(outlier_idx)][col]

vals_no_outliers

{'df1': 0       0.183983
 1       0.198119
 2       0.244100
 3       0.215458
 4       0.217794
           ...   
 2534    0.219939
 2535    0.216682
 2536    0.243179
 2537    0.247419
 2538    0.245652
 Name: df1, Length: 2515, dtype: float64,
 'df2': 0       0.374588
 1       0.247147
 2       0.298339
 3       0.294622
 4       0.218387
           ...   
 2535    0.278673
 2536    0.366589
 2537    0.259978
 2538    0.369529
 2539    0.289049
 Name: df2, Length: 2521, dtype: float64,
 'df3': 0       0.329744
 1       0.258448
 2       0.325729
 3       0.266352
 4       0.273491
           ...   
 2535    0.227164
 2536    0.362967
 2537    0.298492
 2538    0.207603
 2539    0.375508
 Name: df3, Length: 2525, dtype: float64,
 'df4': 0       0.303654
 1       0.220178
 2       0.349959
 3       0.305621
 4       0.340205
           ...   
 2535    0.187269
 2536    0.280253
 2537    0.409763
 2538    0.347666
 2539    0.249721
 Name: df4, Length: 2521, dtype: float64}

In [9]:
for k, v in vals_no_outliers.items():
    print(k, v.mean(), v.std())

df1 0.22081065184317236 0.03960645562232687
df2 0.29098130154711577 0.05499875313281079
df3 0.2983261773305056 0.04824000052674165
df4 0.2943573531544037 0.06090785533093696


t-test without outliers

In [27]:
# spec_df1_no_outliers = df1[~df1.index.isin(df1_outliers)].specificity
# spec_df2_no_outliers = df2[~df2.index.isin(df2_outliers)].specificity
# spec_df3_no_outliers = df3[~df3.index.isin(df3_outliers)].specificity

print("variances")
print("w/ outliers", specificity_df['df1'].var(),specificity_df['df2'].var(), specificity_df['df2'].var())
print("w/o outliers", vals_no_outliers['df1'].var(),vals_no_outliers['df2'].var(), vals_no_outliers['df2'].var())

variances
w/ outliers 0.0019368058122441576 0.0036972536837367046 0.0036972536837367046
w/o outliers 0.0015686713269633475 0.0030248628461638644 0.0030248628461638644


In [28]:

sig_thresh = .05
sig_table = defaultdict(lambda:[])

for col in ['df1','df2','df3']:
    sig_table['model'].append(col)
    res = stats.ttest_ind(vals_no_outliers[col], vals_no_outliers['df4'], equal_var=False, permutations=10000, random_state=34)
    sig_table['t'].append(res.statistic)
    sig_table['pval'].append(res.pvalue)

sig_table = pd.DataFrame(sig_table)
print(tabulate(sig_table, headers=sig_table.columns, floatfmt=".3f", showindex=False, tablefmt='outline'))

+---------+---------+--------+
| model   |       t |   pval |
| df1     | -50.809 |  0.000 |
| df2     |  -2.066 |  0.039 |
| df3     |   2.566 |  0.012 |
+---------+---------+--------+


In [32]:
sig_table = defaultdict(lambda:[])

for model in ['df1','df2','df3']:
    for model2 in ['df1','df2','df3']:
        if model == model2:
            continue
        if f"{model} vs. {model2}" not in sig_table['pair']:
            sig_table['pair'].append(f"{model} vs. {model2}")

        # sig_table['model'].append(col)
        res = stats.ttest_ind(vals_no_outliers[model], vals_no_outliers[model2], equal_var=False, permutations=10000, random_state=34)
        sig_table['t'].append(res.statistic)
        sig_table['pval'].append(res.pvalue)

sig_table = pd.DataFrame(sig_table)
print(tabulate(sig_table, headers=sig_table.columns, floatfmt=".3f", showindex=False, tablefmt='outline'))

+-------------+---------+--------+
| pair        |       t |   pval |
| df1 vs. df2 | -51.963 |  0.000 |
| df1 vs. df3 | -62.356 |  0.000 |
| df2 vs. df1 |  51.963 |  0.000 |
| df2 vs. df3 |  -5.043 |  0.000 |
| df3 vs. df1 |  62.356 |  0.000 |
| df3 vs. df2 |   5.043 |  0.000 |
+-------------+---------+--------+
