In [35]:
import pandas as pd
import os
from scipy import stats
import numpy as np
from collections import defaultdict


USE_ABSOLUTE = False # this is because values closer to 0 are desired according to Lee et al

results_dir = '../data/results/empathy_eval_results/'
files = ['preds_dlrxxx_zephyr-7b-sft-full122_vad.txt', 'preds_dlr1e6_zephyr-7b-sft-full122_d211_vad.txt', 'preds_dlrxxx_zephyr-7b-sft-full_vad.txt']

In [36]:
df1 = pd.read_csv(os.path.join(results_dir, files[0]), sep='~')
df2 = pd.read_csv(os.path.join(results_dir, files[1]), sep='~')
df3 = pd.read_csv(os.path.join(results_dir, files[2]), sep='~')
human_df = pd.read_csv(os.path.join(results_dir, "human_vad.txt"), sep='~')

### set vad metrics you want to look at. Lee et al are concerned with "diff" 

In [37]:
vad_metrics = list(human_df.columns[6:])

# filtering to the diff ones
vad_metrics = [vm for vm in vad_metrics if "diff" in vm]
vad_metrics

['diff_max_v',
 'diff_mean_v',
 'diff_max_a',
 'diff_mean_a',
 'diff_max_d',
 'diff_mean_d',
 'diff_max_intensity']

In [38]:
df_collection = {}

for metric in vad_metrics:
    df_collection[metric] = pd.concat([df[[metric]].rename(columns={metric:f"df{i+1}"}) for i, df in enumerate([df1,df2,df3,human_df])], axis=1)
df_collection['diff_max_v']

Unnamed: 0,df1,df2,df3,df4
0,-0.295,0.449,-0.295,-0.159
1,-0.427,-0.334,-0.354,-0.427
2,0.002,-0.039,-0.029,0.057
3,0.000,-0.223,-0.076,-0.117
4,0.010,0.011,0.009,0.009
...,...,...,...,...
2535,-0.104,-0.065,0.063,-0.115
2536,-0.118,0.060,-0.007,0.063
2537,-0.229,0.028,-0.180,-0.146
2538,-0.823,-0.672,-0.656,-0.786


In [39]:
if USE_ABSOLUTE:
    for metric, df in df_collection.items():
        df_collection[metric] = df.abs()

In [40]:
from tabulate import tabulate

print("Variance")
variance_table = defaultdict(lambda:[])

for metric, df in df_collection.items():
    metric_variance = df.var()

    variance_table['metric'].append(metric)
    for k,v in metric_variance.items():
        variance_table[k].append(v)
    
variance_table = pd.DataFrame(variance_table)
variance_table['max var dominance'] = variance_table[['df1','df2','df3','df4']].max(axis=1, numeric_only=True) / variance_table[['df1','df2','df3','df4']].min(axis=1, numeric_only=True)
print(tabulate(variance_table, headers=variance_table.columns, floatfmt=".3f", showindex=False, tablefmt='outline'))

Variance
+--------------------+-------+-------+-------+-------+---------------------+
| metric             |   df1 |   df2 |   df3 |   df4 |   max var dominance |
| diff_max_v         | 0.041 | 0.043 | 0.025 | 0.052 |               2.102 |
| diff_mean_v        | 0.010 | 0.009 | 0.006 | 0.011 |               1.841 |
| diff_max_a         | 0.052 | 0.046 | 0.033 | 0.054 |               1.627 |
| diff_mean_a        | 0.005 | 0.005 | 0.003 | 0.006 |               1.685 |
| diff_max_d         | 0.031 | 0.033 | 0.023 | 0.041 |               1.748 |
| diff_mean_d        | 0.007 | 0.006 | 0.004 | 0.007 |               1.667 |
| diff_max_intensity | 0.148 | 0.129 | 0.092 | 0.139 |               1.607 |
+--------------------+-------+-------+-------+-------+---------------------+


In [41]:
print("Mean")
mean_table = defaultdict(lambda:[])

for metric, df in df_collection.items():
    metric_mean = df.mean()

    mean_table['metric'].append(metric)
    for k,v in metric_mean.items():
        mean_table[k].append(v)
    
mean_table = pd.DataFrame(mean_table)
# mean_table['max var dominance'] = mean_table[['df1','df2','df3','df4']].max(axis=1, numeric_only=True) / mean_table[['df1','df2','df3','df4']].min(axis=1, numeric_only=True)
print(tabulate(mean_table, headers=mean_table.columns, floatfmt=".3f", showindex=False, tablefmt='outline'))

Mean
+--------------------+--------+--------+--------+--------+
| metric             |    df1 |    df2 |    df3 |    df4 |
| diff_max_v         | -0.047 | -0.011 | -0.091 | -0.000 |
| diff_mean_v        | -0.052 | -0.032 | -0.065 | -0.035 |
| diff_max_a         |  0.032 |  0.010 | -0.082 |  0.024 |
| diff_mean_a        | -0.019 | -0.021 | -0.035 | -0.019 |
| diff_max_d         | -0.026 |  0.005 | -0.091 |  0.006 |
| diff_mean_d        | -0.040 | -0.025 | -0.048 | -0.026 |
| diff_max_intensity |  0.014 |  0.037 | -0.136 |  0.025 |
+--------------------+--------+--------+--------+--------+


In [42]:
print("Mean+STD")
mean_table = defaultdict(lambda:[])

for metric, df in df_collection.items():
    metric_mean = df.mean()
    metric_std = df.std()
    if 'mean' in metric:
        continue

    mean_table['metric'].append(metric)
    for k,v in metric_mean.items():
        std = df[k].std()
        table_entry = f"{v:.2f}\pm{std:.2f}"
        mean_table[k].append(table_entry)
    
mean_table = pd.DataFrame(mean_table)
# mean_table['max var dominance'] = mean_table[['df1','df2','df3','df4']].max(axis=1, numeric_only=True) / mean_table[['df1','df2','df3','df4']].min(axis=1, numeric_only=True)
print(tabulate(mean_table, headers=mean_table.columns, floatfmt=".2f", showindex=False, tablefmt='outline'))

Mean+STD
+--------------------+--------------+--------------+--------------+--------------+
| metric             | df1          | df2          | df3          | df4          |
| diff_max_v         | -0.05\pm0.20 | -0.01\pm0.21 | -0.09\pm0.16 | -0.00\pm0.23 |
| diff_max_a         | 0.03\pm0.23  | 0.01\pm0.21  | -0.08\pm0.18 | 0.02\pm0.23  |
| diff_max_d         | -0.03\pm0.18 | 0.00\pm0.18  | -0.09\pm0.15 | 0.01\pm0.20  |
| diff_max_intensity | 0.01\pm0.38  | 0.04\pm0.36  | -0.14\pm0.30 | 0.02\pm0.37  |
+--------------------+--------------+--------------+--------------+--------------+


In [43]:
mean_table.T

Unnamed: 0,0,1,2,3
metric,diff_max_v,diff_max_a,diff_max_d,diff_max_intensity
df1,-0.05\pm0.20,0.03\pm0.23,-0.03\pm0.18,0.01\pm0.38
df2,-0.01\pm0.21,0.01\pm0.21,0.00\pm0.18,0.04\pm0.36
df3,-0.09\pm0.16,-0.08\pm0.18,-0.09\pm0.15,-0.14\pm0.30
df4,-0.00\pm0.23,0.02\pm0.23,0.01\pm0.20,0.02\pm0.37


In [12]:
df['df1'].std()

0.2551127547574506

In [10]:
metric_std

df1    0.255113
df2    0.239498
df3    0.226604
df4    0.246856
dtype: float64

In [8]:
print("Median")
median_table = defaultdict(lambda:[])

for metric, df in df_collection.items():
    metric_median = df.median()

    median_table['metric'].append(metric)
    for k,v in metric_median.items():
        median_table[k].append(v)
    
median_table = pd.DataFrame(median_table)
# median_table['max var dominance'] = median_table[['df1','df2','df3','df4']].max(axis=1, numeric_only=True) / median_table[['df1','df2','df3','df4']].min(axis=1, numeric_only=True)
print(tabulate(median_table, headers=median_table.columns, floatfmt=".3f", showindex=False, tablefmt='outline'))

Median
+--------------------+-------+-------+-------+-------+
| metric             |   df1 |   df2 |   df3 |   df4 |
| diff_max_v         | 0.084 | 0.083 | 0.070 | 0.084 |
| diff_mean_v        | 0.073 | 0.065 | 0.072 | 0.069 |
| diff_max_a         | 0.145 | 0.128 | 0.115 | 0.138 |
| diff_mean_a        | 0.044 | 0.045 | 0.046 | 0.047 |
| diff_max_d         | 0.089 | 0.096 | 0.095 | 0.106 |
| diff_mean_d        | 0.059 | 0.053 | 0.056 | 0.055 |
| diff_max_intensity | 0.196 | 0.196 | 0.164 | 0.203 |
+--------------------+-------+-------+-------+-------+


In [9]:
print("Std")
std_table = defaultdict(lambda:[])

for metric, df in df_collection.items():
    metric_std = df.std()

    std_table['metric'].append(metric)
    for k,v in metric_std.items():
        std_table[k].append(v)
    
std_table = pd.DataFrame(std_table)
# std_table['max var dominance'] = std_table[['df1','df2','df3','df4']].max(axis=1, numeric_only=True) / std_table[['df1','df2','df3','df4']].min(axis=1, numeric_only=True)
print(tabulate(std_table, headers=std_table.columns, floatfmt=".3f", showindex=False, tablefmt='outline'))

Std
+--------------------+-------+-------+-------+-------+
| metric             |   df1 |   df2 |   df3 |   df4 |
| diff_max_v         | 0.160 | 0.159 | 0.142 | 0.178 |
| diff_mean_v        | 0.069 | 0.062 | 0.059 | 0.070 |
| diff_max_a         | 0.148 | 0.142 | 0.133 | 0.154 |
| diff_mean_a        | 0.048 | 0.047 | 0.042 | 0.051 |
| diff_max_d         | 0.126 | 0.128 | 0.125 | 0.143 |
| diff_mean_d        | 0.057 | 0.052 | 0.050 | 0.058 |
| diff_max_intensity | 0.255 | 0.239 | 0.227 | 0.247 |
+--------------------+-------+-------+-------+-------+


### Sig test, model v human

In [10]:
df

Unnamed: 0,df1,df2,df3,df4
0,0.021,0.667,0.114,0.058
1,0.438,0.547,0.484,0.438
2,0.055,0.125,0.070,0.195
3,0.000,0.206,0.315,0.061
4,0.195,0.875,0.316,0.336
...,...,...,...,...
2535,0.680,0.531,0.203,0.617
2536,0.500,0.297,0.352,0.281
2537,0.680,0.000,0.469,0.609
2538,0.047,0.030,0.727,0.133


In [13]:
from tqdm import tqdm

print("Model vs. human significance test")
sig_thresh = 1
sig_table = defaultdict(lambda:[])
pval_table = defaultdict(lambda:[])
sig_table['model'] = ['df1', 'df2', 'df3']
pval_table['model'] = ['df1', 'df2', 'df3']

for metric, df in tqdm(df_collection.items(), desc='Running sig tests', total=len(df_collection)):

    for model in ['df1', 'df2', 'df3']:
        
        res = stats.ttest_ind(df[model], df['df4'], equal_var=False, permutations=10000, random_state=34)
        if res.pvalue <= sig_thresh:
            sig_table[metric].append(res.statistic)
        else:
            sig_table[metric].append(None)
        pval_table[metric].append(res.pvalue)

      
sig_table = pd.DataFrame(sig_table)
pval_table = pd.DataFrame(pval_table)
# sig_table['max var dominance'] = sig_table[['df1','df2','df3','df4']].max(axis=1, numeric_only=True) / sig_table[['df1','df2','df3','df4']].min(axis=1, numeric_only=True)
print(tabulate(sig_table, headers=sig_table.columns, floatfmt=".3f", showindex=False, tablefmt='outline'))
print(tabulate(pval_table, headers=pval_table.columns, floatfmt=".5f", showindex=False, tablefmt='outline'))

Model vs. human significance test


Running sig tests: 100%|██████████| 7/7 [00:29<00:00,  4.21s/it]

+---------+--------------+---------------+--------------+---------------+--------------+---------------+----------------------+
| model   |   diff_max_v |   diff_mean_v |   diff_max_a |   diff_mean_a |   diff_max_d |   diff_mean_d |   diff_max_intensity |
| df1     |       -1.946 |         1.714 |        0.238 |        -2.573 |       -4.398 |         1.356 |                0.988 |
| df2     |       -2.374 |        -3.549 |       -3.679 |        -1.938 |       -3.510 |        -3.389 |               -1.576 |
| df3     |       -6.365 |        -1.714 |       -6.704 |        -3.943 |       -4.255 |        -2.166 |               -5.671 |
+---------+--------------+---------------+--------------+---------------+--------------+---------------+----------------------+
+---------+--------------+---------------+--------------+---------------+--------------+---------------+----------------------+
| model   |   diff_max_v |   diff_mean_v |   diff_max_a |   diff_mean_a |   diff_max_d |   diff_mean_d |




In [16]:
from tqdm import tqdm

print("Model vs. human significance test")
sig_thresh = .05
sig_table = defaultdict(lambda:[])
pval_table = defaultdict(lambda:[])
# sig_table['model'] = ['df1', 'df2', 'df3']
# pval_table['model'] = ['df1', 'df2', 'df3']

for metric, df in tqdm(df_collection.items(), desc='Running sig tests', total=len(df_collection)):

    for model in ['df1', 'df2', 'df3']:
        for model2 in ['df1', 'df2', 'df3']:
            if model == model2:
                continue
            if f"{model} vs. {model2}" not in sig_table['pair']:
                sig_table['pair'].append(f"{model} vs. {model2}")
                pval_table['pair'].append(f"{model} vs. {model2}")


        
            res = stats.ttest_ind(df[model], df[model2], equal_var=False, permutations=10000, random_state=34)
            if res.pvalue <= sig_thresh:
                sig_table[metric].append(res.statistic)
            else:
                sig_table[metric].append(None)
            pval_table[metric].append(res.pvalue)

      
sig_table = pd.DataFrame(sig_table)
pval_table = pd.DataFrame(pval_table)
# sig_table['max var dominance'] = sig_table[['df1','df2','df3','df4']].max(axis=1, numeric_only=True) / sig_table[['df1','df2','df3','df4']].min(axis=1, numeric_only=True)
print(tabulate(sig_table, headers=sig_table.columns, floatfmt=".3f", showindex=False, tablefmt='outline'))
print(tabulate(pval_table, headers=pval_table.columns, floatfmt=".5f", showindex=False, tablefmt='outline'))

Model vs. human significance test


Running sig tests: 100%|██████████| 7/7 [00:55<00:00,  7.88s/it]

+-------------+--------------+---------------+--------------+---------------+--------------+---------------+----------------------+
| pair        |   diff_max_v |   diff_mean_v |   diff_max_a |   diff_mean_a | diff_max_d   |   diff_mean_d |   diff_max_intensity |
| df1 vs. df2 |      nan     |         5.409 |        4.006 |       nan     |              |         4.850 |                2.552 |
| df1 vs. df3 |        4.613 |         3.592 |        7.113 |       nan     |              |         3.640 |                6.598 |
| df2 vs. df1 |      nan     |        -5.409 |       -4.006 |       nan     |              |        -4.850 |               -2.552 |
| df2 vs. df3 |        4.144 |        -2.045 |        3.044 |         2.011 |              |       nan     |                4.120 |
| df3 vs. df1 |       -4.613 |        -3.592 |       -7.113 |       nan     |              |        -3.640 |               -6.598 |
| df3 vs. df2 |       -4.144 |         2.045 |       -3.044 |        -2.011 




+-------------+--------------+---------------+--------------+---------------+--------------+---------------+----------------------+
| pair        |   diff_max_v |   diff_mean_v |   diff_max_a |   diff_mean_a |   diff_max_d |   diff_mean_d |   diff_max_intensity |
+=============+==============+===============+==============+===============+==============+===============+======================+
| df1 vs. df2 |        0.451 |         5.409 |        4.006 |        -0.690 |       -0.916 |         4.850 |                2.552 |
| df1 vs. df3 |        4.613 |         3.592 |        7.113 |         1.254 |       -0.179 |         3.640 |                6.598 |
| df2 vs. df3 |        4.144 |        -2.045 |        3.044 |         2.011 |        0.745 |        -1.361 |                4.120 |
+-------------+--------------+---------------+--------------+---------------+--------------+---------------+----------------------+
+-------------+--------------+---------------+--------------+---------------+--------------+---------------+----------------------+
| pair        |   diff_max_v |   diff_mean_v |   diff_max_a |   diff_mean_a |   diff_max_d |   diff_mean_d |   diff_max_intensity |
+=============+==============+===============+==============+===============+==============+===============+======================+
| df1 vs. df2 |      0.64554 |       0.00010 |      0.00010 |       0.49785 |      0.35906 |       0.00010 |              0.01000 |
| df1 vs. df3 |      0.00010 |       0.00060 |      0.00010 |       0.21158 |      0.86291 |       0.00060 |              0.00010 |
| df2 vs. df3 |      0.00030 |       0.03870 |      0.00330 |       0.04330 |      0.44046 |       0.16988 |              0.00020 |
+-------------+--------------+---------------+--------------+---------------+--------------+---------------+----------------------+

In [46]:
2538/2540
806/2540

0.3173228346456693

In [45]:
2927 / 20074

0.1458105011457607

In [47]:
2085 / 9437

0.2209388576878245

In [48]:
2529/2540

0.9956692913385827

In [49]:
5339 / 29080

0.18359697386519944

In [50]:
788/2540

0.3102362204724409