In [1]:
from pathlib import Path

import numpy as np
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
from scipy.stats import ttest_ind
from sklearn.preprocessing import StandardScaler
from statsmodels.iolib.summary2 import summary_col
from statsmodels.stats.multicomp import pairwise_tukeyhsd


In [2]:
data_dir = Path() / "results"
figs_dir = Path() / "figs"
figs_dir.mkdir(exist_ok=True)

# Statistics

In [3]:
cronbach_df = pd.read_csv(data_dir / "cronbach.csv")
cronbach_df_simple = cronbach_df.query("scale in ['MFQ', 'MFV']")

## Condition T-test

In [4]:
# claude ttest
ttest_ind(
    cronbach_df_simple.query("agent == 'Claude 2.1' and condition == 'qv'")["alpha"].values,
    cronbach_df_simple.query("agent == 'Claude 2.1' and condition == 'vq'")["alpha"].values,
)

# old value: Ttest_indResult(statistic=0.054352978972766676, pvalue=0.9569116669285772)

TtestResult(statistic=-0.44920858127742386, pvalue=0.6576752597589356, df=22.0)

In [5]:
cronbach_df_simple.query("agent == 'Claude 2.1' and condition == 'qv'").shape[0] + cronbach_df_simple.query("agent == 'Claude 2.1' and condition == 'vq'").shape[0]

24

In [6]:
cronbach_df_simple# gpt-4 ttest
ttest_ind(
    cronbach_df_simple.query("agent == 'GPT-4' and condition == 'qv'")["alpha"].values,
    cronbach_df_simple.query("agent == 'GPT-4' and condition == 'vq'")["alpha"].values,
)
# old value: Ttest_indResult(statistic=2.4887215445023743, pvalue=0.016869329019227434)

TtestResult(statistic=2.858629170799804, pvalue=0.009130833612339734, df=22.0)

In [7]:
cronbach_df_simple.query("agent == 'GPT-4' and condition == 'qv'").shape[0] + cronbach_df_simple.query("agent == 'GPT-4' and condition == 'vq'").shape[0]

24

Calculating Cohen-D for GPT-4

In [8]:
def cohen_d(x, y):
    nx = len(x)
    ny = len(y)
    dof = nx + ny - 2
    return (np.mean(x) - np.mean(y)) / np.sqrt(
        ((nx - 1) * np.std(x, ddof=1) ** 2 + (ny - 1) * np.std(y, ddof=1) ** 2) / dof
    )

cohen_d(
    cronbach_df_simple.query("agent == 'GPT-4' and condition == 'qv'")["alpha"].values,
    cronbach_df_simple.query("agent == 'GPT-4' and condition == 'vq'")["alpha"].values,
)


1.1670304720491502

## ANOVA for agent

In [9]:
cronbach_overall_simple = cronbach_df_simple.query("condition == 'overall'")

In [10]:
formula = "alpha ~ C(agent, Treatment('Human'))"
model = smf.ols(formula, cronbach_overall_simple).fit()
aov_table = sm.stats.anova_lm(model, typ=2)
print(aov_table)

                                sum_sq    df         F    PR(>F)
C(agent, Treatment('Human'))  0.049612   2.0  0.896107  0.417855
Residual                      0.913508  33.0       NaN       NaN


In [11]:
cronbach_overall_simple.groupby(["agent", "scale"])["alpha"].mean()

agent       scale
Claude 2.1  MFQ      0.464288
            MFV      0.911348
GPT-4       MFQ      0.814661
            MFV      0.720451
Human       MFQ      0.735493
            MFV      0.872140
Name: alpha, dtype: float64

In [12]:
model.summary2()

  dat = dat.applymap(lambda x: _formatter(x, float_format))


0,1,2,3
Model:,OLS,Adj. R-squared:,-0.006
Dependent Variable:,alpha,AIC:,-24.0998
Date:,2024-07-24 19:23,BIC:,-19.3492
No. Observations:,36,Log-Likelihood:,15.05
Df Model:,2,F-statistic:,0.8961
Df Residuals:,33,Prob (F-statistic):,0.418
R-squared:,0.052,Scale:,0.027682

0,1,2,3,4,5,6
,Coef.,Std.Err.,t,P>|t|,[0.025,0.975]
Intercept,0.8152,0.0480,16.9730,0.0000,0.7175,0.9129
"C(agent, Treatment('Human'))[T.Claude 2.1]",-0.0901,0.0679,-1.3269,0.1936,-0.2283,0.0481
"C(agent, Treatment('Human'))[T.GPT-4]",-0.0555,0.0679,-0.8171,0.4198,-0.1937,0.0827

0,1,2,3
Omnibus:,5.646,Durbin-Watson:,2.032
Prob(Omnibus):,0.059,Jarque-Bera (JB):,4.626
Skew:,-0.87,Prob(JB):,0.099
Kurtosis:,3.244,Condition No.:,4.0


## ANOVA AGENT * Instrument

In [13]:
formula_scale = "alpha ~ C(agent, Treatment('Human')) * C(scale)"
model_scale = smf.ols(formula_scale, cronbach_overall_simple).fit()
aov_table_scale = sm.stats.anova_lm(model_scale, typ=2)
round(aov_table_scale, 3)

Unnamed: 0,sum_sq,df,F,PR(>F)
"C(agent, Treatment('Human'))",0.05,2.0,2.974,0.066
C(scale),0.233,1.0,27.929,0.0
"C(agent, Treatment('Human')):C(scale)",0.43,2.0,25.796,0.0
Residual,0.25,30.0,,


In [14]:
res = pairwise_tukeyhsd(cronbach_df_simple["alpha"], cronbach_df_simple["agent"])
print(res)

  Multiple Comparison of Means - Tukey HSD, FWER=0.05  
  group1   group2 meandiff p-adj   lower  upper  reject
-------------------------------------------------------
Claude 2.1  GPT-4  -0.0305 0.8232 -0.1528 0.0918  False
Claude 2.1  Human   0.1036 0.3308 -0.0694 0.2765  False
     GPT-4  Human    0.134 0.1601 -0.0389  0.307  False
-------------------------------------------------------


In [15]:
model_scale.summary()

0,1,2,3
Dep. Variable:,alpha,R-squared:,0.74
Model:,OLS,Adj. R-squared:,0.697
Method:,Least Squares,F-statistic:,17.09
Date:,"Wed, 24 Jul 2024",Prob (F-statistic):,5.38e-08
Time:,19:23:15,Log-Likelihood:,38.359
No. Observations:,36,AIC:,-64.72
Df Residuals:,30,BIC:,-55.22
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.7355,0.041,18.008,0.000,0.652,0.819
"C(agent, Treatment('Human'))[T.Claude 2.1]",-0.2712,0.058,-4.695,0.000,-0.389,-0.153
"C(agent, Treatment('Human'))[T.GPT-4]",0.0792,0.058,1.371,0.181,-0.039,0.197
C(scale)[T.MFV],0.1366,0.053,2.555,0.016,0.027,0.246
"C(agent, Treatment('Human'))[T.Claude 2.1]:C(scale)[T.MFV]",0.3104,0.076,4.105,0.000,0.156,0.465
"C(agent, Treatment('Human'))[T.GPT-4]:C(scale)[T.MFV]",-0.2309,0.076,-3.053,0.005,-0.385,-0.076

0,1,2,3
Omnibus:,13.177,Durbin-Watson:,1.356
Prob(Omnibus):,0.001,Jarque-Bera (JB):,18.251
Skew:,-0.937,Prob(JB):,0.000109
Kurtosis:,5.942,Cond. No.,10.7


## AGENT * FOUNDATION

In [16]:
formula_found = "alpha ~ C(agent, Treatment('Human')) * C(foundation)"
model_found = smf.ols(formula_found, cronbach_df_simple.query("foundation != 'Liberty'")).fit()

aov_table_found = sm.stats.anova_lm(model_found, typ=2)
round(aov_table_found, 3)

Unnamed: 0,sum_sq,df,F,PR(>F)
"C(agent, Treatment('Human'))",0.185,2.0,2.234,0.117
C(foundation),1.004,7.0,3.461,0.004
"C(agent, Treatment('Human')):C(foundation)",0.37,14.0,0.637,0.822
Residual,2.197,53.0,,


# Regression models

In [17]:
df = pd.read_csv(data_dir / "foundation_cond_agg.csv")

# renaming columns to use same terminology
df.rename(
    columns={
        "MFQ_harm": "MFQ_Care",
        "MFQ_ingroup": "MFQ_Loyalty",
        "MFQ_authority": "MFQ_Authority",
        "MFQ_purity": "MFQ_Purity",
        "MFQ_fairness": "MFQ_Fairness",
    },
    inplace=True,
)

df.head(2)

Unnamed: 0,agent,id,condition,MFQ_Care,MFQ_Fairness,MFQ_Loyalty,MFQ_Authority,MFQ_Purity,MFV_Authority,MFV_Loyalty,MFV_Fairness,MFV_Purity,MFV_Care (p),MFV_Liberty,MFV_Care (e)
0,Claude 2.1,0,qv,3.5,4.5,1.666667,1.666667,1.5,1.9,1.733333,2.666667,3.571429,3.222222,2.428571,2.272727
1,Claude 2.1,2,qv,3.666667,4.833333,2.0,2.0,2.0,2.4,3.333333,3.0,3.571429,3.666667,3.0,2.272727


Applying StandardScaler

In [18]:
scaler = StandardScaler()

# apply on columns starting with MFQ or MFV
df.loc[
    :, df.columns.str.startswith("MFQ") | df.columns.str.startswith("MFV")
] = scaler.fit_transform(
    df.loc[:, df.columns.str.startswith("MFQ") | df.columns.str.startswith("MFV")]
)

In [19]:
# rename columns to standardize under care, loyalty etc

formula = "Q('{}') ~ MFQ_Authority + MFQ_Care + MFQ_Fairness + MFQ_Loyalty + MFQ_Purity"

dependents = [x for x in df.columns if x.startswith("MFV_")]
dependents.sort()

In [20]:
df.columns

Index(['agent', 'id', 'condition', 'MFQ_Care', 'MFQ_Fairness', 'MFQ_Loyalty',
       'MFQ_Authority', 'MFQ_Purity', 'MFV_Authority', 'MFV_Loyalty',
       'MFV_Fairness', 'MFV_Purity', 'MFV_Care (p)', 'MFV_Liberty',
       'MFV_Care (e)'],
      dtype='object')

## Models for Claude

In [21]:
models = list()

for i in dependents:
    mod = smf.ols(
        formula=formula.format(i),
        data=df.query("agent == 'Claude 2.1'"),
    )
    res = mod.fit()
    models.append(res)

output = summary_col(models, stars=True,)

output

  dat = dat.applymap(lambda x: _formatter(x, float_format))


0,1,2,3,4,5,6,7
,Q('MFV_Authority'),Q('MFV_Care (e)'),Q('MFV_Care (p)'),Q('MFV_Fairness'),Q('MFV_Liberty'),Q('MFV_Loyalty'),Q('MFV_Purity')
Intercept,-0.7213***,-0.3880**,-0.8084***,-0.8465***,-0.3510**,0.2278,-0.5073***
,(0.1359),(0.1762),(0.1210),(0.0972),(0.1574),(0.1837),(0.1494)
MFQ_Authority,0.0017,0.1277,0.0598,-0.0021,0.0416,-0.0045,0.0871
,(0.1339),(0.1736),(0.1192),(0.0957),(0.1551),(0.1810),(0.1472)
MFQ_Care,-0.0068,0.0993,0.1072,0.0680,0.1828,0.0762,0.2333*
,(0.1206),(0.1564),(0.1074),(0.0862),(0.1397),(0.1630),(0.1326)
MFQ_Fairness,0.0282,-0.1197,-0.0494,-0.0278,-0.0391,-0.0690,-0.0986
,(0.0937),(0.1215),(0.0834),(0.0670),(0.1086),(0.1267),(0.1030)
MFQ_Loyalty,0.0236,-0.1061,-0.0764,0.0066,0.0733,0.1978,-0.0368


## GPT-4

In [22]:
models = list()

for i in dependents:
    mod = smf.ols(
        formula=formula.format(i),
        data=df.query("agent == 'GPT-4' and condition == 'vq'"),
    )
    res = mod.fit()
    models.append(res)

output = summary_col(models, stars=True)

output

  dat = dat.applymap(lambda x: _formatter(x, float_format))


0,1,2,3,4,5,6,7
,Q('MFV_Authority'),Q('MFV_Care (e)'),Q('MFV_Care (p)'),Q('MFV_Fairness'),Q('MFV_Liberty'),Q('MFV_Loyalty'),Q('MFV_Purity')
Intercept,0.7994***,0.8964***,1.0151***,1.0735***,0.6966***,-0.1073,0.9770***
,(0.2049),(0.2532),(0.0670),(0.1220),(0.2170),(0.3575),(0.1673)
MFQ_Authority,0.0611,-0.0503,-0.0081,-0.0822,0.0440,-0.1383,-0.0126
,(0.1270),(0.1570),(0.0415),(0.0756),(0.1345),(0.2216),(0.1037)
MFQ_Care,0.0011,0.0941,-0.0644,-0.0165,-0.0576,0.2598,-0.0886
,(0.1890),(0.2336),(0.0618),(0.1126),(0.2002),(0.3297),(0.1543)
MFQ_Fairness,-0.1310,-0.1722,-0.0327,-0.0440,0.0939,-0.2090,-0.0429
,(0.1396),(0.1725),(0.0456),(0.0831),(0.1478),(0.2435),(0.1140)
MFQ_Loyalty,0.0007,-0.0206,-0.0814*,-0.0025,0.2130,0.4343*,-0.0500


In [23]:
models = list()

for i in dependents:
    mod = smf.ols(
        formula=formula.format(i),
        data=df.query("agent == 'GPT-4' and condition == 'qv'"),
    )
    res = mod.fit()
    models.append(res)

output_qv = summary_col(models, stars=True)

output_qv

  dat = dat.applymap(lambda x: _formatter(x, float_format))


0,1,2,3,4,5,6,7
,Q('MFV_Authority'),Q('MFV_Care (e)'),Q('MFV_Care (p)'),Q('MFV_Fairness'),Q('MFV_Liberty'),Q('MFV_Loyalty'),Q('MFV_Purity')
Intercept,0.5219***,0.2535**,0.7084***,0.6677***,0.3899***,0.0203,0.5902***
,(0.1039),(0.1202),(0.0550),(0.0590),(0.1133),(0.1728),(0.0935)
MFQ_Authority,-0.0775,-0.1993*,-0.0220,-0.1086*,-0.3727***,-0.1714,-0.0181
,(0.1020),(0.1180),(0.0540),(0.0580),(0.1113),(0.1697),(0.0918)
MFQ_Care,0.1437,0.1887,0.0881,0.1273,0.0416,-0.0301,-0.0230
,(0.1367),(0.1582),(0.0723),(0.0777),(0.1492),(0.2275),(0.1231)
MFQ_Fairness,-0.1232,-0.0874,-0.0404,-0.0413,0.0628,-0.0544,0.0990
,(0.0872),(0.1009),(0.0461),(0.0496),(0.0951),(0.1451),(0.0785)
MFQ_Loyalty,0.0739,0.0306,-0.0185,-0.0210,0.1431,0.2800*,-0.0443


In [24]:
models = list()

for i in dependents:
    mod = smf.ols(
        formula=formula.format(i),
        data=df.query("agent == 'GPT-4' and condition == 'vq'"),
    )
    res = mod.fit()
    models.append(res)

output_vq = summary_col(models, stars=True)

output_vq

  dat = dat.applymap(lambda x: _formatter(x, float_format))


0,1,2,3,4,5,6,7
,Q('MFV_Authority'),Q('MFV_Care (e)'),Q('MFV_Care (p)'),Q('MFV_Fairness'),Q('MFV_Liberty'),Q('MFV_Loyalty'),Q('MFV_Purity')
Intercept,0.7994***,0.8964***,1.0151***,1.0735***,0.6966***,-0.1073,0.9770***
,(0.2049),(0.2532),(0.0670),(0.1220),(0.2170),(0.3575),(0.1673)
MFQ_Authority,0.0611,-0.0503,-0.0081,-0.0822,0.0440,-0.1383,-0.0126
,(0.1270),(0.1570),(0.0415),(0.0756),(0.1345),(0.2216),(0.1037)
MFQ_Care,0.0011,0.0941,-0.0644,-0.0165,-0.0576,0.2598,-0.0886
,(0.1890),(0.2336),(0.0618),(0.1126),(0.2002),(0.3297),(0.1543)
MFQ_Fairness,-0.1310,-0.1722,-0.0327,-0.0440,0.0939,-0.2090,-0.0429
,(0.1396),(0.1725),(0.0456),(0.0831),(0.1478),(0.2435),(0.1140)
MFQ_Loyalty,0.0007,-0.0206,-0.0814*,-0.0025,0.2130,0.4343*,-0.0500


# Statistics - Including MFQ Divisions

## Condition T-test

In [25]:
cronbach_df_parts = cronbach_df.query("scale != 'MFQ'")

In [26]:
# claude ttest
ttest_ind(
    cronbach_df_parts.query("agent == 'Claude 2.1' and condition == 'qv'")["alpha"].values,
    cronbach_df_parts.query("agent == 'Claude 2.1' and condition == 'vq'")["alpha"].values,
)

# old value: Ttest_indResult(statistic=0.054352978972766676, pvalue=0.9569116669285772)

TtestResult(statistic=-0.34953516655422673, pvalue=0.7289774841350276, df=32.0)

In [27]:
# gpt-4 ttest
ttest_ind(
    cronbach_df_parts.query("agent == 'GPT-4' and condition == 'qv'")["alpha"].values,
    cronbach_df_parts.query("agent == 'GPT-4' and condition == 'vq'")["alpha"].values,
)
# old value: Ttest_indResult(statistic=2.4887215445023743, pvalue=0.016869329019227434)

TtestResult(statistic=3.88904317108902, pvalue=0.00047785239263291085, df=32.0)

Calculating Cohen-D for GPT-4

In [28]:
def cohen_d(x, y):
    nx = len(x)
    ny = len(y)
    dof = nx + ny - 2
    return (np.mean(x) - np.mean(y)) / np.sqrt(
        ((nx - 1) * np.std(x, ddof=1) ** 2 + (ny - 1) * np.std(y, ddof=1) ** 2) / dof
    )

cohen_d(
    cronbach_df_parts.query("agent == 'GPT-4' and condition == 'qv'")["alpha"].values,
    cronbach_df_parts.query("agent == 'GPT-4' and condition == 'vq'")["alpha"].values,
)

# previous 0.7503777791625594

1.3339308027998054

## ANOVA for agent

In [29]:
formula = "alpha ~ C(agent, Treatment('Human'))"
model = smf.ols(formula, cronbach_df_parts).fit()
aov_table = sm.stats.anova_lm(model, typ=2)
print(aov_table)

                                sum_sq     df         F    PR(>F)
C(agent, Treatment('Human'))  0.294712    2.0  2.086264  0.128784
Residual                      8.193269  116.0       NaN       NaN


In [30]:
cronbach_df_parts.groupby("agent")["alpha"].mean()

agent
Claude 2.1    0.581551
GPT-4         0.639082
Human         0.730455
Name: alpha, dtype: float64

In [31]:
model.summary2()

  dat = dat.applymap(lambda x: _formatter(x, float_format))


0,1,2,3
Model:,OLS,Adj. R-squared:,0.018
Dependent Variable:,alpha,AIC:,25.2859
Date:,2024-07-24 19:23,BIC:,33.6233
No. Observations:,119,Log-Likelihood:,-9.643
Df Model:,2,F-statistic:,2.086
Df Residuals:,116,Prob (F-statistic):,0.129
R-squared:,0.035,Scale:,0.070632

0,1,2,3,4,5,6
,Coef.,Std.Err.,t,P>|t|,[0.025,0.975]
Intercept,0.7305,0.0645,11.3323,0.0000,0.6028,0.8581
"C(agent, Treatment('Human'))[T.Claude 2.1]",-0.1489,0.0744,-2.0006,0.0478,-0.2963,-0.0015
"C(agent, Treatment('Human'))[T.GPT-4]",-0.0914,0.0744,-1.2276,0.2221,-0.2388,0.0560

0,1,2,3
Omnibus:,8.931,Durbin-Watson:,1.312
Prob(Omnibus):,0.011,Jarque-Bera (JB):,9.593
Skew:,-0.68,Prob(JB):,0.008
Kurtosis:,2.71,Condition No.:,6.0


In [32]:
formula_scale = "alpha ~ C(agent, Treatment('Human')) * C(scale)"
model_scale = smf.ols(formula_scale, cronbach_df_parts).fit()
aov_table_scale = sm.stats.anova_lm(model_scale, typ=2)
aov_table_scale

Unnamed: 0,sum_sq,df,F,PR(>F)
"C(agent, Treatment('Human'))",0.294712,2.0,4.202603,0.01742625
C(scale),3.306461,2.0,47.150171,1.628537e-15
"C(agent, Treatment('Human')):C(scale)",1.02987,4.0,7.342979,2.79909e-05
Residual,3.856939,110.0,,


In [33]:
formula_found = "alpha ~ C(agent, Treatment('Human')) * C(foundation)"
model_found = smf.ols(formula_found, cronbach_df_parts).fit()
aov_table_found = sm.stats.anova_lm(model_found, typ=2)
aov_table_found

Unnamed: 0,sum_sq,df,F,PR(>F)
"C(agent, Treatment('Human'))",0.294712,2.0,2.60631,0.07925
C(foundation),2.300915,8.0,5.087076,3e-05
"C(agent, Treatment('Human')):C(foundation)",0.690835,16.0,0.763681,0.721429
Residual,5.201519,92.0,,


In [34]:
t = pairwise_tukeyhsd(cronbach_df_parts["alpha"], cronbach_df_parts["agent"])
print(t)

  Multiple Comparison of Means - Tukey HSD, FWER=0.05  
  group1   group2 meandiff p-adj   lower  upper  reject
-------------------------------------------------------
Claude 2.1  GPT-4   0.0575 0.5202 -0.0674 0.1825  False
Claude 2.1  Human   0.1489 0.1166 -0.0278 0.3256  False
     GPT-4  Human   0.0914 0.4394 -0.0853 0.2681  False
-------------------------------------------------------


When adjusting for multiple comparisons, the effect of agent is not significant. Furthermore, LLMs mean was slightliy higher than that of humans.

Only Part 1

In [35]:
cronbach_mfv_mfq1 = cronbach_df_parts.query("scale !=  'MFQ - Part 2'")
cronbach_mfv_mfq2 = cronbach_df_parts.query("scale !=  'MFV - Part 1'")

In [36]:
formula_scale = "alpha ~ C(agent, Treatment('Human')) * C(scale)"
model_scale = smf.ols(formula_scale, cronbach_mfv_mfq1).fit()
aov_table_scale = sm.stats.anova_lm(model_scale, typ=2)
aov_table_scale

Unnamed: 0,sum_sq,df,F,PR(>F)
"C(agent, Treatment('Human'))",0.114321,2.0,2.206175,0.1169472
C(scale),0.915659,1.0,35.341079,7.359875e-08
"C(agent, Treatment('Human')):C(scale)",0.798543,2.0,15.41041,2.290454e-06
Residual,2.020918,78.0,,


In [37]:
print(pairwise_tukeyhsd(cronbach_mfv_mfq1["alpha"], cronbach_mfv_mfq1["agent"]))

  Multiple Comparison of Means - Tukey HSD, FWER=0.05  
  group1   group2 meandiff p-adj   lower  upper  reject
-------------------------------------------------------
Claude 2.1  GPT-4  -0.0029 0.9982 -0.1238 0.1179  False
Claude 2.1  Human   0.1039 0.3197  -0.067 0.2748  False
     GPT-4  Human   0.1068    0.3 -0.0641 0.2777  False
-------------------------------------------------------


In [38]:
formula_scale = "alpha ~ C(agent, Treatment('Human')) * C(scale)"
model_scale = smf.ols(formula_scale, cronbach_mfv_mfq2).fit()
aov_table_scale = sm.stats.anova_lm(model_scale, typ=2)
aov_table_scale

Unnamed: 0,sum_sq,df,F,PR(>F)
"C(agent, Treatment('Human'))",0.294712,2.0,4.202603,0.01742625
C(scale),3.306461,2.0,47.150171,1.628537e-15
"C(agent, Treatment('Human')):C(scale)",1.02987,4.0,7.342979,2.79909e-05
Residual,3.856939,110.0,,
