# Reliability anaysis


## Packages and Helpers

In [12]:
# packages
import pandas as pd
import numpy as np
import glob
import os
import matplotlib.pyplot as plt
from utils import load_dataframes
from scipy.stats import pearsonr
import random
import statsmodels.api as sm
from statsmodels.formula.api import mixedlm
import pingouin as pg
import seaborn as sns

## Read Processed Data

In [13]:
# load overall df
random_all_data = pd.read_csv("processed_data/items_per_LLM_random_simulation.csv")
random_all_data_without_turn = pd.read_csv("processed_data/sim_rand_without_turn.csv") #items_per_LLM_random_simulation_without_turn

semi_random_all_data = pd.read_csv("processed_data/items_per_LLM_semi_random_simulation.csv")
semi_random_all_data_without_turn = pd.read_csv("processed_data/sim_semi_rand_without_turn.csv") # items_per_LLM_semi_random_simulation_without_turn

non_rand_answer_options_data = pd.read_csv("processed_data/sim_non_rand_answer_tendencies.csv")
non_rand_answer_options_data_without_turn = pd.read_csv("processed_data/sim_non_rand_answer_tendencies_without_turn.csv")

## Normalize item scores where tasks have different scales on different items
- AUDIT, FTND, GABS, PG

In [14]:
def normalize_per_scale(data):
    scales_to_normalize = ["AUDIT scale", "FTND scale", "GABS scale", "PG scale"]

    data = data.copy()

    # Loop over each experiment you want to normalize
    for scale in scales_to_normalize:
        mask = data["experiment"] == scale

        if mask.sum() == 0:
            continue  # skip if scale not present

        # Normalize score
        data.loc[mask, "score"] = (
            data.loc[mask, "score"] - data.loc[mask, "score"].min()
        ) / (data.loc[mask, "score"].max() - data.loc[mask, "score"].min())

        # Normalize score_top_n
        data.loc[mask, "score_top_n"] = (
            data.loc[mask, "score_top_n"] - data.loc[mask, "score_top_n"].min()
        ) / (data.loc[mask, "score_top_n"].max() - data.loc[mask, "score_top_n"].min())

    return data

random_all_data = normalize_per_scale(random_all_data)
random_all_data_without_turn = normalize_per_scale(random_all_data_without_turn)
semi_random_all_data = normalize_per_scale(semi_random_all_data)
semi_random_all_data_without_turn = normalize_per_scale(semi_random_all_data_without_turn)
non_rand_answer_options_data = normalize_per_scale(non_rand_answer_options_data)
non_rand_answer_options_data_without_turn = normalize_per_scale(non_rand_answer_options_data_without_turn)


## Chronbach's Alpha

In [15]:
# function:
def compute_cronbach_alpha(data, score = "score"):
    results = []

    # loop through experiments
    for exp, exp_data in data.groupby("experiment"):
        
        # check whether this experiment has subcategories
        if exp_data["category"].notna().any() and exp != "SOEP scale": #if (
    #exp_data["category"].notna().any()
    #and exp != "SOEP scale"
    #and exp != "DFE task"
    #and exp != "DFD task"
#):
            # compute alpha per category (domain)
            for domain, domain_data in exp_data.groupby("category"):
                df_wide = domain_data.pivot_table(
                    index="model", columns="item", values=score
                )
                #print(df_wide)
                if df_wide.shape[1] > 1:
                    alpha, ci = pg.cronbach_alpha(df_wide)
                else:
                    alpha, ci = None, (None, None)
                
                results.append({
                    "experiment": exp,
                    "domain": domain,
                    "alpha": alpha,
                    "alpha_CI": ci
                })
        else:
            # compute alpha for the whole experiment
            df_wide = exp_data.pivot_table(
                index="model", columns="item", values=score
            )
            #print(exp, df_wide.std().describe())
            if df_wide.shape[1] > 1:
                alpha, ci = pg.cronbach_alpha(df_wide)
            else:
                alpha, ci = None, (None, None)
            
            results.append({
                "experiment": exp,
                "domain": "total",  # no subdomain
                "alpha": alpha,
                "alpha_CI": ci
            })

    # convert to DataFrame
    return(pd.DataFrame(results))



In [16]:
# ---- Cronbach's alpha per domain or directly per scale, depending on whether the scale has subdomains ----
alpha_df_random = compute_cronbach_alpha(random_all_data)
alpha_df_random_without_turn = compute_cronbach_alpha(random_all_data_without_turn)
alpha_df_semi_random = compute_cronbach_alpha(semi_random_all_data)
alpha_df_semi_random_without_turn = compute_cronbach_alpha(semi_random_all_data_without_turn)
alpha_df_non_rand_answer_options = compute_cronbach_alpha(non_rand_answer_options_data)
alpha_df_non_rand_answer_options_without_turn = compute_cronbach_alpha(non_rand_answer_options_data_without_turn)


# ---- Cronbach's alpha per domain or directly per scale, depending on whether the scale has subdomains ----
alpha_df_random_top_n= compute_cronbach_alpha(random_all_data, score = "score_top_n")
alpha_df_semi_random_top_n = compute_cronbach_alpha(semi_random_all_data, score = "score_top_n")
alpha_df_non_rand_answer_options_top_n = compute_cronbach_alpha(non_rand_answer_options_data, score = "score_top_n")



# Merge only selected columns
alpha_df = alpha_df_random.merge(
    alpha_df_random_without_turn,
    on=["experiment", "domain"],
    how="inner",
    suffixes=("_reversed", "_raw")
)

# alpha_df_without_turn = alpha_df_random_without_turn.merge(
#     alpha_df_semi_random_without_turn,
#     on=["experiment", "domain"],
#     how="inner",
#     suffixes=("_random", "_semi_random")
# )

# alpha_df = alpha_df.merge(
#     alpha_df_without_turn,
#     on=["experiment", "domain"],
#     how="inner",
#     suffixes=("", "_without_flipping")
# )

alpha_df_semi_r = alpha_df_semi_random.merge(
    alpha_df_semi_random_without_turn,
    on=["experiment", "domain"], 
    how="inner",
    suffixes=("_reversed", "_raw")
)

alpha_df_non_r = alpha_df_non_rand_answer_options.merge(
    alpha_df_non_rand_answer_options_without_turn,
    on=["experiment", "domain"], 
    how="inner",
    suffixes=("_reversed", "_raw")
)

display(alpha_df)
display(alpha_df_semi_r)
display(alpha_df_non_r)

Unnamed: 0,experiment,domain,alpha_reversed,alpha_CI_reversed,alpha_raw,alpha_CI_raw
0,AUDIT scale,total,0.185702,"[-0.213, 0.495]",0.16035,"[-0.251, 0.479]"
1,BARRAT scale,BISa,0.085305,"[-0.377, 0.436]",-0.573209,"[-1.368, 0.03]"
2,BARRAT scale,BISm,0.188272,"[-0.209, 0.496]",0.042444,"[-0.426, 0.406]"
3,BARRAT scale,BISn,0.014077,"[-0.469, 0.388]",0.199341,"[-0.193, 0.503]"
4,BART task,total,0.127238,"[-0.29, 0.459]",0.127238,"[-0.29, 0.459]"
5,CARE scale,CAREa,0.083517,"[-0.373, 0.434]",0.083517,"[-0.373, 0.434]"
6,CARE scale,CAREs,0.095543,"[-0.38, 0.447]",0.095543,"[-0.38, 0.447]"
7,CARE scale,CAREw,0.230793,"[-0.21, 0.539]",0.230793,"[-0.21, 0.539]"
8,CCT task,total,-0.12424,"[-0.697, 0.33]",-0.12424,"[-0.697, 0.33]"
9,DAST scale,total,-0.198741,"[-0.765, 0.251]",0.167027,"[-0.227, 0.479]"


Unnamed: 0,experiment,domain,alpha_reversed,alpha_CI_reversed,alpha_raw,alpha_CI_raw
0,AUDIT scale,total,0.12509,"[-0.303, 0.457]",-0.280847,"[-0.908, 0.205]"
1,BARRAT scale,BISa,-0.043888,"[-0.571, 0.357]",0.166357,"[-0.255, 0.486]"
2,BARRAT scale,BISm,-0.241641,"[-0.849, 0.23]",-0.206726,"[-0.797, 0.251]"
3,BARRAT scale,BISn,0.200345,"[-0.191, 0.504]",-0.314574,"[-0.958, 0.184]"
4,BART task,total,0.127238,"[-0.29, 0.459]",0.161147,"[-0.24, 0.48]"
5,CARE scale,CAREa,-0.01677,"[-0.524, 0.372]",-0.01677,"[-0.524, 0.372]"
6,CARE scale,CAREs,-0.040084,"[-0.586, 0.364]",-0.040084,"[-0.586, 0.364]"
7,CARE scale,CAREw,-0.23721,"[-0.945, 0.258]",-0.23721,"[-0.945, 0.258]"
8,CCT task,total,-0.12424,"[-0.697, 0.33]",-0.123541,"[-0.696, 0.33]"
9,DAST scale,total,-0.286199,"[-0.894, 0.196]",0.174552,"[-0.216, 0.484]"


Unnamed: 0,experiment,domain,alpha_reversed,alpha_CI_reversed,alpha_raw,alpha_CI_raw
0,AUDIT scale,total,0.884494,"[0.828, 0.928]",0.953454,"[0.931, 0.971]"
1,BARRAT scale,BISa,0.971975,"[0.958, 0.983]",0.990053,"[0.985, 0.994]"
2,BARRAT scale,BISm,0.937814,"[0.907, 0.961]",0.991035,"[0.987, 0.994]"
3,BARRAT scale,BISn,0.885266,"[0.829, 0.929]",0.998668,"[0.998, 0.999]"
4,BART task,total,0.982769,"[0.975, 0.989]",0.982769,"[0.975, 0.989]"
5,CARE scale,CAREa,0.891735,"[0.838, 0.933]",0.891735,"[0.838, 0.933]"
6,CARE scale,CAREs,0.599966,"[0.39, 0.755]",0.599966,"[0.39, 0.755]"
7,CARE scale,CAREw,0.779589,"[0.653, 0.868]",0.779589,"[0.653, 0.868]"
8,CCT task,total,0.984221,"[0.976, 0.991]",0.984221,"[0.976, 0.991]"
9,DAST scale,total,0.04305,"[-0.409, 0.402]",0.999941,"[1.0, 1.0]"


In [17]:
alpha_df_non_r.to_csv('processed_data/alpha_df_simulation_3.csv', index=False)


## check correlations of semi-random data

In [18]:
#df_wide.describe()

In [19]:
df_wide = non_rand_answer_options_data[non_rand_answer_options_data["category"] == "SStas"].pivot_table(index='model', columns='item', values='score')
sorted_cols = sorted(df_wide.columns, key=lambda x: int(x))
df_wide = df_wide[sorted_cols]
correlations = df_wide.corr()
correlations

item,3,11,16,17,20,21,23,28,38,40
item,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
3,1.0,-0.697319,0.820694,0.743744,-0.684607,-0.724496,0.751678,0.82376,-0.663179,-0.769016
11,-0.697319,1.0,-0.658476,-0.785659,0.725821,0.693214,-0.638508,-0.768482,0.684901,0.734145
16,0.820694,-0.658476,1.0,0.789442,-0.7269,-0.696952,0.818881,0.828781,-0.678776,-0.751506
17,0.743744,-0.785659,0.789442,1.0,-0.658834,-0.754506,0.807979,0.779418,-0.618769,-0.780702
20,-0.684607,0.725821,-0.7269,-0.658834,1.0,0.710066,-0.670887,-0.719684,0.630122,0.712596
21,-0.724496,0.693214,-0.696952,-0.754506,0.710066,1.0,-0.830385,-0.735699,0.680906,0.749584
23,0.751678,-0.638508,0.818881,0.807979,-0.670887,-0.830385,1.0,0.759244,-0.694234,-0.783794
28,0.82376,-0.768482,0.828781,0.779418,-0.719684,-0.735699,0.759244,1.0,-0.714527,-0.739118
38,-0.663179,0.684901,-0.678776,-0.618769,0.630122,0.680906,-0.694234,-0.714527,1.0,0.629332
40,-0.769016,0.734145,-0.751506,-0.780702,0.712596,0.749584,-0.783794,-0.739118,0.629332,1.0


## Split-half Reliability

In [20]:
def split_half_reliability(df_items, n_splits=100):
    """Compute average split-half reliability (Spearman-Brown corrected)."""

    k = df_items.shape[1]
    
    if k < 2:
        return None, None

    cols = df_items.columns
    results = []

    for _ in range(n_splits):
        shuffled = np.random.permutation(cols)
        half1 = shuffled[:k//2]
        half2 = shuffled[k//2:]

        s1 = df_items[half1].sum(axis=1)
        s2 = df_items[half2].sum(axis=1)

        r, _ = pearsonr(s1, s2)

        if np.isfinite(r):
            r_sb = (2 * r) / (1 + r)  # Spearman-Brown correction
            results.append(r_sb)

    if len(results) == 0:
        return None, None
    
    return np.mean(results), np.std(results)


In [21]:
def compute_split_half_reliability(data, score="score"):
    rows = []  # final list of results

    for exp, exp_data in data.groupby("experiment"):

        # case 1: experiments with domains
        if exp_data["category"].notna().any() and exp != "SOEP scale":
            for domain, domain_data in exp_data.groupby("category"):

                df_wide = domain_data.pivot_table(
                    index="model", columns="item", values=score
                )

                mean_rsb, sd_rsb = split_half_reliability(df_wide)

                rows.append({
                    "experiment": exp,
                    "domain": domain,
                    "split_half_mean": mean_rsb,
                    "split_half_sd": sd_rsb
                })

        # case 2: experiments without domains
        else:
            df_wide = exp_data.pivot_table(
                index="model", columns="item", values=score
            )

            mean_rsb, sd_rsb = split_half_reliability(df_wide)

            rows.append({
                "experiment": exp,
                "domain": "total",
                "split_half_mean": mean_rsb,
                "split_half_sd": sd_rsb
            })

    return pd.DataFrame(rows)


In [22]:
split_half_reliability_random = compute_split_half_reliability(random_all_data)
split_half_reliability_random_without_turn = compute_split_half_reliability(random_all_data_without_turn)
split_half_reliability_semi_random = compute_split_half_reliability(semi_random_all_data)
split_half_reliability_semi_random_without_turn = compute_split_half_reliability(semi_random_all_data_without_turn)
split_half_non_rand_answer_options = compute_split_half_reliability(non_rand_answer_options_data)
split_half_non_rand_answer_options_without_turn = compute_split_half_reliability(non_rand_answer_options_data_without_turn)


# Merge 
split_half_rel_df = split_half_reliability_random.merge(
    split_half_reliability_semi_random,
    on=["experiment", "domain"],
    how="inner",
    suffixes=("_random", "_semi_random")
)
split_half_rel_df_without_flipping = split_half_reliability_random_without_turn.merge(
    split_half_reliability_semi_random_without_turn,
    on=["experiment", "domain"],
    how="inner",
    suffixes=("_random", "_semi_random")
)
split_half_rel_df = split_half_rel_df.merge(
    split_half_rel_df_without_flipping,
    on=["experiment", "domain"],
    how="inner",
    suffixes=("", "_without_flipping")
)

split_half_rel_df_non_rand = split_half_non_rand_answer_options.merge(
    split_half_non_rand_answer_options_without_turn,
    on=["experiment", "domain"],
    how="inner",
    suffixes=("_flipped", "_without_flipping")
)

display(split_half_rel_df)
display(split_half_rel_df_non_rand)

Unnamed: 0,experiment,domain,split_half_mean_random,split_half_sd_random,split_half_mean_semi_random,split_half_sd_semi_random,split_half_mean_random_without_flipping,split_half_sd_random_without_flipping,split_half_mean_semi_random_without_flipping,split_half_sd_semi_random_without_flipping
0,AUDIT scale,total,0.206398,0.172379,0.114025,0.166167,0.148732,0.18379,-0.267115,0.217087
1,BARRAT scale,BISa,0.083259,0.162406,-0.074241,0.217125,-0.563398,0.297389,0.156403,0.172076
2,BARRAT scale,BISm,0.190307,0.165645,-0.26057,0.247546,0.056639,0.137655,-0.226816,0.22638
3,BARRAT scale,BISn,0.014742,0.181207,0.199838,0.152181,0.17064,0.157918,-0.285077,0.259461
4,BART task,total,0.162261,0.187428,0.161032,0.176206,0.145849,0.180121,0.17831,0.173646
5,CARE scale,CAREa,0.081165,0.114312,-0.060584,0.148806,0.105522,0.140225,-0.016985,0.15397
6,CARE scale,CAREs,0.05698,0.22573,-0.176075,0.304678,-0.03557,0.244558,-0.154652,0.294295
7,CARE scale,CAREw,0.136314,0.349639,-0.241266,0.129772,0.182407,0.347396,-0.280513,0.14477
8,CCT task,total,-0.159638,0.280984,-0.099889,0.253375,-0.120775,0.292288,-0.115386,0.237988
9,DAST scale,total,-0.193259,0.19806,-0.324535,0.289947,0.177465,0.162436,0.184572,0.15164


Unnamed: 0,experiment,domain,split_half_mean_flipped,split_half_sd_flipped,split_half_mean_without_flipping,split_half_sd_without_flipping
0,AUDIT scale,total,0.952829,0.060694,0.978346,0.017157
1,BARRAT scale,BISa,0.993315,0.00351,0.995649,0.004316
2,BARRAT scale,BISm,0.991724,0.004979,0.996422,0.004142
3,BARRAT scale,BISn,0.988099,0.008538,0.999382,0.000477
4,BART task,total,0.99727,0.000703,0.997443,0.000565
5,CARE scale,CAREa,0.952501,0.02795,0.945,0.035006
6,CARE scale,CAREs,0.864827,0.054576,0.866564,0.054491
7,CARE scale,CAREw,0.903475,0.047487,0.897754,0.050899
8,CCT task,total,0.999355,0.000593,0.99918,0.001001
9,DAST scale,total,-0.005978,0.74325,0.999947,2.1e-05
