# Reliability anaysis


## Packages and Helpers

In [2]:
# packages
import pandas as pd
import numpy as np
import glob
import os
import matplotlib.pyplot as plt
from utils import load_dataframes
from scipy.stats import pearsonr
import random
import statsmodels.api as sm
from statsmodels.formula.api import mixedlm
import pingouin as pg
import seaborn as sns

## Read Processed Data

In [3]:
# load overall df
random_all_data = pd.read_csv("processed_data/items_per_LLM_random_simulation.csv")
semi_random_all_data = pd.read_csv("processed_data/items_per_LLM_semi_random_simulation.csv")


## Normalize item scores where tasks have different scales on different items
- AUDIT, FTND, GABS, PG

In [4]:
def normalize_per_scale(data):
    scales_to_normalize = ["AUDIT scale", "FTND scale", "GABS scale", "PG scale"]

    data = data.copy()

    # Loop over each experiment you want to normalize
    for scale in scales_to_normalize:
        mask = data["experiment"] == scale

        if mask.sum() == 0:
            continue  # skip if scale not present

        # Normalize score
        data.loc[mask, "score"] = (
            data.loc[mask, "score"] - data.loc[mask, "score"].min()
        ) / (data.loc[mask, "score"].max() - data.loc[mask, "score"].min())

        # Normalize score_top_n
        data.loc[mask, "score_top_n"] = (
            data.loc[mask, "score_top_n"] - data.loc[mask, "score_top_n"].min()
        ) / (data.loc[mask, "score_top_n"].max() - data.loc[mask, "score_top_n"].min())

    return data

random_all_data = normalize_per_scale(random_all_data)
semi_random_all_data = normalize_per_scale(semi_random_all_data)


## Chronbach's Alpha

In [5]:
# function:
def compute_cronbach_alpha(data, score = "score"):
    results = []

    # loop through experiments
    for exp, exp_data in data.groupby("experiment"):
        
        # check whether this experiment has subcategories
        if exp_data["category"].notna().any() and exp != "SOEP scale":
            # compute alpha per category (domain)
            for domain, domain_data in exp_data.groupby("category"):
                df_wide = domain_data.pivot_table(
                    index="model", columns="item", values=score
                )
                #print(df_wide)
                if df_wide.shape[1] > 1:
                    alpha, ci = pg.cronbach_alpha(df_wide)
                else:
                    alpha, ci = None, (None, None)
                
                results.append({
                    "experiment": exp,
                    "domain": domain,
                    "alpha": alpha,
                    "alpha_CI": ci
                })
        else:
            # compute alpha for the whole experiment
            df_wide = exp_data.pivot_table(
                index="model", columns="item", values=score
            )
            #print(exp, df_wide.std().describe())
            if df_wide.shape[1] > 1:
                alpha, ci = pg.cronbach_alpha(df_wide)
            else:
                alpha, ci = None, (None, None)
            
            results.append({
                "experiment": exp,
                "domain": "total",  # no subdomain
                "alpha": alpha,
                "alpha_CI": ci
            })

    # convert to DataFrame
    return(pd.DataFrame(results))



In [6]:
# ---- Cronbach's alpha per domain or directly per scale, depending on whether the scale has subdomains ----
alpha_df_random = compute_cronbach_alpha(random_all_data)
alpha_df_semi_random = compute_cronbach_alpha(semi_random_all_data)


# ---- Cronbach's alpha per domain or directly per scale, depending on whether the scale has subdomains ----
alpha_df_random_top_n= compute_cronbach_alpha(random_all_data, score = "score_top_n")
alpha_df_semi_random_top_n = compute_cronbach_alpha(semi_random_all_data, score = "score_top_n")



# Merge only selected columns
alpha_df = alpha_df_random.merge(
    alpha_df_semi_random,
    on=["experiment", "domain"],
    how="inner",
    suffixes=("_random", "_semi_random")
)

display(alpha_df)

Unnamed: 0,experiment,domain,alpha_random,alpha_CI_random,alpha_semi_random,alpha_CI_semi_random
0,AUDIT scale,total,0.185702,"[-0.213, 0.495]",0.12509,"[-0.303, 0.457]"
1,BARRAT scale,BISa,0.085305,"[-0.377, 0.436]",-0.043888,"[-0.571, 0.357]"
2,BARRAT scale,BISm,0.188272,"[-0.209, 0.496]",-0.241641,"[-0.849, 0.23]"
3,BARRAT scale,BISn,0.014077,"[-0.469, 0.388]",0.200345,"[-0.191, 0.504]"
4,BART task,total,0.127238,"[-0.29, 0.459]",0.127238,"[-0.29, 0.459]"
5,CARE scale,CAREa,0.083517,"[-0.373, 0.434]",-0.01677,"[-0.524, 0.372]"
6,CARE scale,CAREs,0.095543,"[-0.38, 0.447]",-0.040084,"[-0.586, 0.364]"
7,CARE scale,CAREw,0.230793,"[-0.21, 0.539]",-0.23721,"[-0.945, 0.258]"
8,CCT task,total,-0.12424,"[-0.697, 0.33]",-0.12424,"[-0.697, 0.33]"
9,DAST scale,total,-0.198741,"[-0.765, 0.251]",-0.286199,"[-0.894, 0.196]"


## check correlations of semi-random data

In [12]:
df_wide = semi_random_all_data[semi_random_all_data["category"] == "SStas"].pivot_table(index='model', columns='item', values='score')
sorted_cols = sorted(df_wide.columns, key=lambda x: int(x))
df_wide = df_wide[sorted_cols]
correlations = df_wide.corr()
correlations

item,3,11,16,17,20,21,23,28,38,40
item,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
3,1.0,-0.01252,0.140112,-0.062011,-0.011576,-0.086047,0.095625,0.10538,0.094918,0.085055
11,-0.01252,1.0,-0.010423,0.097531,-0.035798,0.329148,-0.162559,0.016272,0.202274,-0.025511
16,0.140112,-0.010423,1.0,-0.05513,0.143889,-0.057841,0.241962,0.016759,0.104992,0.278738
17,-0.062011,0.097531,-0.05513,1.0,-0.151385,0.311947,-0.246507,-0.178638,-0.025775,0.135513
20,-0.011576,-0.035798,0.143889,-0.151385,1.0,0.145135,0.197401,-0.04706,-0.140025,0.139164
21,-0.086047,0.329148,-0.057841,0.311947,0.145135,1.0,0.019932,-0.153011,0.393763,0.024843
23,0.095625,-0.162559,0.241962,-0.246507,0.197401,0.019932,1.0,0.231315,-0.029584,0.07481
28,0.10538,0.016272,0.016759,-0.178638,-0.04706,-0.153011,0.231315,1.0,-0.143196,-0.054858
38,0.094918,0.202274,0.104992,-0.025775,-0.140025,0.393763,-0.029584,-0.143196,1.0,0.11431
40,0.085055,-0.025511,0.278738,0.135513,0.139164,0.024843,0.07481,-0.054858,0.11431,1.0


## Split-half Reliability

In [8]:
def split_half_reliability(df_items, n_splits=100):
    """Compute average split-half reliability (Spearman-Brown corrected)."""

    k = df_items.shape[1]
    
    if k < 2:
        return None, None

    cols = df_items.columns
    results = []

    for _ in range(n_splits):
        shuffled = np.random.permutation(cols)
        half1 = shuffled[:k//2]
        half2 = shuffled[k//2:]

        s1 = df_items[half1].sum(axis=1)
        s2 = df_items[half2].sum(axis=1)

        r, _ = pearsonr(s1, s2)

        if np.isfinite(r):
            r_sb = (2 * r) / (1 + r)  # Spearman-Brown correction
            results.append(r_sb)

    if len(results) == 0:
        return None, None
    
    return np.mean(results), np.std(results)


In [9]:
def compute_split_half_reliability(data, score="score"):
    rows = []  # final list of results

    for exp, exp_data in data.groupby("experiment"):

        # case 1: experiments with domains
        if exp_data["category"].notna().any() and exp != "SOEP scale":
            for domain, domain_data in exp_data.groupby("category"):

                df_wide = domain_data.pivot_table(
                    index="model", columns="item", values=score
                )

                mean_rsb, sd_rsb = split_half_reliability(df_wide)

                rows.append({
                    "experiment": exp,
                    "domain": domain,
                    "split_half_mean": mean_rsb,
                    "split_half_sd": sd_rsb
                })

        # case 2: experiments without domains
        else:
            df_wide = exp_data.pivot_table(
                index="model", columns="item", values=score
            )

            mean_rsb, sd_rsb = split_half_reliability(df_wide)

            rows.append({
                "experiment": exp,
                "domain": "total",
                "split_half_mean": mean_rsb,
                "split_half_sd": sd_rsb
            })

    return pd.DataFrame(rows)


In [10]:
split_half_reliability_random = compute_split_half_reliability(random_all_data)
split_half_reliability_semi_random = compute_split_half_reliability(semi_random_all_data)


# Merge 
split_half_rel_df = split_half_reliability_random.merge(
    split_half_reliability_semi_random,
    on=["experiment", "domain"],
    how="inner",
    suffixes=("_random", "_semi_random")
)

display(split_half_rel_df)

Unnamed: 0,experiment,domain,split_half_mean_random,split_half_sd_random,split_half_mean_semi_random,split_half_sd_semi_random
0,AUDIT scale,total,0.212674,0.152972,0.106516,0.151107
1,BARRAT scale,BISa,0.080925,0.181273,-0.032229,0.195448
2,BARRAT scale,BISm,0.189912,0.139958,-0.237304,0.255033
3,BARRAT scale,BISn,0.037454,0.188662,0.205056,0.134008
4,BART task,total,0.143625,0.180235,0.132501,0.199176
5,CARE scale,CAREa,0.097934,0.13917,-0.050637,0.164518
6,CARE scale,CAREs,0.013978,0.244015,-0.116499,0.230064
7,CARE scale,CAREw,0.20405,0.336567,-0.270672,0.142958
8,CCT task,total,-0.119842,0.283792,-0.091091,0.241151
9,DAST scale,total,-0.204588,0.204216,-0.291597,0.290414
