# Reliability anaysis


## Packages and Helpers

In [1]:
# packages
import pandas as pd
import numpy as np
import glob
import os
import matplotlib.pyplot as plt
from utils import load_dataframes
from scipy.stats import pearsonr
import random
import statsmodels.api as sm
from statsmodels.formula.api import mixedlm
import pingouin as pg
import seaborn as sns

## Read Processed Data

In [2]:
# load overall df
all_data_unchanged = pd.read_csv("processed_data/no_change_item_data.csv")
all_data = pd.read_csv("processed_data/items_per_LLM.csv")


## Normalize item scores where tasks have different scales on different items
- AUDIT, FTND, GABS, PG

In [3]:
def normalize_per_scale(data):
    scales_to_normalize = ["AUDIT scale", "FTND scale", "GABS scale", "PG scale"]

    data = data.copy()

    # Loop over each experiment you want to normalize
    for scale in scales_to_normalize:
        mask = data["experiment"] == scale

        if mask.sum() == 0:
            continue  # skip if scale not present

        # Normalize score
        data.loc[mask, "score"] = (
            data.loc[mask, "score"] - data.loc[mask, "score"].min()
        ) / (data.loc[mask, "score"].max() - data.loc[mask, "score"].min())

        # Normalize score_top_n
        data.loc[mask, "score_top_n"] = (
            data.loc[mask, "score_top_n"] - data.loc[mask, "score_top_n"].min()
        ) / (data.loc[mask, "score_top_n"].max() - data.loc[mask, "score_top_n"].min())

    return data

all_data = normalize_per_scale(all_data)
all_data_unchanged = normalize_per_scale(all_data_unchanged)


## Chronbach's Alpha

In [4]:
# function:
def compute_cronbach_alpha(data = all_data, score = "score"):
    results = []

    # loop through experiments
    for exp, exp_data in data.groupby("experiment"):
        
        # check whether this experiment has subcategories
        if exp_data["category"].notna().any() and exp != "SOEP scale":
            # compute alpha per category (domain)
            for domain, domain_data in exp_data.groupby("category"):
                df_wide = domain_data.pivot_table(
                    index="model", columns="item", values=score
                )
                #print(df_wide)
                if df_wide.shape[1] > 1:
                    alpha, ci = pg.cronbach_alpha(df_wide)
                else:
                    alpha, ci = None, (None, None)
                
                results.append({
                    "experiment": exp,
                    "domain": domain,
                    "alpha": alpha,
                    "alpha_CI": ci
                })
        else:
            # compute alpha for the whole experiment
            df_wide = exp_data.pivot_table(
                index="model", columns="item", values=score
            )
            #print(exp, df_wide.std().describe())
            if df_wide.shape[1] > 1:
                alpha, ci = pg.cronbach_alpha(df_wide)
            else:
                alpha, ci = None, (None, None)
            
            results.append({
                "experiment": exp,
                "domain": "total",  # no subdomain
                "alpha": alpha,
                "alpha_CI": ci
            })

    # convert to DataFrame
    return(pd.DataFrame(results))



In [5]:
reverse_item_map = pd.read_csv('processed_data/reverse_item_map.csv')

In [6]:
reverse_item_map

Unnamed: 0,expriment,pctReversed
0,AUDIT scale,0.0
1,BARRAT scale,0.366667
2,CARE scale,0.0
3,DAST scale,0.9
4,Dm scale,0.0
5,DOSPERT scale,0.0
6,FTND scale,0.857143
7,GABS scale,0.0
8,PG scale,1.0
9,PRI scale,0.0


In [7]:
# ---- Cronbach's alpha per domain or directly per scale, depending on whether the scale has subdomains ----
alpha_df = compute_cronbach_alpha()
raw_alpha_df = compute_cronbach_alpha(data = all_data_unchanged)

# Merge raw and processed (reversed) reliability
alpha_df = alpha_df.merge(
    raw_alpha_df,
    on=["experiment", "domain"],
    how="inner",
    suffixes=("_reversed", "_raw")
)

alpha_df["reversed_higher"] = np.where(
    alpha_df["alpha_reversed"] > alpha_df["alpha_raw"], "greater",
    np.where(alpha_df["alpha_reversed"] == alpha_df["alpha_raw"], "equal", "smaller")
)


display(alpha_df)

Unnamed: 0,experiment,domain,alpha_reversed,alpha_CI_reversed,alpha_raw,alpha_CI_raw,reversed_higher
0,AUDIT scale,total,0.739466,"[0.612, 0.838]",0.934789,"[0.903, 0.96]",smaller
1,BARRAT scale,BISa,0.830566,"[0.745, 0.896]",0.973389,"[0.96, 0.984]",smaller
2,BARRAT scale,BISm,0.830683,"[0.748, 0.895]",0.965052,"[0.948, 0.978]",smaller
3,BARRAT scale,BISn,0.761134,"[0.644, 0.852]",0.975071,"[0.963, 0.985]",smaller
4,BART task,total,0.984716,"[0.977, 0.991]",0.984716,"[0.977, 0.991]",equal
5,CARE scale,CAREa,0.856412,"[0.785, 0.911]",0.856412,"[0.785, 0.911]",equal
6,CARE scale,CAREs,0.651253,"[0.468, 0.787]",0.651253,"[0.468, 0.787]",equal
7,CARE scale,CAREw,0.961134,"[0.939, 0.977]",0.961134,"[0.939, 0.977]",equal
8,CCT task,total,0.984565,"[0.977, 0.991]",0.984565,"[0.977, 0.991]",equal
9,DAST scale,total,0.503859,"[0.269, 0.69]",0.975666,"[0.964, 0.985]",smaller


In [8]:
#alpha_df.to_csv('processed_data/alpha_df.csv', index=False)


In [9]:
df_wide = all_data_unchanged[all_data_unchanged["category"] == "SStas"].pivot_table(index='model', columns='item', values='score')
sorted_cols = sorted(df_wide.columns, key=lambda x: int(x))
df_wide = df_wide[sorted_cols]
correlations = df_wide.corr()
correlations

item,3,11,16,17,20,21,23,28,38,40
item,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
3,1.0,0.717201,0.733039,0.778608,0.675498,0.617488,0.633817,0.649909,0.653454,0.612725
11,0.717201,1.0,0.921395,0.888452,0.932651,0.914646,0.89053,0.8954,0.915741,0.869689
16,0.733039,0.921395,1.0,0.951411,0.947995,0.935176,0.948993,0.950785,0.920992,0.893678
17,0.778608,0.888452,0.951411,1.0,0.8901,0.905979,0.925401,0.918085,0.881976,0.895245
20,0.675498,0.932651,0.947995,0.8901,1.0,0.946281,0.940997,0.923952,0.943848,0.886171
21,0.617488,0.914646,0.935176,0.905979,0.946281,1.0,0.952226,0.943892,0.949189,0.898264
23,0.633817,0.89053,0.948993,0.925401,0.940997,0.952226,1.0,0.970109,0.95221,0.940389
28,0.649909,0.8954,0.950785,0.918085,0.923952,0.943892,0.970109,1.0,0.954122,0.926062
38,0.653454,0.915741,0.920992,0.881976,0.943848,0.949189,0.95221,0.954122,1.0,0.94048
40,0.612725,0.869689,0.893678,0.895245,0.886171,0.898264,0.940389,0.926062,0.94048,1.0


In [11]:
df_wide = all_data[all_data["category"] == "SStas"].pivot_table(index='model', columns='item', values='score')
sorted_cols = sorted(df_wide.columns, key=lambda x: int(x))
df_wide = df_wide[sorted_cols]
correlations = df_wide.corr()
correlations

item,3,11,16,17,20,21,23,28,38,40
item,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
3,1.0,-0.729243,0.688903,0.654979,-0.673039,-0.6646,0.676292,0.690711,-0.680533,-0.542127
11,-0.729243,1.0,-0.808251,-0.779816,0.905891,0.899884,-0.827549,-0.816227,0.843309,0.823005
16,0.688903,-0.808251,1.0,0.921573,-0.828692,-0.857443,0.894406,0.863095,-0.78202,-0.751256
17,0.654979,-0.779816,0.921573,1.0,-0.849727,-0.874727,0.896755,0.883736,-0.839475,-0.772117
20,-0.673039,0.905891,-0.828692,-0.849727,1.0,0.950158,-0.847524,-0.830336,0.886952,0.824691
21,-0.6646,0.899884,-0.857443,-0.874727,0.950158,1.0,-0.876728,-0.857444,0.862651,0.80607
23,0.676292,-0.827549,0.894406,0.896755,-0.847524,-0.876728,1.0,0.942083,-0.864413,-0.827152
28,0.690711,-0.816227,0.863095,0.883736,-0.830336,-0.857444,0.942083,1.0,-0.936918,-0.858659
38,-0.680533,0.843309,-0.78202,-0.839475,0.886952,0.862651,-0.864413,-0.936918,1.0,0.911143
40,-0.542127,0.823005,-0.751256,-0.772117,0.824691,0.80607,-0.827152,-0.858659,0.911143,1.0


## Split-half Reliability

In [13]:
def split_half_reliability(df_items, n_splits=100):
    """Compute average split-half reliability (Spearman-Brown corrected)."""

    k = df_items.shape[1]
    
    if k < 2:
        return None, None

    cols = df_items.columns
    results = []

    for _ in range(n_splits):
        shuffled = np.random.permutation(cols)
        half1 = shuffled[:k//2]
        half2 = shuffled[k//2:]

        s1 = df_items[half1].sum(axis=1)
        s2 = df_items[half2].sum(axis=1)

        r, _ = pearsonr(s1, s2)

        if np.isfinite(r):
            r_sb = (2 * r) / (1 + r)  # Spearman-Brown correction
            results.append(r_sb)

    if len(results) == 0:
        return None, None
    
    return np.mean(results), np.std(results)


In [14]:
def compute_split_half_reliability(score="score"):
    rows = []  # final list of results

    for exp, exp_data in all_data.groupby("experiment"):

        # case 1: experiments with domains
        if exp_data["category"].notna().any() and exp != "SOEP scale":
            for domain, domain_data in exp_data.groupby("category"):

                df_wide = domain_data.pivot_table(
                    index="model", columns="item", values=score
                )

                mean_rsb, sd_rsb = split_half_reliability(df_wide)

                rows.append({
                    "experiment": exp,
                    "domain": domain,
                    "split_half_mean": mean_rsb,
                    "split_half_sd": sd_rsb
                })

        # case 2: experiments without domains
        else:
            df_wide = exp_data.pivot_table(
                index="model", columns="item", values=score
            )

            mean_rsb, sd_rsb = split_half_reliability(df_wide)

            rows.append({
                "experiment": exp,
                "domain": "total",
                "split_half_mean": mean_rsb,
                "split_half_sd": sd_rsb
            })

    return pd.DataFrame(rows)


In [15]:
split_half_reliability = compute_split_half_reliability()
split_half_reliability

Unnamed: 0,experiment,domain,split_half_mean,split_half_sd
0,AUDIT scale,total,0.773428,0.083633
1,BARRAT scale,BISa,0.829147,0.113833
2,BARRAT scale,BISm,0.852145,0.081903
3,BARRAT scale,BISn,0.811933,0.149981
4,BART task,total,0.989044,0.005156
5,CARE scale,CAREa,0.99029,0.009931
6,CARE scale,CAREs,0.977032,0.019734
7,CARE scale,CAREw,0.996171,0.000734
8,CCT task,total,0.993916,0.003635
9,DAST scale,total,0.635505,0.145202


## Visualize Distributions per model

In [16]:
# # Visualise distribution (focus on variance) per model of logprobs

# #model_names = all_data['model'].unique()
# task_names = all_data['experiment'].unique()

# #for model_name in model_names:
# for task_name in task_names:
#     df_plot = all_data[
#         #(all_data['model'] == model_name) &
#         (all_data['experiment'] == task_name)
#     ]
    
#     plt.figure(figsize=(10, 5))
#     sns.histplot(data=df_plot, x="score", kde=True, bins=30)  
#     plt.title(f'Distribution of score for all models, for task: {task_name}')
#     plt.xlabel('Score')
#     plt.ylabel('Count')
#     plt.show()