# Reliability anaysis


## Packages and Helpers

In [2]:
# packages
import pandas as pd
import numpy as np
import glob
import os
import matplotlib.pyplot as plt
from utils import load_dataframes
from scipy.stats import pearsonr
import random
import statsmodels.api as sm
from statsmodels.formula.api import mixedlm
import pingouin as pg
import seaborn as sns

## Read Processed Data

In [3]:
# load overall df
all_data = pd.read_csv("processed_data/items_per_LLM.csv")

In [4]:
all_data.describe()

Unnamed: 0,score,score_top_n,H_score,score_expected
count,21192.0,21192.0,368.0,304.0
mean,4.098468,3.453543,0.399521,0.497405
std,8.854362,7.427143,0.114482,0.181776
min,8.5e-05,0.0,0.169929,0.18207
25%,0.3456,0.328374,0.30342,0.331352
50%,1.351195,1.039493,0.395055,0.514704
75%,2.61871,2.589177,0.491583,0.646394
max,86.832153,91.283178,0.726647,0.794977


## Normalize item scores where tasks have different scales on different items
- AUDIT, FTND, GABS, PG

In [5]:
def normalize_per_scale(data):
    scales_to_normalize = ["AUDIT scale", "FTND scale", "GABS scale", "PG scale"]

    data = data.copy()

    # Loop over each experiment you want to normalize
    for scale in scales_to_normalize:
        mask = data["experiment"] == scale

        if mask.sum() == 0:
            continue  # skip if scale not present

        # Normalize score
        data.loc[mask, "score"] = (
            data.loc[mask, "score"] - data.loc[mask, "score"].min()
        ) / (data.loc[mask, "score"].max() - data.loc[mask, "score"].min())

        # Normalize score_top_n
        data.loc[mask, "score_top_n"] = (
            data.loc[mask, "score_top_n"] - data.loc[mask, "score_top_n"].min()
        ) / (data.loc[mask, "score_top_n"].max() - data.loc[mask, "score_top_n"].min())

    return data

all_data = normalize_per_scale(all_data)


In [6]:
# task_names = all_data['experiment'].unique()
# model_names = all_data["model"].unique()
# for model_name in model_names:
#     #for task_name in task_names:
#     sub_df = all_data[
#         (all_data['model'] == model_name) &
#         (all_data['experiment'] == "DFD task")
#     ]
#     print(sub_df)

## Chronbach's Alpha

In [7]:
# function:
def compute_cronbach_alpha(score = "score"):
    results = []

    # loop through experiments
    for exp, exp_data in all_data.groupby("experiment"):
        
        # check whether this experiment has subcategories
        if (
    exp_data["category"].notna().any()
    and exp != "SOEP scale"
    and exp != "DFE task"
    and exp != "DFD task"
):
            # compute alpha per category (domain)
            for domain, domain_data in exp_data.groupby("category"):
                df_wide = domain_data.pivot_table(
                    index="model", columns="item", values=score
                )
                #print(df_wide)
                if df_wide.shape[1] > 1:
                    alpha, ci = pg.cronbach_alpha(df_wide)
                else:
                    alpha, ci = None, (None, None)
                
                results.append({
                    "experiment": exp,
                    "domain": domain,
                    "alpha": alpha,
                    "alpha_CI": ci
                })
        else:
            # compute alpha for the whole experiment
            df_wide = exp_data.pivot_table(
                index="model", columns="item", values=score
            )
            #print(exp, df_wide.std().describe())
            if df_wide.shape[1] > 1:
                alpha, ci = pg.cronbach_alpha(df_wide)
            else:
                alpha, ci = None, (None, None)
            
            results.append({
                "experiment": exp,
                "domain": "total",  # no subdomain
                "alpha": alpha,
                "alpha_CI": ci
            })

    # convert to DataFrame
    return(pd.DataFrame(results))



In [8]:
# ---- Cronbach's alpha per domain or directly per scale, depending on whether the scale has subdomains ----
alpha_df = compute_cronbach_alpha()


# ---- Cronbach's alpha per domain or directly per scale, depending on whether the scale has subdomains ----
alpha_top_n_df = compute_cronbach_alpha(score = "score_top_n")


# Merge only selected columns
alpha_df = alpha_df.merge(
    alpha_top_n_df,
    on=["experiment", "domain"],
    how="inner",
    suffixes=("_total", "_top_n")
)

display(alpha_df)

Unnamed: 0,experiment,domain,alpha_total,alpha_CI_total,alpha_top_n,alpha_CI_top_n
0,AUDIT scale,total,0.739466,"[0.612, 0.838]",0.51678,"[0.28, 0.7]"
1,BARRAT scale,BISa,0.830566,"[0.745, 0.896]",0.752462,"[0.627, 0.847]"
2,BARRAT scale,BISm,0.830683,"[0.748, 0.895]",0.807216,"[0.713, 0.88]"
3,BARRAT scale,BISn,0.761134,"[0.644, 0.852]",0.663099,"[0.498, 0.791]"
4,BART task,total,0.984716,"[0.977, 0.991]",0.984364,"[0.977, 0.99]"
5,CARE scale,CAREa,0.856412,"[0.785, 0.911]",0.892074,"[0.838, 0.933]"
6,CARE scale,CAREs,0.651253,"[0.468, 0.787]",0.688656,"[0.525, 0.81]"
7,CARE scale,CAREw,0.961134,"[0.939, 0.977]",0.938256,"[0.903, 0.963]"
8,CCT task,total,0.984565,"[0.977, 0.991]",0.978183,"[0.967, 0.987]"
9,DAST scale,total,0.503859,"[0.269, 0.69]",0.661407,"[0.501, 0.788]"


In [9]:
df_wide = all_data[all_data["experiment"] == "SSSV scale"].pivot_table(index='model', columns='item', values='score')

# Compute correlation between models
correlations = df_wide.corr()
correlations

item,1,10,11,12,13,14,15,16,17,18,...,37,38,39,4,40,5,6,7,8,9
item,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,-0.439478,-0.318267,-0.347741,-0.362829,0.354461,-0.399569,0.388234,0.360987,0.382929,...,-0.23781,-0.262865,0.309494,-0.405364,-0.160062,0.382824,0.314669,-0.358463,0.342325,0.461295
10,-0.439478,1.0,0.898186,0.853564,0.879618,-0.760255,0.791943,-0.784391,-0.785396,-0.801917,...,0.759073,0.776037,-0.742454,0.794849,0.747631,-0.846534,-0.6407,0.869368,-0.887547,-0.79196
11,-0.318267,0.898186,1.0,0.936018,0.915164,-0.81809,0.799309,-0.808251,-0.779816,-0.843631,...,0.831988,0.843309,-0.807113,0.804825,0.823005,-0.855859,-0.635323,0.918935,-0.906649,-0.748025
12,-0.347741,0.853564,0.936018,1.0,0.932862,-0.809991,0.79656,-0.827584,-0.822668,-0.850113,...,0.788125,0.803162,-0.813523,0.831989,0.728204,-0.890867,-0.641771,0.897522,-0.895607,-0.808869
13,-0.362829,0.879618,0.915164,0.932862,1.0,-0.836941,0.814626,-0.881659,-0.881895,-0.87692,...,0.826594,0.808033,-0.804864,0.786509,0.753738,-0.868388,-0.631477,0.852524,-0.899261,-0.796922
14,0.354461,-0.760255,-0.81809,-0.809991,-0.836941,1.0,-0.922981,0.864413,0.853226,0.882115,...,-0.724178,-0.7262,0.792875,-0.618898,-0.731117,0.808473,0.810245,-0.849322,0.893626,0.775908
15,-0.399569,0.791943,0.799309,0.79656,0.814626,-0.922981,1.0,-0.908495,-0.877428,-0.927076,...,0.739983,0.773045,-0.841908,0.631086,0.757815,-0.809848,-0.786602,0.793443,-0.840172,-0.769698
16,0.388234,-0.784391,-0.808251,-0.827584,-0.881659,0.864413,-0.908495,1.0,0.921573,0.945004,...,-0.798446,-0.78202,0.820441,-0.670593,-0.751256,0.801017,0.607856,-0.744413,0.799285,0.80379
17,0.360987,-0.785396,-0.779816,-0.822668,-0.881895,0.853226,-0.877428,0.921573,1.0,0.894362,...,-0.838639,-0.839475,0.853664,-0.681369,-0.772117,0.838283,0.734168,-0.718967,0.818954,0.81653
18,0.382929,-0.801917,-0.843631,-0.850113,-0.87692,0.882115,-0.927076,0.945004,0.894362,1.0,...,-0.803617,-0.803889,0.871005,-0.665139,-0.784393,0.839895,0.651378,-0.795157,0.851014,0.735784


## Split-half Reliability

In [10]:
def split_half_reliability(df_items, n_splits=100):
    """Compute average split-half reliability (Spearman-Brown corrected)."""

    k = df_items.shape[1]
    
    if k < 2:
        return None, None

    cols = df_items.columns
    results = []

    for _ in range(n_splits):
        shuffled = np.random.permutation(cols)
        half1 = shuffled[:k//2]
        half2 = shuffled[k//2:]

        s1 = df_items[half1].sum(axis=1)
        s2 = df_items[half2].sum(axis=1)

        r, _ = pearsonr(s1, s2)

        if np.isfinite(r):
            r_sb = (2 * r) / (1 + r)  # Spearman-Brown correction
            results.append(r_sb)

    if len(results) == 0:
        return None, None
    
    return np.mean(results), np.std(results)


In [11]:
def compute_split_half_reliability(score="score"):
    rows = []  # final list of results

    for exp, exp_data in all_data.groupby("experiment"):

        # case 1: experiments with domains
        if (
    exp_data["category"].notna().any()
    and exp != "SOEP scale"
    and exp != "DFE task"
    and exp != "DFD task"
):
        #if exp_data["category"].notna().any() and exp != "SOEP scale":
            for domain, domain_data in exp_data.groupby("category"):

                df_wide = domain_data.pivot_table(
                    index="model", columns="item", values=score
                )

                mean_rsb, sd_rsb = split_half_reliability(df_wide)

                rows.append({
                    "experiment": exp,
                    "domain": domain,
                    "split_half_mean": mean_rsb,
                    "split_half_sd": sd_rsb
                })

        # case 2: experiments without domains
        else:
            df_wide = exp_data.pivot_table(
                index="model", columns="item", values=score
            )

            mean_rsb, sd_rsb = split_half_reliability(df_wide)

            rows.append({
                "experiment": exp,
                "domain": "total",
                "split_half_mean": mean_rsb,
                "split_half_sd": sd_rsb
            })

    return pd.DataFrame(rows)


In [12]:
split_half_reliability = compute_split_half_reliability()
split_half_reliability

Unnamed: 0,experiment,domain,split_half_mean,split_half_sd
0,AUDIT scale,total,0.779844,0.088202
1,BARRAT scale,BISa,0.849953,0.108182
2,BARRAT scale,BISm,0.858285,0.075932
3,BARRAT scale,BISn,0.786535,0.165846
4,BART task,total,0.988027,0.005113
5,CARE scale,CAREa,0.991127,0.007742
6,CARE scale,CAREs,0.974611,0.02107
7,CARE scale,CAREw,0.996159,0.000725
8,CCT task,total,0.994392,0.003379
9,DAST scale,total,0.627089,0.17817


## Visualize Distributions per model

In [13]:
# # Visualise distribution (focus on variance) per model of logprobs

# #model_names = all_data['model'].unique()
# task_names = all_data['experiment'].unique()

# #for model_name in model_names:
# for task_name in task_names:
#     df_plot = all_data[
#         #(all_data['model'] == model_name) &
#         (all_data['experiment'] == task_name)
#     ]
    
#     plt.figure(figsize=(10, 5))
#     sns.histplot(data=df_plot, x="score", kde=True, bins=30)  
#     plt.title(f'Distribution of score for all models, for task: {task_name}')
#     plt.xlabel('Score')
#     plt.ylabel('Count')
#     plt.show()