# Reliability anaysis


## Packages and Helpers

In [1]:
# packages
import pandas as pd
import numpy as np
import glob
import os
import matplotlib.pyplot as plt
from utils import load_dataframes
from scipy.stats import pearsonr
import random
import statsmodels.api as sm
from statsmodels.formula.api import mixedlm
import pingouin as pg
import seaborn as sns

## Read Processed Data

In [2]:

# load overall df
all_data = pd.read_csv("processed_data/items_per_LLM.csv")

In [3]:
all_data[all_data["experiment"] == "LOT task"]

Unnamed: 0,experiment,model,item,score,score_top_n,category,reverse_coded


In [5]:
all_data[all_data["category"]== "Investment"]

Unnamed: 0,experiment,model,item,score,score_top_n,category,reverse_coded
4560,DOSPERT scale,Apertus-70B-Instruct-2509,7,1.542266,1.000000,Investment,
4571,DOSPERT scale,Apertus-70B-Instruct-2509,18,1.239807,1.000000,Investment,
4577,DOSPERT scale,Apertus-70B-Instruct-2509,24,1.725146,3.189641,Investment,
4583,DOSPERT scale,Apertus-70B-Instruct-2509,30,1.493840,2.195549,Investment,
4600,DOSPERT scale,Apertus-8B-Instruct-2509,7,1.409900,1.000000,Investment,
...,...,...,...,...,...,...,...
6343,DOSPERT scale,granite-3.3-8b-instruct,30,1.503841,1.000000,Investment,
6360,DOSPERT scale,zephyr-7b-beta,7,1.383534,1.000000,Investment,
6371,DOSPERT scale,zephyr-7b-beta,18,1.107743,1.000000,Investment,
6377,DOSPERT scale,zephyr-7b-beta,24,2.474859,3.798271,Investment,


## Chronbach's Alpha

In [9]:
# function:
def compute_cronbach_alpha(score = "score"):
    results = []

    # loop through experiments
    for exp, exp_data in all_data.groupby("experiment"):
        
        # check whether this experiment has subcategories
        if exp_data["category"].notna().any() and exp != "SOEP scale":
            # compute alpha per category (domain)
            for domain, domain_data in exp_data.groupby("category"):
                df_wide = domain_data.pivot_table(
                    index="model", columns="item", values=score
                )
                if df_wide.shape[1] > 1:
                    alpha, ci = pg.cronbach_alpha(df_wide)
                else:
                    alpha, ci = None, (None, None)
                
                results.append({
                    "experiment": exp,
                    "domain": domain,
                    "alpha": alpha,
                    "alpha_CI": ci
                })
        else:
            # compute alpha for the whole experiment
            df_wide = exp_data.pivot_table(
                index="model", columns="item", values=score
            )
            if df_wide.shape[1] > 1:
                alpha, ci = pg.cronbach_alpha(df_wide)
            else:
                alpha, ci = None, (None, None)
            
            results.append({
                "experiment": exp,
                "domain": None,  # no subdomain
                "alpha": alpha,
                "alpha_CI": ci
            })

    # convert to DataFrame
    return(pd.DataFrame(results))



In [12]:
# ---- Cronbach's alpha per domain or directly per scale, depending on whether the scale has subdomains ----
alpha_df = compute_cronbach_alpha()
#display(alpha_df)

# ---- Cronbach's alpha for top_n score per domain or directly per scale, depending on whether the scale has subdomains ----
alpha_top_n_df = compute_cronbach_alpha(score = "score_top_n")
#display(alpha_df)


# Merge only selected columns from DFD_human_data
alpha_df = alpha_df.merge(
    alpha_top_n_df,
    on=["experiment", "domain"],
    how="inner",
    suffixes=("_total", "_top_n")
)

display(alpha_df)

Unnamed: 0,experiment,domain,alpha_total,alpha_CI_total,alpha_top_n,alpha_CI_top_n
0,AUDIT scale,,0.734676,"[0.605, 0.835]",0.50218,"[0.258, 0.691]"
1,BARRAT scale,BISa,0.552201,"[0.326, 0.724]",0.808261,"[0.711, 0.882]"
2,BARRAT scale,BISm,0.732315,"[0.601, 0.834]",0.865809,"[0.8, 0.917]"
3,BARRAT scale,BISn,0.534991,"[0.307, 0.711]",0.873693,"[0.812, 0.922]"
4,CARE scale,CAREa,0.856412,"[0.785, 0.911]",0.894163,"[0.841, 0.935]"
5,CARE scale,CAREs,0.651253,"[0.468, 0.787]",0.67201,"[0.5, 0.8]"
6,CARE scale,CAREw,0.961134,"[0.939, 0.977]",0.857061,"[0.775, 0.914]"
7,DAST scale,,0.768345,"[0.659, 0.855]",0.897304,"[0.849, 0.936]"
8,DOSPERT scale,Ethical,0.884977,"[0.827, 0.929]",0.801875,"[0.702, 0.878]"
9,DOSPERT scale,Gambling,0.779437,"[0.653, 0.868]",0.524531,"[0.252, 0.715]"


## Split-half Reliability

In [None]:
# ---- Split-half per domain or directly per scale ----
results = []

# loop through experiments
for exp, exp_data in all_data.groupby("experiment"):
    
    # check whether this experiment has subcategories
    if exp_data["category"].notna().any() and exp != "SOEP scale":
        # compute alpha per category (domain)
        for domain, domain_data in exp_data.groupby("category"):
            df_wide = domain_data.pivot_table(
                index="model", columns="item", values="score_top_n"
            )
            if df_wide.shape[1] > 1:
                alpha, ci = pg.cronbach_alpha(df_wide)
            else:
                alpha, ci = None, (None, None)
            
            results.append({
                "experiment": exp,
                "domain": domain,
                "alpha": alpha,
                "alpha_CI": ci
            })
    else:
        # compute alpha for the whole experiment
        df_wide = exp_data.pivot_table(
            index="model", columns="item", values="score_top_n"
        )
        if df_wide.shape[1] > 1:
            alpha, ci = pg.cronbach_alpha(df_wide)
        else:
            alpha, ci = None, (None, None)
        
        results.append({
            "experiment": exp,
            "domain": None,  # no subdomain
            "alpha": alpha,
            "alpha_CI": ci
        })

# convert to DataFrame
alpha_df = pd.DataFrame(results)
display(alpha_df)




## Visualize Distributions per model

In [None]:
# Visualise distribution (focus on variance) per model of logprobs

model_names = all_data['model'].unique()
task_names = all_data['experiment'].unique()

for model_name in model_names:
    for task_name in task_names:
        df_plot = all_data[
            (all_data['model'] == model_name) &
            (all_data['experiment'] == task_name)
        ]
        
        plt.figure(figsize=(10, 5))
        sns.histplot(data=df_plot, x="score", kde=True, bins=30)  
        plt.title(f'Distribution of logprob_predicted for model: {model_name}, for task: {task_name}')
        plt.xlabel('Score')
        plt.ylabel('Count')
        plt.show()