In [9]:
import pandas as pd
import numpy as np
from scipy.stats import wilcoxon, norm
from itertools import combinations
from collections import Counter
from math import log2

In [13]:
#cleaning
def load_and_normalize(path):
    df = pd.read_csv(path, encoding="latin1")

    # Clean column names
    df.columns = df.columns.str.strip().str.lower().str.replace(" ", "_")

    for col in [c for c in df.columns if c.startswith("rating_")]:
        df[col] = pd.to_numeric(df[col], errors="coerce")

    # Extract IDs safely
    df["item_id"] = df["sentence_id"].astype(str).str.split(".").str[0]
    df["frame_id"] = df["sentence_id"].astype(str).str.split(".").str[1]

    # Reshape into long format
    records = []
    for _, row in df.iterrows():
        for model in ["llama", "qwen", "gpt_oss", "mistral"]:
            records.append({
                "item_id": row["item_id"],
                "frame_id": row["frame_id"],
                "model_id": model,
                "rating": row[f"rating_{model}"],
                "response": row.get(f"response_{model}", None)
            })
    return pd.DataFrame(records)

def polarity(r):
    if r <= 2: return "D"
    if r == 3: return "N"
    return "A"

def entropy_from_counts(counts):
    total = sum(counts.values())
    return -sum((c/total) * log2(c/total) for c in counts.values() if c > 0)


#metrc
def compute_drift(df):
    return df.groupby(["model_id", "item_id"])["rating"].agg(
        lambda g: g.max() - g.min()
    ).reset_index(name="drift")

def compute_flip(df):
    def flip_fn(g):
        pols = set(g.apply(polarity))
        return int("D" in pols and "A" in pols)
    return df.groupby(["model_id", "item_id"])["rating"].apply(flip_fn).reset_index(name="flip")

def compute_entropy(df):
    def ent_fn(g):
        cats = g.apply(polarity)
        return entropy_from_counts(Counter(cats))
    return df.groupby(["model_id", "item_id"])["rating"].apply(ent_fn).reset_index(name="entropy")

# stats
def pivot_metric(metric_df, metric_name):
    return metric_df.pivot(index="item_id", columns="model_id", values=metric_name)

def wilcoxon_pairwise(metric_matrix, metric_name):
    results = []
    models = metric_matrix.columns
    N = len(metric_matrix)

    for m1, m2 in combinations(models, 2):
        stat, p = wilcoxon(metric_matrix[m1], metric_matrix[m2])
        z = norm.ppf(p/2) * (-1 if stat > (N*(N+1)/4) else 1)
        r = abs(z) / np.sqrt(N)
        results.append({
            "Metric": metric_name, "Model 1": m1, "Model 2": m2,
            "W": stat, "p": p, "effect_size_r": r
        })

    df = pd.DataFrame(results)
    df["p_adj"] = df["p"] * len(list(combinations(models, 2)))  # bonferroni
    return df

In [16]:
#main
def run_full_analysis(df):
    drift_df = compute_drift(df)
    flip_df = compute_flip(df)
    entropy_df = compute_entropy(df)

    drift_matrix = pivot_metric(drift_df, "drift")
    flip_matrix = pivot_metric(flip_df, "flip")
    entropy_matrix = pivot_metric(entropy_df, "entropy")

    drift_stats = wilcoxon_pairwise(drift_matrix, "Drift")
    flip_stats = wilcoxon_pairwise(flip_matrix, "Polarity Flip")
    entropy_stats = wilcoxon_pairwise(entropy_matrix, "Entropy")

    return pd.concat([drift_stats, flip_stats, entropy_stats])
if __name__ == "__main__":
    df_long = load_and_normalize("./output.csv")
    stats = run_full_analysis(df_long)
    print(stats)


          Metric  Model 1  Model 2      W         p  effect_size_r     p_adj
0          Drift  gpt_oss    llama  252.0  0.119987       0.219886  0.719920
1          Drift  gpt_oss  mistral  169.5  0.283251       0.151751  1.699507
2          Drift  gpt_oss     qwen  196.0  0.866312       0.023808  5.197871
3          Drift    llama  mistral   77.0  0.002444       0.428537  0.014663
4          Drift    llama     qwen  120.5  0.084784       0.243751  0.508705
5          Drift  mistral     qwen  170.0  0.277142       0.153691  1.662853
0  Polarity Flip  gpt_oss    llama   14.0  0.012555       0.353009  0.075330
1  Polarity Flip  gpt_oss  mistral   12.0  0.705457       0.053452  4.232742
2  Polarity Flip  gpt_oss     qwen    8.0  0.256839       0.160357  1.541036
3  Polarity Flip    llama  mistral    5.5  0.011412       0.357771  0.068472
4  Polarity Flip    llama     qwen   11.0  0.057780       0.268328  0.346677
5  Polarity Flip  mistral     qwen    7.0  0.414216       0.115470  2.485297