In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
from scipy import stats
import matplotlib.pyplot as plt
import os
from scipy.stats import friedmanchisquare, wilcoxon

# Jonckheere–Terpstra might not exist on older SciPy; handle gracefully
try:
    from scipy.stats import jonckheere_terpstra
    HAS_JT = True
except Exception:
    HAS_JT = False

from scipy.stats import ConstantInputWarning
import warnings
warnings.filterwarnings("ignore", category=ConstantInputWarning)

In [2]:
# =====================================================================
#  IO + PREP
# =====================================================================

H2_METRICS = [
    "lexical_query_overlap",
    "semantic_similarity_to_CQ",
    "semantic_soft_coverage_to_CQ",
    "semantic_diversity_score",
]

In [3]:
def load_and_clean(path: Path):
    """Load H2 sheets and forward-fill KG/Model labels."""
    # If your sheet names differ, just change these two strings:
    h2 = pd.read_excel(path, sheet_name="H2-AM")
    h2_c = pd.read_excel(path, sheet_name="H2-AM&C")

    for df in (h2, h2_c):
        for col in ["KG", "Model"]:
            if col in df.columns:
                df[col] = df[col].ffill()

    return h2, h2_c

In [4]:
def prepare_h2_with_complexity(h2_c: pd.DataFrame) -> pd.DataFrame:
    """Add numeric complexity level for H2 metrics."""
    df = h2_c.copy()
    df["ComplexityLevel"] = df["Complexity"].map({"Simple": 1, "Moderate": 2, "Complex": 3})
    return df

In [5]:
# =====================================================================
#  ANALYSIS FUNCTIONS
# =====================================================================

def analyze_h2(h2_c_clean: pd.DataFrame):
    print("\n=== H2 metrics analysis ===\n")

    # ----- Global Kruskal–Wallis per metric across complexity -----
    print("Global Kruskal–Wallis tests across Complexity for H2 metrics:")
    for metric in H2_METRICS:
        if metric not in h2_c_clean.columns:
            print(f"  {metric:28s} [SKIP: column not found]")
            continue

        valid = h2_c_clean.dropna(subset=[metric])
        groups = [g[metric].values for _, g in valid.groupby("Complexity")]
        if len(groups) >= 2:
            stat, p = stats.kruskal(*groups)
            print(f"  {metric:28s} H = {stat:.3f}, p = {p:.4f}")
        else:
            print(f"  {metric:28s} [SKIP: not enough groups]")
    print()

    # ----- Per (KG, Model) Spearman correlations vs Complexity -----
    per_pair = []
    for (kg, model), grp in h2_c_clean.groupby(["KG", "Model"]):
        if grp["ComplexityLevel"].nunique() < 2:
            continue

        for metric in H2_METRICS:
            if metric not in grp.columns:
                continue
            g = grp.dropna(subset=[metric])
            if g.shape[0] >= 2:
                r, p = stats.spearmanr(g["ComplexityLevel"], g[metric])
                if not np.isnan(r):
                    per_pair.append(
                        {"KG": kg, "Model": model, "metric": metric, "rho": r, "p": p}
                    )

    per_pair_df = pd.DataFrame(per_pair)
    if per_pair_df.empty:
        print("No usable per-(KG,Model) correlations for H2 metrics.\n")
        return per_pair_df

    print("Per (KG, Model) Spearman correlations for H2 metrics (first few rows):")
    print(per_pair_df.head(20).to_string(index=False))
    print()

    # ----- Summary of correlations by metric -----
    summary_corr = per_pair_df.groupby("metric")["rho"].agg(["count", "mean", "median"])
    print("Summary of Spearman rho per metric (H2):")
    print(summary_corr.to_string())
    print()

    # ----- Sign tests: negative vs positive correlations -----
    print("Sign tests for H2 metrics (is negative trend more common than positive?):")
    for metric, grp in per_pair_df.groupby("metric"):
        neg = (grp["rho"] < 0).sum()
        pos = (grp["rho"] > 0).sum()
        n = neg + pos
        if n > 0:
            p_val = stats.binomtest(neg, n, 0.5, alternative="greater").pvalue
            print(f"  {metric:28s} neg={neg:2d}, pos={pos:2d}, n={n:2d}, p={p_val:.4f}")
        else:
            print(f"  {metric:28s} no non-zero correlations")
    print()

    # ----- KG-level comparison: Big vs Small -----
    print("Big vs Small KG comparison on H2 metrics (Mann–Whitney U):")
    small_big_results = []
    for metric in H2_METRICS:
        if metric not in h2_c_clean.columns:
            continue
        big_vals = h2_c_clean[h2_c_clean["KG"] == "Big"][metric].dropna()
        small_vals = h2_c_clean[h2_c_clean["KG"] == "Small"][metric].dropna()
        if len(big_vals) > 0 and len(small_vals) > 0:
            u_stat, p = stats.mannwhitneyu(big_vals, small_vals, alternative="two-sided")
            small_big_results.append(
                {
                    "metric": metric,
                    "Big_mean": big_vals.mean(),
                    "Small_mean": small_vals.mean(),
                    "U": u_stat,
                    "p": p,
                    "n_big": len(big_vals),
                    "n_small": len(small_vals),
                }
            )

    if small_big_results:
        kg_df = pd.DataFrame(small_big_results)
        print(kg_df.to_string(index=False))
        print()
    else:
        print("Not enough data for Big vs Small KG comparison.\n")

    return per_pair_df

In [6]:
# =====================================================================
#  PLOTTING FUNCTIONS
# =====================================================================

def plot_h2_boxplots(h2_c_clean: pd.DataFrame):
    """Boxplots of each H2 metric by Complexity."""
    for metric in H2_METRICS:
        if metric not in h2_c_clean.columns:
            continue
        plt.figure()
        data = [g[metric].values for _, g in h2_c_clean.groupby("Complexity")]
        labels = [name for name, _ in h2_c_clean.groupby("Complexity")]
        plt.boxplot(data)
        plt.xticks(range(1, len(labels) + 1), labels)
        plt.ylabel(metric)
        plt.title(f"{metric} by Complexity")
        out_path = OUT_DIR / f"boxplot_H2_{metric}.png"
        plt.savefig(out_path, bbox_inches="tight")
        plt.close()

def plot_h2_trendlines(h2_c_clean: pd.DataFrame):
    """
    Line plots: ComplexityLevel vs metric for each KG–Model (separate lines).
    One plot per metric.
    """
    for metric in H2_METRICS:
        if metric not in h2_c_clean.columns:
            continue
        plt.figure()
        for (kg, model), grp in h2_c_clean.groupby(["KG", "Model"]):
            g = grp.sort_values("ComplexityLevel")
            x = g["ComplexityLevel"].values
            y = g[metric].values
            if len(x) >= 2 and not np.all(np.isnan(y)):
                label = f"{kg}-{model}"
                plt.plot(x, y, marker="o", label=label)
        plt.xticks([1, 2, 3], ["Simple", "Moderate", "Complex"])
        plt.xlabel("Complexity")
        plt.ylabel(metric)
        plt.title(f"{metric} vs Complexity (per KG–Model)")
        plt.legend(fontsize="x-small", bbox_to_anchor=(1.05, 1), loc="upper left")
        out_path = OUT_DIR / f"trend_H2_{metric}.png"
        plt.savefig(out_path, bbox_inches="tight")
        plt.close()

def plot_h2_rho_heatmap(per_pair_df: pd.DataFrame):
    """
    Heatmap of rho values:
      rows: (KG,Model)
      cols: metrics
    """
    if per_pair_df.empty:
        return

    pivot_data = []
    row_labels = []
    for (kg, model), grp in per_pair_df.groupby(["KG", "Model"]):
        row_labels.append(f"{kg}-{model}")
        row = []
        for m in H2_METRICS:
            sub = grp[grp["metric"] == m]
            row.append(sub["rho"].iloc[0] if not sub.empty else np.nan)
        pivot_data.append(row)

    data = np.array(pivot_data, dtype=float)

    plt.figure()
    nan_mask = np.isnan(data)
    data_display = np.where(nan_mask, 0.0, data)

    im = plt.imshow(data_display, aspect="auto")
    plt.colorbar(im, label="Spearman rho")
    plt.yticks(range(len(row_labels)), row_labels, fontsize="x-small")
    plt.xticks(range(len(H2_METRICS)), H2_METRICS, rotation=45, ha="right")
    plt.title("Spearman rho (Complexity vs H2 metrics)\nper KG–Model")
    out_path = OUT_DIR / "heatmap_H2_rho.png"
    plt.savefig(out_path, bbox_inches="tight")
    plt.close()

In [7]:
def plot_h2_boxplots_by_kg(h2_c_clean: pd.DataFrame):
    """
    Boxplots of each H2 metric by KG (Small vs Big), pooled across complexities.
    Saves: boxplot_H2_<metric>_by_KG.png
    """
    for metric in H2_METRICS:
        if metric not in h2_c_clean.columns:
            continue

        plt.figure()
        data = [g[metric].dropna().values for _, g in h2_c_clean.groupby("KG")]
        labels = [name for name, _ in h2_c_clean.groupby("KG")]

        # keep stable order Small then Big if present
        order = []
        if "Small" in labels: order.append("Small")
        if "Big" in labels: order.append("Big")
        if order and set(order) == set(labels):
            data = [h2_c_clean[h2_c_clean["KG"] == kg][metric].dropna().values for kg in order]
            labels = order

        plt.boxplot(data)
        plt.xticks(range(1, len(labels) + 1), labels)
        plt.ylabel(metric)
        plt.title(f"{metric} by KG (pooled)")
        out_path = OUT_DIR / f"boxplot_H2_{metric}_by_KG.png"
        plt.savefig(out_path, bbox_inches="tight")
        plt.close()


def plot_h2_boxplots_by_kg_within_complexity(h2_c_clean: pd.DataFrame):
    """
    For each Complexity, boxplots of each H2 metric by KG (Small vs Big).
    Saves: boxplot_H2_<metric>_by_KG_within_<Complexity>.png
    """
    for metric in H2_METRICS:
        if metric not in h2_c_clean.columns:
            continue

        for comp, df_comp in h2_c_clean.groupby("Complexity"):
            plt.figure()

            # stable KG order
            labels = []
            if "Small" in df_comp["KG"].unique(): labels.append("Small")
            if "Big" in df_comp["KG"].unique(): labels.append("Big")
            if not labels:
                plt.close()
                continue

            data = [df_comp[df_comp["KG"] == kg][metric].dropna().values for kg in labels]

            plt.boxplot(data)
            plt.xticks(range(1, len(labels) + 1), labels)
            plt.ylabel(metric)
            plt.title(f"{metric} by KG within {comp}")
            out_path = OUT_DIR / f"boxplot_H2_{metric}_by_KG_within_{comp}.png"
            plt.savefig(out_path, bbox_inches="tight")
            plt.close()


In [8]:
# =====================================================================
#  EXTRA STATS (mirrors your H1 extras)
# =====================================================================

def _cliffs_delta(x: np.ndarray, y: np.ndarray) -> float:
    """Cliff's delta effect size for two independent samples."""
    x = np.asarray(x)
    y = np.asarray(y)
    x = x[~np.isnan(x)]
    y = y[~np.isnan(y)]
    if len(x) == 0 or len(y) == 0:
        return np.nan

    gt = 0
    lt = 0
    for xi in x:
        gt += np.sum(xi > y)
        lt += np.sum(xi < y)
    return (gt - lt) / (len(x) * len(y))

def h2_kg_effect_stats(h2_c_clean: pd.DataFrame):
    """
    Summarize KG effect on H2 metrics:
      1) Overall Big vs Small pooled across complexities
      2) Big vs Small within each Complexity
    Also computes Cliff's delta.
    """
    print("\n=== EXTRA: H2 KG effect summary (Big vs Small) ===\n")

    complexities = ["Simple", "Moderate", "Complex"]
    rows = []

    # (A) Overall pooled
    for metric in H2_METRICS:
        if metric not in h2_c_clean.columns:
            continue
        big_vals = h2_c_clean[h2_c_clean["KG"] == "Big"][metric].dropna().values
        small_vals = h2_c_clean[h2_c_clean["KG"] == "Small"][metric].dropna().values
        if len(big_vals) > 0 and len(small_vals) > 0:
            u_stat, p = stats.mannwhitneyu(big_vals, small_vals, alternative="two-sided")
            delta = _cliffs_delta(big_vals, small_vals)
            rows.append(
                {
                    "scope": "overall",
                    "Complexity": "ALL",
                    "metric": metric,
                    "Big_mean": np.mean(big_vals),
                    "Small_mean": np.mean(small_vals),
                    "n_big": len(big_vals),
                    "n_small": len(small_vals),
                    "U": u_stat,
                    "p": p,
                    "cliffs_delta(Big-Small)": delta,
                }
            )

    # (B) Within each complexity
    for comp in complexities:
        df_comp = h2_c_clean[h2_c_clean["Complexity"] == comp]
        for metric in H2_METRICS:
            if metric not in df_comp.columns:
                continue
            big_vals = df_comp[df_comp["KG"] == "Big"][metric].dropna().values
            small_vals = df_comp[df_comp["KG"] == "Small"][metric].dropna().values
            if len(big_vals) > 0 and len(small_vals) > 0:
                u_stat, p = stats.mannwhitneyu(big_vals, small_vals, alternative="two-sided")
                delta = _cliffs_delta(big_vals, small_vals)
                rows.append(
                    {
                        "scope": "within_complexity",
                        "Complexity": comp,
                        "metric": metric,
                        "Big_mean": np.mean(big_vals),
                        "Small_mean": np.mean(small_vals),
                        "n_big": len(big_vals),
                        "n_small": len(small_vals),
                        "U": u_stat,
                        "p": p,
                        "cliffs_delta(Big-Small)": delta,
                    }
                )

    if not rows:
        print("Not enough data to compute KG effect stats.\n")
        return None

    kg_effect_df = pd.DataFrame(rows)
    print(kg_effect_df.to_string(index=False))
    print()

    out_path = OUT_DIR / "kg_effect_H2_mannwhitney_cliffsdelta.csv"
    kg_effect_df.to_csv(out_path, index=False)
    print(f"Saved KG effect stats to: {out_path}\n")

    return kg_effect_df

def summarize_model_sensitivity_h2(per_pair_df: pd.DataFrame):
    """Same idea as your H1 sensitivity table, but for H2."""
    if per_pair_df.empty:
        print("No per-pair H2 correlations to summarize.\n")
        return None

    pivot = (
        per_pair_df
        .pivot_table(index=["KG", "Model"], columns="metric", values="rho")
        .reindex(columns=[m for m in H2_METRICS if m in per_pair_df["metric"].unique()])
    )
    pivot["mean_rho_across_H2"] = pivot.mean(axis=1)

    print("\n=== Model complexity sensitivity (mean Spearman rho across H2 metrics) ===")
    print("(More negative = stronger decrease as complexity increases)\n")
    print(pivot.sort_values("mean_rho_across_H2").to_string())
    print()

    out_path = OUT_DIR / "model_complexity_sensitivity_H2.csv"
    pivot.to_csv(out_path)
    print(f"Saved model sensitivity table to: {out_path}\n")

    return pivot

def effect_size_simple_vs_complex_h2(h2_c_clean: pd.DataFrame):
    """Simple vs Complex effect sizes (Cohen's d) for each H2 metric."""
    print("\n=== Simple vs Complex effect sizes (H2 metrics) ===\n")

    for metric in H2_METRICS:
        if metric not in h2_c_clean.columns:
            print(f"{metric:28s}: column not found.")
            continue

        simple = h2_c_clean[h2_c_clean["Complexity"] == "Simple"][metric].dropna()
        complex_ = h2_c_clean[h2_c_clean["Complexity"] == "Complex"][metric].dropna()

        if len(simple) < 2 or len(complex_) < 2:
            print(f"{metric:28s}: not enough data for effect size.")
            continue

        mean_simple = simple.mean()
        mean_complex = complex_.mean()
        sd_simple = simple.std(ddof=1)
        sd_complex = complex_.std(ddof=1)

        n1, n2 = len(simple), len(complex_)
        pooled_var = ((n1 - 1) * sd_simple**2 + (n2 - 1) * sd_complex**2) / (n1 + n2 - 2)
        pooled_sd = np.sqrt(pooled_var)

        d = (mean_simple - mean_complex) / pooled_sd if pooled_sd > 0 else np.nan

        print(
            f"{metric:28s}: mean_Simple={mean_simple:.3f}, mean_Complex={mean_complex:.3f}, "
            f"Cohen_d={d:.3f} (n1={n1}, n2={n2})"
        )
    print()

def rank_models_by_complexity_h2(h2_c_clean: pd.DataFrame):
    """
    For each complexity level, compute mean of each H2 metric per Model (aggregated over KGs)
    and print rankings. Saves a CSV per metric + one combined CSV.
    """
    print("\n=== Model rankings by H2 metrics for each Complexity ===\n")

    all_rows = []
    for metric in H2_METRICS:
        if metric not in h2_c_clean.columns:
            continue

        rows = []
        for comp, grp in h2_c_clean.groupby("Complexity"):
            tmp = (
                grp.groupby("Model")[metric]
                .mean()
                .reset_index()
                .rename(columns={metric: "mean_value"})
            )
            tmp["Complexity"] = comp
            tmp["metric"] = metric
            rows.append(tmp)

        if rows:
            ranking_df = pd.concat(rows, ignore_index=True)
            all_rows.append(ranking_df)

            for comp in ["Simple", "Moderate", "Complex"]:
                sub = ranking_df[ranking_df["Complexity"] == comp].sort_values("mean_value", ascending=False)
                print(f"--- Metric: {metric} | {comp} ---")
                print(sub.to_string(index=False))
                print()

            out_path = OUT_DIR / f"model_rankings_by_complexity_{metric}.csv"
            ranking_df.to_csv(out_path, index=False)
            print(f"Saved rankings for {metric} to: {out_path}\n")

    if not all_rows:
        print("No data to rank models.\n")
        return None

    combined = pd.concat(all_rows, ignore_index=True)
    out_path = OUT_DIR / "model_rankings_by_complexity_H2_ALL.csv"
    combined.to_csv(out_path, index=False)
    print(f"Saved combined rankings to: {out_path}\n")
    return combined

# =====================================================================
#  EXTRA TESTS (mirrors your H1 tests)
# =====================================================================

def friedman_test_h2(h2_c_clean: pd.DataFrame):
    """Friedman repeated-measures test across complexity, using (KG,Model) as blocks."""
    print("\n=== EXTRA: Friedman Test (Repeated-Measures across models) [H2] ===\n")

    for metric in H2_METRICS:
        if metric not in h2_c_clean.columns:
            continue

        pivot = h2_c_clean.pivot_table(
            index=["KG", "Model"],
            columns="Complexity",
            values=metric
        ).dropna(subset=["Simple", "Moderate", "Complex"])

        if pivot.shape[0] < 3:
            print(f"{metric:28s}: not enough (KG,Model) with all three complexities.")
            continue

        S = pivot["Simple"].values
        M = pivot["Moderate"].values
        C = pivot["Complex"].values

        stat, p = friedmanchisquare(S, M, C)
        print(f"{metric:28s}: Friedman χ² = {stat:.3f}, p = {p:.4f}")
    print()

def jt_test_h2(h2_c_clean: pd.DataFrame):
    """
    Jonckheere–Terpstra trend test for ordered categories:
    tests monotonic trend Simple -> Moderate -> Complex.
    """
    print("\n=== EXTRA: Jonckheere–Terpstra Trend Test [H2] ===\n")

    if not HAS_JT:
        print("SciPy version does not have jonckheere_terpstra; skipping this test.\n")
        return

    for metric in H2_METRICS:
        if metric not in h2_c_clean.columns:
            continue

        g_simple   = h2_c_clean[h2_c_clean["Complexity"] == "Simple"][metric].dropna()
        g_moderate = h2_c_clean[h2_c_clean["Complexity"] == "Moderate"][metric].dropna()
        g_complex  = h2_c_clean[h2_c_clean["Complexity"] == "Complex"][metric].dropna()

        groups = [g_simple, g_moderate, g_complex]
        if any(len(g) == 0 for g in groups):
            print(f"{metric:28s}: insufficient data for JT test.")
            continue

        # If your H2 hypothesis is "decreasing", keep decreasing.
        # If it's "increasing", change to alternative="increasing".
        jt_stat, p = jonckheere_terpstra(groups, alternative="decreasing")
        print(f"{metric:28s}: JT = {jt_stat:.3f}, p = {p:.4f}")
    print()

def kendalls_w_h2(h2_c_clean: pd.DataFrame):
    """Compute Kendall's W across complexity levels for each H2 metric."""
    print("\n=== EXTRA: Kendall's W (agreement across complexity levels) [H2] ===\n")

    for metric in H2_METRICS:
        if metric not in h2_c_clean.columns:
            continue

        pivot = h2_c_clean.pivot_table(
            index=["KG", "Model"],
            columns="Complexity",
            values=metric
        ).dropna(subset=["Simple", "Moderate", "Complex"])

        if pivot.shape[0] < 3:
            print(f"{metric:28s}: insufficient models for Kendall's W.")
            continue

        ranks = pivot.rank(axis=1, ascending=True).values
        n, k = ranks.shape

        R_j = np.sum(ranks, axis=0)
        R_bar = n * (k + 1) / 2.0

        S = np.sum((R_j - R_bar) ** 2)
        W = 12 * S / (n**2 * (k**2 - 1))

        print(f"{metric:28s}: Kendall's W = {W:.3f}")
    print()

def pairwise_wilcoxon_h2(h2_c_clean: pd.DataFrame):
    """Paired Wilcoxon tests per H2 metric across complexities, using (KG,Model) as pairs."""
    print("\n=== EXTRA: Pairwise Wilcoxon tests (Simple/Moderate/Complex) [H2] ===\n")

    for metric in H2_METRICS:
        if metric not in h2_c_clean.columns:
            continue

        pivot = h2_c_clean.pivot_table(
            index=["KG", "Model"],
            columns="Complexity",
            values=metric
        ).dropna(subset=["Simple", "Moderate", "Complex"])

        if pivot.shape[0] < 3:
            print(f"{metric:28s}: not enough paired (KG,Model) rows.")
            continue

        S = pivot["Simple"]
        M = pivot["Moderate"]
        C = pivot["Complex"]

        stat_SM, p_SM = wilcoxon(S, M)
        stat_MC, p_MC = wilcoxon(M, C)
        stat_SC, p_SC = wilcoxon(S, C)

        print(f"{metric}:")
        print(f"  Simple vs Moderate : W = {stat_SM:.3f}, p = {p_SM:.4f}")
        print(f"  Moderate vs Complex: W = {stat_MC:.3f}, p = {p_MC:.4f}")
        print(f"  Simple vs Complex  : W = {stat_SC:.3f}, p = {p_SC:.4f}\n")

def kg_complexity_interaction_tests_h2(h2_c_clean: pd.DataFrame):
    """
    Mirrors your H1 KG × Complexity analyses, but for H2 metrics:
      1) Big vs Small at each Complexity (Wilcoxon, Model-level)
      2) Complexity effect within each KG (Friedman, Model-level)
      3) Drop comparisons (Small vs Big; Wilcoxon)
    """
    print("\n=== EXTRA: KG × Complexity analyses for H2 metrics ===\n")

    complexities = ["Simple", "Moderate", "Complex"]
    kgs = ["Small", "Big"]

    for metric in H2_METRICS:
        if metric not in h2_c_clean.columns:
            continue

        print(f"\n--- Metric: {metric} ---")

        pivot = h2_c_clean.pivot_table(
            index="Model",
            columns=["KG", "Complexity"],
            values=metric
        )

        missing_cols = []
        for kg in kgs:
            for comp in complexities:
                if (kg, comp) not in pivot.columns:
                    missing_cols.append((kg, comp))
        if missing_cols:
            print(f"  Skipping {metric}: missing cells {missing_cols}")
            continue

        # (1) Big vs Small at each complexity
        print("  Big vs Small KG at each Complexity (Wilcoxon, Model-level):")
        for comp in complexities:
            small_vals = pivot[("Small", comp)].dropna()
            big_vals   = pivot[("Big", comp)].dropna()

            common_index = small_vals.index.intersection(big_vals.index)
            if len(common_index) < 2:
                print(f"    {comp:9s}: not enough paired models.")
                continue

            s = small_vals.loc[common_index]
            b = big_vals.loc[common_index]

            stat, p = wilcoxon(s, b)
            print(
                f"    {comp:9s}: Small_mean={s.mean():.3f}, Big_mean={b.mean():.3f}, "
                f"W={stat:.3f}, p={p:.4f}"
            )

        # (2) Complexity effect within each KG
        print("  Complexity effect within each KG (Friedman, Model-level):")
        for kg in kgs:
            sub = pivot[kg].copy().dropna(subset=complexities)
            if sub.shape[0] < 2:
                print(f"    {kg:5s}: not enough complete models for Friedman.")
                continue
            S = sub["Simple"].values
            M = sub["Moderate"].values
            C = sub["Complex"].values
            stat, p = friedmanchisquare(S, M, C)
            print(f"    {kg:5s}: Friedman χ²={stat:.3f}, p={p:.4f}")

        # (3) Drop comparisons
        print("  KG × Complexity (drops comparison, Small vs Big; Wilcoxon):")

        ss = pivot[("Small", "Simple")].dropna()
        sm = pivot[("Small", "Moderate")].dropna()
        sc = pivot[("Small", "Complex")].dropna()
        bs = pivot[("Big", "Simple")].dropna()
        bm = pivot[("Big", "Moderate")].dropna()
        bc = pivot[("Big", "Complex")].dropna()

        common_index = (
            ss.index.intersection(sm.index).intersection(sc.index)
            .intersection(bs.index).intersection(bm.index).intersection(bc.index)
        )

        if len(common_index) < 2:
            print("    Not enough models with all six cells (Small/Big × Simple/Moderate/Complex).")
            continue

        ss = ss.loc[common_index]
        sm = sm.loc[common_index]
        sc = sc.loc[common_index]
        bs = bs.loc[common_index]
        bm = bm.loc[common_index]
        bc = bc.loc[common_index]

        drop_small_SM = ss - sm
        drop_big_SM   = bs - bm
        stat_sm, p_sm = wilcoxon(drop_small_SM, drop_big_SM)
        print(
            f"    Simple→Moderate drop: mean_drop_Small={drop_small_SM.mean():.3f}, "
            f"mean_drop_Big={drop_big_SM.mean():.3f}, W={stat_sm:.3f}, p={p_sm:.4f}"
        )

        drop_small_MC = sm - sc
        drop_big_MC   = bm - bc
        stat_mc, p_mc = wilcoxon(drop_small_MC, drop_big_MC)
        print(
            f"    Moderate→Complex drop: mean_drop_Small={drop_small_MC.mean():.3f}, "
            f"mean_drop_Big={drop_big_MC.mean():.3f}, W={stat_mc:.3f}, p={p_mc:.4f}"
        )

        drop_small_SC = ss - sc
        drop_big_SC   = bs - bc
        stat_sc2, p_sc2 = wilcoxon(drop_small_SC, drop_big_SC)
        print(
            f"    Simple→Complex drop: mean_drop_Small={drop_small_SC.mean():.3f}, "
            f"mean_drop_Big={drop_big_SC.mean():.3f}, W={stat_sc2:.3f}, p={p_sc2:.4f}"
        )

    print()

In [None]:
# ========= CONFIG =========
# EXCEL_PATH = Path("all_models_consolidated_results.xlsx")  # change if needed
EXCEL_PATH = Path("balanced_models_consolidated_results.xlsx")  # change if needed

# Total # of CQs and per-complexity split (given by you)
TOTAL_CQS = 33
CQS_PER_COMPLEXITY = {"Simple": 12, "Moderate": 11, "Complex": 10}
OUT_DIR = Path("analysis_plots2")  # folder to store PNGs
OUT_DIR.mkdir(exist_ok=True)
# ==========================

print(f"Loading data from: {EXCEL_PATH}")

h2, h2_c = load_and_clean(EXCEL_PATH)
h2_c_clean = prepare_h2_with_complexity(h2_c)

# Analyses
per_pair_df = analyze_h2(h2_c_clean)

# Extra analyses
model_sensitivity = summarize_model_sensitivity_h2(per_pair_df)
effect_size_simple_vs_complex_h2(h2_c_clean)
rankings = rank_models_by_complexity_h2(h2_c_clean)

# Plots
print(f"Saving plots to: {OUT_DIR.resolve()}")
plot_h2_boxplots(h2_c_clean)
plot_h2_trendlines(h2_c_clean)
plot_h2_rho_heatmap(per_pair_df)
plot_h2_boxplots_by_kg(h2_c_clean)
plot_h2_boxplots_by_kg_within_complexity(h2_c_clean)
kg_effect_table = h2_kg_effect_stats(h2_c_clean)
print("Done generating H2 plots.")

# Extra tests to strengthen evidence (mirrors your H1 extras)
friedman_test_h2(h2_c_clean)
jt_test_h2(h2_c_clean)
kendalls_w_h2(h2_c_clean)
pairwise_wilcoxon_h2(h2_c_clean)
kg_complexity_interaction_tests_h2(h2_c_clean)

Loading data from: consolidated_results_filter_only4.xlsx

=== H2 metrics analysis ===

Global Kruskal–Wallis tests across Complexity for H2 metrics:
  lexical_query_overlap        H = 5.016, p = 0.0814
  semantic_similarity_to_CQ    H = 1.903, p = 0.3862
  semantic_soft_coverage_to_CQ H = 0.161, p = 0.9225
  semantic_diversity_score     H = 0.860, p = 0.6505

Per (KG, Model) Spearman correlations for H2 metrics (first few rows):
   KG       Model                       metric       rho        p
  Big      Claude        lexical_query_overlap -0.500000 0.666667
  Big      Claude    semantic_similarity_to_CQ  0.500000 0.666667
  Big      Claude semantic_soft_coverage_to_CQ  0.500000 0.666667
  Big      Claude     semantic_diversity_score  0.500000 0.666667
  Big Deepseek R1        lexical_query_overlap -0.866025 0.333333
  Big Deepseek R1    semantic_similarity_to_CQ  0.500000 0.666667
  Big Deepseek R1 semantic_soft_coverage_to_CQ -0.500000 0.666667
  Big Deepseek R1     semantic_diversi

