In [1]:
import pandas as pd
import numpy as np

import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.stats.anova import anova_lm
from statsmodels.stats.multicomp import pairwise_tukeyhsd

from statsmodels.formula.api import mixedlm
from scipy.stats import wilcoxon, ttest_rel
from itertools import combinations

from scipy.stats import ConstantInputWarning
import warnings
warnings.filterwarnings("ignore", category=ConstantInputWarning)

In [2]:
def detect_factors_and_metrics(df):
    """
    Clean a sheet:
      - drop all-NaN rows
      - strip column names
      - forward-fill factor columns (KG, Model, Complexity)
      - cast factors to categorical
      - detect numeric metric columns automatically
    """
    df = df.dropna(how="all").copy()
    df.columns = [str(c).strip() for c in df.columns]

    factor_candidates = ["KG", "Model", "Complexity"]
    factors = [c for c in factor_candidates if c in df.columns]

    # Forward-fill factor columns (your sheets have KG/Model only on first row of a block)
    for f in factors:
        df[f] = df[f].ffill()
        df[f] = df[f].astype("category")

    # Metrics = numeric columns that are not factors and not totally NaN
    metric_cols = []
    for c in df.columns:
        if c in factors:
            continue
        if pd.api.types.is_numeric_dtype(df[c]) and df[c].notna().any():
            metric_cols.append(c)

    return df, factors, metric_cols

In [3]:
def run_anova_and_posthoc(df, factors, metric, sheet_name):
    """Run ANOVA, Tukey and Wilcoxon tests for one metric on one sheet."""
    data = df.dropna(subset=[metric]).copy()

    print("\n" + "=" * 120)
    print(f"SHEET: {sheet_name} | METRIC: {metric}")
    print("=" * 120)

    # If metric is constant, no point in testing anything
    if data[metric].nunique() < 2:
        print("Metric is constant across rows; skipping statistical tests.")
        return

    has_kg = "KG" in factors
    has_complexity = "Complexity" in factors
    has_model = "Model" in factors

    # -------------------- ANOVA / mixed model --------------------
    anova_done = False

    if has_kg or has_complexity:
        try:
            # Build formula depending on which factors exist
            if has_kg and has_complexity:
                # 2-way with interaction, plus Model as a blocking factor if present
                formula = f"{metric} ~ C(KG) * C(Complexity)"
                if has_model:
                    formula += " + C(Model)"
            elif has_kg:
                formula = f"{metric} ~ C(KG)"
                if has_model:
                    formula += " + C(Model)"
            else:  # only complexity
                formula = f"{metric} ~ C(Complexity)"
                if has_model:
                    formula += " + C(Model)"

            print(f"Attempting ANOVA with formula: {formula}")
            model = smf.ols(formula, data=data).fit()
            anova_table = anova_lm(model, typ=2)

            # Add eta-squared effect size
            ss_total = anova_table["sum_sq"].sum()
            anova_table["eta_sq"] = anova_table["sum_sq"] / ss_total

            print("\nANOVA table:")
            print(anova_table)
            anova_done = True

        except Exception as e:
            print("\n⚠️ ANOVA failed, attempting mixed-effects model instead.")
            print(f"Reason: {e}")

            # Mixed model: KG as fixed effect, Model as random effect (if both present)
            if has_kg and has_model:
                try:
                    print("\nMixed-effects model: metric ~ KG with random intercept per Model")
                    mx = mixedlm(f"{metric} ~ C(KG)", data=data, groups=data["Model"])
                    res = mx.fit()
                    print(res.summary())
                except Exception as e2:
                    print("Mixed-effects model also failed:", e2)
            else:
                print("No appropriate structure for mixed-effects model; skipping.")

    else:
        print("No KG or Complexity columns found; skipping ANOVA/mixed.")

    # -------------------- Tukey HSD: KG --------------------
    if has_kg and data["KG"].nunique() > 1:
        try:
            print("\nPost-hoc: KG (Tukey HSD)")
            tuk_kg = pairwise_tukeyhsd(endog=data[metric],
                                       groups=data["KG"],
                                       alpha=0.05)
            print(tuk_kg)
        except Exception as e:
            print("Tukey for KG failed:", e)

    # -------------------- Tukey HSD: Complexity --------------------
    if has_complexity and data["Complexity"].nunique() > 1:
        try:
            print("\nPost-hoc: Complexity (Tukey HSD)")
            tuk_c = pairwise_tukeyhsd(endog=data[metric],
                                      groups=data["Complexity"],
                                      alpha=0.05)
            print(tuk_c)
        except Exception as e:
            print("Tukey for Complexity failed:", e)

        # KG × Complexity combos
        if has_kg:
            try:
                print("\nPost-hoc: KG × Complexity combinations (Tukey HSD)")
                combos = data["KG"].astype(str) + "_" + data["Complexity"].astype(str)
                tuk_kc = pairwise_tukeyhsd(endog=data[metric],
                                           groups=combos,
                                           alpha=0.05)
                print(tuk_kc)
            except Exception as e:
                print("Tukey for KG×Complexity combos failed:", e)

    # -------------------- Wilcoxon: KG (paired across models) --------------------
    # Here we treat each model as a subject, compare Big vs Small (or two KG levels)
    if has_kg and has_model:
        wide_kg = data.pivot_table(index="Model", columns="KG", values=metric)

        if wide_kg.shape[1] >= 2:
            # If more than 2 levels (not in your current file, but just in case),
            # compare them pairwise.
            levels = list(wide_kg.columns)
            print("\nWilcoxon signed-rank tests for KG (pairwise across models):")
            for a, b in combinations(levels, 2):
                sub = wide_kg[[a, b]].dropna()
                if len(sub) > 1:
                    try:
                        w_stat, w_p = wilcoxon(sub[a], sub[b])
                        print(f"  {a} vs {b}: n = {len(sub)}, W = {w_stat:.4f}, p = {w_p:.4g}")
                        # Optional: paired t-test for reference
                        t_stat, t_p = ttest_rel(sub[a], sub[b])
                        print(f"    (paired t-test: t = {t_stat:.4f}, p = {t_p:.4g})")
                    except Exception as e:
                        print(f"  {a} vs {b}: Wilcoxon failed:", e)
                else:
                    print(f"  {a} vs {b}: not enough paired data, skipped.")
        else:
            print("\nWilcoxon KG: skipped (KG does not have at least 2 levels).")

    # -------------------- Wilcoxon: Complexity (pairwise across models) --------------------
    if has_complexity and has_model:
        wide_c = data.pivot_table(index="Model",
                                  columns="Complexity",
                                  values=metric)

        levels = [lvl for lvl in wide_c.columns if wide_c[lvl].notna().any()]
        if len(levels) >= 2:
            print("\nWilcoxon signed-rank tests for Complexity (pairwise across models):")
            for a, b in combinations(levels, 2):
                sub = wide_c[[a, b]].dropna()
                if len(sub) > 1:
                    try:
                        w_stat, w_p = wilcoxon(sub[a], sub[b])
                        print(f"  {a} vs {b}: n = {len(sub)}, W = {w_stat:.4f}, p = {w_p:.4g}")
                        # Optional: paired t-test for reference
                        t_stat, t_p = ttest_rel(sub[a], sub[b])
                        print(f"    (paired t-test: t = {t_stat:.4f}, p = {t_p:.4g})")
                    except Exception as e:
                        print(f"  {a} vs {b}: Wilcoxon failed:", e)
                else:
                    print(f"  {a} vs {b}: not enough paired data, skipped.")
        else:
            print("\nWilcoxon Complexity: skipped (fewer than 2 complexity levels with data).")

In [10]:
def analyze_excel(file_path):
    xls = pd.ExcelFile(file_path)
    print(f"Found sheets: {xls.sheet_names}")
    

    # for sheet in xls.sheet_names:
    for sheet in ['H1-AM&C']:
        print("\n" + "#" * 120)
        print(f"ANALYZING SHEET: {sheet}")
        print("#" * 120)

        df_raw = xls.parse(sheet)
        df, factors, metric_cols = detect_factors_and_metrics(df_raw)

        print(f"Detected factor columns: {factors}")
        print(f"Detected metric columns: {metric_cols}")

        if not metric_cols:
            print("No numeric metrics detected, skipping sheet.")
            continue

        for metric in metric_cols:
            run_anova_and_posthoc(df, factors, metric, sheet)

In [None]:
# ---------------- CONFIG ----------------
EXCEL_FILE = "all_models_consolidated_results.xlsx" 
# ----------------------------------------

In [12]:
analyze_excel(EXCEL_FILE)

Found sheets: ['Screen', 'All_Screen', 'Screen with C', 'H1-AM', 'H1-AM&C', 'H2-AM', 'H2-AM&C']

########################################################################################################################
ANALYZING SHEET: H1-AM&C
########################################################################################################################
Detected factor columns: ['KG', 'Model', 'Complexity']
Detected metric columns: ['rows', 'vars', 'latency_p50_ms', 'latency_p95_ms', 'latency_mean_ms', 'lexical_query_overlap', 'semantic_similarity_to_CQ', 'semantic_soft_coverage_to_CQ', 'tuple_cohesion', 'determinism_score', 'satisfiability_binding_score', 'h1_overall', 'semantic_diversity_score', 'syntax_ok_rate', 'satisfiable_rate', 'deterministic_rate']

SHEET: H1-AM&C | METRIC: rows
Attempting ANOVA with formula: rows ~ C(KG) * C(Complexity) + C(Model)

ANOVA table:
                           sum_sq    df         F    PR(>F)    eta_sq
C(KG)                3.846991e+07   1.0

  wide_kg = data.pivot_table(index="Model", columns="KG", values=metric)
  wide_c = data.pivot_table(index="Model",


  Multiple Comparison of Means - Tukey HSD, FWER=0.05   
 group1   group2  meandiff p-adj   lower   upper  reject
--------------------------------------------------------
 Complex Moderate  -1.5265 0.0258 -2.8844 -0.1687   True
 Complex   Simple  -1.1795 0.0964 -2.5373  0.1783  False
Moderate   Simple   0.3471 0.7976 -1.0108  1.7049  False
--------------------------------------------------------

Post-hoc: KG × Complexity combinations (Tukey HSD)
        Multiple Comparison of Means - Tukey HSD, FWER=0.05        
    group1         group2     meandiff p-adj   lower  upper  reject
-------------------------------------------------------------------
   Big_Complex   Big_Moderate  -1.9608 0.1895 -4.4986  0.577  False
   Big_Complex     Big_Simple   -1.749 0.2892 -4.2868 0.7888  False
   Big_Complex  Small_Complex  -0.6453 0.9623 -3.1831 1.8925  False
   Big_Complex Small_Moderate  -1.7376 0.2955 -4.2754 0.8002  False
   Big_Complex   Small_Simple  -1.2553 0.6256 -3.7931 1.2825  False
  Big

  wide_kg = data.pivot_table(index="Model", columns="KG", values=metric)
  wide_c = data.pivot_table(index="Model",


     Multiple Comparison of Means - Tukey HSD, FWER=0.05      
 group1   group2   meandiff p-adj    lower      upper   reject
--------------------------------------------------------------
 Complex Moderate    5.8034    1.0 -1646.3449 1657.9517  False
 Complex   Simple -345.8477 0.8587  -1997.996 1306.3006  False
Moderate   Simple -351.6511 0.8544 -2003.7994 1300.4972  False
--------------------------------------------------------------

Post-hoc: KG × Complexity combinations (Tukey HSD)
            Multiple Comparison of Means - Tukey HSD, FWER=0.05            
    group1         group2      meandiff  p-adj    lower      upper   reject
---------------------------------------------------------------------------
   Big_Complex   Big_Moderate  -128.8981    1.0 -3089.5754 2831.7791  False
   Big_Complex     Big_Simple   525.9198 0.9922 -2434.7574 3486.5971  False
   Big_Complex  Small_Complex   598.6767  0.986 -2362.0005 3559.3539  False
   Big_Complex Small_Moderate   739.1817 0.9651 -22

  wide_kg = data.pivot_table(index="Model", columns="KG", values=metric)
  wide_c = data.pivot_table(index="Model",


     Multiple Comparison of Means - Tukey HSD, FWER=0.05      
 group1   group2   meandiff p-adj    lower      upper   reject
--------------------------------------------------------------
 Complex Moderate   81.4663 0.9926 -1682.7558 1845.6884  False
 Complex   Simple -355.1034 0.8686 -2119.3255 1409.1187  False
Moderate   Simple -436.5697 0.8088 -2200.7918 1327.6524  False
--------------------------------------------------------------

Post-hoc: KG × Complexity combinations (Tukey HSD)
            Multiple Comparison of Means - Tukey HSD, FWER=0.05            
    group1         group2      meandiff  p-adj    lower      upper   reject
---------------------------------------------------------------------------
   Big_Complex   Big_Moderate   -80.4589    1.0 -3259.1613 3098.2434  False
   Big_Complex     Big_Simple   529.8409 0.9942 -2648.8614 3708.5432  False
   Big_Complex  Small_Complex   578.5026 0.9913 -2600.1997 3757.2049  False
   Big_Complex Small_Moderate   821.8941 0.9596 -23

  wide_kg = data.pivot_table(index="Model", columns="KG", values=metric)
  wide_c = data.pivot_table(index="Model",


     Multiple Comparison of Means - Tukey HSD, FWER=0.05      
 group1   group2   meandiff p-adj    lower      upper   reject
--------------------------------------------------------------
 Complex Moderate   37.5456 0.9982 -1625.5119 1700.6032  False
 Complex   Simple -357.8335 0.8514  -2020.891 1305.2241  False
Moderate   Simple -395.3791  0.822 -2058.4367 1267.6785  False
--------------------------------------------------------------

Post-hoc: KG × Complexity combinations (Tukey HSD)
            Multiple Comparison of Means - Tukey HSD, FWER=0.05            
    group1         group2      meandiff  p-adj    lower      upper   reject
---------------------------------------------------------------------------
   Big_Complex   Big_Moderate   -97.0208    1.0 -3082.0378 2887.9962  False
   Big_Complex     Big_Simple   508.3981 0.9936 -2476.6189 3493.4151  False
   Big_Complex  Small_Complex    608.565 0.9854  -2376.452  3593.582  False
   Big_Complex Small_Moderate    780.677 0.9576   -

  wide_kg = data.pivot_table(index="Model", columns="KG", values=metric)
  wide_c = data.pivot_table(index="Model",


  Multiple Comparison of Means - Tukey HSD, FWER=0.05  
 group1   group2  meandiff p-adj   lower  upper  reject
-------------------------------------------------------
 Complex Moderate   0.0276 0.0792 -0.0028 0.0579  False
 Complex   Simple   0.0021 0.9833 -0.0282 0.0324  False
Moderate   Simple  -0.0255 0.1106 -0.0558 0.0049  False
-------------------------------------------------------

Post-hoc: KG × Complexity combinations (Tukey HSD)
        Multiple Comparison of Means - Tukey HSD, FWER=0.05        
    group1         group2     meandiff p-adj   lower  upper  reject
-------------------------------------------------------------------
   Big_Complex   Big_Moderate   0.0251 0.6669  -0.028 0.0782  False
   Big_Complex     Big_Simple   0.0132 0.9661 -0.0399 0.0663  False
   Big_Complex  Small_Complex   0.0204 0.8221 -0.0327 0.0735  False
   Big_Complex Small_Moderate   0.0504 0.0685 -0.0027 0.1035  False
   Big_Complex   Small_Simple   0.0114 0.9816 -0.0417 0.0645  False
  Big_Modera

  wide_kg = data.pivot_table(index="Model", columns="KG", values=metric)
  wide_c = data.pivot_table(index="Model",


  Multiple Comparison of Means - Tukey HSD, FWER=0.05  
 group1   group2  meandiff p-adj   lower  upper  reject
-------------------------------------------------------
 Complex Moderate  -0.0921 0.4306 -0.2759 0.0916  False
 Complex   Simple   0.0026 0.9993 -0.1812 0.1863  False
Moderate   Simple   0.0947 0.4115 -0.0891 0.2784  False
-------------------------------------------------------

Post-hoc: KG × Complexity combinations (Tukey HSD)
        Multiple Comparison of Means - Tukey HSD, FWER=0.05        
    group1         group2     meandiff p-adj   lower  upper  reject
-------------------------------------------------------------------
   Big_Complex   Big_Moderate  -0.1825 0.5233 -0.5153 0.1503  False
   Big_Complex     Big_Simple  -0.0163    1.0 -0.3491 0.3165  False
   Big_Complex  Small_Complex  -0.0222 0.9999  -0.355 0.3106  False
   Big_Complex Small_Moderate   -0.024 0.9999 -0.3567 0.3088  False
   Big_Complex   Small_Simple  -0.0008    1.0 -0.3336  0.332  False
  Big_Modera

  wide_kg = data.pivot_table(index="Model", columns="KG", values=metric)
  wide_c = data.pivot_table(index="Model",


  Multiple Comparison of Means - Tukey HSD, FWER=0.05  
 group1   group2  meandiff p-adj   lower  upper  reject
-------------------------------------------------------
 Complex Moderate  -0.0998 0.4243 -0.2972 0.0975  False
 Complex   Simple  -0.0006    1.0  -0.198 0.1968  False
Moderate   Simple   0.0993 0.4284 -0.0981 0.2966  False
-------------------------------------------------------

Post-hoc: KG × Complexity combinations (Tukey HSD)
        Multiple Comparison of Means - Tukey HSD, FWER=0.05        
    group1         group2     meandiff p-adj   lower  upper  reject
-------------------------------------------------------------------
   Big_Complex   Big_Moderate  -0.1886 0.5596 -0.5449 0.1678  False
   Big_Complex     Big_Simple    0.003    1.0 -0.3534 0.3593  False
   Big_Complex  Small_Complex  -0.0008    1.0 -0.3572 0.3555  False
   Big_Complex Small_Moderate  -0.0119    1.0 -0.3683 0.3444  False
   Big_Complex   Small_Simple   -0.005    1.0 -0.3613 0.3514  False
  Big_Modera

  wide_kg = data.pivot_table(index="Model", columns="KG", values=metric)
  wide_c = data.pivot_table(index="Model",


  Multiple Comparison of Means - Tukey HSD, FWER=0.05  
 group1   group2  meandiff p-adj   lower  upper  reject
-------------------------------------------------------
 Complex Moderate  -0.1865 0.1269 -0.4166 0.0437  False
 Complex   Simple  -0.0784 0.6713 -0.3086 0.1517  False
Moderate   Simple    0.108  0.476 -0.1222 0.3382  False
-------------------------------------------------------

Post-hoc: KG × Complexity combinations (Tukey HSD)
        Multiple Comparison of Means - Tukey HSD, FWER=0.05        
    group1         group2     meandiff p-adj   lower  upper  reject
-------------------------------------------------------------------
   Big_Complex   Big_Moderate  -0.2668 0.3519 -0.6794 0.1457  False
   Big_Complex     Big_Simple  -0.0341 0.9998 -0.4466 0.3785  False
   Big_Complex  Small_Complex  -0.0945 0.9757 -0.5071  0.318  False
   Big_Complex Small_Moderate  -0.2006 0.6415 -0.6132 0.2119  False
   Big_Complex   Small_Simple  -0.2173 0.5643 -0.6299 0.1952  False
  Big_Modera

  wide_kg = data.pivot_table(index="Model", columns="KG", values=metric)
  wide_c = data.pivot_table(index="Model",


  Multiple Comparison of Means - Tukey HSD, FWER=0.05  
 group1   group2  meandiff p-adj   lower  upper  reject
-------------------------------------------------------
 Complex Moderate   -0.125 0.4522 -0.3823 0.1323  False
 Complex   Simple      0.0    1.0 -0.2573 0.2573  False
Moderate   Simple    0.125 0.4522 -0.1323 0.3823  False
-------------------------------------------------------

Post-hoc: KG × Complexity combinations (Tukey HSD)
        Multiple Comparison of Means - Tukey HSD, FWER=0.05        
    group1         group2     meandiff p-adj   lower  upper  reject
-------------------------------------------------------------------
   Big_Complex   Big_Moderate    -0.25 0.5297 -0.7087 0.2087  False
   Big_Complex     Big_Simple      0.0    1.0 -0.4587 0.4587  False
   Big_Complex  Small_Complex      0.0    1.0 -0.4587 0.4587  False
   Big_Complex Small_Moderate      0.0    1.0 -0.4587 0.4587  False
   Big_Complex   Small_Simple      0.0    1.0 -0.4587 0.4587  False
  Big_Modera

  wide_kg = data.pivot_table(index="Model", columns="KG", values=metric)
  wide_c = data.pivot_table(index="Model",



Wilcoxon signed-rank tests for Complexity (pairwise across models):
  Complex vs Moderate: n = 4, W = 0.0000, p = 0.3173
    (paired t-test: t = 1.0000, p = 0.391)
  Complex vs Simple: Wilcoxon failed: zero_method 'wilcox' and 'pratt' do not work if x - y is zero for all elements.
  Moderate vs Simple: n = 4, W = 0.0000, p = 0.3173
    (paired t-test: t = -1.0000, p = 0.391)

SHEET: H1-AM&C | METRIC: satisfiability_binding_score
Attempting ANOVA with formula: satisfiability_binding_score ~ C(KG) * C(Complexity) + C(Model)

ANOVA table:
                       sum_sq    df         F    PR(>F)    eta_sq
C(KG)                0.056733   1.0  1.066019  0.318207  0.044827
C(Complexity)        0.226912   2.0  2.131868  0.153155  0.179295
C(Model)             0.071379   3.0  0.447079  0.722971  0.056401
C(KG):C(Complexity)  0.112269   2.0  1.054784  0.372728  0.088710
Residual             0.798287  15.0       NaN       NaN  0.630767

Post-hoc: KG (Tukey HSD)
Multiple Comparison of Means - Tuke

  wide_kg = data.pivot_table(index="Model", columns="KG", values=metric)
  wide_c = data.pivot_table(index="Model",


  Multiple Comparison of Means - Tukey HSD, FWER=0.05  
 group1   group2  meandiff p-adj   lower  upper  reject
-------------------------------------------------------
 Complex Moderate  -0.0255 0.9668 -0.2845 0.2335  False
 Complex   Simple   0.1165 0.5044 -0.1425 0.3755  False
Moderate   Simple    0.142  0.368  -0.117  0.401  False
-------------------------------------------------------

Post-hoc: KG × Complexity combinations (Tukey HSD)
        Multiple Comparison of Means - Tukey HSD, FWER=0.05        
    group1         group2     meandiff p-adj   lower  upper  reject
-------------------------------------------------------------------
   Big_Complex   Big_Moderate  -0.1596   0.87 -0.6154 0.2963  False
   Big_Complex     Big_Simple   0.1184 0.9588 -0.3375 0.5743  False
   Big_Complex  Small_Complex   0.0021    1.0 -0.4537  0.458  False
   Big_Complex Small_Moderate   0.1108 0.9688 -0.3451 0.5666  False
   Big_Complex   Small_Simple   0.1168 0.9611 -0.3391 0.5727  False
  Big_Modera

  wide_kg = data.pivot_table(index="Model", columns="KG", values=metric)
  wide_c = data.pivot_table(index="Model",


  Multiple Comparison of Means - Tukey HSD, FWER=0.05  
 group1   group2  meandiff p-adj   lower  upper  reject
-------------------------------------------------------
 Complex Moderate   0.0329 0.9224  -0.183 0.2488  False
 Complex   Simple   0.0505 0.8273 -0.1654 0.2664  False
Moderate   Simple   0.0176  0.977 -0.1983 0.2335  False
-------------------------------------------------------

Post-hoc: KG × Complexity combinations (Tukey HSD)
        Multiple Comparison of Means - Tukey HSD, FWER=0.05        
    group1         group2     meandiff p-adj   lower  upper  reject
-------------------------------------------------------------------
   Big_Complex   Big_Moderate  -0.0426 0.9907 -0.2732 0.1881  False
   Big_Complex     Big_Simple  -0.0751 0.8999 -0.3057 0.1555  False
   Big_Complex  Small_Complex   0.1119 0.6439 -0.1188 0.3425  False
   Big_Complex Small_Moderate   0.2202 0.0663 -0.0105 0.4508  False
   Big_Complex   Small_Simple   0.2879 0.0099  0.0573 0.5186   True
  Big_Modera

  wide_kg = data.pivot_table(index="Model", columns="KG", values=metric)
  wide_c = data.pivot_table(index="Model",


  Multiple Comparison of Means - Tukey HSD, FWER=0.05  
 group1   group2  meandiff p-adj   lower  upper  reject
-------------------------------------------------------
 Complex Moderate   0.0148 0.9945 -0.3591 0.3886  False
 Complex   Simple      0.1 0.7808 -0.2739 0.4739  False
Moderate   Simple   0.0852  0.835 -0.2886 0.4591  False
-------------------------------------------------------

Post-hoc: KG × Complexity combinations (Tukey HSD)
        Multiple Comparison of Means - Tukey HSD, FWER=0.05        
    group1         group2     meandiff p-adj   lower  upper  reject
-------------------------------------------------------------------
   Big_Complex   Big_Moderate  -0.0659 0.9996 -0.7723 0.6404  False
   Big_Complex     Big_Simple    0.025    1.0 -0.6814 0.7314  False
   Big_Complex  Small_Complex   -0.025    1.0 -0.7314 0.6814  False
   Big_Complex Small_Moderate   0.0705 0.9995 -0.6359 0.7768  False
   Big_Complex   Small_Simple     0.15 0.9826 -0.5564 0.8564  False
  Big_Modera

  wide_kg = data.pivot_table(index="Model", columns="KG", values=metric)
  wide_c = data.pivot_table(index="Model",


  Multiple Comparison of Means - Tukey HSD, FWER=0.05  
 group1   group2  meandiff p-adj   lower  upper  reject
-------------------------------------------------------
 Complex Moderate   -0.008 0.9986 -0.4043 0.3884  False
 Complex   Simple   0.1208  0.726 -0.2755 0.5172  False
Moderate   Simple   0.1288 0.6956 -0.2676 0.5251  False
-------------------------------------------------------

Post-hoc: KG × Complexity combinations (Tukey HSD)
        Multiple Comparison of Means - Tukey HSD, FWER=0.05        
    group1         group2     meandiff p-adj   lower  upper  reject
-------------------------------------------------------------------
   Big_Complex   Big_Moderate  -0.0818 0.9992 -0.8289 0.6652  False
   Big_Complex     Big_Simple   -0.025    1.0 -0.7721 0.7221  False
   Big_Complex  Small_Complex   -0.125 0.9941 -0.8721 0.6221  False
   Big_Complex Small_Moderate  -0.0591 0.9998 -0.8061  0.688  False
   Big_Complex   Small_Simple   0.1417 0.9895 -0.6054 0.8887  False
  Big_Modera

  wide_kg = data.pivot_table(index="Model", columns="KG", values=metric)
  wide_c = data.pivot_table(index="Model",


  Multiple Comparison of Means - Tukey HSD, FWER=0.05  
 group1   group2  meandiff p-adj   lower  upper  reject
-------------------------------------------------------
 Complex Moderate   0.0148 0.9945 -0.3591 0.3886  False
 Complex   Simple      0.1 0.7808 -0.2739 0.4739  False
Moderate   Simple   0.0852  0.835 -0.2886 0.4591  False
-------------------------------------------------------

Post-hoc: KG × Complexity combinations (Tukey HSD)
        Multiple Comparison of Means - Tukey HSD, FWER=0.05        
    group1         group2     meandiff p-adj   lower  upper  reject
-------------------------------------------------------------------
   Big_Complex   Big_Moderate  -0.0659 0.9996 -0.7723 0.6404  False
   Big_Complex     Big_Simple    0.025    1.0 -0.6814 0.7314  False
   Big_Complex  Small_Complex   -0.025    1.0 -0.7314 0.6814  False
   Big_Complex Small_Moderate   0.0705 0.9995 -0.6359 0.7768  False
   Big_Complex   Small_Simple     0.15 0.9826 -0.5564 0.8564  False
  Big_Modera

  wide_kg = data.pivot_table(index="Model", columns="KG", values=metric)
  wide_c = data.pivot_table(index="Model",
