In [19]:
import numpy as np, pandas as pd, matplotlib.pyplot as plt, cvxpy as cp
from datasets import load_dataset
import seaborn as sns
import tqdm


In [10]:
MODELS = [
'chatgpt-4o-latest-20250326','claude-3-5-haiku-20241022','claude-3-5-sonnet-20241022',
'claude-3-7-sonnet-20250219','claude-3-7-sonnet-20250219-thinking-32k','claude-opus-4-20250514',
'claude-opus-4-20250514-thinking-16k','claude-sonnet-4-20250514','claude-sonnet-4-20250514-thinking-32k',
'deepseek-r1-0528','deepseek-v3-0324','gemini-2.0-flash-001','gemini-2.5-flash','gemini-2.5-pro',
'gemma-3-27b-it','gpt-4.1-mini-2025-04-14','llama-4-maverick-03-26-experimental',
'llama-4-maverick-17b-128e-instruct','mistral-medium-2505','o3-2025-04-16','o4-mini-2025-04-16',
'qwen3-235b-a22b-no-thinking','qwen3-30b-a3b'
]

N_BOOT = 200


In [11]:
def load_df():
    ds = load_dataset("lmarena-ai/arena-human-preference-140k", split="train")
    df = ds.to_pandas()
    keep = ['model_a','model_b','winner','language','timestamp','is_code','category_tag']
    return df[keep].copy()

def top_languages(df, k=4):
    vc = df["language"].value_counts(dropna=False)
    # Exclude "und" from the top k languages
    vc_filtered = vc[vc.index != "und"]
    langs = vc_filtered.head(k).index.tolist()
    return langs, vc

def build_margins(df, languages, models, alpha=1.0):
    m = len(models); K = len(languages)
    idx = {name:i for i,name in enumerate(models)}
    df = df[df["language"].isin(languages)].copy()
    df = df[df["model_a"].isin(models) & df["model_b"].isin(models)].copy()
    counts = df["language"].value_counts().reindex(languages).fillna(0).astype(int)
    tot = int(counts.sum())
    if tot == 0: raise ValueError("No rows after filtering; check languages/models.")
    w0 = (counts/tot).to_numpy(float)

    df = df.copy()
    df["idx_a"] = df["model_a"].map(idx)
    df["idx_b"] = df["model_b"].map(idx)
    df = df.dropna(subset=["idx_a", "idx_b"]).copy()
    
    M_list = []
    for lang in languages:
        rows = df[df["language"]==lang]
        if len(rows) == 0:
            M_list.append(np.zeros((m,m), float))
            continue
        win = np.zeros((m,m), float)
        idx_a = rows["idx_a"].values.astype(int)
        idx_b = rows["idx_b"].values.astype(int)
        winner = rows["winner"].values
        mask_a = (winner == "model_a")
        np.add.at(win, (idx_a[mask_a], idx_b[mask_a]), 1.0)
        mask_b = (winner == "model_b")
        np.add.at(win, (idx_b[mask_b], idx_a[mask_b]), 1.0)

        M = np.zeros((m,m), float)
        for i in range(m):
            for j in range(i+1, m):
                tot_ij = win[i,j] + win[j,i]
                if tot_ij > 0:
                    mij = (win[i,j] - win[j,i]) / (tot_ij + 2.0*alpha)
                else:
                    mij = 0.0
                M[i,j] = mij; M[j,i] = -mij
        M_list.append(M)
    return M_list, w0, counts, df

In [12]:
def solve_drml_tv(M_list, w0, rho, solvers=("GUROBI","MOSEK","GLPK","ECOS")):
    K = len(M_list); m = M_list[0].shape[0]
    p = cp.Variable(m, nonneg=True); t = cp.Variable()
    mu = cp.Variable(m); lam = cp.Variable(m, nonneg=True)
    gamma = cp.Variable((m,K))
    cons = [cp.sum(p)==1]
    for a in range(m):
        cons += [t <= mu[a] - 2.0*rho*lam[a] + w0 @ gamma[a,:]]
        for k in range(K):
            Mk = M_list[k]
            cons += [mu[a] + gamma[a,k] <= p @ Mk[:,a],
                     gamma[a,k] <= lam[a],
                     gamma[a,k] >= -lam[a]]
    prob = cp.Problem(cp.Maximize(t), cons)
    last = None
    for s in solvers:
        try:
            prob.solve(solver=getattr(cp,s), verbose=False)
            if prob.status in ("optimal","optimal_inaccurate"): break
        except Exception as e:
            last = e
    if prob.status not in ("optimal","optimal_inaccurate"):
        raise RuntimeError(f"LP not solved. status={prob.status}, last={last}")
    pval = np.array(p.value).reshape(-1)
    pval[pval<0]=0
    if pval.sum()>0: pval /= pval.sum()
    return pval, float(t.value)


In [13]:
def per_group_winrate(p, M_list):
    M_stack = np.stack(M_list, axis=0)  # (K, m, m)
    return per_group_winrate_vectorized(p, M_stack)

def per_group_winrate_vectorized(p, M_stack):

    pM = np.einsum('i,kij->kj', p, M_stack) 
    margins = np.min(pM, axis=1)  
    wr = 0.5 * (1.0 + margins)
    return wr

def bootstrap_df(df_sub, languages, seed):
    rng = np.random.default_rng(seed)
    parts = []
    for lang in languages:
        part = df_sub[df_sub["language"]==lang]
        n = len(part)
        if n == 0: continue
        idx = rng.integers(0, n, size=n)
        parts.append(part.iloc[idx])
    return pd.concat(parts, ignore_index=True)


In [14]:
def bootstrap_margins(df, models, n_boot=200, seed=0, alpha=1.0):
    languages, _ = top_languages(df, k=4)
    M0, w0, counts, df_sub = build_margins(df, languages, models, alpha=alpha)
    print("languages:", languages)
    print("counts:", dict(zip(languages, counts.tolist())))
    print("w0:", dict(zip(languages, w0.tolist())))
    boot_M = []
    for b in range(n_boot):
        df_b = bootstrap_df(df_sub, languages, seed + 100000*b)
        Mb, _, _, _ = build_margins(df_b, languages, models, alpha=alpha)
        boot_M.append(Mb)
        if (b+1) % max(1, n_boot//10) == 0: print(f"boot {b+1}/{n_boot}")
    return languages, w0, M0, boot_M

def compute_from_bootstrap(languages, w0, boot_M, rhos):
    K = len(languages)
    boot_wr = {(rho,k): [] for rho in rhos for k in range(K)}
    boot_wr_overall = {rho: [] for rho in rhos}
    boot_p = {rho: [] for rho in rhos}
    boot_v = {rho: [] for rho in rhos}
    n_boot = len(boot_M)
    w0_arr = np.array(w0, float)
    for b, Mb in enumerate(boot_M):
        Mb_stack = np.stack(Mb, axis=0)
        for rho in rhos:
            p, v = solve_drml_tv(Mb, w0, rho)
            boot_p[rho].append(p)
            boot_v[rho].append(v)  
            wr = per_group_winrate_vectorized(p, Mb_stack)
            for k in range(K): boot_wr[(rho,k)].append(float(wr[k]))
            M_pooled = np.einsum('k,kij->ij', w0_arr, Mb_stack)
            overall_margin = float(np.min(p @ M_pooled))
            overall_wr = 0.5 * (1 + overall_margin)
            boot_wr_overall[rho].append(float(overall_wr))
        if (b+1) % max(1, n_boot//10) == 0: print(f"compute {b+1}/{n_boot}")
    rows = []
    for rho in rhos:
        for k, lang in enumerate(languages):
            arr = np.array(boot_wr[(rho,k)], float)
            se = float(arr.std() / np.sqrt(len(arr)))
            rows.append({"rho": rho, "group": lang, "boot_mean": float(arr.mean()), "boot_se": se})
        arr_overall = np.array(boot_wr_overall[rho], float)
        se_overall = float(arr_overall.std() / np.sqrt(len(arr_overall)))
        rows.append({"rho": rho, "group": "overall", "boot_mean": float(arr_overall.mean()), "boot_se": se_overall})
    ci = pd.DataFrame(rows).sort_values(["group","rho"])
    return ci, boot_p, boot_v

In [15]:
df = load_df()
rhos = np.linspace(0.0, 1.0, 11)
print("Generating bootstrap margin matrices...")
languages, w0, M0, boot_M = bootstrap_margins(df, MODELS, n_boot=N_BOOT, seed=1, alpha=1)

Generating bootstrap margin matrices...
languages: ['en', 'pl', 'ru', 'zh']
counts: {'en': 26172, 'pl': 4842, 'ru': 3530, 'zh': 2565}
w0: {'en': 0.7052736532916544, 'pl': 0.13048047643428817, 'ru': 0.09512517179120968, 'zh': 0.06912069848284783}
boot 20/200
boot 40/200
boot 60/200
boot 80/200
boot 100/200
boot 120/200
boot 140/200
boot 160/200
boot 180/200
boot 200/200


In [16]:
print("Computing results from bootstrap margin matrices...")
ci, boot_p, boot_v = compute_from_bootstrap(languages, w0, boot_M, rhos)

Computing results from bootstrap margin matrices...
compute 20/200
compute 40/200
compute 60/200
compute 80/200
compute 100/200
compute 120/200
compute 140/200
compute 160/200
compute 180/200
compute 200/200


In [17]:
def plot_group_curves(ci, languages, out_pdf="group_winrate_vs_rho.pdf"):
    lang_names = {'en': 'English', 'pl': 'Polish', 'ru': 'Russian', 'zh': 'Chinese', 'overall': 'Overall'}
    
    sns.set_style("whitegrid")
    sns.set_palette("husl")

    plt.rcParams.update({
        "font.size": 11,
        "axes.labelsize": 12,
        "xtick.labelsize": 10,
        "ytick.labelsize": 10,
        "legend.fontsize": 10,
        "pdf.fonttype": 42,
        "ps.fonttype": 42,
        "font.family": "sans-serif",
        "axes.linewidth": 0.8,
        "grid.linewidth": 0.5,
        "xtick.major.width": 0.6,
        "ytick.major.width": 0.6,
    })
    
    fig, ax = plt.subplots(figsize=(6, 4))
    
    desired_order = ['overall', 'en', 'pl', 'zh', 'ru']
    colors = {'overall': '#2C3E50', 'en': '#E74C3C', 'pl': '#3498DB', 'zh': '#F39C12', 'ru': '#27AE60'}
    
    for group in desired_order:
        if group == 'overall':
            d = ci[ci["group"] == "overall"].sort_values("rho")
            if len(d) > 0:
                x = d["rho"].to_numpy(float)
                y = d["boot_mean"].to_numpy(float) * 100.0
                se = d["boot_se"].to_numpy(float) * 100.0
                ax.plot(x, y, marker="o", linewidth=1.2, markersize=5, 
                       label="Overall", linestyle="--", color=colors['overall'], alpha=0.9)
                ax.fill_between(x, y - se, y + se, alpha=0.15, color=colors['overall'])
        elif group in languages:
            d = ci[ci["group"] == group].sort_values("rho")
            if len(d) > 0:
                x = d["rho"].to_numpy(float)
                y = d["boot_mean"].to_numpy(float) * 100.0
                se = d["boot_se"].to_numpy(float) * 100.0
                lang_label = lang_names.get(group, group)
                ax.plot(x, y, marker="o", linewidth=1.2, markersize=5, 
                       label=lang_label, color=colors[group], alpha=0.9)
                ax.fill_between(x, y - se, y + se, alpha=0.15, color=colors[group])
    
    ax.set_xlabel(r"$\rho$", fontsize=12, labelpad=6)
    ax.set_ylabel("Win Rate (%)", fontsize=12, labelpad=6)
    
    ax.grid(True, alpha=0.25, linestyle='-', linewidth=0.4)
    ax.set_axisbelow(True)

    ax.legend(frameon=True, loc='best', framealpha=0.95, facecolor='white', 
             edgecolor='lightgray', borderpad=0.8, labelspacing=0.6)

    fig.tight_layout()
    fig.savefig(out_pdf, bbox_inches="tight", dpi=300)
    plt.close(fig)
    print("saved:", out_pdf)

In [20]:
plot_group_curves(ci, languages, out_pdf="group_winrate_vs_rho.pdf")

saved: group_winrate_vs_rho.pdf


In [None]:
def bootstrap_train_test_splits(df, languages, models, train_frac=0.8, n_boot=100, seed=0):
    df_filtered = df[df["language"].isin(languages)].copy()
    df_filtered = df_filtered[df_filtered["model_a"].isin(models) & df_filtered["model_b"].isin(models)].copy()
    
    splits = []
    for b in range(n_boot):
        rng = np.random.default_rng(seed + 100000*b)
        n_total = len(df_filtered)
        boot_idx = rng.integers(0, n_total, size=n_total)
        df_boot = df_filtered.iloc[boot_idx].reset_index(drop=True)
        
        df_train_list = []
        df_test_list = []
        for lang in languages:
            lang_df = df_boot[df_boot["language"] == lang]
            n = len(lang_df)
            if n == 0:
                continue
            n_train = int(train_frac * n)
            idx = np.arange(n)
            rng.shuffle(idx)
            train_idx = idx[:n_train]
            test_idx = idx[n_train:]
            df_train_list.append(lang_df.iloc[train_idx])
            df_test_list.append(lang_df.iloc[test_idx])
        if len(df_train_list) == 0:
            continue
        df_train = pd.concat(df_train_list, ignore_index=True)
        df_test = pd.concat(df_test_list, ignore_index=True)
        splits.append((df_train, df_test))
        if (b+1) % max(1, n_boot//10) == 0: print(f"boot {b+1}/{n_boot}")
    return splits

def compute_generalization(splits, languages, models, rhos, alpha=1.0):
    K = len(languages)
    boot_wr_train = {(rho, lang): [] for rho in rhos for lang in languages + ["overall"]}
    boot_wr_test = {(rho, lang): [] for rho in rhos for lang in languages + ["overall"]}
    boot_wr_gap = {(rho, lang): [] for rho in rhos for lang in languages + ["overall"]}
    
    for df_train, df_test in splits:
        M_train, w0_train, _, _ = build_margins(df_train, languages, models, alpha=alpha)
        M_test, w0_test, _, _ = build_margins(df_test, languages, models, alpha=alpha)
        
        M_train_stack = np.stack(M_train, axis=0)
        M_test_stack = np.stack(M_test, axis=0)
        w0_train_arr = np.array(w0_train, float)
        w0_test_arr = np.array(w0_test, float)
        
        for rho in rhos:
            p_rho, _ = solve_drml_tv(M_train, w0_train, rho)
            wr_train = per_group_winrate_vectorized(p_rho, M_train_stack)
            wr_test = per_group_winrate_vectorized(p_rho, M_test_stack)
            for k, lang in enumerate(languages):
                boot_wr_train[(rho, lang)].append(float(wr_train[k]))
                boot_wr_test[(rho, lang)].append(float(wr_test[k]))
                boot_wr_gap[(rho, lang)].append(float(wr_test[k] - wr_train[k]))
            
            M_pooled_test = np.einsum('k,kij->ij', w0_test_arr, M_test_stack)
            v_test_overall = float(np.min(p_rho @ M_pooled_test))
            wr_test_overall = 0.5 * (1 + v_test_overall)
            M_pooled_train = np.einsum('k,kij->ij', w0_train_arr, M_train_stack)
            v_train_overall = float(np.min(p_rho @ M_pooled_train))
            wr_train_overall = 0.5 * (1 + v_train_overall)
            boot_wr_train[(rho, "overall")].append(float(wr_train_overall))
            boot_wr_test[(rho, "overall")].append(float(wr_test_overall))
            boot_wr_gap[(rho, "overall")].append(float(wr_test_overall - wr_train_overall))
    
    gap_rows = []
    for rho in rhos:
        for lang in languages + ["overall"]:
            arr_train = np.array(boot_wr_train[(rho, lang)], float)
            arr_test = np.array(boot_wr_test[(rho, lang)], float)
            arr_gap = np.array(boot_wr_gap[(rho, lang)], float)
            if len(arr_train) > 0:
                gap_rows.append({
                    "rho": rho,
                    "language": lang,
                    "wr_train_mean": float(arr_train.mean()),
                    "wr_train_se": float(arr_train.std() / np.sqrt(len(arr_train))),
                    "wr_test_mean": float(arr_test.mean()),
                    "wr_test_se": float(arr_test.std() / np.sqrt(len(arr_test))),
                    "wr_gap_mean": float(arr_gap.mean()),
                    "wr_gap_se": float(arr_gap.std() / np.sqrt(len(arr_gap)))
                })
    gap_df = pd.DataFrame(gap_rows)
    return gap_df, boot_wr_train, boot_wr_test, boot_wr_gap

In [29]:
print("Generating bootstrap train/test splits...")
splits = bootstrap_train_test_splits(df, languages, MODELS, train_frac=0.8, n_boot=N_BOOT, seed=42)

Generating bootstrap train/test splits...
boot 20/200
boot 40/200
boot 60/200
boot 80/200
boot 100/200
boot 120/200
boot 140/200
boot 160/200
boot 180/200
boot 200/200


In [30]:
print("Computing generalization gaps from splits...")
gap_df, _, _, _ = compute_generalization(splits, languages, MODELS, rhos, alpha=1.0)

Computing generalization gaps from splits...


In [35]:
def plot_generalization(gap_df, languages, metric="gap", out_pdf=None):
    lang_names = {'en': 'English', 'pl': 'Polish', 'ru': 'Russian', 'zh': 'Chinese', 'overall': 'Overall'}

    if out_pdf is None:
        out_pdf = f"generalization_{metric}.pdf"

    sns.set_style("whitegrid")
    sns.set_palette("husl")

    plt.rcParams.update({
        "font.size": 11,
        "axes.labelsize": 12,
        "xtick.labelsize": 10,
        "ytick.labelsize": 10,
        "legend.fontsize": 10,
        "pdf.fonttype": 42,
        "ps.fonttype": 42,
        "font.family": "sans-serif",
        "axes.linewidth": 0.8,
        "grid.linewidth": 0.5,
        "xtick.major.width": 0.6,
        "ytick.major.width": 0.6,
    })
    
    fig, ax = plt.subplots(figsize=(6, 4))
    
    desired_order = ['overall', 'en', 'pl', 'zh', 'ru']
    colors = {'overall': '#2C3E50', 'en': '#E74C3C', 'pl': '#3498DB', 'zh': '#F39C12', 'ru': '#27AE60'}
    
    if metric == "train":
        mean_col = "wr_train_mean"
        se_col = "wr_train_se"
        ylabel = "Win rate on training data (%)"
        title_suffix = "Train"
    elif metric == "test":
        mean_col = "wr_test_mean"
        se_col = "wr_test_se"
        ylabel = "Win rate on held out votes (%)"
        title_suffix = "Test"
    elif metric == "gap":
        mean_col = "wr_gap_mean"
        se_col = "wr_gap_se"
        ylabel = "Win rate generalization gap (%)"
        title_suffix = "Gap"
    else:
        raise ValueError("metric must be 'train', 'test', or 'gap'")
    
    if metric == "gap":
        d = gap_df[gap_df["language"] == "overall"].sort_values("rho")
        if len(d) > 0:
            x = d["rho"].to_numpy(float)
            y = -d[mean_col].to_numpy(float) * 100.0
            se = d[se_col].to_numpy(float) * 100.0
            ax.plot(x, y, marker="o", linewidth=1.2, markersize=5,
                   linestyle="-", color=colors['overall'], alpha=0.9)
            ax.fill_between(x, y - se, y + se, alpha=0.15, color=colors['overall'])
    else:
        for group in desired_order:
                    d = gap_df[gap_df["language"] == group].sort_values("rho")
                    if len(d) > 0:
                        x = d["rho"].to_numpy(float)
                        y = d[mean_col].to_numpy(float) * 100.0
                        se = d[se_col].to_numpy(float) * 100.0
                        label = lang_names.get(group, group)
                        linestyle = "--" if group == 'overall' else "-"
                        ax.plot(x, y, marker="o", linewidth=1.2, markersize=5,
                            label=label, linestyle=linestyle, color=colors[group], alpha=0.9)
                        ax.fill_between(x, y - se, y + se, alpha=0.15, color=colors[group])
        
        ax.legend(frameon=True, loc='best', framealpha=0.95, facecolor='white', 
                 edgecolor='lightgray', borderpad=0.8, labelspacing=0.6)
    
    ax.set_xlabel(r"$\rho$", fontsize=12, labelpad=6)
    ax.set_ylabel(ylabel, fontsize=12, labelpad=6)
    
    ax.grid(True, alpha=0.25, linestyle='-', linewidth=0.4)
    ax.set_axisbelow(True)
    
    fig.tight_layout()
    fig.savefig(out_pdf, bbox_inches="tight", dpi=300)
    plt.close(fig)
    print("saved:", out_pdf)

def print_generalization_gaps(gap_df, languages, rhos):
    lang_names = {'en': 'English', 'pl': 'Polish', 'ru': 'Russian', 'zh': 'Chinese', 'overall': 'Overall'}
    gap_df = gap_df.copy()
    gap_df["formatted"] = gap_df.apply(
        lambda row: f"{row['wr_gap_mean']:.4f} ± {row['wr_gap_se']:.4f}", axis=1
    )
    table = gap_df.pivot(index="language", columns="rho", values="formatted")
    all_langs = languages + ["overall"]
    table = table.reindex(all_langs)
    table.index = [lang_names.get(lang, lang) for lang in table.index]
    display(table)
    return table

In [37]:
print_generalization_gaps(gap_df, languages, rhos)
plot_generalization(gap_df, languages, "gap", out_pdf="generalization.pdf")
plot_generalization(gap_df, languages, "test", out_pdf="winrate_heldout.pdf")

rho,0.0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0
English,-0.0774 ± 0.0047,-0.0735 ± 0.0041,-0.0633 ± 0.0036,-0.0553 ± 0.0032,-0.0467 ± 0.0030,-0.0420 ± 0.0028,-0.0365 ± 0.0027,-0.0333 ± 0.0028,-0.0305 ± 0.0027,-0.0285 ± 0.0026,-0.0279 ± 0.0026
Polish,-0.0631 ± 0.0079,-0.0595 ± 0.0070,-0.0541 ± 0.0066,-0.0488 ± 0.0056,-0.0473 ± 0.0048,-0.0480 ± 0.0043,-0.0505 ± 0.0039,-0.0503 ± 0.0035,-0.0501 ± 0.0032,-0.0488 ± 0.0031,-0.0485 ± 0.0031
Russian,0.0062 ± 0.0065,0.0013 ± 0.0061,-0.0042 ± 0.0055,-0.0077 ± 0.0051,-0.0144 ± 0.0046,-0.0238 ± 0.0040,-0.0330 ± 0.0035,-0.0423 ± 0.0033,-0.0504 ± 0.0031,-0.0574 ± 0.0029,-0.0577 ± 0.0029
Chinese,0.0102 ± 0.0066,0.0035 ± 0.0064,-0.0052 ± 0.0056,-0.0154 ± 0.0045,-0.0216 ± 0.0039,-0.0264 ± 0.0037,-0.0306 ± 0.0034,-0.0317 ± 0.0031,-0.0329 ± 0.0030,-0.0350 ± 0.0028,-0.0362 ± 0.0028
Overall,-0.0597 ± 0.0033,-0.0528 ± 0.0029,-0.0457 ± 0.0027,-0.0390 ± 0.0024,-0.0324 ± 0.0023,-0.0289 ± 0.0021,-0.0254 ± 0.0020,-0.0225 ± 0.0020,-0.0196 ± 0.0020,-0.0186 ± 0.0020,-0.0182 ± 0.0020


saved: generalization.pdf
saved: winrate_heldout.pdf


In [38]:
def print_boot_p(boot_p, models, rhos, thresh=0.02):
    rows = []
    for rho in rhos:
        P = np.vstack(boot_p[rho])
        mean = P.mean(axis=0)
        se = P.std(axis=0) / np.sqrt(P.shape[0])
        for i, name in enumerate(models):
            rows.append({"rho": rho, "model": name, "mean": mean[i], "se": se[i]})
    wdf = pd.DataFrame(rows)
    keep = wdf.groupby("model")["mean"].max()
    keep = keep[keep > thresh].index
    wdf = wdf[wdf["model"].isin(keep)].copy()
    wdf["mean%"] = (100*wdf["mean"]).round(2)
    wdf["se%"] = (100*wdf["se"]).round(2)
    
    wdf["formatted"] = wdf.apply(lambda row: f"{row['mean%']:.2f} ± {row['se%']:.2f}", axis=1)
    
    table = wdf.pivot(index="model", columns="rho", values="formatted")
    
    rho0_means = wdf[wdf["rho"] == rhos[0]].set_index("model")["mean"].sort_values(ascending=False)
    table = table.reindex(rho0_means.index)
    
    print(f"(shown if mean> {100*thresh:.1f}% for some rho)")
    display(table)
    return wdf

wdf = print_boot_p(boot_p, MODELS, rhos, thresh=0.02)

(shown if mean> 2.0% for some rho)


rho,0.0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
gemini-2.5-pro,72.97 ± 2.56,69.73 ± 2.16,63.62 ± 1.87,57.76 ± 1.57,51.86 ± 1.33,46.65 ± 1.20,42.21 ± 1.10,39.11 ± 1.02,35.95 ± 0.98,34.64 ± 0.97,34.60 ± 0.96
chatgpt-4o-latest-20250326,11.53 ± 1.76,12.16 ± 1.49,12.25 ± 1.20,11.64 ± 1.04,10.51 ± 0.92,9.80 ± 0.88,9.66 ± 0.86,9.07 ± 0.79,8.69 ± 0.75,8.40 ± 0.73,8.33 ± 0.72
deepseek-r1-0528,10.11 ± 1.48,11.35 ± 1.26,12.87 ± 1.11,13.39 ± 0.95,13.47 ± 0.91,13.13 ± 0.87,12.88 ± 0.81,12.37 ± 0.78,12.21 ± 0.77,12.19 ± 0.77,12.16 ± 0.76
llama-4-maverick-03-26-experimental,2.73 ± 0.71,3.46 ± 0.70,5.31 ± 0.72,7.78 ± 0.77,10.65 ± 0.82,12.82 ± 0.86,14.25 ± 0.82,15.43 ± 0.83,15.88 ± 0.82,15.95 ± 0.82,16.01 ± 0.82
o3-2025-04-16,2.40 ± 0.61,2.91 ± 0.54,5.15 ± 0.65,7.92 ± 0.77,11.00 ± 0.90,13.56 ± 0.95,15.16 ± 0.98,16.11 ± 1.00,16.43 ± 0.95,15.84 ± 0.93,15.66 ± 0.93
gemini-2.5-flash,0.09 ± 0.06,0.16 ± 0.08,0.26 ± 0.11,0.66 ± 0.20,1.04 ± 0.25,1.74 ± 0.31,2.58 ± 0.38,3.64 ± 0.45,5.04 ± 0.52,5.57 ± 0.55,5.66 ± 0.55
qwen3-235b-a22b-no-thinking,0.00 ± 0.00,0.00 ± 0.00,0.00 ± 0.00,0.10 ± 0.04,0.50 ± 0.16,0.87 ± 0.22,1.41 ± 0.27,1.80 ± 0.27,2.41 ± 0.35,2.86 ± 0.36,2.90 ± 0.36


In [39]:
def plot_model_performance_grid(wdf, out_pdf="model_performance_grid.pdf"):
    sns.set_style("white")
    
    plt.rcParams.update({
        "font.size": 13,
        "axes.labelsize": 14,
        "xtick.labelsize": 12,
        "ytick.labelsize": 13,
        "legend.fontsize": 13,
        "pdf.fonttype": 42,
        "ps.fonttype": 42,
        "font.family": "sans-serif",
        "axes.linewidth": 0.8,
        "grid.linewidth": 0.4,
        "xtick.major.width": 0.6,
        "ytick.major.width": 0,
    })
    
    model_names = {
        'gemini-2.5-pro': 'Gemini 2.5 Pro',
        'chatgpt-4o-latest-20250326': 'ChatGPT 4o',
        'deepseek-r1-0528': 'DeepSeek R1',
        'llama-4-maverick-03-26-experimental': 'Llama 4 Maverick',
        'o3-2025-04-16': 'o3',
        'gemini-2.5-flash': 'Gemini 2.5 Flash',
        'qwen3-235b-a22b-no-thinking': 'Qwen 3 235B',
    }
    
    colors = {
        'gemini-2.5-pro': '#E74C3C',
        'chatgpt-4o-latest-20250326': '#3498DB',
        'deepseek-r1-0528': '#F39C12',
        'llama-4-maverick-03-26-experimental': '#27AE60',
        'o3-2025-04-16': '#9B59B6',
        'gemini-2.5-flash': '#1ABC9C',
        'qwen3-235b-a22b-no-thinking': '#E67E22',
    }
    
    fig, axes = plt.subplots(1, 4, figsize=(18, 5))
    axes = axes.flatten()
    
    rhos_to_plot = [0.0, 0.4, 0.7, 1.0]
    models_ordered = wdf[np.isclose(wdf["rho"], 0.0)].sort_values("mean", ascending=True)["model"].values
    
    for idx, rho in enumerate(rhos_to_plot):
        ax = axes[idx]
        
        data = wdf[np.isclose(wdf["rho"], rho)].copy()
        data = data.set_index("model").reindex(models_ordered).reset_index()
        data = data.dropna()
        
        bar_spacing = 0.65
        y_pos = np.arange(len(data)) * bar_spacing
        bar_colors = [colors.get(m, '#95A5A6') for m in data["model"]]
        
        ax.barh(y_pos, data["mean%"], xerr=data["se%"], 
               color=bar_colors,
               error_kw={'elinewidth': 1.2, 'capsize': 2.5, 'alpha': 0.7},
               height=0.5 * bar_spacing, alpha=0.85, edgecolor='white', linewidth=0.5)
        
        ax.set_xlim(0, 75)
        ax.set_xticks([0, 20, 40, 60, 80])
        ax.set_axisbelow(True)
        ax.grid(True, axis='x', alpha=0.25, linestyle='-', linewidth=0.5, color='gray')
        ax.grid(True, axis='y', alpha=0.15, linestyle='-', linewidth=0.3, color='gray')
        
        if idx == 0:
            ax.set_yticks(y_pos)
            clean_names = [model_names.get(m, m) for m in data["model"]]
            ax.set_yticklabels(clean_names, fontsize=13)
        else:
            ax.set_yticks([])
        
        ax.set_xlabel("Probability (%)", fontsize=15, labelpad=4)
        ax.set_xticklabels(['0', '20', '40', '60', '80'], fontsize=13)
        
        rho_label = f'ρ = {rho:.1f}'
        ax.text(0.95, 0.1, rho_label, transform=ax.transAxes,
               fontsize=18, fontweight='bold', ha='right', va='bottom',
               bbox=dict(boxstyle='round,pad=0.4', facecolor='white', 
                        edgecolor='lightgray', alpha=0.8, linewidth=0.5))
        
        ax.spines['left'].set_linewidth(0.8)
        ax.spines['bottom'].set_linewidth(0.8)
        ax.spines['top'].set_visible(False)
        ax.spines['right'].set_visible(False)
    
    fig.tight_layout(pad=1.2)
    fig.savefig(out_pdf, bbox_inches="tight", dpi=300)
    plt.close(fig)
    print(f"saved: {out_pdf}")

plot_model_performance_grid(wdf, out_pdf="model_performance_grid.pdf")

saved: model_performance_grid.pdf
