In [1]:
import numpy as np, pandas as pd, matplotlib.pyplot as plt, cvxpy as cp
from datasets import load_dataset
import seaborn as sns

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
MODELS = ['gemini-2.0-flash-grounding', 'gemini-2.5-flash-preview-04-17-grounding', 'gemini-2.5-pro-exp-03-25-grounding', 'gemini-2.5-pro-exp-03-25-wo-search', 'gpt-4o-mini-search-preview', 'gpt-4o-search-preview', 'gpt-4o-search-preview-high', 'gpt-4o-search-preview-high-loc', 'sonar', 'sonar-pro', 'sonar-pro-high', 'sonar-reasoning', 'sonar-reasoning-pro-high']
GROUP_BY = "primary_intent"  # Options: "language" or "primary_intent"
N_BOOT = 200

In [3]:
def load_df(group_by=GROUP_BY):
    ds = load_dataset("lmarena-ai/search-arena-24k", split="test")
    df = ds.to_pandas()
    keep = ['model_a','model_b','winner','languages','timestamp', 'primary_intent']
    df = df[keep].copy()
    # Filter rows where languages is a list with exactly one element
    df = df[df["languages"].apply(lambda x: isinstance(x, (list, tuple, np.ndarray)) and len(x) == 1)].copy()
    def extract_lang(x):
        if isinstance(x, (list, tuple, np.ndarray)) and len(x) == 1:
            val = x[0]
            if isinstance(val, np.ndarray):
                val = val.item() if val.size == 1 else str(val[0])
            return str(val)
        return str(x)
    df["language"] = df["languages"].apply(extract_lang)
    if group_by == "language":
        df["group"] = df["language"]
    elif group_by == "primary_intent":
        df["group"] = df["primary_intent"]
    else:
        raise ValueError(f"Unknown group_by: {group_by}. Must be 'language' or 'primary_intent'")
    return df

def top_groups(df, group_col="group", k=4):
    vc = df[group_col].value_counts(dropna=False)
    groups = vc.head(k).index.tolist()
    print(groups)
    return groups, vc

def top_languages(df, k=4):
    return top_groups(df, "language", k)


def build_margins(df, groups, models, alpha=1.0, group_col="group"):
    m = len(models); K = len(groups)
    idx = {name:i for i,name in enumerate(models)}
    df = df[df[group_col].isin(groups)].copy()
    df = df[df["model_a"].isin(models) & df["model_b"].isin(models)].copy()
    counts = df[group_col].value_counts().reindex(groups).fillna(0).astype(int)
    tot = int(counts.sum())
    if tot == 0: raise ValueError("No rows after filtering; check groups/models.")
    w0 = (counts/tot).to_numpy(float)
    M_list = []
    df = df.copy()
    df["idx_a"] = df["model_a"].map(idx)
    df["idx_b"] = df["model_b"].map(idx)
    df = df.dropna(subset=["idx_a", "idx_b"]).copy()
    
    M_list = []
    for grp in groups:
        rows = df[df[group_col]==grp]
        if len(rows) == 0:
            M_list.append(np.zeros((m,m), float))
            continue
        win = np.zeros((m,m), float)
        idx_a = rows["idx_a"].values.astype(int)
        idx_b = rows["idx_b"].values.astype(int)
        winner = rows["winner"].values
        mask_a = (winner == "model_a")
        np.add.at(win, (idx_a[mask_a], idx_b[mask_a]), 1.0)
        mask_b = (winner == "model_b")
        np.add.at(win, (idx_b[mask_b], idx_a[mask_b]), 1.0)

        M = np.zeros((m,m), float)
        for i in range(m):
            for j in range(i+1, m):
                tot_ij = win[i,j] + win[j,i]
                if tot_ij > 0:
                    mij = (win[i,j] - win[j,i]) / (tot_ij + 2.0*alpha)
                else:
                    mij = 0.0
                M[i,j] = mij; M[j,i] = -mij
        M_list.append(M)
    return M_list, w0, counts, df

In [4]:
def solve_drml_tv(M_list, w0, rho, solvers=("GUROBI","MOSEK","GLPK","ECOS")):
    K = len(M_list); m = M_list[0].shape[0]
    p = cp.Variable(m, nonneg=True); t = cp.Variable()
    mu = cp.Variable(m); lam = cp.Variable(m, nonneg=True)
    gamma = cp.Variable((m,K))
    cons = [cp.sum(p)==1]
    for a in range(m):
        cons += [t <= mu[a] - 2.0*rho*lam[a] + w0 @ gamma[a,:]]
        for k in range(K):
            Mk = M_list[k]
            cons += [mu[a] + gamma[a,k] <= p @ Mk[:,a],
                     gamma[a,k] <= lam[a],
                     gamma[a,k] >= -lam[a]]
    prob = cp.Problem(cp.Maximize(t), cons)
    last = None
    for s in solvers:
        try:
            prob.solve(solver=getattr(cp,s), verbose=False)
            if prob.status in ("optimal","optimal_inaccurate"): break
        except Exception as e:
            last = e
    if prob.status not in ("optimal","optimal_inaccurate"):
        raise RuntimeError(f"LP not solved. status={prob.status}, last={last}")
    pval = np.array(p.value).reshape(-1)
    pval[pval<0]=0
    if pval.sum()>0: pval /= pval.sum()
    return pval, float(t.value)


In [5]:
def per_group_winrate(p, M_list):
    M_stack = np.stack(M_list, axis=0)  # (K, m, m)
    return per_group_winrate_vectorized(p, M_stack)

def per_group_winrate_vectorized(p, M_stack):
    pM = np.einsum('i,kij->kj', p, M_stack)
    margins = np.min(pM, axis=1)
    wr = 0.5 * (1.0 + margins)
    return wr

def bootstrap_df(df_sub, groups, seed, group_col="group"):
    rng = np.random.default_rng(seed)
    parts = []
    for grp in groups:
        part = df_sub[df_sub[group_col]==grp]
        n = len(part)
        if n == 0: continue
        idx = rng.integers(0, n, size=n)
        parts.append(part.iloc[idx])
    return pd.concat(parts, ignore_index=True)


In [6]:
def bootstrap_margins(df, models, n_boot=200, seed=0, alpha=1.0, group_by=GROUP_BY):
    if group_by == "language":
        df["group"] = df["language"]
        groups, _ = top_groups(df, "group", k=4)
    elif group_by == "primary_intent":
        df["group"] = df["primary_intent"]
        groups, _ = top_groups(df, "group", k=4)
    else:
        raise ValueError(f"Unknown group_by: {group_by}")

    M0, w0, counts, df_sub = build_margins(df, groups, models, alpha=alpha, group_col="group")
    print("groups:", groups)
    print("counts:", dict(zip(groups, counts.tolist())))
    print("w0:", dict(zip(groups, w0.tolist())))
    boot_M = []
    for b in range(n_boot):
        df_b = bootstrap_df(df_sub, groups, seed + 100000*b, group_col="group")
        Mb, _, _, _ = build_margins(df_b, groups, models, alpha=alpha, group_col="group")
        boot_M.append(Mb)
        if (b+1) % max(1, n_boot//10) == 0: print(f"boot {b+1}/{n_boot}")
    return groups, w0, M0, boot_M

def compute_from_bootstrap(groups, w0, boot_M, rhos):
    K = len(groups)
    boot_wr = {(rho,k): [] for rho in rhos for k in range(K)}
    boot_wr_overall = {rho: [] for rho in rhos}
    boot_p = {rho: [] for rho in rhos}
    boot_v = {rho: [] for rho in rhos}
    n_boot = len(boot_M)
    w0_arr = np.array(w0, float)
    for b, Mb in enumerate(boot_M):
        Mb_stack = np.stack(Mb, axis=0)
        for rho in rhos:
            p, v = solve_drml_tv(Mb, w0, rho)
            boot_p[rho].append(p)
            boot_v[rho].append(v)  
            wr = per_group_winrate_vectorized(p, Mb_stack)
            for k in range(K): boot_wr[(rho,k)].append(float(wr[k]))
            M_pooled = np.einsum('k,kij->ij', w0_arr, Mb_stack)
            overall_margin = float(np.min(p @ M_pooled))
            overall_wr = 0.5 * (1 + overall_margin)
            boot_wr_overall[rho].append(float(overall_wr))
        if (b+1) % max(1, n_boot//10) == 0: print(f"compute {b+1}/{n_boot}")
    rows = []
    for rho in rhos:
        for k, lang in enumerate(groups):
            arr = np.array(boot_wr[(rho,k)], float)
            se = float(arr.std() / np.sqrt(len(arr)))
            rows.append({"rho": rho, "group": lang, "boot_mean": float(arr.mean()), "boot_se": se})
        arr_overall = np.array(boot_wr_overall[rho], float)
        se_overall = float(arr_overall.std() / np.sqrt(len(arr_overall)))
        rows.append({"rho": rho, "group": "overall", "boot_mean": float(arr_overall.mean()), "boot_se": se_overall})
    ci = pd.DataFrame(rows).sort_values(["group","rho"])
    return ci, boot_p, boot_v

In [7]:
df = load_df(group_by=GROUP_BY)
rhos = np.linspace(0.0, 1.0, 11)
print("Generating bootstrap margin matrices...")
groups, w0, M0, boot_M = bootstrap_margins(df, MODELS, n_boot=N_BOOT, seed=1, alpha=1.0, group_by=GROUP_BY)
print("Computing results from bootstrap margin matrices...")
ci, boot_p, boot_v = compute_from_bootstrap(groups, w0, boot_M, rhos)

Generating bootstrap margin matrices...
['Factual Lookup', 'Info Synthesis', 'Recommendation', 'Analysis']
groups: ['Factual Lookup', 'Info Synthesis', 'Recommendation', 'Analysis']
counts: {'Factual Lookup': 4269, 'Info Synthesis': 3931, 'Recommendation': 2441, 'Analysis': 2348}
w0: {'Factual Lookup': 0.3286627146046655, 'Info Synthesis': 0.30264069597351606, 'Recommendation': 0.18792824697821234, 'Analysis': 0.18076834244360612}
boot 20/200
boot 40/200
boot 60/200
boot 80/200
boot 100/200
boot 120/200
boot 140/200
boot 160/200
boot 180/200
boot 200/200
Computing results from bootstrap margin matrices...
compute 20/200
compute 40/200
compute 60/200
compute 80/200
compute 100/200
compute 120/200
compute 140/200
compute 160/200
compute 180/200
compute 200/200


In [8]:
def plot_group_curves(ci, groups, out_pdf=None):
    if out_pdf is None:
        out_pdf = f"group_winrate_vs_rho_{GROUP_BY}.pdf"
    sns.set_style("whitegrid")
    sns.set_palette("husl")
    
    plt.rcParams.update({
        "font.size": 11,
        "axes.labelsize": 12,
        "xtick.labelsize": 10,
        "ytick.labelsize": 10,
        "legend.fontsize": 10,
        "pdf.fonttype": 42,
        "ps.fonttype": 42,
        "font.family": "sans-serif",
        "axes.linewidth": 0.8,
        "grid.linewidth": 0.5,
        "xtick.major.width": 0.6,
        "ytick.major.width": 0.6,
    })
    
    fig, ax = plt.subplots(figsize=(6, 4))
    
    default_colors = ['#E74C3C', '#3498DB', '#F39C12', '#27AE60', '#9B59B6', '#1ABC9C', '#E67E22']
    colors = {'overall': '#2C3E50'}
    for i, grp in enumerate(groups):
        if grp not in colors:
            colors[grp] = default_colors[i % len(default_colors)]
    
    desired_order = ['overall'] + list(groups)
    
    for group in desired_order:
        if group == 'overall':
            d = ci[ci["group"] == "overall"].sort_values("rho")
            if len(d) > 0:
                x = d["rho"].to_numpy(float)
                y = d["boot_mean"].to_numpy(float)
                se = d["boot_se"].to_numpy(float)
                ax.plot(x, y, marker="o", linewidth=1.2, markersize=5, 
                       label="Overall", linestyle="--", color=colors['overall'], alpha=0.9)
                ax.fill_between(x, y - se, y + se, alpha=0.15, color=colors['overall'])
        elif group in groups:
            d = ci[ci["group"] == group].sort_values("rho")
            if len(d) > 0:
                x = d["rho"].to_numpy(float)
                y = d["boot_mean"].to_numpy(float)
                se = d["boot_se"].to_numpy(float)
                ax.plot(x, y, marker="o", linewidth=1.2, markersize=5, 
                       label=group, color=colors.get(group, '#95A5A6'), alpha=0.9)
                ax.fill_between(x, y - se, y + se, alpha=0.15, color=colors.get(group, '#95A5A6'))
    
    ax.set_xlabel(r"$\rho$", fontsize=12, labelpad=6)
    ax.set_ylabel("Win Rate (%)", fontsize=12, labelpad=6)
    
    ax.grid(True, alpha=0.25, linestyle='-', linewidth=0.4)
    ax.set_axisbelow(True)
    
    ax.legend(frameon=True, loc='best', framealpha=0.95, facecolor='white', 
             edgecolor='lightgray', borderpad=0.8, labelspacing=0.6)
    
    fig.tight_layout()
    fig.savefig(out_pdf, bbox_inches="tight", dpi=300)
    plt.close(fig)
    print("saved:", out_pdf)

In [9]:
plot_group_curves(ci, groups)

saved: group_winrate_vs_rho_primary_intent.pdf


In [13]:
def bootstrap_train_test_splits(
    df,
    groups,
    models,
    train_frac=0.8,
    n_boot=100,
    seed=0,
    group_col="group",
):
    df_filtered = df[df[group_col].isin(groups)].copy()
    df_filtered = df_filtered[
        df_filtered["model_a"].isin(models) & df_filtered["model_b"].isin(models)
    ].copy()

    splits = []
    for b in range(n_boot):
        rng = np.random.default_rng(seed + 100000 * b)
        n_total = len(df_filtered)
        boot_idx = rng.integers(0, n_total, size=n_total)
        df_boot = df_filtered.iloc[boot_idx].reset_index(drop=True)

        df_train_list = []
        df_test_list = []
        for grp in groups:
            grp_df = df_boot[df_boot[group_col] == grp]
            n = len(grp_df)
            if n == 0:
                continue
            n_train = int(train_frac * n)
            idx = np.arange(n)
            rng.shuffle(idx)
            train_idx = idx[:n_train]
            test_idx = idx[n_train:]
            df_train_list.append(grp_df.iloc[train_idx])
            df_test_list.append(grp_df.iloc[test_idx])

        if len(df_train_list) == 0:
            continue

        df_train = pd.concat(df_train_list, ignore_index=True)
        df_test = pd.concat(df_test_list, ignore_index=True)
        splits.append((df_train, df_test))

        if (b + 1) % max(1, n_boot // 10) == 0:
            print(f"boot {b+1}/{n_boot}")

    return splits


def compute_generalization(
    splits,
    groups,
    models,
    rhos,
    alpha=1.0,
    group_col="group",
):
    K = len(groups)

    # store train/test/gap for each (rho, group) and "overall"
    boot_wr_train = {(rho, grp): [] for rho in rhos for grp in groups + ["overall"]}
    boot_wr_test  = {(rho, grp): [] for rho in rhos for grp in groups + ["overall"]}
    boot_wr_gap   = {(rho, grp): [] for rho in rhos for grp in groups + ["overall"]}

    for df_train, df_test in splits:
        M_train, w0_train, _, _ = build_margins(
            df_train, groups, models, alpha=alpha, group_col=group_col
        )
        M_test, w0_test, _, _ = build_margins(
            df_test, groups, models, alpha=alpha, group_col=group_col
        )

        M_train_stack = np.stack(M_train, axis=0)
        M_test_stack = np.stack(M_test, axis=0)
        w0_train_arr = np.array(w0_train, float)
        w0_test_arr = np.array(w0_test, float)

        for rho in rhos:
            p_rho, _ = solve_drml_tv(M_train, w0_train, rho)

            # per-group train/test winrates under p_rho
            wr_train = per_group_winrate_vectorized(p_rho, M_train_stack)
            wr_test = per_group_winrate_vectorized(p_rho, M_test_stack)

            for k, grp in enumerate(groups):
                boot_wr_train[(rho, grp)].append(float(wr_train[k]))
                boot_wr_test[(rho, grp)].append(float(wr_test[k]))
                boot_wr_gap[(rho, grp)].append(float(wr_test[k] - wr_train[k]))

            # overall (pooled by w0) train/test winrates
            M_pooled_train = np.einsum("k,kij->ij", w0_train_arr, M_train_stack)
            v_train_overall = float(np.min(p_rho @ M_pooled_train))
            wr_train_overall = 0.5 * (1 + v_train_overall)

            M_pooled_test = np.einsum("k,kij->ij", w0_test_arr, M_test_stack)
            v_test_overall = float(np.min(p_rho @ M_pooled_test))
            wr_test_overall = 0.5 * (1 + v_test_overall)

            boot_wr_train[(rho, "overall")].append(float(wr_train_overall))
            boot_wr_test[(rho, "overall")].append(float(wr_test_overall))
            boot_wr_gap[(rho, "overall")].append(float(wr_test_overall - wr_train_overall))

    # summarize means + SEs
    rows = []
    for rho in rhos:
        for grp in groups + ["overall"]:
            arr_train = np.array(boot_wr_train[(rho, grp)], float)
            arr_test = np.array(boot_wr_test[(rho, grp)], float)
            arr_gap = np.array(boot_wr_gap[(rho, grp)], float)

            if len(arr_train) == 0:
                continue

            n = len(arr_train)
            rows.append(
                {
                    "rho": rho,
                    "group": grp,
                    "wr_train_mean": float(arr_train.mean()),
                    "wr_train_se": float(arr_train.std(ddof=0) / np.sqrt(n)),
                    "wr_test_mean": float(arr_test.mean()),
                    "wr_test_se": float(arr_test.std(ddof=0) / np.sqrt(n)),
                    "wr_gap_mean": float(arr_gap.mean()),
                    "wr_gap_se": float(arr_gap.std(ddof=0) / np.sqrt(n)),
                }
            )

    gap_df = pd.DataFrame(rows)
    return gap_df, boot_wr_train, boot_wr_test, boot_wr_gap


In [44]:
print("Generating bootstrap train/test splits...")
splits = bootstrap_train_test_splits(df, groups, MODELS, train_frac=0.8, n_boot=N_BOOT, seed=42, group_col="group")
print("Computing generalization gaps from splits...")
gap_df, _, _, _ = compute_generalization(splits, groups, MODELS, rhos, alpha=1.0, group_col="group")

Generating bootstrap train/test splits...
boot 20/200
boot 40/200
boot 60/200
boot 80/200
boot 100/200
boot 120/200
boot 140/200
boot 160/200
boot 180/200
boot 200/200
Computing generalization gaps from splits...


In [35]:
def plot_generalization(gap_df, groups, metric="wr", split="gap", group_by="group", out_pdf=None):
    if out_pdf is None:
        out_pdf = f"{metric}_{split}_by_{group_by}.pdf"
    sns.set_style("white")
    plt.rcParams.update({"font.size":13,"axes.labelsize":14,"xtick.labelsize":12,"ytick.labelsize":13,"legend.fontsize":13,"pdf.fonttype":42,"ps.fonttype":42,"font.family":"sans-serif","axes.linewidth":0.8,"grid.linewidth":0.4,"xtick.major.width":0.6,"ytick.major.width":0})
    colors={"overall":"#2C3E50"}
    default_colors=["#E74C3C","#3498DB","#F39C12","#27AE60","#9B59B6","#1ABC9C","#E67E22"]
    for i,grp in enumerate(groups):
        if grp not in colors: colors[grp]=default_colors[i%len(default_colors)]
    y_mean=f"{metric}_{split}_mean"
    y_se=f"{metric}_{split}_se"
    fig,ax=plt.subplots(1,1,figsize=(6,4))
    if split=="gap":
        d=gap_df[gap_df["group"]=="overall"].sort_values("rho")
        if len(d)>0:
            x=d["rho"].to_numpy(float); y=-d[y_mean].to_numpy(float) * 100.0; se=d[y_se].to_numpy(float) * 100.0
            ax.plot(x,y,marker="o",linewidth=1.2,markersize=5,linestyle="-",color=colors["overall"],alpha=0.9,label="Overall")
            ax.fill_between(x,y-se,y+se,alpha=0.15,color=colors["overall"])
            ax.set_ylabel("Win rate generalization gap (%)" if metric=="wr" else metric,fontsize=14,labelpad=4)
        #     if np.any(np.isfinite(y)): ax.axhline(float(np.nanmin(y)),linestyle="--",linewidth=1.0,color=colors["overall"],alpha=0.8,label="min overall")
        # ax.legend(frameon=True,loc="best",framealpha=0.95,facecolor="white",edgecolor="lightgray",borderpad=0.8,labelspacing=0.6)
    else:
        order=["overall"]+list(groups)
        for grp in order:
            d=gap_df[gap_df["group"]==grp].sort_values("rho")
            if len(d)==0: continue
            x=d["rho"].to_numpy(float); y=d[y_mean].to_numpy(float) * 100.0; se=d[y_se].to_numpy(float) * 100.0
            ls="--" if grp=="overall" else "-"
            ax.plot(x,y,marker="o",linewidth=1.2,markersize=5,linestyle=ls,color=colors.get(grp,"#95A5A6"),alpha=0.9,label=grp.title())
            ax.fill_between(x,y-se,y+se,alpha=0.15,color=colors.get(grp,"#95A5A6"))
            if split=="test":
                ax.set_ylabel("Win rate on held out votes (%)" if metric=="wr" else metric,fontsize=14,labelpad=4)
            else:
                ax.set_ylabel("Win rate (%)" if metric=="wr" else metric,fontsize=14,labelpad=4)
        ax.legend(frameon=True,loc="best",framealpha=0.95,facecolor="white",edgecolor="lightgray",borderpad=0.8,labelspacing=0.6, fontsize=10)
    ax.set_xlabel(r"$\rho$",fontsize=15,labelpad=4)
    ax.set_axisbelow(True)
    ax.grid(True,alpha=0.25,linestyle="-",linewidth=0.5,color="gray")
    ax.grid(True,axis="y",alpha=0.15,linestyle="-",linewidth=0.3,color="gray")
    ax.spines["left"].set_linewidth(0.8); ax.spines["bottom"].set_linewidth(0.8); ax.spines["top"].set_visible(False); ax.spines["right"].set_visible(False)
    fig.tight_layout(pad=1.0)
    fig.savefig(out_pdf,bbox_inches="tight",dpi=300)
    plt.close(fig)
    print(f"saved: {out_pdf}")

def print_generalization_table(gap_df, groups, rhos, metric="wr", split="gap"):
    df=gap_df.copy()
    mean_col=f"{metric}_{split}_mean"; se_col=f"{metric}_{split}_se"
    df["formatted"]=df.apply(lambda row: f"{row[mean_col]:.4f} ± {row[se_col]:.4f}",axis=1)
    table=df.pivot(index="group",columns="rho",values="formatted")
    table=table.reindex(groups+["overall"])
    display(table)
    return table

In [33]:
print_generalization_table(gap_df, groups, rhos, metric="wr", split="train")
print_generalization_table(gap_df, groups, rhos, metric="wr", split="test")
print_generalization_table(gap_df, groups, rhos, metric="wr", split="gap")

rho,0.0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0
group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Factual Lookup,0.4085 ± 0.0183,0.4266 ± 0.0099,0.4303 ± 0.0085,0.4398 ± 0.0083,0.4442 ± 0.0072,0.4462 ± 0.0071,0.4450 ± 0.0060,0.4413 ± 0.0050,0.4384 ± 0.0052,0.4381 ± 0.0052,0.4381 ± 0.0052
Info Synthesis,0.4268 ± 0.0149,0.4444 ± 0.0106,0.4508 ± 0.0076,0.4476 ± 0.0054,0.4469 ± 0.0040,0.4483 ± 0.0047,0.4459 ± 0.0058,0.4417 ± 0.0055,0.4368 ± 0.0049,0.4364 ± 0.0049,0.4364 ± 0.0049
Recommendation,0.3165 ± 0.0214,0.3529 ± 0.0153,0.3615 ± 0.0139,0.3821 ± 0.0125,0.3992 ± 0.0097,0.4153 ± 0.0074,0.4217 ± 0.0060,0.4277 ± 0.0051,0.4334 ± 0.0044,0.4340 ± 0.0043,0.4340 ± 0.0043
Analysis,0.4120 ± 0.0176,0.4174 ± 0.0121,0.4291 ± 0.0080,0.4367 ± 0.0044,0.4345 ± 0.0045,0.4374 ± 0.0040,0.4396 ± 0.0043,0.4356 ± 0.0039,0.4348 ± 0.0043,0.4348 ± 0.0044,0.4348 ± 0.0044
overall,0.5000 ± 0.0000,0.4950 ± 0.0011,0.4864 ± 0.0026,0.4791 ± 0.0036,0.4759 ± 0.0045,0.4737 ± 0.0039,0.4713 ± 0.0034,0.4666 ± 0.0035,0.4635 ± 0.0040,0.4638 ± 0.0041,0.4638 ± 0.0041


rho,0.0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0
group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Factual Lookup,0.3753 ± 0.0128,0.4029 ± 0.0133,0.4050 ± 0.0118,0.4167 ± 0.0110,0.4176 ± 0.0106,0.4183 ± 0.0102,0.4166 ± 0.0100,0.4145 ± 0.0096,0.4010 ± 0.0161,0.4005 ± 0.0161,0.4005 ± 0.0161
Info Synthesis,0.3852 ± 0.0126,0.3970 ± 0.0106,0.4020 ± 0.0107,0.4156 ± 0.0137,0.4151 ± 0.0146,0.4151 ± 0.0113,0.4142 ± 0.0105,0.4139 ± 0.0086,0.4196 ± 0.0093,0.4185 ± 0.0098,0.4185 ± 0.0098
Recommendation,0.2791 ± 0.0192,0.2995 ± 0.0178,0.3175 ± 0.0189,0.3309 ± 0.0186,0.3400 ± 0.0154,0.3541 ± 0.0145,0.3608 ± 0.0144,0.3648 ± 0.0136,0.3670 ± 0.0129,0.3661 ± 0.0127,0.3661 ± 0.0127
Analysis,0.3550 ± 0.0244,0.3848 ± 0.0192,0.3965 ± 0.0159,0.4013 ± 0.0163,0.3968 ± 0.0163,0.3859 ± 0.0150,0.3873 ± 0.0140,0.3897 ± 0.0144,0.3963 ± 0.0148,0.3960 ± 0.0149,0.3960 ± 0.0149
overall,0.4570 ± 0.0075,0.4629 ± 0.0071,0.4602 ± 0.0075,0.4662 ± 0.0044,0.4673 ± 0.0039,0.4673 ± 0.0029,0.4661 ± 0.0023,0.4630 ± 0.0032,0.4603 ± 0.0038,0.4598 ± 0.0040,0.4598 ± 0.0040


rho,0.0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0
group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Factual Lookup,-0.0332 ± 0.0248,-0.0237 ± 0.0183,-0.0253 ± 0.0113,-0.0230 ± 0.0129,-0.0266 ± 0.0144,-0.0278 ± 0.0145,-0.0284 ± 0.0133,-0.0268 ± 0.0122,-0.0375 ± 0.0176,-0.0376 ± 0.0177,-0.0376 ± 0.0177
Info Synthesis,-0.0416 ± 0.0161,-0.0474 ± 0.0130,-0.0488 ± 0.0130,-0.0320 ± 0.0154,-0.0318 ± 0.0167,-0.0333 ± 0.0144,-0.0317 ± 0.0141,-0.0278 ± 0.0127,-0.0172 ± 0.0118,-0.0179 ± 0.0122,-0.0179 ± 0.0122
Recommendation,-0.0374 ± 0.0211,-0.0534 ± 0.0135,-0.0440 ± 0.0170,-0.0512 ± 0.0183,-0.0593 ± 0.0172,-0.0612 ± 0.0151,-0.0609 ± 0.0143,-0.0629 ± 0.0140,-0.0664 ± 0.0134,-0.0678 ± 0.0132,-0.0678 ± 0.0132
Analysis,-0.0571 ± 0.0199,-0.0326 ± 0.0199,-0.0326 ± 0.0159,-0.0354 ± 0.0166,-0.0377 ± 0.0169,-0.0515 ± 0.0163,-0.0523 ± 0.0153,-0.0459 ± 0.0140,-0.0386 ± 0.0135,-0.0388 ± 0.0136,-0.0388 ± 0.0136
overall,-0.0430 ± 0.0075,-0.0321 ± 0.0066,-0.0262 ± 0.0069,-0.0130 ± 0.0040,-0.0085 ± 0.0045,-0.0064 ± 0.0047,-0.0052 ± 0.0049,-0.0037 ± 0.0048,-0.0032 ± 0.0049,-0.0040 ± 0.0052,-0.0040 ± 0.0052


rho,0.0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0
group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Factual Lookup,-0.0332 ± 0.0248,-0.0237 ± 0.0183,-0.0253 ± 0.0113,-0.0230 ± 0.0129,-0.0266 ± 0.0144,-0.0278 ± 0.0145,-0.0284 ± 0.0133,-0.0268 ± 0.0122,-0.0375 ± 0.0176,-0.0376 ± 0.0177,-0.0376 ± 0.0177
Info Synthesis,-0.0416 ± 0.0161,-0.0474 ± 0.0130,-0.0488 ± 0.0130,-0.0320 ± 0.0154,-0.0318 ± 0.0167,-0.0333 ± 0.0144,-0.0317 ± 0.0141,-0.0278 ± 0.0127,-0.0172 ± 0.0118,-0.0179 ± 0.0122,-0.0179 ± 0.0122
Recommendation,-0.0374 ± 0.0211,-0.0534 ± 0.0135,-0.0440 ± 0.0170,-0.0512 ± 0.0183,-0.0593 ± 0.0172,-0.0612 ± 0.0151,-0.0609 ± 0.0143,-0.0629 ± 0.0140,-0.0664 ± 0.0134,-0.0678 ± 0.0132,-0.0678 ± 0.0132
Analysis,-0.0571 ± 0.0199,-0.0326 ± 0.0199,-0.0326 ± 0.0159,-0.0354 ± 0.0166,-0.0377 ± 0.0169,-0.0515 ± 0.0163,-0.0523 ± 0.0153,-0.0459 ± 0.0140,-0.0386 ± 0.0135,-0.0388 ± 0.0136,-0.0388 ± 0.0136
overall,-0.0430 ± 0.0075,-0.0321 ± 0.0066,-0.0262 ± 0.0069,-0.0130 ± 0.0040,-0.0085 ± 0.0045,-0.0064 ± 0.0047,-0.0052 ± 0.0049,-0.0037 ± 0.0048,-0.0032 ± 0.0049,-0.0040 ± 0.0052,-0.0040 ± 0.0052


In [36]:
plot_generalization(gap_df, groups, metric="wr", split="train", group_by=GROUP_BY, out_pdf=f"wr_train_by_{GROUP_BY}.pdf")
plot_generalization(gap_df, groups, metric="wr", split="test", group_by=GROUP_BY, out_pdf=f"wr_test_by_{GROUP_BY}.pdf")
plot_generalization(gap_df, groups, metric="wr", split="gap", group_by=GROUP_BY, out_pdf=f"wr_gap_by_{GROUP_BY}.pdf")

saved: wr_train_by_primary_intent.pdf
saved: wr_test_by_primary_intent.pdf
saved: wr_gap_by_primary_intent.pdf


In [37]:
def print_boot_p(boot_p, models, rhos, thresh=0.02):
    rows = []
    for rho in rhos:
        P = np.vstack(boot_p[rho])
        mean = P.mean(axis=0)
        se = P.std(axis=0) / np.sqrt(P.shape[0])
        for i, name in enumerate(models):
            rows.append({"rho": rho, "model": name, "mean": mean[i], "se": se[i]})
    wdf = pd.DataFrame(rows)
    keep = wdf.groupby("model")["mean"].max()
    keep = keep[keep > thresh].index
    wdf = wdf[wdf["model"].isin(keep)].copy()
    wdf["mean%"] = (100*wdf["mean"]).round(2)
    wdf["se%"] = (100*wdf["se"]).round(2)
    
    wdf["formatted"] = wdf.apply(lambda row: f"{row['mean%']:.2f} ± {row['se%']:.2f}", axis=1)
    
    table = wdf.pivot(index="model", columns="rho", values="formatted")
    
    rho0_means = wdf[wdf["rho"] == rhos[0]].set_index("model")["mean"].sort_values(ascending=False)
    table = table.reindex(rho0_means.index)
    
    print(f"(shown if mean> {100*thresh:.1f}% for some rho)")
    display(table)
    return wdf

wdf = print_boot_p(boot_p, MODELS, rhos, thresh=0.02)


(shown if mean> 2.0% for some rho)


rho,0.0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
sonar-reasoning-pro-high,48.74 ± 2.92,45.74 ± 2.18,42.04 ± 1.61,35.89 ± 1.35,33.07 ± 1.20,31.70 ± 1.14,31.09 ± 1.09,31.92 ± 1.08,32.15 ± 1.08,32.30 ± 1.07,32.30 ± 1.07
gemini-2.5-pro-exp-03-25-grounding,34.15 ± 2.67,34.72 ± 2.13,32.52 ± 1.78,31.71 ± 1.60,30.82 ± 1.48,29.01 ± 1.36,28.11 ± 1.27,26.58 ± 1.21,25.75 ± 1.21,25.57 ± 1.21,25.57 ± 1.21
sonar-pro-high,9.23 ± 1.51,11.53 ± 1.20,14.67 ± 0.99,17.26 ± 0.88,18.47 ± 0.83,19.11 ± 0.82,19.56 ± 0.82,19.65 ± 0.81,20.08 ± 0.81,20.20 ± 0.82,20.20 ± 0.82
sonar,5.27 ± 0.94,5.86 ± 0.84,7.43 ± 0.83,9.10 ± 0.84,10.02 ± 0.80,10.98 ± 0.80,10.98 ± 0.78,10.62 ± 0.76,9.90 ± 0.76,9.75 ± 0.75,9.75 ± 0.75
sonar-reasoning,1.86 ± 0.40,1.56 ± 0.28,2.07 ± 0.35,3.43 ± 0.47,3.37 ± 0.47,3.10 ± 0.42,2.62 ± 0.38,2.50 ± 0.38,2.54 ± 0.38,2.53 ± 0.37,2.53 ± 0.37
sonar-pro,0.38 ± 0.18,0.47 ± 0.16,0.92 ± 0.24,1.39 ± 0.30,1.97 ± 0.34,2.50 ± 0.36,2.73 ± 0.38,2.72 ± 0.37,2.79 ± 0.40,2.76 ± 0.40,2.76 ± 0.40
gemini-2.5-pro-exp-03-25-wo-search,0.23 ± 0.12,0.06 ± 0.05,0.22 ± 0.10,0.88 ± 0.21,1.57 ± 0.29,2.39 ± 0.36,3.22 ± 0.41,3.81 ± 0.43,4.37 ± 0.48,4.41 ± 0.48,4.41 ± 0.48
gemini-2.5-flash-preview-04-17-grounding,0.14 ± 0.14,0.06 ± 0.06,0.13 ± 0.08,0.34 ± 0.11,0.72 ± 0.18,1.19 ± 0.26,1.63 ± 0.30,2.09 ± 0.34,2.26 ± 0.35,2.31 ± 0.35,2.31 ± 0.35


In [42]:
def plot_model_performance_grid(wdf, out_pdf=None):
    if out_pdf is None:
        out_pdf = f"model_performance_grid_{GROUP_BY}.pdf"
    sns.set_style("white")
    plt.rcParams.update({"font.size":10,"axes.labelsize":11,"xtick.labelsize":9,"ytick.labelsize":10,"legend.fontsize":10,"pdf.fonttype":42,"ps.fonttype":42,"font.family":"sans-serif","axes.linewidth":0.8,"grid.linewidth":0.4,"xtick.major.width":0.6,"ytick.major.width":0})
    rhos_to_plot=[0.0,0.4,0.7,1.0]
    model_names={'gemini-2.0-flash-grounding':'Gemini 2.0 Flash','gemini-2.5-flash-preview-04-17-grounding':'Gemini 2.5 Flash','gemini-2.5-pro-exp-03-25-grounding':'Gemini 2.5 Pro','gemini-2.5-pro-exp-03-25-wo-search':'Gemini 2.5 Pro (no search)','gpt-4o-mini-search-preview':'GPT-4o Mini','gpt-4o-search-preview':'GPT-4o','gpt-4o-search-preview-high':'GPT-4o High','gpt-4o-search-preview-high-loc':'GPT-4o High Loc','sonar':'Sonar','sonar-pro':'Sonar Pro','sonar-pro-high':'Sonar Pro High','sonar-reasoning':'Sonar Reasoning','sonar-reasoning-pro-high':'Sonar Reasoning Pro High'}
    colors={'sonar-reasoning-pro-high':'#E74C3C','gemini-2.5-pro-exp-03-25-grounding':'#3498DB','sonar-pro-high':'#F39C12','sonar':'#27AE60','sonar-pro':'#9B59B6','sonar-reasoning':'#1ABC9C','gemini-2.5-pro-exp-03-25-wo-search':'#E67E22','gemini-2.5-flash-preview-04-17-grounding':'#E91E63'}
    fig,axes=plt.subplots(1,4,figsize=(18,5)); axes=axes.flatten()
    models_ordered=wdf[np.isclose(wdf["rho"],0.0)].sort_values("mean",ascending=True)["model"].values
    for idx,rho in enumerate(rhos_to_plot):
        ax=axes[idx]
        data=wdf[np.isclose(wdf["rho"],rho)].copy()
        data=data.set_index("model").reindex(models_ordered).reset_index().dropna()
        bar_spacing=0.65
        y_pos=np.arange(len(data))*bar_spacing
        bar_colors=[colors.get(m,"#95A5A6") for m in data["model"]]
        ax.barh(y_pos,data["mean%"],xerr=data["se%"],color=bar_colors,error_kw={"elinewidth":1.2,"capsize":2.5,"alpha":0.7},height=0.5*bar_spacing,alpha=0.85,edgecolor="white",linewidth=0.5)
        ax.set_xlim(0,75); ax.set_xticks([0,20,40,60,80])
        ax.set_axisbelow(True)
        ax.grid(True,axis="x",alpha=0.25,linestyle="-",linewidth=0.5,color="gray")
        ax.grid(True,axis="y",alpha=0.15,linestyle="-",linewidth=0.3,color="gray")
        if idx==0:
            ax.set_yticks(y_pos)
            ax.set_yticklabels([model_names.get(m,m) for m in data["model"]],fontsize=12)
        else:
            ax.set_yticks(y_pos)
            ax.set_yticklabels([])
        ax.set_xlabel("Probability (%)",fontsize=14,labelpad=4)
        ax.set_xticklabels(["0","20","40","60","80"],fontsize=14)
        ax.text(0.95,0.1,f"ρ = {rho:.1f}",transform=ax.transAxes,fontsize=16,fontweight="bold",ha="right",va="bottom",bbox=dict(boxstyle="round,pad=0.4",facecolor="white",edgecolor="lightgray",alpha=0.8,linewidth=0.5))
        ax.spines["left"].set_linewidth(0.8); ax.spines["bottom"].set_linewidth(0.8); ax.spines["top"].set_visible(False); ax.spines["right"].set_visible(False)
    fig.tight_layout(pad=1.2)
    fig.savefig(out_pdf,bbox_inches="tight",dpi=300)
    plt.close(fig)
    print(f"saved: {out_pdf}")


In [43]:
plot_model_performance_grid(wdf)

saved: model_performance_grid_primary_intent.pdf
