# 0) Clean image performance

In [66]:
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

files = {
    "internvl": "internvl_clean_baseline.csv",
    "lingshu": "lingshu_clean_baseline.csv",
    "medgemma": "medgemma_clean_baseline.csv",
    "qwen3": "qwen3_clean_baseline.csv",
}

modalities = ["MRI", "OCT", "Xray"]

results = []

for model, path in files.items():
    df = pd.read_csv(path)

    # ÌÉÄÏûÖ Ï†ïÎ¶¨ (Î¨∏ÏûêÏó¥/Í≥µÎ∞±/NaN Î∞©ÏßÄ)
    df["gt_binary"] = pd.to_numeric(df["gt_binary"], errors="coerce")
    df["pred_binary"] = pd.to_numeric(df["pred_binary"], errors="coerce")
    df["is_correct"] = pd.to_numeric(df["is_correct"], errors="coerce")

    for mod in modalities:
        sub = df[df["dataset"] == mod].copy()
        if sub.empty:
            continue

        # labelÏù¥ NaNÏù∏ Ìñâ Ï†úÍ±∞
        sub = sub.dropna(subset=["gt_binary", "pred_binary"])
        y_true = sub["gt_binary"].astype(int)
        y_pred = sub["pred_binary"].astype(int)

        results.append({
            "model": model,
            "dataset": mod,
            "n_samples": len(sub),
            "accuracy": accuracy_score(y_true, y_pred),
            "precision": precision_score(y_true, y_pred, zero_division=0),
            "recall": recall_score(y_true, y_pred, zero_division=0),
            "f1": f1_score(y_true, y_pred, zero_division=0),
            # sanity check: is_correctÍ∞Ä Ï†úÎåÄÎ°úÎ©¥ ÏïÑÎûò Îëê Í∞íÏù¥ Í∞ôÏïÑÏïº Ìï®
            "is_correct_mean": sub["is_correct"].mean(),
        })

results_df = pd.DataFrame(results)

pivot = (
    results_df
    .pivot_table(
        index=["model", "dataset"],
        values=["n_samples", "accuracy", "precision", "recall", "f1"],
    )
    .sort_index()
)

pivot = pivot[["n_samples", "accuracy", "precision", "recall", "f1"]]
#print(pivot.round(4))

from IPython.display import display

metrics = ["n_samples", "accuracy", "precision", "recall", "f1"]
dataset_order = ["MRI", "OCT", "Xray"]

for model in results_df["model"].unique():
    print(f"\nü§ñ {model.upper()} ‚Äî Clean baseline")

    tbl = (
        results_df[results_df["model"] == model]
        .set_index("dataset")[metrics]
        .reindex(dataset_order)
    )

    display(
        tbl.style
        .format({
            "accuracy":"{:.4f}",
            "precision":"{:.4f}",
            "recall":"{:.4f}",
            "f1":"{:.4f}",
            "n_samples":"{:.0f}",
        })
        .background_gradient(subset=["f1"])
    )



ü§ñ INTERNVL ‚Äî Clean baseline


Unnamed: 0_level_0,n_samples,accuracy,precision,recall,f1
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
MRI,20,0.8,1.0,0.6,0.75
OCT,20,0.7,1.0,0.4,0.5714
Xray,20,0.75,0.8571,0.6,0.7059



ü§ñ LINGSHU ‚Äî Clean baseline


Unnamed: 0_level_0,n_samples,accuracy,precision,recall,f1
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
MRI,20,1.0,1.0,1.0,1.0
OCT,20,0.7,1.0,0.4,0.5714
Xray,20,0.9,1.0,0.8,0.8889



ü§ñ MEDGEMMA ‚Äî Clean baseline


Unnamed: 0_level_0,n_samples,accuracy,precision,recall,f1
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
MRI,20,0.85,0.7692,1.0,0.8696
OCT,20,0.5,0.0,0.0,0.0
Xray,20,0.85,1.0,0.7,0.8235



ü§ñ QWEN3 ‚Äî Clean baseline


Unnamed: 0_level_0,n_samples,accuracy,precision,recall,f1
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
MRI,20,0.8,1.0,0.6,0.75
OCT,20,0.75,0.7273,0.8,0.7619
Xray,20,0.4,0.3333,0.2,0.25


# 1) Weak image performance

In [75]:
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from pathlib import Path

models = ["internvl", "lingshu", "medgemma", "qwen3"]

prompts = [
    "baseline-prompts",
    "disease-cue-reason-1-prompts",
    "disease_cue-1-prompts",
    "artefact-disease-cue-2-prompts",
    "artefact_disease_cue-1-prompts",
    "artefact-disease-cue-reason-1-prompts",
]

modalities = ["MRI", "OCT", "Xray"]

results = []

for model in models:
    for prompt in prompts:
        file = f"{model}_{prompt}.csv"
        if not Path(file).exists():
            continue

        df = pd.read_csv(file)

        # weakÎßå
        df = df[df["severity"] == "weak"].copy()

        # Ïà´Ïûê Î≥ÄÌôò (Î¨∏ÏûêÏó¥/Í≥µÎ∞±/Ïù¥ÏÉÅÍ∞í Î∞©ÏßÄ)
        df["binarylabel"] = pd.to_numeric(df["binarylabel"], errors="coerce")
        df["pred_binary"] = pd.to_numeric(df["pred_binary"], errors="coerce")

        for mod in modalities:
            sub = df[df["dataset"] == mod].copy()
            if sub.empty:
                continue

            n_total = len(sub)
            n_gt_nan = int(sub["binarylabel"].isna().sum())
            n_pred_nan = int(sub["pred_binary"].isna().sum())

            # metric Í≥ÑÏÇ∞ Í∞ÄÎä•Ìïú Ïú†Ìö® rowÎßå ÏÇ¨Ïö©
            valid = sub.dropna(subset=["binarylabel", "pred_binary"]).copy()
            n_valid = len(valid)

            if n_valid == 0:
                results.append({
                    "model": model,
                    "prompt": prompt,
                    "dataset": mod,
                    "n_total": n_total,
                    "n_valid": 0,
                    "n_gt_nan": n_gt_nan,
                    "n_pred_nan": n_pred_nan,
                    "accuracy": None,
                    "precision": None,
                    "recall": None,
                    "f1": None,
                })
                continue

            y_true = valid["binarylabel"].astype(int)
            y_pred = valid["pred_binary"].astype(int)

            results.append({
                "model": model,
                "prompt": prompt,
                "dataset": mod,
                "n_total": n_total,
                "n_valid": n_valid,
                "n_gt_nan": n_gt_nan,
                "n_pred_nan": n_pred_nan,
                "accuracy": accuracy_score(y_true, y_pred),
                "precision": precision_score(y_true, y_pred, zero_division=0),
                "recall": recall_score(y_true, y_pred, zero_division=0),
                "f1": f1_score(y_true, y_pred, zero_division=0),
            })

results_df = pd.DataFrame(results)
pivot = results_df.pivot_table(
    index=["model", "prompt", "dataset"],
    values=["n_valid", "accuracy", "precision", "recall", "f1"],
    aggfunc="first",
).sort_index()

from IPython.display import display

model_order = ["internvl", "lingshu", "medgemma", "qwen3"]
dataset_order = ["MRI", "OCT", "Xray"]
metrics = ["n_valid", "accuracy", "precision", "recall", "f1"]

prompt_order = [
    "baseline-prompts",
    "disease_cue-1-prompts",
    "artefact_disease_cue-1-prompts",
    "artefact-disease-cue-2-prompts",
]


for prompt in prompt_order:
    df_p = results_df[results_df["prompt"] == prompt].copy()
    if df_p.empty:
        continue

    print(f"\nüßæ PROMPT: {prompt}\n" + "‚îÄ" * 60)

    for model in model_order:
        df_pm = df_p[df_p["model"] == model].copy()
        if df_pm.empty:
            continue

        tbl = (
            df_pm
            .set_index("dataset")[metrics]
            .reindex(dataset_order)
        )

        print(f"\nü§ñ Model: {model}")
        display(tbl.round(4))



üßæ PROMPT: baseline-prompts
‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ

ü§ñ Model: internvl


Unnamed: 0_level_0,n_valid,accuracy,precision,recall,f1
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
MRI,100,0.77,1.0,0.54,0.7013
OCT,100,0.55,1.0,0.1,0.1818
Xray,100,0.63,0.9333,0.28,0.4308



ü§ñ Model: lingshu


Unnamed: 0_level_0,n_valid,accuracy,precision,recall,f1
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
MRI,100,0.89,0.9535,0.82,0.8817
OCT,100,0.64,1.0,0.28,0.4375
Xray,100,0.69,0.6418,0.86,0.735



ü§ñ Model: medgemma


Unnamed: 0_level_0,n_valid,accuracy,precision,recall,f1
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
MRI,100,0.85,0.8889,0.8,0.8421
OCT,100,0.5,0.0,0.0,0.0
Xray,100,0.56,1.0,0.12,0.2143



ü§ñ Model: qwen3


Unnamed: 0_level_0,n_valid,accuracy,precision,recall,f1
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
MRI,100,0.66,1.0,0.32,0.4848
OCT,100,0.54,0.75,0.12,0.2069
Xray,100,0.56,1.0,0.12,0.2143



üßæ PROMPT: disease_cue-1-prompts
‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ

ü§ñ Model: internvl


Unnamed: 0_level_0,n_valid,accuracy,precision,recall,f1
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
MRI,100,0.72,1.0,0.44,0.6111
OCT,100,0.55,1.0,0.1,0.1818
Xray,100,0.67,0.9474,0.36,0.5217



ü§ñ Model: lingshu


Unnamed: 0_level_0,n_valid,accuracy,precision,recall,f1
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
MRI,100,0.88,0.975,0.78,0.8667
OCT,100,0.59,1.0,0.18,0.3051
Xray,100,0.71,0.7442,0.64,0.6882



ü§ñ Model: medgemma


Unnamed: 0_level_0,n_valid,accuracy,precision,recall,f1
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
MRI,100,0.86,0.8462,0.88,0.8627
OCT,100,0.51,1.0,0.02,0.0392
Xray,100,0.62,1.0,0.24,0.3871



ü§ñ Model: qwen3


Unnamed: 0_level_0,n_valid,accuracy,precision,recall,f1
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
MRI,100,0.68,1.0,0.36,0.5294
OCT,100,0.53,0.6667,0.12,0.2034
Xray,100,0.57,1.0,0.14,0.2456



üßæ PROMPT: artefact_disease_cue-1-prompts
‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ

ü§ñ Model: internvl


Unnamed: 0_level_0,n_valid,accuracy,precision,recall,f1
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
MRI,100,0.68,1.0,0.36,0.5294
OCT,100,0.52,1.0,0.04,0.0769
Xray,100,0.61,1.0,0.22,0.3607



ü§ñ Model: lingshu


Unnamed: 0_level_0,n_valid,accuracy,precision,recall,f1
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
MRI,100,0.84,0.9722,0.7,0.814
OCT,100,0.59,1.0,0.18,0.3051
Xray,100,0.71,0.7442,0.64,0.6882



ü§ñ Model: medgemma


Unnamed: 0_level_0,n_valid,accuracy,precision,recall,f1
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
MRI,100,0.86,0.8462,0.88,0.8627
OCT,100,0.5,0.0,0.0,0.0
Xray,100,0.62,1.0,0.24,0.3871



ü§ñ Model: qwen3


Unnamed: 0_level_0,n_valid,accuracy,precision,recall,f1
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
MRI,100,0.62,1.0,0.24,0.3871
OCT,100,0.52,0.75,0.06,0.1111
Xray,100,0.52,1.0,0.04,0.0769



üßæ PROMPT: artefact-disease-cue-2-prompts
‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ

ü§ñ Model: internvl


Unnamed: 0_level_0,n_valid,accuracy,precision,recall,f1
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
MRI,100,0.69,1.0,0.38,0.5507
OCT,100,0.52,1.0,0.04,0.0769
Xray,100,0.61,1.0,0.22,0.3607



ü§ñ Model: lingshu


Unnamed: 0_level_0,n_valid,accuracy,precision,recall,f1
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
MRI,100,0.85,0.973,0.72,0.8276
OCT,100,0.59,1.0,0.18,0.3051
Xray,100,0.71,0.7442,0.64,0.6882



ü§ñ Model: medgemma


Unnamed: 0_level_0,n_valid,accuracy,precision,recall,f1
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
MRI,100,0.86,0.8462,0.88,0.8627
OCT,100,0.5,0.0,0.0,0.0
Xray,100,0.61,1.0,0.22,0.3607



ü§ñ Model: qwen3


Unnamed: 0_level_0,n_valid,accuracy,precision,recall,f1
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
MRI,100,0.62,1.0,0.24,0.3871
OCT,100,0.5,0.0,0.0,0.0
Xray,100,0.52,1.0,0.04,0.0769


In [76]:
from IPython.display import display

model_order = ["internvl", "lingshu", "medgemma", "qwen3"]
dataset_order = ["MRI", "OCT", "Xray"]
metrics = ["accuracy", "precision", "recall", "f1"]

prompt_order = [
    "baseline-prompts",
    "disease-cue-reason-1-prompts",
    "disease_cue-1-prompts",
    "artefact-disease-cue-2-prompts",
    "artefact_disease_cue-1-prompts",
    "artefact-disease-cue-reason-1-prompts",
]
for prompt in prompt_order:
    df_p = results_df[results_df["prompt"] == prompt].copy()
    if df_p.empty:
        continue

    print(f"\nüßæ PROMPT: {prompt}")
    print("‚îÄ" * 70)

    # (model, dataset)Î•º ÌñâÏúºÎ°ú
    tbl = (
        df_p
        .pivot_table(
            index=["model", "dataset"],
            values=["n_valid"] + metrics,
            aggfunc="first",
        )
        .reindex(
            pd.MultiIndex.from_product(
                [model_order, dataset_order],
                names=["model", "dataset"]
            )
        )
    )

    display(
        tbl.round(4)
        .style
        .background_gradient(subset=["f1"])
        .set_caption(f"Weak performance ‚Äî Prompt: {prompt}")
    )



üßæ PROMPT: baseline-prompts
‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ


Unnamed: 0_level_0,Unnamed: 1_level_0,accuracy,f1,n_valid,precision,recall
model,dataset,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
internvl,MRI,0.77,0.7013,100,1.0,0.54
internvl,OCT,0.55,0.1818,100,1.0,0.1
internvl,Xray,0.63,0.4308,100,0.9333,0.28
lingshu,MRI,0.89,0.8817,100,0.9535,0.82
lingshu,OCT,0.64,0.4375,100,1.0,0.28
lingshu,Xray,0.69,0.735,100,0.6418,0.86
medgemma,MRI,0.85,0.8421,100,0.8889,0.8
medgemma,OCT,0.5,0.0,100,0.0,0.0
medgemma,Xray,0.56,0.2143,100,1.0,0.12
qwen3,MRI,0.66,0.4848,100,1.0,0.32



üßæ PROMPT: disease-cue-reason-1-prompts
‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ


Unnamed: 0_level_0,Unnamed: 1_level_0,accuracy,f1,n_valid,precision,recall
model,dataset,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
internvl,MRI,0.56,0.2143,100,1.0,0.12
internvl,OCT,0.5,0.0,100,0.0,0.0
internvl,Xray,0.52,0.0769,100,1.0,0.04
lingshu,MRI,0.83,0.8,100,0.9714,0.68
lingshu,OCT,0.59,0.3051,100,1.0,0.18
lingshu,Xray,0.7,0.6809,100,0.7273,0.64
medgemma,MRI,0.81,0.7711,100,0.9697,0.64
medgemma,OCT,0.5,0.0,100,0.0,0.0
medgemma,Xray,0.53,0.1132,100,1.0,0.06
qwen3,MRI,0.59,0.3051,100,1.0,0.18



üßæ PROMPT: disease_cue-1-prompts
‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ


Unnamed: 0_level_0,Unnamed: 1_level_0,accuracy,f1,n_valid,precision,recall
model,dataset,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
internvl,MRI,0.72,0.6111,100,1.0,0.44
internvl,OCT,0.55,0.1818,100,1.0,0.1
internvl,Xray,0.67,0.5217,100,0.9474,0.36
lingshu,MRI,0.88,0.8667,100,0.975,0.78
lingshu,OCT,0.59,0.3051,100,1.0,0.18
lingshu,Xray,0.71,0.6882,100,0.7442,0.64
medgemma,MRI,0.86,0.8627,100,0.8462,0.88
medgemma,OCT,0.51,0.0392,100,1.0,0.02
medgemma,Xray,0.62,0.3871,100,1.0,0.24
qwen3,MRI,0.68,0.5294,100,1.0,0.36



üßæ PROMPT: artefact-disease-cue-2-prompts
‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ


Unnamed: 0_level_0,Unnamed: 1_level_0,accuracy,f1,n_valid,precision,recall
model,dataset,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
internvl,MRI,0.69,0.5507,100,1.0,0.38
internvl,OCT,0.52,0.0769,100,1.0,0.04
internvl,Xray,0.61,0.3607,100,1.0,0.22
lingshu,MRI,0.85,0.8276,100,0.973,0.72
lingshu,OCT,0.59,0.3051,100,1.0,0.18
lingshu,Xray,0.71,0.6882,100,0.7442,0.64
medgemma,MRI,0.86,0.8627,100,0.8462,0.88
medgemma,OCT,0.5,0.0,100,0.0,0.0
medgemma,Xray,0.61,0.3607,100,1.0,0.22
qwen3,MRI,0.62,0.3871,100,1.0,0.24



üßæ PROMPT: artefact_disease_cue-1-prompts
‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ


Unnamed: 0_level_0,Unnamed: 1_level_0,accuracy,f1,n_valid,precision,recall
model,dataset,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
internvl,MRI,0.68,0.5294,100,1.0,0.36
internvl,OCT,0.52,0.0769,100,1.0,0.04
internvl,Xray,0.61,0.3607,100,1.0,0.22
lingshu,MRI,0.84,0.814,100,0.9722,0.7
lingshu,OCT,0.59,0.3051,100,1.0,0.18
lingshu,Xray,0.71,0.6882,100,0.7442,0.64
medgemma,MRI,0.86,0.8627,100,0.8462,0.88
medgemma,OCT,0.5,0.0,100,0.0,0.0
medgemma,Xray,0.62,0.3871,100,1.0,0.24
qwen3,MRI,0.62,0.3871,100,1.0,0.24



üßæ PROMPT: artefact-disease-cue-reason-1-prompts
‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ


Unnamed: 0_level_0,Unnamed: 1_level_0,accuracy,f1,n_valid,precision,recall
model,dataset,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
internvl,MRI,0.56,0.2143,100,1.0,0.12
internvl,OCT,0.5,0.0,100,0.0,0.0
internvl,Xray,0.51,0.0392,100,1.0,0.02
lingshu,MRI,0.82,0.7857,100,0.9706,0.66
lingshu,OCT,0.58,0.2759,100,1.0,0.16
lingshu,Xray,0.7,0.6739,100,0.7381,0.62
medgemma,MRI,0.81,0.7711,100,0.9697,0.64
medgemma,OCT,0.5,0.0,100,0.0,0.0
medgemma,Xray,0.52,0.0769,100,1.0,0.04
qwen3,MRI,0.53,0.1132,100,1.0,0.06


In [77]:
import pandas as pd
from pathlib import Path
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from IPython.display import display

# =========================
# 0) ÏÑ§Ï†ï
# =========================
models = ["internvl", "lingshu", "medgemma", "qwen3"]

prompts = [
    "baseline-prompts",
    "disease-cue-reason-1-prompts",
    "disease_cue-1-prompts",
    "artefact-disease-cue-2-prompts",
    "artefact_disease_cue-1-prompts",
    "artefact-disease-cue-reason-1-prompts",
]

datasets = ["MRI", "OCT", "Xray"]  # ‚úÖ XrayÎ°ú Í≥†Ï†ï
BEST_METRIC = "f1"                # best Í∏∞Ï§Ä (Ï∂îÏ≤ú: f1)

clean_files = {
    "internvl": "internvl_clean_baseline.csv",
    "lingshu": "lingshu_clean_baseline.csv",
    "medgemma": "medgemma_clean_baseline.csv",
    "qwen3": "qwen3_clean_baseline.csv",
}

def norm_dataset(s: pd.Series) -> pd.Series:
    return s.replace({"XRAY": "Xray", "XRay": "Xray", "xray": "Xray"})

# =========================
# 1) Clean metrics
# =========================
clean_rows = []
for model, path in clean_files.items():
    if not Path(path).exists():
        print(f"‚ö†Ô∏è missing clean file: {path}")
        continue

    df = pd.read_csv(path)
    df["dataset"] = norm_dataset(df["dataset"])
    df["gt_binary"] = pd.to_numeric(df["gt_binary"], errors="coerce")
    df["pred_binary"] = pd.to_numeric(df["pred_binary"], errors="coerce")

    for ds in datasets:
        sub = df[df["dataset"] == ds].copy()
        if sub.empty:
            continue
        valid = sub.dropna(subset=["gt_binary", "pred_binary"]).copy()
        if valid.empty:
            continue

        y_true = valid["gt_binary"].astype(int)
        y_pred = valid["pred_binary"].astype(int)

        clean_rows.append({
            "model": model,
            "dataset": ds,
            "clean_n": len(valid),
            "clean_accuracy": accuracy_score(y_true, y_pred),
            "clean_precision": precision_score(y_true, y_pred, zero_division=0),
            "clean_recall": recall_score(y_true, y_pred, zero_division=0),
            "clean_f1": f1_score(y_true, y_pred, zero_division=0),
        })

clean_df = pd.DataFrame(clean_rows)

# =========================
# 2) Weak metrics (severity=weak)
# =========================
weak_rows = []
for model in models:
    for prompt in prompts:
        file = f"{model}_{prompt}.csv"
        if not Path(file).exists():
            continue

        df = pd.read_csv(file)
        df = df[df["severity"] == "weak"].copy()

        df["dataset"] = norm_dataset(df["dataset"])
        df["binarylabel"] = pd.to_numeric(df["binarylabel"], errors="coerce")
        df["pred_binary"] = pd.to_numeric(df["pred_binary"], errors="coerce")

        for ds in datasets:
            sub = df[df["dataset"] == ds].copy()
            if sub.empty:
                continue

            n_total = len(sub)
            n_gt_nan = int(sub["binarylabel"].isna().sum())
            n_pred_nan = int(sub["pred_binary"].isna().sum())

            valid = sub.dropna(subset=["binarylabel", "pred_binary"]).copy()
            n_valid = len(valid)

            if n_valid == 0:
                weak_rows.append({
                    "model": model, "prompt": prompt, "dataset": ds,
                    "n_total": n_total, "n_valid": 0,
                    "n_gt_nan": n_gt_nan, "n_pred_nan": n_pred_nan,
                    "accuracy": pd.NA, "precision": pd.NA, "recall": pd.NA, "f1": pd.NA,
                })
                continue

            y_true = valid["binarylabel"].astype(int)
            y_pred = valid["pred_binary"].astype(int)

            weak_rows.append({
                "model": model, "prompt": prompt, "dataset": ds,
                "n_total": n_total, "n_valid": n_valid,
                "n_gt_nan": n_gt_nan, "n_pred_nan": n_pred_nan,
                "accuracy": accuracy_score(y_true, y_pred),
                "precision": precision_score(y_true, y_pred, zero_division=0),
                "recall": recall_score(y_true, y_pred, zero_division=0),
                "f1": f1_score(y_true, y_pred, zero_division=0),
            })

weak_df = pd.DataFrame(weak_rows)

# =========================
# 3) Merge + drop
# =========================
merged = weak_df.merge(clean_df, on=["model", "dataset"], how="left")

for m in ["accuracy", "precision", "recall", "f1"]:
    merged[f"{m}_drop"] = merged[m] - merged[f"clean_{m}"]

# =========================
# 4) Best prompt (model √ó dataset)
# =========================
best_df = (
    merged.dropna(subset=[BEST_METRIC])
    .sort_values([BEST_METRIC, "n_valid"], ascending=[False, False])
    .groupby(["model", "dataset"], as_index=False)
    .first()
)

# =========================
# 5) ÏòàÏÅòÍ≤å Ï∂úÎ†• üé®
# =========================

# (A) Clean ÏöîÏïΩ (ÏßßÍ≤å)
print("üßº CLEAN baseline summary")
clean_pivot = (
    clean_df.pivot_table(index="model", columns="dataset", values="clean_f1", aggfunc="first")
    .reindex(models)
    .reindex(columns=datasets)
)
display(clean_pivot.style.format("{:.4f}").set_caption("Clean F1 (higher is better)"))

# (B) BEST prompt ÏöîÏïΩÌëú (Í∞ÄÏû• Ï§ëÏöî üèÜ)
print("\nüèÜ BEST prompt per (model √ó modality) ‚Äî based on F1 (weak only)")
best_show = best_df[[
    "model","dataset","prompt","n_valid",
    "f1","clean_f1","f1_drop",
    "recall","precision","accuracy"
]].copy()

# Î≥¥Í∏∞ Ï¢ãÍ≤å Î™®Îç∏/Î™®Îã¨Î¶¨Ìã∞ ÏàúÏÑú Í≥†Ï†ï
best_show["model"] = pd.Categorical(best_show["model"], categories=models, ordered=True)
best_show["dataset"] = pd.Categorical(best_show["dataset"], categories=datasets, ordered=True)
best_show = best_show.sort_values(["model","dataset"])

display(
    best_show.style
    .format({
        "f1":"{:.4f}", "clean_f1":"{:.4f}", "f1_drop":"{:+.4f}",
        "recall":"{:.4f}", "precision":"{:.4f}", "accuracy":"{:.4f}"
    })
    .background_gradient(subset=["f1"], axis=None)                 # f1 ÎÜíÏùÑÏàòÎ°ù Í∞ïÏ°∞
    .background_gradient(subset=["f1_drop"], axis=None)            # dropÎèÑ ÏãúÍ∞ÅÌôî
    .set_caption("Best prompt (weak) + clean reference + drop")
)

# (C) Î™®Îç∏Î≥ÑÎ°ú ÏßßÍ≤å: Î™®Îã¨Î¶¨Ìã∞ 3Ï§ÑÏßúÎ¶¨ bestÎßå Î≥¥Ïó¨Ï£ºÍ∏∞ ü§ñ
print("\nü§ñ Model-wise best prompts (compact view)")
for model in models:
    df_m = best_show[best_show["model"] == model].copy()
    if df_m.empty:
        continue

    compact = df_m.set_index("dataset")[["prompt","n_valid","f1","f1_drop","clean_f1","recall","precision","accuracy"]]
    compact = compact.reindex(datasets)

    print(f"\n‚ú® {model.upper()}")
    display(
        compact.style
        .format({
            "f1":"{:.4f}", "clean_f1":"{:.4f}", "f1_drop":"{:+.4f}",
            "recall":"{:.4f}", "precision":"{:.4f}", "accuracy":"{:.4f}"
        })
        .background_gradient(subset=["f1"], axis=None)
        .background_gradient(subset=["f1_drop"], axis=None)
    )


üßº CLEAN baseline summary


dataset,MRI,OCT,Xray
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
internvl,0.75,0.5714,0.7059
lingshu,1.0,0.5714,0.8889
medgemma,0.8696,0.0,0.8235
qwen3,0.75,0.7619,0.25



üèÜ BEST prompt per (model √ó modality) ‚Äî based on F1 (weak only)


Unnamed: 0,model,dataset,prompt,n_valid,f1,clean_f1,f1_drop,recall,precision,accuracy
0,internvl,MRI,baseline-prompts,100,0.7013,0.75,-0.0487,0.54,1.0,0.77
1,internvl,OCT,baseline-prompts,100,0.1818,0.5714,-0.3896,0.1,1.0,0.55
2,internvl,Xray,disease_cue-1-prompts,100,0.5217,0.7059,-0.1841,0.36,0.9474,0.67
3,lingshu,MRI,baseline-prompts,100,0.8817,1.0,-0.1183,0.82,0.9535,0.89
4,lingshu,OCT,baseline-prompts,100,0.4375,0.5714,-0.1339,0.28,1.0,0.64
5,lingshu,Xray,baseline-prompts,100,0.735,0.8889,-0.1538,0.86,0.6418,0.69
6,medgemma,MRI,disease_cue-1-prompts,100,0.8627,0.8696,-0.0068,0.88,0.8462,0.86
7,medgemma,OCT,disease_cue-1-prompts,100,0.0392,0.0,0.0392,0.02,1.0,0.51
8,medgemma,Xray,disease_cue-1-prompts,100,0.3871,0.8235,-0.4364,0.24,1.0,0.62
9,qwen3,MRI,disease_cue-1-prompts,100,0.5294,0.75,-0.2206,0.36,1.0,0.68



ü§ñ Model-wise best prompts (compact view)

‚ú® INTERNVL


Unnamed: 0_level_0,prompt,n_valid,f1,f1_drop,clean_f1,recall,precision,accuracy
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
MRI,baseline-prompts,100,0.7013,-0.0487,0.75,0.54,1.0,0.77
OCT,baseline-prompts,100,0.1818,-0.3896,0.5714,0.1,1.0,0.55
Xray,disease_cue-1-prompts,100,0.5217,-0.1841,0.7059,0.36,0.9474,0.67



‚ú® LINGSHU


Unnamed: 0_level_0,prompt,n_valid,f1,f1_drop,clean_f1,recall,precision,accuracy
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
MRI,baseline-prompts,100,0.8817,-0.1183,1.0,0.82,0.9535,0.89
OCT,baseline-prompts,100,0.4375,-0.1339,0.5714,0.28,1.0,0.64
Xray,baseline-prompts,100,0.735,-0.1538,0.8889,0.86,0.6418,0.69



‚ú® MEDGEMMA


Unnamed: 0_level_0,prompt,n_valid,f1,f1_drop,clean_f1,recall,precision,accuracy
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
MRI,disease_cue-1-prompts,100,0.8627,-0.0068,0.8696,0.88,0.8462,0.86
OCT,disease_cue-1-prompts,100,0.0392,0.0392,0.0,0.02,1.0,0.51
Xray,disease_cue-1-prompts,100,0.3871,-0.4364,0.8235,0.24,1.0,0.62



‚ú® QWEN3


Unnamed: 0_level_0,prompt,n_valid,f1,f1_drop,clean_f1,recall,precision,accuracy
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
MRI,disease_cue-1-prompts,100,0.5294,-0.2206,0.75,0.36,1.0,0.68
OCT,baseline-prompts,100,0.2069,-0.555,0.7619,0.12,0.75,0.54
Xray,disease_cue-1-prompts,100,0.2456,-0.0044,0.25,0.14,1.0,0.57


# 3) Strong image performance

In [78]:
import pandas as pd
from pathlib import Path

models = ["internvl", "lingshu", "medgemma", "qwen3"]

datasets = ["MRI", "OCT", "Xray"]

rows = []

for model in models:
    for prompt in prompts:
        file = f"{model}_{prompt}.csv"
        if not Path(file).exists():
            continue

        df = pd.read_csv(file)

        # üî¥ strongÎßå
        df = df[df["severity"] == "strong"].copy()
        if df.empty:
            continue

        # dataset ÌëúÍ∏∞ ÌÜµÏùº
        df["dataset"] = df["dataset"].replace({"XRAY": "Xray", "XRay": "Xray"})

        for ds in datasets:
            sub = df[df["dataset"] == ds].copy()
            if sub.empty:
                continue

            n_total = len(sub)
            n_distorted = (sub["model_pred"] == "distorted").sum()

            rows.append({
                "model": model,
                "prompt": prompt,
                "dataset": ds,
                "n_total": n_total,
                "n_distorted": int(n_distorted),
                "distorted_ratio": n_distorted / n_total,
            })

strong_df = pd.DataFrame(rows)
#print(strong_df.round(4))


In [79]:
from IPython.display import display

for prompt in strong_df["prompt"].unique():
    print(f"\nüß® STRONG ‚Äî PROMPT: {prompt}")

    tbl = (
        strong_df[strong_df["prompt"] == prompt]
        .pivot_table(
            index="model",
            columns="dataset",
            values="distorted_ratio",
            aggfunc="first",
        )
        .reindex(index=models, columns=datasets)
    )

    display(
        tbl.style
        .format("{:.2%}")
        .background_gradient(axis=None)
        .set_caption("Distorted output ratio (severity = strong)")
    )



üß® STRONG ‚Äî PROMPT: baseline-prompts


dataset,MRI,OCT,Xray
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
internvl,14.00%,8.00%,44.00%
lingshu,11.00%,2.00%,35.00%
medgemma,1.00%,0.00%,1.00%
qwen3,2.00%,1.00%,14.00%



üß® STRONG ‚Äî PROMPT: disease-cue-reason-1-prompts


dataset,MRI,OCT,Xray
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
internvl,0.00%,0.00%,0.00%
lingshu,16.00%,1.00%,43.00%
medgemma,0.00%,0.00%,0.00%
qwen3,2.00%,0.00%,6.00%



üß® STRONG ‚Äî PROMPT: disease_cue-1-prompts


dataset,MRI,OCT,Xray
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
internvl,0.00%,2.00%,0.00%
lingshu,18.00%,3.00%,41.00%
medgemma,3.00%,3.00%,3.00%
qwen3,3.00%,0.00%,7.00%



üß® STRONG ‚Äî PROMPT: artefact-disease-cue-2-prompts


dataset,MRI,OCT,Xray
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
internvl,2.00%,5.00%,0.00%
lingshu,17.00%,4.00%,52.00%
medgemma,4.00%,4.00%,4.00%
qwen3,3.00%,2.00%,10.00%



üß® STRONG ‚Äî PROMPT: artefact_disease_cue-1-prompts


dataset,MRI,OCT,Xray
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
internvl,0.00%,3.00%,0.00%
lingshu,8.00%,1.00%,41.00%
medgemma,2.00%,2.00%,2.00%
qwen3,0.00%,0.00%,0.00%



üß® STRONG ‚Äî PROMPT: artefact-disease-cue-reason-1-prompts


dataset,MRI,OCT,Xray
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
internvl,0.00%,0.00%,0.00%
lingshu,26.00%,3.00%,53.00%
medgemma,2.00%,0.00%,1.00%
qwen3,3.00%,1.00%,7.00%
