In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
import numpy as np
from pathlib import Path

In [None]:
paths = {
    "biovil":            Path("/opt/gpudata/imadejski/search-model/remix/data/20250728_biovil_baseline_evaluation/mimic-biovil-frontal-impression-base/mimic-biovil-frontal-impression-base_accuracy_results_test_no_resample.csv"),
    "ig_tg-no-mlm":      Path("/opt/gpudata/imadejski/search-model/remix/data/20250804_model_evaluation_mlm_correction/mimic-biovil-frontal-impression-ig_tg-mlm/mimic-biovil-frontal-impression-ig_tg-mlm_accuracy_results_test_no_resample.csv"),
    "ig_tg-mlm":         Path("/opt/gpudata/imadejski/search-model/remix/data/20250804_model_evaluation_mlm_correction/mimic-biovil-frontal-impression-ig_tg-mlm/mimic-biovil-frontal-impression-ig_tg-mlm_accuracy_results_test_no_resample.csv"),
    "igl_tg-no-mlm":     Path("/opt/gpudata/imadejski/search-model/remix/data/20250804_model_evaluation_mlm_correction/mimic-biovil-frontal-impression-igl_tg-no-mlm/mimic-biovil-frontal-impression-igl_tg-no-mlm_accuracy_results_test_no_resample.csv"),
    "igl_tg-mlm":        Path("/opt/gpudata/imadejski/search-model/remix/data/20250804_model_evaluation_mlm_correction/mimic-biovil-frontal-impression-igl_tg-mlm/mimic-biovil-frontal-impression-igl_tg-mlm_accuracy_results_test_no_resample.csv"),
    "ig_tgl-no-mlm":     Path("/opt/gpudata/imadejski/search-model/remix/data/20250804_model_evaluation_mlm_correction/mimic-biovil-frontal-impression-ig_tgl-no-mlm/mimic-biovil-frontal-impression-ig_tgl-no-mlm_accuracy_results_test_no_resample.csv"),
    "ig_tgl-mlm":        Path("/opt/gpudata/imadejski/search-model/remix/data/20250804_model_evaluation_mlm_correction/mimic-biovil-frontal-impression-ig_tgl-mlm/mimic-biovil-frontal-impression-ig_tgl-mlm_accuracy_results_test_no_resample.csv"),
    "igl_tgl-no-mlm":    Path("/opt/gpudata/imadejski/search-model/remix/data/20250804_model_evaluation_mlm_correction/mimic-biovil-frontal-impression-igl_tgl-no-mlm/mimic-biovil-frontal-impression-igl_tgl-no-mlm_accuracy_results_test_no_resample.csv"),
    "igl_tgl-mlm":       Path("/opt/gpudata/imadejski/search-model/remix/data/20250804_model_evaluation_mlm_correction/mimic-biovil-frontal-impression-igl_tgl-mlm/mimic-biovil-frontal-impression-igl_tgl-mlm_accuracy_results_test_no_resample.csv"),
}

In [None]:
mlm_paths = {
    "biovil":            Path("/opt/gpudata/imadejski/search-model/remix/data/20250728_biovil_baseline_evaluation/mimic-biovil-frontal-impression-base/mimic-biovil-frontal-impression-base_accuracy_results_test_no_resample.csv"),
    "ig_tg-mlm":         Path("/opt/gpudata/imadejski/search-model/remix/data/20250804_model_evaluation_mlm_correction/mimic-biovil-frontal-impression-ig_tg-mlm/mimic-biovil-frontal-impression-ig_tg-mlm_accuracy_results_test_no_resample.csv"),
    "igl_tg-mlm":        Path("/opt/gpudata/imadejski/search-model/remix/data/20250804_model_evaluation_mlm_correction/mimic-biovil-frontal-impression-igl_tg-mlm/mimic-biovil-frontal-impression-igl_tg-mlm_accuracy_results_test_no_resample.csv"),
    "ig_tgl-mlm":        Path("/opt/gpudata/imadejski/search-model/remix/data/20250804_model_evaluation_mlm_correction/mimic-biovil-frontal-impression-ig_tgl-mlm/mimic-biovil-frontal-impression-ig_tgl-mlm_accuracy_results_test_no_resample.csv"),
    "igl_tgl-mlm":       Path("/opt/gpudata/imadejski/search-model/remix/data/20250804_model_evaluation_mlm_correction/mimic-biovil-frontal-impression-igl_tgl-mlm/mimic-biovil-frontal-impression-igl_tgl-mlm_accuracy_results_test_no_resample.csv"),
}

In [None]:
def read_metric(path: Path, metric_name: str) -> pd.DataFrame:
    df = pd.read_csv(path, sep=None, engine="python")          # auto-detect comma or tab
    if not {"label", "metric", "value"}.issubset(df.columns):
        raise ValueError(f"{path} missing required columns")
    return df[df["metric"] == metric_name][["label", "value"]]


In [None]:
accuracy_dfs = {name: read_metric(p, "average_cosine_similarity_mean_accuracy") for name, p in paths.items()}

def desaturate(rgba, alpha=0.4):
    r, g, b, _ = rgba
    return (r, g, b, alpha)

tab20 = plt.cm.get_cmap("tab20")
base_idx = {             # pick every other colour to keep pairs distinct
    "biovil": 0,
    "ig_tg": 2,
    "igl_tg": 4,
    "ig_tgl": 6,
    "igl_tgl": 8,
}

colours = {
    "biovil":             tab20(base_idx["biovil"]),
    "ig_tg-mlm":          tab20(base_idx["ig_tg"]),
    "ig_tg-no-mlm":       desaturate(tab20(base_idx["ig_tg"])),
    "igl_tg-mlm":         tab20(base_idx["igl_tg"]),
    "igl_tg-no-mlm":      desaturate(tab20(base_idx["igl_tg"])),
    "ig_tgl-mlm":         tab20(base_idx["ig_tgl"]),
    "ig_tgl-no-mlm":      desaturate(tab20(base_idx["ig_tgl"])),
    "igl_tgl-mlm":        tab20(base_idx["igl_tgl"]),
    "igl_tgl-no-mlm":     desaturate(tab20(base_idx["igl_tgl"])),
}

accuracy_long_df = (
    pd.concat([accuracy_df.assign(model=name) for name, accuracy_df in accuracy_dfs.items()], ignore_index=True)
      .sort_values(["label", "model"])
)
labels = sorted(long_df["label"].unique())
models = list(paths.keys())   # preserve original order

In [None]:
n_labels, n_models = len(labels), len(models)
bar_width = 0.8 / n_models
fig, ax = plt.subplots(figsize=(max(10, n_labels * 0.6), 6))

for i, model in enumerate(models):
    vals = (
        accuracy_long_df[accuracy_long_df["model"] == model]
        .set_index("label")
        .reindex(labels)["value"]
        .values
    )
    x = (np.arange(n_labels) - 0.4) + i*bar_width + bar_width/2
    ax.bar(x, vals, width=bar_width, color=colours[model],
           edgecolor="black", linewidth=0, label=model)

ax.set_xticks(np.arange(n_labels))
ax.set_xticklabels(labels, rotation=45, ha="right")
ax.set_xlabel("Label")
ax.set_ylabel("Mean accuracy")
ax.set_title("Accuracy @ Top N by Label without Resampling for MIMIC Test Set")
ax.legend(bbox_to_anchor=(1.02, 1), loc="upper left", frameon=False)
fig.tight_layout()
plt.show()


In [None]:
# Only MLM models (exclude any with "no-mlm" in the name)
mlm_models = [name for name in paths.keys() if "no-mlm" not in name]

n_labels, n_models = len(labels), len(mlm_models)
bar_width = 0.8 / n_models
fig, ax = plt.subplots(figsize=(max(10, n_labels * 0.6), 6))

for i, model in enumerate(mlm_models):
    vals = (
        accuracy_long_df[accuracy_long_df["model"] == model]
        .set_index("label")
        .reindex(labels)["value"]
        .values
    )
    x = (np.arange(n_labels) - 0.4) + i*bar_width + bar_width/2
    ax.bar(x, vals, width=bar_width, color=colours[model],
           edgecolor="black", linewidth=0, label=model)

ax.set_xticks(np.arange(n_labels))
ax.set_xticklabels(labels, rotation=45, ha="right")
ax.set_xlabel("Label")
ax.set_ylabel("Mean accuracy")
ax.set_title("Accuracy @ Top N by Label without Resampling for MIMIC Test Set")
ax.legend(bbox_to_anchor=(1.02, 1), loc="upper left", frameon=False)
fig.tight_layout()
plt.show()

In [None]:
ndcgn_dfs = {name: read_metric(p, "average_cosine_similarity_mean_ndcg_n") for name, p in paths.items()}

ndcgn_long_df = (
    pd.concat([ndcgn_df.assign(model=name) for name, ndcgn_df in ndcgn_dfs.items()], ignore_index=True)
      .sort_values(["label", "model"])
)
ndcgn_labels = sorted(ndcgn_long_df["label"].unique())
ndcgn_models = list(paths.keys())   # preserve original order

n_labels, n_models = len(labels), len(models)
bar_width = 0.8 / n_models
fig, ax = plt.subplots(figsize=(max(10, n_labels * 0.6), 6))

for i, model in enumerate(models):
    vals = (
        ndcgn_long_df[ndcgn_long_df["model"] == model]
        .set_index("label")
        .reindex(ndcgn_labels)["value"]
        .values
    )
    x = (np.arange(n_labels) - 0.4) + i*bar_width + bar_width/2
    ax.bar(x, vals, width=bar_width, color=colours[model],
           edgecolor="black", linewidth=0, label=model)

ax.set_xticks(np.arange(n_labels))
ax.set_xticklabels(labels, rotation=45, ha="right")
ax.set_xlabel("Label")
ax.set_ylabel("Mean NDCG@N")
ax.set_title("NDCG @ N by Label without Resampling for MIMIC Test Set")
ax.legend(bbox_to_anchor=(1.02, 1), loc="upper left", frameon=False)
fig.tight_layout()
plt.show()


In [None]:
# NDCG@N plot for MLM-only models

ndcgn_metric = "average_cosine_similarity_mean_ndcg_n"
ndcgn_dfs = {name: read_metric(p, ndcgn_metric) for name, p in paths.items()}

ndcgn_long_df = (
    pd.concat([df.assign(model=name) for name, df in ndcgn_dfs.items()], ignore_index=True)
      .sort_values(["label", "model"])
)

labels_ndcgn = sorted(ndcgn_long_df["label"].unique())

# Exclude no-mlm variants, preserve original order from `paths`
mlm_models = [name for name in paths.keys() if "no-mlm" not in name and "no_mlm" not in name]

n_labels, n_models = len(labels_ndcgn), len(mlm_models)
bar_width = 0.8 / n_models
fig, ax = plt.subplots(figsize=(max(10, n_labels * 0.6), 6))

for i, model in enumerate(mlm_models):
    vals = (
        ndcgn_long_df[ndcgn_long_df["model"] == model]
        .set_index("label")
        .reindex(labels_ndcgn)["value"]
        .values
    )
    x = (np.arange(n_labels) - 0.4) + i*bar_width + bar_width/2
    ax.bar(x, vals, width=bar_width, color=colours[model],
           edgecolor="black", linewidth=0, label=model)

ax.set_xticks(np.arange(n_labels))
ax.set_xticklabels(labels_ndcgn, rotation=45, ha="right")
ax.set_xlabel("Label")
ax.set_ylabel("Mean NDCG@N")
ax.set_title("NDCG@N by Label without Resampling (MLM models only)")
ax.legend(bbox_to_anchor=(1.02, 1), loc="upper left", frameon=False)
fig.tight_layout()
plt.show()

In [None]:
# Accuracy@Top10 plot for MLM-only models

top10_metric = "average_cosine_similarity_mean_accuracy_top_10"
top10_dfs = {name: read_metric(p, top10_metric) for name, p in paths.items()}

top10_long_df = (
    pd.concat([df.assign(model=name) for name, df in top10_dfs.items()], ignore_index=True)
      .sort_values(["label", "model"])
)

labels_top10 = sorted(top10_long_df["label"].unique())

# Exclude no-mlm variants, preserve original order from `paths`
models = [name for name in paths.keys()]

n_labels, n_models = len(labels_top10), len(models)
bar_width = 0.8 / n_models
fig, ax = plt.subplots(figsize=(max(10, n_labels * 0.6), 6))

for i, model in enumerate(models):
    vals = (
        top10_long_df[top10_long_df["model"] == model]
        .set_index("label")
        .reindex(labels_top10)["value"]
        .values
    )
    x = (np.arange(n_labels) - 0.4) + i*bar_width + bar_width/2
    ax.bar(x, vals, width=bar_width, color=colours[model],
           edgecolor="black", linewidth=0, label=model)

ax.set_xticks(np.arange(n_labels))
ax.set_xticklabels(labels_top10, rotation=45, ha="right")
ax.set_xlabel("Label")
ax.set_ylabel("Mean Accuracy@Top10")
ax.set_title("Accuracy@Top10 by Label for MIMIC Test")
ax.legend(bbox_to_anchor=(1.02, 1), loc="upper left", frameon=False)
fig.tight_layout()
plt.show()

In [None]:
# Accuracy@Top10 plot for MLM-only models

top20_metric = "average_cosine_similarity_mean_accuracy_top_20"
top20_dfs = {name: read_metric(p, top20_metric) for name, p in paths.items()}

top20_long_df = (
    pd.concat([df.assign(model=name) for name, df in top20_dfs.items()], ignore_index=True)
      .sort_values(["label", "model"])
)

labels_top20 = sorted(top20_long_df["label"].unique())

# Exclude no-mlm variants, preserve original order from `paths`
models = [name for name in paths.keys()]

n_labels, n_models = len(labels_top20), len(models)
bar_width = 0.8 / n_models
fig, ax = plt.subplots(figsize=(max(10, n_labels * 0.6), 6))

for i, model in enumerate(models):
    vals = (
        top20_long_df[top20_long_df["model"] == model]
        .set_index("label")
        .reindex(labels_top20)["value"]
        .values
    )
    x = (np.arange(n_labels) - 0.4) + i*bar_width + bar_width/2
    ax.bar(x, vals, width=bar_width, color=colours[model],
           edgecolor="black", linewidth=0, label=model)

ax.set_xticks(np.arange(n_labels))
ax.set_xticklabels(labels_top20, rotation=45, ha="right")
ax.set_xlabel("Label")
ax.set_ylabel("Mean Accuracy@Top20")
ax.set_title("Accuracy@Top20 by Label without Resampling (MLM models only)")
ax.legend(bbox_to_anchor=(1.02, 1), loc="upper left", frameon=False)
fig.tight_layout()
plt.show()

In [None]:
# nDCG@Top10 plot for MLM-only models

ndcg_top10_metric = "average_cosine_similarity_mean_ndcg_10"
ndcg_top10_dfs = {name: read_metric(p, ndcg_top10_metric) for name, p in paths.items()}

ndcg_top10_long_df = (
    pd.concat([df.assign(model=name) for name, df in ndcg_top10_dfs.items()], ignore_index=True)
      .sort_values(["label", "model"])
)

labels_ndcg_top10 = sorted(ndcg_top10_long_df["label"].unique())

# Exclude no-mlm variants, preserve original order from `paths`
models = [name for name in paths.keys()]

n_labels, n_models = len(labels_ndcg_top10), len(models)
bar_width = 0.8 / n_models
fig, ax = plt.subplots(figsize=(max(10, n_labels * 0.6), 6))

for i, model in enumerate(models):
    vals = (
        ndcg_top10_long_df[ndcg_top10_long_df["model"] == model]
        .set_index("label")
        .reindex(labels_ndcg_top10)["value"]
        .values
    )
    x = (np.arange(n_labels) - 0.4) + i*bar_width + bar_width/2
    ax.bar(x, vals, width=bar_width, color=colours[model],
           edgecolor="black", linewidth=0, label=model)

ax.set_xticks(np.arange(n_labels))
ax.set_xticklabels(labels_ndcg_top10, rotation=45, ha="right")
ax.set_xlabel("Label")
ax.set_ylabel("Mean nDCG@Top10")
ax.set_title("nDCG@Top10 by Label for MIMIC Test")
ax.legend(bbox_to_anchor=(1.02, 1), loc="upper left", frameon=False)
fig.tight_layout()
plt.show()

In [None]:
# nDCG@Top10 plot for MLM-only models

ndcg_top20_metric = "average_cosine_similarity_mean_ndcg_20"
ndcg_top20_dfs = {name: read_metric(p, ndcg_top10_metric) for name, p in paths.items()}

ndcg_top20_long_df = (
    pd.concat([df.assign(model=name) for name, df in ndcg_top20_dfs.items()], ignore_index=True)
      .sort_values(["label", "model"])
)

labels_ndcg_top20 = sorted(ndcg_top20_long_df["label"].unique())

# Exclude no-mlm variants, preserve original order from `paths`
models = [name for name in paths.keys()]

n_labels, n_models = len(labels_ndcg_top20), len(models)
bar_width = 0.8 / n_models
fig, ax = plt.subplots(figsize=(max(10, n_labels * 0.6), 6))

for i, model in enumerate(models):
    vals = (
        ndcg_top20_long_df[ndcg_top20_long_df["model"] == model]
        .set_index("label")
        .reindex(labels_ndcg_top20)["value"]
        .values
    )
    x = (np.arange(n_labels) - 0.4) + i*bar_width + bar_width/2
    ax.bar(x, vals, width=bar_width, color=colours[model],
           edgecolor="black", linewidth=0, label=model)

ax.set_xticks(np.arange(n_labels))
ax.set_xticklabels(labels_ndcg_top20, rotation=45, ha="right")
ax.set_xlabel("Label")
ax.set_ylabel("Mean nDCG@Top20")
ax.set_title("nDCG@Top20 by Label for MIMIC Test")
ax.legend(bbox_to_anchor=(1.02, 1), loc="upper left", frameon=False)
fig.tight_layout()
plt.show()