In [1]:
import os, sys, re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import yaml
import wandb

sys.path.insert(0, "..")
from script.main_utils import parse_yaml_config, setup_dump_env
from script.boxplot_helpers import (
    _load_forward_metrics_recursive,
    _apply_filters,
    _collect_values_by_encoder,
    _plot_box,
    _plot_violin,
    _sanitize_token,
    _ensure_out_path,
)
from script.evaluation.gather_results import gather_forward_metrics

EVAL_ROOT = "../evaluation"
OUT_DIR = os.path.join(EVAL_ROOT, "boxplots")
setup_dump_env()




'../dump'

In [3]:
CONFIG_PATH = "../sweeps/configs/boxplot_demo"
cfg = parse_yaml_config(CONFIG_PATH)
for k in ("gene_sets", "plot_box", "plot_violin"):
    if k not in cfg:
        raise RuntimeError(f"missing required config key: {k}")
gene_sets = cfg.get("gene_sets")


In [4]:
plot_box = cfg.get("plot_box")
plot_violin = cfg.get("plot_violin")


In [9]:
from pathlib import Path
configs = []
configs_dir = "../evaluation/"
for path in Path(configs_dir).rglob("metrics_summary.csv"):
    df = pd.read_csv(path)
    configs.append(df)
    print(path)


../evaluation/icms2down/47fcd4fc96/uni/icms2down/coad_icms2down_cmmn_genes_wmse enc-uni/predictions/metrics_summary.csv
../evaluation/icms2down/c73e940902/dinov3_vits16/icms2down/coad_icms2down_cmmn_genes_dinov3_vits16/predictions/metrics_summary.csv
../evaluation/icms2down/c73e940902/resnet50random/icms2down/coad_icms2down_cmmn_genes_resrand/predictions/metrics_summary.csv
../evaluation/icms2down/b04f205130/uni/icms2down/coad_icms2down_cmmn_genes_wmse redo enc-uni/predictions/metrics_summary.csv
../evaluation/icms3up/dd8c9bc880/dino/icms3up/coad icms3up dino no finetuning enc-dino/predictions/metrics_summary.csv
../evaluation/icms3up/dd8c9bc880/dino/icms3up/coad icms3up multimodel enc-dino/predictions/metrics_summary.csv
../evaluation/icms3up/b27914dcce/uni/icms3up/coad icms3up uni no finetuning/enc-uni/predictions/metrics_summary.csv
../evaluation/icms3up/b27914dcce/uni/icms3up/coad icms3up uni 3/enc-uni/predictions/metrics_summary.csv
../evaluation/icms2up/6db31de985/uni/icms2up/coa

In [5]:
def _plot_violin_seaborn(values, title, out_path, y_lim=(-1, 1), y_label="Pearson r", x_label="Group"):
    import os, numpy as np, pandas as pd, matplotlib.pyplot as plt, seaborn as sns

    # Normalize input → labels + list of arrays
    if isinstance(values, dict):
        labels = sorted(values)
        groups = [np.asarray(values[k], dtype=float) for k in labels]
    else:
        groups = values if values and isinstance(values[0], (list, tuple, np.ndarray)) else [values]
        groups = [np.asarray(g, dtype=float) for g in groups]
        labels = [f"G{i+1}" for i in range(len(groups))]

    # Long-form DataFrame for seaborn (drop NaNs)
    df = pd.DataFrame(
        [(lab, v) for lab, arr in zip(labels, groups) for v in np.asarray(arr, float) if not np.isnan(v)],
        columns=["Group", "Value"]
    )

    fig, ax = plt.subplots(figsize=(8, 4.5))
    sns.violinplot(data=df, x="Group", y="Value", inner=None, cut=0, ax=ax)

    # Overlay group means
    means = df.groupby("Group")["Value"].mean().reindex(labels)
    ax.scatter(range(len(labels)), means.values, s=20, color="black", zorder=3)

    ax.set_ylim(*y_lim); ax.set_ylabel(y_label); ax.set_xlabel(x_label); ax.set_title(title)
    fig.tight_layout()
    os.makedirs(os.path.dirname(out_path) or ".", exist_ok=True)
    fig.savefig(out_path, dpi=200)
    plt.close(fig)

In [8]:
df_plot = df
group_key = "encoder_type"
saved_paths = []
skip_non_finite = bool(cfg.get("skip_non_finite", False))
for set_name, genes in gene_sets.items():
    pearson_cols = df.columns[df.columns.str.startswith('pearson', na=False)]

    print(len(genes))
    vals = df.loc[df.index[0], pearson_cols].to_list()
    print(vals)
    print(type(vals))
    title = f"Pearson by {group_key} — {set_name}"
    fname = _sanitize_token(set_name)
    if not plot_box:
        out_base_box = os.path.join(OUT_DIR, f"{fname}__box")
        out_path_box = _ensure_out_path(out_base_box, "png")
        _plot_box(vals, title, out_path_box)
        saved_paths.append(out_path_box)
    if plot_violin:
        out_base_violin = os.path.join(OUT_DIR, f"{fname}__violin")
        
        out_path_violin = _ensure_out_path(out_base_violin, "png")
        _plot_violin(vals, title, out_path_violin)
        saved_paths.append(out_path_violin)
        
        out_path_violin = _ensure_out_path(out_base_violin, "png")
        _plot_violin_seaborn(vals, title, out_path_violin)
        saved_paths.append(out_path_violin)
saved_paths


276
[0.5727116590663497, 0.4169215859124907, 0.3733026401960221, 0.5198905214091162, 0.5458237547135824, 0.6273977475154929, 0.4504377361514254, 0.6782294026871051, 0.5746649398330785, 0.4612794851647719, 0.3098151081599519, 0.4144640817537698, 0.6199109990819989, 0.3146850098231517, 0.1180570809426927, 0.5007576713736568, 0.5903214403678686, 0.5419754484540065, 0.6573878965284091, 0.3931185460397025, 0.2843863673567284, 0.4244171268662779, 0.3431578916421358, 0.5945892084272917, 0.4233276591570031, 0.3222934137823902, 0.439515077879234, 0.557350910959972, 0.197727038713803, 0.5610792367129007, 0.5524375983943622, 0.5992226642022411, 0.4838454784552338, 0.5046960740758059, 0.6316948950166287, 0.5360250782640346, 0.4760255131966856]
<class 'list'>


['../evaluation/boxplots/hvg__violin3.png',
 '../evaluation/boxplots/hvg__violin4.png']