In [None]:
import os, sys, re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import yaml
import wandb

sys.path.insert(0, "..")
from script.main_utils import parse_yaml_config, setup_dump_env
from script.boxplot_helpers import (
    _load_forward_metrics_recursive,
    _apply_filters,
    _collect_values_by_encoder,
    _plot_box,
    _plot_violin,
    _sanitize_token,
    _ensure_out_path,
)

EVAL_ROOT = "../evaluation"
OUT_DIR = os.path.join(EVAL_ROOT, "boxplots")
setup_dump_env()


In [None]:
CONFIG_PATH = ""

if not CONFIG_PATH or not isinstance(CONFIG_PATH, str):
    raise RuntimeError("CONFIG_PATH empty or invalid")
cfg = parse_yaml_config(CONFIG_PATH)
if not isinstance(cfg, dict):
    raise RuntimeError("config must be a mapping")
for k in ("gene_sets", "log_to_wandb", "plot_box", "plot_violin"):
    if k not in cfg:
        raise RuntimeError(f"missing required config key: {k}")
gene_sets = cfg.get("gene_sets")
if not isinstance(gene_sets, dict) or not gene_sets:
    raise ValueError("gene_sets must be a non-empty mapping")
for name, gl in gene_sets.items():
    if not isinstance(name, str) or not name.strip():
        raise ValueError("gene_sets keys must be non-empty strings")
    if not isinstance(gl, list) or not all(isinstance(g, str) for g in gl):
        raise ValueError(f"gene_sets['{name}'] must be a list of strings")
include_projects = cfg.get("include_projects") or None
include_encoders = cfg.get("include_encoders") or None
include_run_name_regex = cfg.get("include_run_name_regex") or None
exclude_run_name_regex = cfg.get("exclude_run_name_regex") or None
plot_box = cfg.get("plot_box")
plot_violin = cfg.get("plot_violin")
if not isinstance(plot_box, bool) or not isinstance(plot_violin, bool):
    raise ValueError("plot_box and plot_violin must be bools")
if not (plot_box or plot_violin):
    raise ValueError("at least one of plot_box/plot_violin must be true")
log_to_wandb = bool(cfg.get("log_to_wandb", False))
run = None
if log_to_wandb:
    for key in ("run_name", "group", "job_type", "tags", "project"):
        if key not in cfg:
            raise ValueError(f"Missing required parameter '{key}' for W&B logging")
    wb_cfg = {k: v for k, v in cfg.items() if k not in ("project", "metric", "method", "run_name", "group", "job_type", "tags")}
    original_run_name = cfg.get("run_name")
    run = wandb.init(
        project=cfg["project"],
        name=cfg["run_name"],
        group=cfg["group"],
        job_type=cfg["job_type"],
        tags=cfg["tags"],
        config=wb_cfg,
    )
    cfg = dict(cfg)
    cfg.update(dict(run.config))
    if original_run_name is not None:
        cfg["run_name"] = original_run_name


In [None]:
SCAN_DIR = ""
if not SCAN_DIR or not isinstance(SCAN_DIR, str):
    raise RuntimeError("SCAN_DIR empty or invalid")
SCAN_ROOT = os.path.join(EVAL_ROOT, SCAN_DIR)
if not os.path.isdir(SCAN_ROOT):
    raise FileNotFoundError(f"scan_root not found or not a dir: {SCAN_ROOT}")
df = _load_forward_metrics_recursive(SCAN_ROOT)
df = _apply_filters(
    df,
    include_projects=include_projects,
    include_encoders=include_encoders,
    include_run_name_regex=include_run_name_regex,
    exclude_run_name_regex=exclude_run_name_regex,
)
df.shape


In [None]:
saved_paths = []
skip_non_finite = bool(cfg.get("skip_non_finite", False))
for set_name, genes in gene_sets.items():
    vals = _collect_values_by_encoder(df, genes, skip_non_finite)
    title = f"Pearson by encoder â€” {set_name}"
    fname = _sanitize_token(set_name)
    if plot_box:
        out_base_box = os.path.join(OUT_DIR, f"{fname}__box")
        out_path_box = _ensure_out_path(out_base_box, "png")
        _plot_box(vals, title, out_path_box)
        saved_paths.append(out_path_box)
        if run is not None:
            run.log({f"boxplot/{set_name}": wandb.Image(out_path_box)})
    if plot_violin:
        out_base_violin = os.path.join(OUT_DIR, f"{fname}__violin")
        out_path_violin = _ensure_out_path(out_base_violin, "png")
        _plot_violin(vals, title, out_path_violin)
        saved_paths.append(out_path_violin)
        if run is not None:
            run.log({f"violinplot/{set_name}": wandb.Image(out_path_violin)})
saved_paths


In [None]:
if run is not None:
    run.finish()
