# Manuscript figures

Notebook for generating publication-ready figures. Uses Nature series journal plot style (Arial/Helvetica, 5–7 pt, axis lines and tick marks, accessible colors, no grid).

## 1. CNA profile — grouped bar plots (AUROC / AUPRC)

Load AUROC and AUPRC from CNA profile benchmark outputs (`{cna_type}/2_metric/metric.{cna_type}.auroc.tsv` and `metric.{cna_type}.auprc.tsv`). Set `CNA_METRIC_BASE` in the config cell to your CNA profile output directory (parent of `loss/`, `loh/`, `gain/`). If unset or missing, demo data are used.

**Plots:**  
- **Grouped bars** (1–3): bar order left→right = loss, gain, loh. InferCNV and CopyKAT do not report LOH; LOH is shown as 0 and annotated.  
- **Stacked bars** (4–6): each bar = one tool; stack bottom→top = loss, gain, loh (LOH on top for easy comparison).

Style: Nature series (Arial/Helvetica, 5–7 pt, axis lines and ticks, colorblind‑friendly palette, no grid, PDF + PNG).

Set `CNA_METRIC_BASE` to your CNA profile output (parent of `loss/`, `loh/`, `gain/`). Figures are written to `figures/`.

In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib

# Nature series style
matplotlib.rcParams["font.family"] = "sans-serif"
matplotlib.rcParams["font.sans-serif"] = ["Arial", "Helvetica", "DejaVu Sans"]
matplotlib.rcParams["font.size"] = 6
matplotlib.rcParams["axes.labelsize"] = 6
matplotlib.rcParams["axes.titlesize"] = 7
matplotlib.rcParams["xtick.labelsize"] = 5
matplotlib.rcParams["ytick.labelsize"] = 5
matplotlib.rcParams["legend.fontsize"] = 5
matplotlib.rcParams["figure.dpi"] = 150
matplotlib.rcParams["pdf.fonttype"] = 42
matplotlib.rcParams["axes.linewidth"] = 0.5
matplotlib.rcParams["xtick.major.width"] = 0.5
matplotlib.rcParams["ytick.major.width"] = 0.5

# CNA profile metric output base (parent of loss/, loh/, gain/)
CNA_METRIC_BASE = os.path.expanduser("~/path/to/cna_profile_output")
CNA_TYPES = ("loss", "loh", "gain")
CNA_TYPE_LABELS = {"loss": "Copy loss", "loh": "LOH", "gain": "Copy gain"}
# Grouped bars: bar order left→right = loss, gain, loh
CNA_ORDER_GROUPED = ("loss", "gain", "loh")
# Stacked bars: stack bottom→top = loss, gain, loh (LOH on top for easy comparison)
CNA_ORDER_STACKED = ("loss", "gain", "loh")
CNA_COLOR_INDEX = {"loss": 0, "loh": 1, "gain": 2}
# Tools that do not report LOH (optional override; else inferred from data)
TOOLS_WITHOUT_LOH = None  # e.g. ("InferCNV", "CopyKAT") or None to infer
# Figures saved to project_root/figures
PROJECT_ROOT = os.path.dirname(os.getcwd()) if os.path.basename(os.getcwd()) == "scripts" else os.getcwd()
OUT_DIR = os.path.join(PROJECT_ROOT, "figures")
os.makedirs(OUT_DIR, exist_ok=True)

In [None]:
def load_cna_metrics(base: str):
    """Load AUROC and AUPRC per CNA type from metric TSVs. Return (df_auroc, df_auprc)."""
    auroc_dfs, auprc_dfs = [], []
    for ct in CNA_TYPES:
        d = os.path.join(base, ct, "2_metric")
        fa = os.path.join(d, f"metric.{ct}.auroc.tsv")
        fp = os.path.join(d, f"metric.{ct}.auprc.tsv")
        if not os.path.isfile(fa) or not os.path.isfile(fp):
            raise FileNotFoundError(f"Missing metric files in {d}. Check CNA_METRIC_BASE.")
        da = pd.read_csv(fa, sep="\t")
        dp = pd.read_csv(fp, sep="\t")
        da["cna_type"] = ct
        dp["cna_type"] = ct
        auroc_dfs.append(da)
        auprc_dfs.append(dp)
    df_auroc = pd.concat(auroc_dfs, ignore_index=True)
    df_auprc = pd.concat(auprc_dfs, ignore_index=True)
    return df_auroc, df_auprc

_use_demo = "path/to/cna_profile_output" in CNA_METRIC_BASE or not os.path.isdir(CNA_METRIC_BASE)
if _use_demo:
    np.random.seed(42)
    _tools = ["InferCNV", "CopyKAT", "Numbat", "XClone"]
    _no_loh = {"InferCNV", "CopyKAT"}
    _auc = np.random.uniform(0.5, 0.95, (len(_tools), 3))
    _prc = np.random.uniform(0.3, 0.9, (len(_tools), 3))
    rows_auroc, rows_auprc = [], []
    for i, t in enumerate(_tools):
        for j, ct in enumerate(CNA_TYPES):
            if ct == "loh" and t in _no_loh:
                continue
            rows_auroc.append({"tool": t, "cna_type": ct, "AUROC": _auc[i, j]})
            rows_auprc.append({"tool": t, "cna_type": ct, "AUPRC": _prc[i, j]})
    df_auroc = pd.DataFrame(rows_auroc)
    df_auprc = pd.DataFrame(rows_auprc)
    print("Using demo data (set CNA_METRIC_BASE to your output dir for real data).")
else:
    df_auroc, df_auprc = load_cna_metrics(CNA_METRIC_BASE)

# Pivot to tools x CNA types (loss, loh, gain)
auroc_wide = df_auroc.pivot(index="tool", columns="cna_type", values="AUROC").reindex(columns=list(CNA_TYPES))
auprc_wide = df_auprc.pivot(index="tool", columns="cna_type", values="AUPRC").reindex(columns=list(CNA_TYPES))
tools = sorted(set(auroc_wide.index) | set(auprc_wide.index))
auroc_wide = auroc_wide.reindex(tools).fillna(0.0)
auprc_wide = auprc_wide.reindex(tools).fillna(0.0)

# Infer tools without LOH (in loss/gain but not in LOH metrics)
if TOOLS_WITHOUT_LOH is not None:
    tools_no_loh = set(TOOLS_WITHOUT_LOH)
else:
    in_loh = set(df_auroc.loc[df_auroc["cna_type"] == "loh", "tool"].unique())
    tools_no_loh = {t for t in tools if t not in in_loh}
# Order tools: with LOH first, then without (each subgroup sorted), for clearer comparison
tools_with_loh = sorted(t for t in tools if t not in tools_no_loh)
tools_order = tools_with_loh + sorted(tools_no_loh)
auroc_wide = auroc_wide.reindex(tools_order)
auprc_wide = auprc_wide.reindex(tools_order)
if tools_no_loh:
    print("Tools without LOH (shown as hatched bar):", sorted(tools_no_loh))
print("Tool order:", tools_order)
print("AUROC wide:\n", auroc_wide)
print("AUPRC wide:\n", auprc_wide)

In [None]:
# Colorblind-friendly colors (Nature-style): Copy loss, LOH, Copy gain
CNA_COLORS = ["#0072B2", "#E69F00", "#009E73"]  # blue, orange, green

def plot_grouped_bars(ax, data_wide, ylabel, title=None, tools_no_loh=None):
    """Grouped bar plot: tools on x, three bars per tool (loss, gain, loh)."""
    from matplotlib.patches import Patch

    tools_no_loh = tools_no_loh or set()
    n = len(data_wide)
    x = np.arange(n)
    width = 0.24
    na_height = 0.02
    for j, ct in enumerate(CNA_ORDER_GROUPED):
        offset = (j - 1) * width
        lbl = CNA_TYPE_LABELS[ct]
        col = CNA_COLORS[CNA_COLOR_INDEX[ct]]
        vals = data_wide[ct].values.copy()
        mask_no_loh = np.array([t in tools_no_loh for t in data_wide.index]) if ct == "loh" else np.zeros(n, dtype=bool)
        for i in range(n):
            if ct == "loh" and mask_no_loh[i]:
                vals[i] = na_height
                ax.bar(x[i] + offset, vals[i], width, color=col, linewidth=0, hatch="///", edgecolor="gray")
            else:
                ax.bar(x[i] + offset, vals[i], width, color=col, linewidth=0)
    handles = [Patch(facecolor=CNA_COLORS[CNA_COLOR_INDEX[ct]], label=CNA_TYPE_LABELS[ct]) for ct in CNA_ORDER_GROUPED]
    if tools_no_loh:
        handles.append(Patch(facecolor=CNA_COLORS[CNA_COLOR_INDEX["loh"]], hatch="///", edgecolor="gray", label="LOH (no output)"))
    ax.legend(handles=handles, loc="upper right", frameon=True)
    ax.set_xticks(x)
    ax.set_xticklabels(data_wide.index.tolist(), rotation=45, ha="right")
    ax.set_ylabel(ylabel)
    if title:
        ax.set_title(title)
    ax.set_xlim(x[0] - 0.6, x[-1] + 0.6)
    ax.set_ylim(0, min(1.05, max(1.0, np.nanmax(data_wide.values) * 1.05)))
    ax.grid(False)
    ax.spines["top"].set_visible(False)
    ax.spines["right"].set_visible(False)
    ax.yaxis.set_ticks_position("left")
    ax.xaxis.set_ticks_position("bottom")


def plot_stacked_bars(ax, data_wide, ylabel, title=None):
    """Stacked bar plot: tools on x, stacks = loss, gain, loh (LOH on top)."""
    n = len(data_wide)
    x = np.arange(n)
    width = 0.6
    bottom = np.zeros(n)
    for ct in CNA_ORDER_STACKED:
        vals = data_wide[ct].values
        ax.bar(x, vals, width, bottom=bottom, label=CNA_TYPE_LABELS[ct], color=CNA_COLORS[CNA_COLOR_INDEX[ct]], linewidth=0)
        bottom = bottom + vals
    ax.set_xticks(x)
    ax.set_xticklabels(data_wide.index.tolist(), rotation=45, ha="right")
    ax.set_ylabel(ylabel)
    if title:
        ax.set_title(title)
    ax.set_xlim(x[0] - 0.6, x[-1] + 0.6)
    ax.set_ylim(0, None)
    ax.legend(loc="upper right", frameon=True)
    ax.grid(False)
    ax.spines["top"].set_visible(False)
    ax.spines["right"].set_visible(False)
    ax.yaxis.set_ticks_position("left")
    ax.xaxis.set_ticks_position("bottom")

In [None]:
# 1. AUROC only — grouped bars (Copy loss, LOH, Copy gain)
fig, ax = plt.subplots(figsize=(4.25, 3.25))
plot_grouped_bars(ax, auroc_wide, "AUROC", title="CNA profile: AUROC", tools_no_loh=tools_no_loh)
plt.tight_layout()
plt.savefig(os.path.join(OUT_DIR, "cna_profile_auroc_grouped.pdf"), bbox_inches="tight")
plt.savefig(os.path.join(OUT_DIR, "cna_profile_auroc_grouped.png"), dpi=300, bbox_inches="tight")
plt.show()

In [None]:
# 2. AUPRC only — grouped bars (Copy loss, LOH, Copy gain)
fig, ax = plt.subplots(figsize=(4.25, 3.25))
plot_grouped_bars(ax, auprc_wide, "AUPRC", title="CNA profile: AUPRC", tools_no_loh=tools_no_loh)
plt.tight_layout()
plt.savefig(os.path.join(OUT_DIR, "cna_profile_auprc_grouped.pdf"), bbox_inches="tight")
plt.savefig(os.path.join(OUT_DIR, "cna_profile_auprc_grouped.png"), dpi=300, bbox_inches="tight")
plt.show()

In [None]:
# 3. AUROC and AUPRC — two panels (Nature: panel labels 8 pt bold, lowercase a, b)
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(7, 3.25), sharey=False)
plot_grouped_bars(ax1, auroc_wide, "AUROC", title="AUROC", tools_no_loh=tools_no_loh)
plot_grouped_bars(ax2, auprc_wide, "AUPRC", title="AUPRC", tools_no_loh=tools_no_loh)
for ax, label in zip((ax1, ax2), ("a", "b")):
    ax.text(0.02, 0.98, label, transform=ax.transAxes, fontsize=8, fontweight="bold", va="top", ha="left")
plt.tight_layout()
plt.savefig(os.path.join(OUT_DIR, "cna_profile_auroc_auprc_grouped.pdf"), bbox_inches="tight")
plt.savefig(os.path.join(OUT_DIR, "cna_profile_auroc_auprc_grouped.png"), dpi=300, bbox_inches="tight")
plt.show()

### Stacked bar plots (original)

Same data, stacked layout: each bar = one tool; stack bottom→top = loss, gain, loh (LOH on top for easy comparison).

In [None]:
# 4. AUROC — stacked bars (Copy loss, LOH, Copy gain)
fig, ax = plt.subplots(figsize=(4.25, 3.25))
plot_stacked_bars(ax, auroc_wide, "AUROC", title="CNA profile: AUROC")
plt.tight_layout()
plt.savefig(os.path.join(OUT_DIR, "cna_profile_auroc_stacked.pdf"), bbox_inches="tight")
plt.savefig(os.path.join(OUT_DIR, "cna_profile_auroc_stacked.png"), dpi=300, bbox_inches="tight")
plt.show()

In [None]:
# 5. AUPRC — stacked bars (Copy loss, LOH, Copy gain)
fig, ax = plt.subplots(figsize=(4.25, 3.25))
plot_stacked_bars(ax, auprc_wide, "AUPRC", title="CNA profile: AUPRC")
plt.tight_layout()
plt.savefig(os.path.join(OUT_DIR, "cna_profile_auprc_stacked.pdf"), bbox_inches="tight")
plt.savefig(os.path.join(OUT_DIR, "cna_profile_auprc_stacked.png"), dpi=300, bbox_inches="tight")
plt.show()

In [None]:
# 6. AUROC and AUPRC — stacked, two panels
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(7, 3.25), sharey=False)
plot_stacked_bars(ax1, auroc_wide, "AUROC", title="AUROC")
plot_stacked_bars(ax2, auprc_wide, "AUPRC", title="AUPRC")
for ax, label in zip((ax1, ax2), ("a", "b")):
    ax.text(0.02, 0.98, label, transform=ax.transAxes, fontsize=8, fontweight="bold", va="top", ha="left")
plt.tight_layout()
plt.savefig(os.path.join(OUT_DIR, "cna_profile_auroc_auprc_stacked.pdf"), bbox_inches="tight")
plt.savefig(os.path.join(OUT_DIR, "cna_profile_auroc_auprc_stacked.png"), dpi=300, bbox_inches="tight")
plt.show()