In [None]:
CELDA 1 — Imports + paths del repo + outputs

In [None]:
from pathlib import Path
import io
import contextlib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.lines import Line2D

from sccoda.util import cell_composition_data as dat
from sccoda.util.comp_ana import CompositionalAnalysis

from src.paths import project_paths

P = project_paths(Path.cwd())
PROJECT_ROOT = P["PROJECT_ROOT"]
RESULTS_DIR  = P["RESULTS_DIR"]
FIGURES_DIR  = P["FIGURES_DIR"]

OUT_SUMMARY = RESULTS_DIR / "summary_tables" / "scCODA" / "level2_final"
OUT_FIG     = FIGURES_DIR / "scCODA" / "level2_final"
OUT_SUMMARY.mkdir(parents=True, exist_ok=True)
OUT_FIG.mkdir(parents=True, exist_ok=True)

print("PROJECT_ROOT:", PROJECT_ROOT)
print("RESULTS_DIR :", RESULTS_DIR)
print("FIGURES_DIR :", FIGURES_DIR)
print("OUT_SUMMARY :", OUT_SUMMARY)
print("OUT_FIG     :", OUT_FIG)


In [None]:
CELDA 2 — Localizar input counts (NB15) + cargar CSV + parámetros

In [None]:
# Input esperado desde NB15 (githubizado)
candidates = [
    RESULTS_DIR / "summary_tables" / "composition_boxplots" / "cell_counts_Level2final_by_patient.csv",
    RESULTS_DIR / "summary_tables" / "cell_counts_Level2final_by_patient.csv",
]
counts_csv = next((p for p in candidates if p.exists()), None)
print("counts_csv:", counts_csv)

if counts_csv is None:
    raise FileNotFoundError(
        "No encuentro cell_counts_Level2final_by_patient.csv.\nProbé:\n"
        + "\n".join([f"- {x}" for x in candidates])
    )

# Parámetros
COVARIATE_COL = "disease"

# Reference selection:
PRESENT_FRAC_PRIMARY  = 0.90
PRESENT_FRAC_FALLBACK = 0.75

# Sampling
NUM_RESULTS = 2000
NUM_BURNIN  = 1000

# Panel resultados (PNG)
MAX_ROWS_RESULTS_TABLE = 25

# (Opcional) boxplot input proportions (topN + Other) para suplemento
MAKE_INPUT_BOXPLOT = True
TOP_N_FOR_INPUT_PLOT = 25

df = pd.read_csv(counts_csv)
if isinstance(df, pd.Series):
    df = df.to_frame()

print("Loaded counts:", df.shape)
print(df.head())


In [None]:
CELDA 3 — Preparar DataFrame para scCODA (RBC-out robusto) + proportions

In [None]:
needed_cols = {"patientID", COVARIATE_COL}
missing = sorted(list(needed_cols - set(df.columns)))
if missing:
    raise KeyError(f"Faltan columnas requeridas en counts_csv: {missing}")

celltype_cols = [c for c in df.columns if c not in ["patientID", COVARIATE_COL, "total_cells_patient"]]

# RBC-out robusto
if "RBC" in celltype_cols:
    print("[WARN] Columna RBC encontrada en counts_csv. Eliminándola (post-RBC-out).")
    celltype_cols = [c for c in celltype_cols if c != "RBC"]

if len(celltype_cols) == 0:
    raise RuntimeError("No hay columnas de cell types en counts_csv (tras filtrar columnas base).")

df_sccoda = df[["patientID", COVARIATE_COL] + celltype_cols].copy()

# asegurar enteros >=0
for c in celltype_cols:
    df_sccoda[c] = pd.to_numeric(df_sccoda[c], errors="raise").astype(int)
    if (df_sccoda[c] < 0).any():
        raise ValueError(f"Counts negativos detectados en columna {c} (esto no debería ocurrir).")

# eliminar columnas todo-cero
zero_cols = [c for c in celltype_cols if int(df_sccoda[c].sum()) == 0]
if zero_cols:
    print("[WARN] Columnas con suma 0 (se eliminan):", zero_cols)
    celltype_cols = [c for c in celltype_cols if c not in zero_cols]
    df_sccoda = df_sccoda[["patientID", COVARIATE_COL] + celltype_cols].copy()

# IMPORTANTÍSIMO: patientID debe ser índice
df_sccoda["patientID"] = df_sccoda["patientID"].astype(str)
df_sccoda[COVARIATE_COL] = df_sccoda[COVARIATE_COL].astype(str)
df_sccoda = df_sccoda.set_index("patientID")

print("\nDataFrame para scCODA (head):")
print(df_sccoda.head())
print("Shape:", df_sccoda.shape)
print("N cell types:", len(celltype_cols))
print(f"{COVARIATE_COL} counts:")
print(df_sccoda[COVARIATE_COL].value_counts())

if df_sccoda[COVARIATE_COL].nunique() < 2:
    raise RuntimeError(f"Covariate '{COVARIATE_COL}' solo tiene 1 nivel. scCODA no tiene comparación que hacer.")

counts_mat = df_sccoda[celltype_cols].astype(float)
tot = counts_mat.sum(axis=1).replace(0, np.nan)
props_mat = counts_mat.div(tot, axis=0)


In [None]:
CELDA 4 — Selección de referencia explícita (reproducible) + guardar decisión

In [None]:
present_frac = (counts_mat > 0).mean(axis=0)

candidates_ref = present_frac[present_frac >= PRESENT_FRAC_PRIMARY].index.tolist()
threshold_used = PRESENT_FRAC_PRIMARY

if len(candidates_ref) == 0:
    candidates_ref = present_frac[present_frac >= PRESENT_FRAC_FALLBACK].index.tolist()
    threshold_used = PRESENT_FRAC_FALLBACK

if len(candidates_ref) == 0:
    candidates_ref = present_frac[present_frac > 0].index.tolist()
    threshold_used = 0.0

REFERENCE_CELL_TYPE = None
ref_reason = ""

if len(candidates_ref) > 0:
    means = props_mat[candidates_ref].mean(axis=0)
    stds  = props_mat[candidates_ref].std(axis=0)
    cv = (stds / means.replace(0, np.nan)).replace([np.inf, -np.inf], np.nan).dropna()

    if len(cv) > 0:
        REFERENCE_CELL_TYPE = cv.sort_values().index[0]
        ref_reason = f"min_CV_among_present>={threshold_used:.2f} (chosen={REFERENCE_CELL_TYPE})"
    else:
        REFERENCE_CELL_TYPE = "automatic"
        ref_reason = "fallback_automatic (CV undefined)"
else:
    REFERENCE_CELL_TYPE = "automatic"
    ref_reason = "fallback_automatic (no candidates)"

ref_tag = str(REFERENCE_CELL_TYPE).replace(" ", "_")

print("\n=== Reference selection ===")
print(f"Present fraction threshold used: {threshold_used:.2f}")
print("Top present_frac (10):")
print(present_frac.sort_values(ascending=False).head(10))
print("Candidates (head):", candidates_ref[:20], ("..." if len(candidates_ref) > 20 else ""))
print("REFERENCE_CELL_TYPE:", REFERENCE_CELL_TYPE)
print("ref_tag:", ref_tag)
print("Reason:", ref_reason)

ref_path = OUT_SUMMARY / f"scCODA_Level2final_reference_ref-{ref_tag}.txt"
with open(ref_path, "w", encoding="utf-8") as f:
    f.write("scCODA reference selection\n")
    f.write(f"- present_frac_primary: {PRESENT_FRAC_PRIMARY}\n")
    f.write(f"- present_frac_fallback: {PRESENT_FRAC_FALLBACK}\n")
    f.write(f"- threshold_used: {threshold_used}\n")
    f.write(f"- reference_cell_type: {REFERENCE_CELL_TYPE}\n")
    f.write(f"- reason: {ref_reason}\n")
print("Saved:", ref_path)


In [None]:
CELDA 5 — Ejecutar scCODA + guardar summary/credible_effects (robusto a 0.1.9)

In [None]:
data = dat.from_pandas(df_sccoda, covariate_columns=[COVARIATE_COL])
print("\nscCODA data:", data)

model = CompositionalAnalysis(
    data,
    formula=COVARIATE_COL,
    reference_cell_type=REFERENCE_CELL_TYPE,
)

result = model.sample_hmc(num_results=NUM_RESULTS, num_burnin=NUM_BURNIN)

# summary() imprime -> capturar stdout
buf = io.StringIO()
with contextlib.redirect_stdout(buf):
    _ = result.summary()
summary_txt = buf.getvalue()

df_credible = result.credible_effects()

# FIX: a veces devuelve Series
if isinstance(df_credible, pd.Series):
    df_credible = df_credible.to_frame()

if not isinstance(df_credible, pd.DataFrame):
    df_credible = pd.DataFrame(df_credible)

df_credible = df_credible.copy()

# Normalizar salida típica scCODA 0.1.9
if df_credible.shape[1] == 1:
    col0 = df_credible.columns[0]
    if str(df_credible[col0].dtype) in ("bool", "boolean"):
        df_credible = df_credible.rename(columns={col0: "credible"})

if "Final Parameter" in df_credible.columns and "credible" not in df_credible.columns:
    df_credible = df_credible.rename(columns={"Final Parameter": "credible"})

print("\n=== credible_effects() (head) ===")
print(df_credible.head(10))
print("credible_effects shape:", df_credible.shape)
print("credible_effects columns:", list(df_credible.columns))

out_summary_txt  = OUT_SUMMARY / f"scCODA_Level2final_summary_ref-{ref_tag}.txt"
out_credible_csv = OUT_SUMMARY / f"scCODA_Level2final_credible_effects_ref-{ref_tag}.csv"

with open(out_summary_txt, "w", encoding="utf-8") as f:
    f.write(summary_txt)

df_credible.to_csv(out_credible_csv, index=True)

print("\nSaved:", out_summary_txt)
print("Saved:", out_credible_csv)


In [None]:
CELDA 6 — Panel “resultados” (Figura 2C) a PNG

In [None]:
def _infer_credible_column(df):
    cols = list(getattr(df, "columns", []))
    if "credible" in cols:
        return "credible"
    for c in cols:
        if str(df[c].dtype) in ("bool", "boolean"):
            return c
    for c in cols:
        if str(c).lower().strip() in ("final parameter", "final_parameter"):
            return c
    return None

def _infer_effect_column(df):
    cols = list(getattr(df, "columns", []))
    preferred = []
    for c in cols:
        cl = str(c).lower()
        if ("effect" in cl) or ("beta" in cl) or ("coef" in cl):
            preferred.append(c)
    return preferred[0] if preferred else None

cred_col = _infer_credible_column(df_credible)
eff_col  = _infer_effect_column(df_credible)

tbl = df_credible.copy()

# asegurar celltype legible
if isinstance(tbl.index, pd.MultiIndex):
    tbl = tbl.reset_index()
else:
    if tbl.index.name is None:
        tbl.index.name = "celltype"
    tbl = tbl.reset_index()

n_rows = int(tbl.shape[0])

# filtrar a credible si existe
tbl_show = tbl.copy()
n_credible = None
if cred_col is not None and cred_col in tbl_show.columns:
    try:
        cred_bool = pd.to_numeric(tbl_show[cred_col], errors="coerce").fillna(False).astype(bool)
        n_credible = int(cred_bool.sum())
        if n_credible > 0:
            tbl_show = tbl_show.loc[cred_bool].copy()
    except Exception:
        n_credible = None

# recorte de filas si sigue siendo enorme
if int(tbl_show.shape[0]) > MAX_ROWS_RESULTS_TABLE:
    if eff_col is not None and eff_col in tbl_show.columns:
        try:
            vals = pd.to_numeric(tbl_show[eff_col], errors="coerce")
            tbl_show = (
                tbl_show.assign(_abs_eff=vals.abs())
                        .sort_values("_abs_eff", ascending=False)
                        .drop(columns=["_abs_eff"])
            )
        except Exception:
            pass
    tbl_show = tbl_show.head(MAX_ROWS_RESULTS_TABLE).copy()

# formateo tabla
tbl_disp = tbl_show.copy()
for c in tbl_disp.columns:
    if np.issubdtype(tbl_disp[c].dtype, np.number):
        tbl_disp[c] = tbl_disp[c].map(lambda x: f"{x:.3g}" if pd.notnull(x) else "")

fig = plt.figure(figsize=(12.5, 6.8))
gs = fig.add_gridspec(2, 1, height_ratios=[0.28, 0.72], hspace=0.02)

ax_txt = fig.add_subplot(gs[0, 0])
ax_tbl = fig.add_subplot(gs[1, 0])
ax_txt.axis("off")
ax_tbl.axis("off")

lines = []
lines.append(f"scCODA results — Level2_final (reference = {REFERENCE_CELL_TYPE})")
lines.append(f"Input: counts per patient; covariate: {COVARIATE_COL}")
lines.append(f"N cell types in model: {len(celltype_cols)} | N patients: {df_sccoda.shape[0]}")
if n_credible is None:
    lines.append("Credible effects: (credible column not found in output)")
else:
    lines.append(f"Credible effects detected: {n_credible} / {n_rows}")
lines.append(f"Shown in table: {int(tbl_show.shape[0])} rows (full CSV saved)")

ax_txt.text(0.01, 0.98, "\n".join(lines), va="top", ha="left", fontsize=11)

table = ax_tbl.table(
    cellText=tbl_disp.values,
    colLabels=tbl_disp.columns.tolist(),
    loc="center",
    cellLoc="left",
)
table.auto_set_font_size(False)
table.set_fontsize(8)
table.scale(1, 1.2)

out_png_results = OUT_FIG / f"Fig2C_scCODA_Level2final_results_ref-{ref_tag}.png"
fig.savefig(out_png_results, dpi=300, bbox_inches="tight")
plt.close(fig)

print("Saved results panel:", out_png_results)


In [None]:
CELDA 7 — (Opcional) Input composition boxplot (Top N + Other) + QA

In [None]:
if MAKE_INPUT_BOXPLOT:
    mean_props = props_mat.mean(axis=0).sort_values(ascending=False)
    top_cols = mean_props.head(TOP_N_FOR_INPUT_PLOT).index.tolist()
    other_cols = [c for c in props_mat.columns if c not in top_cols]

    plot_props = props_mat[top_cols].copy()
    if len(other_cols) > 0:
        plot_props["Other"] = props_mat[other_cols].sum(axis=1)
        plot_cols = top_cols + ["Other"]
    else:
        plot_cols = top_cols

    plot_df = plot_props.copy()
    plot_df[COVARIATE_COL] = df_sccoda[COVARIATE_COL].astype(str)
    plot_df["patientID"] = plot_df.index.astype(str)

    long = plot_df.melt(
        id_vars=["patientID", COVARIATE_COL],
        value_vars=plot_cols,
        var_name="celltype",
        value_name="proportion",
    )

    diseases = sorted(long[COVARIATE_COL].unique())
    cycle = plt.rcParams["axes.prop_cycle"].by_key().get("color", ["C0", "C1", "C2", "C3"])
    colors = {dis: cycle[i % len(cycle)] for i, dis in enumerate(diseases)}

    fig_w = max(14, 0.45 * len(plot_cols))
    fig, ax = plt.subplots(figsize=(fig_w, 6))

    n_groups = len(diseases)
    base_positions = np.arange(len(plot_cols))
    width = 0.35 if n_groups == 2 else 0.25
    offsets = np.linspace(-width, width, n_groups)

    all_positions = []
    all_data = []
    for i, ct in enumerate(plot_cols):
        for j, dis in enumerate(diseases):
            vals = long.loc[(long["celltype"] == ct) & (long[COVARIATE_COL] == dis), "proportion"].values
            all_data.append(vals)
            all_positions.append(base_positions[i] + offsets[j])

    ax.boxplot(
        all_data,
        positions=all_positions,
        widths=width * 0.8,
        showfliers=False,
        patch_artist=False,
    )

    rng = np.random.default_rng(0)
    k = 0
    for i, ct in enumerate(plot_cols):
        for j, dis in enumerate(diseases):
            vals = all_data[k]
            x0 = all_positions[k]
            jitter = rng.normal(0, width * 0.08, size=len(vals))
            ax.scatter(
                np.full_like(vals, x0, dtype=float) + jitter,
                vals,
                s=10,
                alpha=0.7,
                color=colors[dis],
            )
            k += 1

    ax.set_xticks(base_positions)
    ax.set_xticklabels(plot_cols, rotation=90, ha="center", fontsize=7)
    ax.set_ylabel("Proportion of cells per patient")
    ax.set_title(f"scCODA input composition (Level2_final) — top{TOP_N_FOR_INPUT_PLOT} + Other | ref={REFERENCE_CELL_TYPE}")
    ax.set_ylim(0, max(0.05, float(long["proportion"].max()) * 1.15))

    legend_handles = [
        Line2D([0],[0], marker='s', linestyle='None', markersize=8, label=dis,
               markerfacecolor=colors[dis], markeredgecolor=colors[dis])
        for dis in diseases
    ]
    ax.legend(handles=legend_handles, title=COVARIATE_COL, frameon=False, loc="upper right")

    plt.tight_layout()
    out_png_box = OUT_FIG / f"SuppFig_scCODA_Level2final_input_boxplots_top{TOP_N_FOR_INPUT_PLOT}_ref-{ref_tag}.png"
    plt.savefig(out_png_box, dpi=300)
    plt.close(fig)
    print("Saved input composition (supp):", out_png_box)

    qa_rows = []
    for ct in plot_cols:
        for dis in diseases:
            vals = long.loc[(long["celltype"] == ct) & (long[COVARIATE_COL] == dis), "proportion"].dropna().values
            if len(vals) == 0:
                qa_rows.append([ct, dis, 0, np.nan, np.nan, np.nan, np.nan])
                continue
            qa_rows.append([
                ct,
                dis,
                int(len(vals)),
                float(np.mean(vals)),
                float(np.median(vals)),
                float(np.quantile(vals, 0.25)),
                float(np.quantile(vals, 0.75)),
            ])

    qa_df = pd.DataFrame(qa_rows, columns=[
        "celltype", COVARIATE_COL, "n_patients",
        "mean_prop", "median_prop", "q25_prop", "q75_prop",
    ])
    qa_path = OUT_SUMMARY / f"QA_scCODA_Level2final_boxplot_stats_top{TOP_N_FOR_INPUT_PLOT}_ref-{ref_tag}.csv"
    qa_df.to_csv(qa_path, index=False)
    print("Saved QA table:", qa_path)

print("\n[OK] scCODA Level2_final completado (summary + credible + results panel [+ optional input plot + QA]).")
