In [None]:
from pathlib import Path
import re
import pandas as pd

from src.paths import project_paths

# paths repo
P = project_paths(Path.cwd())
PROJECT_ROOT = P.PROJECT_ROOT
RESULTS_DIR = P.RESULTS_DIR

SUM_DIR = RESULTS_DIR / "summary_tables"
SUM_DIR.mkdir(parents=True, exist_ok=True)

def pick_first_existing(candidates, label: str) -> Path:
    for p in candidates:
        if p is not None and Path(p).exists():
            return Path(p)
    raise FileNotFoundError(
        f"No encuentro {label}. Probé:\n" + "\n".join([f" - {Path(p)}" for p in candidates if p is not None])
    )

# inputs
legacy_sum = PROJECT_ROOT / "summary_tables_final"

num_fn    = "QA_dotplot_numeric_matrix_Level2final.csv"
markers_fn = "QA_dotplot_markers_2perLevel2_noRBC.csv"
totals_fn  = "QA_dotplot_totals_by_Level2_plot_noRBC.csv"

num_path = pick_first_existing(
    [SUM_DIR / num_fn, legacy_sum / num_fn],
    label=num_fn
)
markers_path = pick_first_existing(
    [SUM_DIR / markers_fn, legacy_sum / markers_fn],
    label=markers_fn
)
totals_path = pick_first_existing(
    [SUM_DIR / totals_fn, legacy_sum / totals_fn],
    label=totals_fn
)

print("PROJECT_ROOT:", PROJECT_ROOT)
print("SUM_DIR      :", SUM_DIR)
print("num_path     :", num_path)
print("markers_path :", markers_path)
print("totals_path  :", totals_path)

In [None]:
# cargar tablas auxiliares
mk = pd.read_csv(markers_path)
needed_cols = {"Level2_final", "marker1", "marker2"}
if not needed_cols.issubset(set(mk.columns)):
    raise KeyError(f"markers_path debe tener columnas {needed_cols}. Tiene: {mk.columns.tolist()}")

tot = pd.read_csv(totals_path)
if "Level2_final" not in tot.columns or "n_cells" not in tot.columns:
    raise KeyError(f"totals_path debe tener Level2_final y n_cells. Tiene: {tot.columns.tolist()}")

n_cells_map = tot.set_index("Level2_final")["n_cells"].to_dict()

# cargar matriz numérica
df = pd.read_csv(num_path)

# Detectar la columna de identidad de fila
row_id = None
for cand in ["Level2_final", "Level2", "celltype", "cluster"]:
    if cand in df.columns:
        row_id = cand
        break
if row_id is None:
    row_id = df.columns[0]

df[row_id] = df[row_id].astype(str)
cols = df.columns.tolist()

def find_col_for_gene(gene: str, kind: str):
    """kind: 'mean' o 'frac'"""
    g = str(gene)
    if g.strip() == "" or g.lower() in {"none", "nan"}:
        return None

    keys = ["mean", "avg", "mu"] if kind == "mean" else ["frac", "pct", "percent", "nonzero", "nz"]

    cand = [c for c in cols
            if re.search(rf"(^|[^A-Za-z0-9]){re.escape(g)}([^A-Za-z0-9]|$)", c)
            and any(k in c.lower() for k in keys)]
    if len(cand) == 1:
        return cand[0]
    if len(cand) > 1:
        cand = sorted(cand, key=lambda x: (("__" not in x) and ("_" not in x), len(x)))
        return cand[0]

    cand2 = [c for c in cols if (g in c) and any(k in c.lower() for k in keys)]
    if len(cand2) == 1:
        return cand2[0]
    if len(cand2) > 1:
        cand2 = sorted(cand2, key=lambda x: (("__" not in x) and ("_" not in x), len(x)))
        return cand2[0]

    return None

In [None]:
# construir resumen 2 marcadores por población
meta_rows = []
for _, r in mk.iterrows():
    ct = str(r["Level2_final"])
    for m in [r["marker1"], r["marker2"]]:
        gene = str(m)
        mean_col = find_col_for_gene(gene, "mean")
        frac_col = find_col_for_gene(gene, "frac")
        meta_rows.append({
            "Level2_final": ct,
            "n_cells": int(n_cells_map.get(ct, 0)),
            "marker": gene,
            "mean_col": mean_col,
            "frac_col": frac_col,
        })

meta = pd.DataFrame(meta_rows)

df_idx = df.set_index(row_id)

out_rows = []
for _, r in meta.iterrows():
    ct = r["Level2_final"]
    gene = r["marker"]
    mean_col = r["mean_col"]
    frac_col = r["frac_col"]

    mean_val = None
    frac_val = None

    if ct in df_idx.index:
        if mean_col is not None and mean_col in df_idx.columns:
            mean_val = float(df_idx.loc[ct, mean_col])
        if frac_col is not None and frac_col in df_idx.columns:
            frac_val = float(df_idx.loc[ct, frac_col])

    out_rows.append({
        "Level2_final": ct,
        "n_cells": int(r["n_cells"]),
        "marker": gene,
        "mean_log1p": mean_val,
        "frac_nonzero": frac_val,
        "mean_source_col": mean_col,
        "frac_source_col": frac_col,
    })

out = pd.DataFrame(out_rows)

print("\n[CHECK] filas resumen:", out.shape[0])
print("[CHECK] missing mean:", int(out["mean_log1p"].isna().sum()),
      "| missing frac:", int(out["frac_nonzero"].isna().sum()))

out_path = SUM_DIR / "QA_dotplot_numeric_summary_2markers_Level2final.csv"
out.to_csv(out_path, index=False)
print("Saved:", out_path)

print("\nPreview (10 filas):")
print(out.head(10).to_string(index=False))