In [None]:
# ANNEX — UMAPs por linaje + dotplots Level2_final (2 marcadores)
#
# Inputs:
#   - Objeto filtrado RBC-out (h5ad) desde results/
#   - Level2_final_map.json desde results/summary_tables/
#   - UMAP_Harmony_embeddings.csv desde results/summary_tables/
#
# Outputs:
#   - figures/annex_lineage_umap_dotplots/
#     - Annex_UMAP_<linaje>_colored_by_Level2final.png
#     - Annex_Dotplot_<linaje>_Level2final_2markers.png
# ============================================================

from pathlib import Path
import re, json
import numpy as np
import pandas as pd
import scanpy as sc
import anndata as ad
import matplotlib.pyplot as plt

from src.paths import project_paths

P = project_paths(Path.cwd())
PROJECT_ROOT = P.PROJECT_ROOT
RESULTS_DIR = P.RESULTS_DIR
FIGURES_DIR = P.FIGURES_DIR

SUM_DIR = RESULTS_DIR / "summary_tables"
SUM_DIR.mkdir(parents=True, exist_ok=True)

ANNEX_DIR = FIGURES_DIR / "annex_lineage_umap_dotplots"
ANNEX_DIR.mkdir(parents=True, exist_ok=True)

legacy_sum = PROJECT_ROOT / "summary_tables_final"
legacy_results = PROJECT_ROOT / "data_processed"  # solo fallback legacy

def pick_first_existing(candidates, label: str) -> Path:
    for p in candidates:
        p = Path(p)
        if p.exists():
            return p
    raise FileNotFoundError(
        f"No encuentro {label}. Probé:\n" + "\n".join([f" - {Path(x)}" for x in candidates])
    )

def find_first_match(root: Path, patterns, label: str) -> Path:
    root = Path(root)
    for pat in patterns:
        hits = sorted(root.rglob(pat))
        if hits:
            return hits[0]
    raise FileNotFoundError(f"No encuentro {label} en {root}. Patterns: {patterns}")

def safe_name(x: str) -> str:
    x = str(x)
    x = re.sub(r"\s+", "_", x)
    x = re.sub(r"[^A-Za-z0-9_\-]+", "_", x)
    return x.strip("_")

print("PROJECT_ROOT:", PROJECT_ROOT)
print("RESULTS_DIR :", RESULTS_DIR)
print("FIGURES_DIR :", FIGURES_DIR)
print("SUM_DIR     :", SUM_DIR)
print("ANNEX_DIR   :", ANNEX_DIR)
print("legacy_sum  :", legacy_sum, "| exists:", legacy_sum.exists())
print("legacy_data :", legacy_results, "| exists:", legacy_results.exists())

In [None]:
# Inputs
OUT_FILTER = pick_first_existing(
    [
        RESULTS_DIR / "TFM_CIRRHOSIS_main_filtered_for_analysis.h5ad",
        RESULTS_DIR / "TFM_CIRRHOSIS_main_filtered.h5ad",
        RESULTS_DIR / "TFM_CIRRHOSIS_main_clean.h5ad",
        legacy_results / "TFM_CIRRHOSIS_main_filtered_for_analysis.h5ad",
    ],
    label="h5ad filtrado RBC-out (main_filtered_for_analysis)"
)

# Level2_final_map.json (artifact post-NB11) puede estar en subcarpeta dentro de summary_tables
try:
    MAP_PATH = pick_first_existing(
        [
            SUM_DIR / "Level2_final_map.json",
            legacy_sum / "Level2_final_map.json",
        ],
        label="Level2_final_map.json"
    )
except FileNotFoundError:
    # fallback: búsqueda recursiva
    MAP_PATH = find_first_match(
        SUM_DIR if SUM_DIR.exists() else RESULTS_DIR,
        patterns=["Level2_final_map*.json", "*Level2_final*map*.json"],
        label="Level2_final_map.json (recursive)"
    )

# Embeddings Harmony (artifact post-NB13) puede estar en subcarpeta dentro de summary_tables
try:
    EMB_PATH = pick_first_existing(
        [
            SUM_DIR / "UMAP_Harmony_embeddings.csv",
            legacy_sum / "UMAP_Harmony_embeddings.csv",
        ],
        label="UMAP_Harmony_embeddings.csv"
    )
except FileNotFoundError:
    EMB_PATH = find_first_match(
        SUM_DIR if SUM_DIR.exists() else RESULTS_DIR,
        patterns=["UMAP_Harmony_embeddings*.csv", "*Harmony*embeddings*.csv"],
        label="UMAP_Harmony_embeddings.csv (recursive)"
    )

print("OUT_FILTER:", OUT_FILTER)
print("MAP_PATH  :", MAP_PATH)
print("EMB_PATH  :", EMB_PATH)

with open(MAP_PATH, "r", encoding="utf-8") as f:
    level2_final_map = json.load(f)

In [None]:
# Import markers
try:
    from src import markers as M
    geneMarkers_level2 = M.geneMarkers_level2
    symbols_to_varnames = M.symbols_to_varnames
    print("Using markers from: src/markers.py")
except Exception as e:
    # fallback
    print("[WARN] No pude importar src.markers. Intento fallback legacy. Error:", repr(e))
    from AI_Package.markers.markers import geneMarkers_level2, symbols_to_varnames
    print("Using markers from: AI_Package.markers.markers (legacy fallback)")

# base Level2 -> genes (2+ por subtipo)
lvl2_to_symbols = {}
for _l1, subdict in geneMarkers_level2.items():
    for _l2, genes in subdict.items():
        if genes:
            lvl2_to_symbols[_l2] = list(genes)

# overrides EXACTOS
OVERRIDE_2MARKERS = {
    "B_Other":       ["MS4A1", "CD74"],
    "CD4_Memory":    ["IL7R", "CCR7"],
    "ISG_Myeloid":   ["ISG15", "IFIT3"],
    "MonoDC_Other":  ["LYZ", "FCER1G"],
    "HSCs":          ["CD34", "KIT"],
    "Plasma":        ["MZB1", "JCHAIN"],
    "pDC":           ["IL3RA", "IRF7"],
}
for l2, genes2 in OVERRIDE_2MARKERS.items():
    if (l2 not in lvl2_to_symbols) or (len([g for g in lvl2_to_symbols.get(l2, []) if g]) < 2):
        lvl2_to_symbols[l2] = genes2

# orden final
order_by_group = {
    "B":     ["B_Naive", "B_Memory", "B_Activated", "B_Atypical", "B_Other"],
    "Plasma":["Plasma"],
    "pDC":   ["pDC"],
    "T":     ["CD4_Naive","CD4_Memory","CD8_Naive","CD8_Effector_Cytotoxic","Treg","MAIT","GammaDelta_T","Proliferative_T","Exhausted_T"],
    "NK":    ["NK"],
    "Mono":  ["Classical_Mono","NonClassical_Mono","ISG_Myeloid","MonoDC_Other"],
    "DC":    ["cDC1","cDC2","DC4","aDC"],
    "HSCs":  ["HSCs"],
}

In [None]:
# Load object + build obs
adata_b = sc.read_h5ad(OUT_FILTER, backed="r")

need_obs = ["patientID", "disease", "Level1_refined"]
missing = [c for c in need_obs if c not in adata_b.obs.columns]
if missing:
    adata_b.file.close()
    raise KeyError(f"Faltan columnas en obs: {missing}. Cols disponibles: {adata_b.obs.columns.tolist()}")

# Level2_final
if "Level2_final" in adata_b.obs.columns:
    lvl2_final = adata_b.obs["Level2_final"].copy()
elif "Level2" in adata_b.obs.columns:
    lvl2_final = adata_b.obs["Level2"].copy().replace(level2_final_map)
else:
    adata_b.file.close()
    raise KeyError("No encuentro ni 'Level2_final' ni 'Level2' en obs del h5ad.")

obs = adata_b.obs[need_obs].copy()
obs.index = obs.index.astype(str)
obs["Level2_final"] = pd.Series(lvl2_final, index=adata_b.obs_names).astype("string").reindex(obs.index)

# RBC-out
lineages = sorted(set(obs["Level1_refined"].astype(str).unique()))
lineages = [x for x in lineages if x not in {"RBC", "RBC_and_HSC"}]
print("Lineages (Level1_refined, sin RBC/RBC_and_HSC):", lineages)

# Load embeddings y alinear por índice
emb = pd.read_csv(EMB_PATH, index_col=0)
emb.index = emb.index.astype(str)

for c in ["UMAP1_harmony", "UMAP2_harmony"]:
    if c not in emb.columns:
        adata_b.file.close()
        raise KeyError(f"Falta columna '{c}' en {EMB_PATH}. Tiene: {emb.columns.tolist()}")

common = emb.index.intersection(obs.index)
if len(common) == 0:
    adata_b.file.close()
    raise ValueError(
        "No hay intersección entre índices de embeddings y obs del h5ad.\n"
        "Revisa que UMAP_Harmony_embeddings.csv use obs_names como índice."
    )

emb = emb.loc[common, ["UMAP1_harmony", "UMAP2_harmony"]].copy()
obs = obs.loc[common].copy()

# anotar embeddings con obs
df_plot = emb.copy()
df_plot["disease"] = obs["disease"].astype(str).values
df_plot["Level1_refined"] = obs["Level1_refined"].astype(str).values
df_plot["Level2_final"] = obs["Level2_final"].fillna("Unlabeled").astype(str).values

print("Células alineadas:", len(common))

In [None]:
# Helpers
sc.settings.autoshow = False

def save_lineage_umap(df_sub: pd.DataFrame, out_png: Path, title: str):
    a = ad.AnnData(
        X=np.zeros((df_sub.shape[0], 1), dtype=np.float32),
        obs=df_sub[["disease", "Level1_refined", "Level2_final"]].copy()
    )
    a.obsm["X_umap_harmony"] = df_sub[["UMAP1_harmony", "UMAP2_harmony"]].to_numpy().astype(np.float32)

    ax = sc.pl.embedding(
        a,
        basis="umap_harmony",
        color="Level2_final",
        show=False,
        frameon=False,
        title=title,
        legend_loc="right margin",
        legend_fontsize=6
    )
    fig = ax.figure
    fig.savefig(out_png, dpi=300, bbox_inches="tight")
    plt.close(fig)

def make_lineage_dotplot(cell_ids, lineage: str, present_l2, out_png: Path):
    # orden de subtipos
    if lineage in order_by_group:
        order = [l2 for l2 in order_by_group[lineage] if l2 in present_l2] + sorted(
            [l2 for l2 in present_l2 if l2 not in order_by_group[lineage]]
        )
    else:
        order = sorted(present_l2)

    # genes: 2 por subtipo
    gene_symbols = []
    for l2 in order:
        gene_symbols.extend(lvl2_to_symbols.get(l2, [])[:2])
    seen = set()
    gene_symbols = [g for g in gene_symbols if g and not (g in seen or seen.add(g))]

    gene_varnames = symbols_to_varnames(adata_b, gene_symbols)
    gene_varnames = [v for v in gene_varnames if v is not None]

    if len(gene_varnames) == 0:
        print(f"[WARN] {lineage}: no se encontró ningún marcador en var_names.")
        return

    # cargar solo genes necesarios, solo células del linaje
    ad_view = adata_b[cell_ids, gene_varnames]
    ad_mem = ad_view.to_memory()

    # añadir Level2_final
    l2 = obs.loc[ad_mem.obs_names, "Level2_final"].fillna("Unlabeled").astype(str).values
    ad_mem.obs["Level2_final"] = pd.Categorical(l2, categories=order, ordered=True)

    use_layer = "log1p_10k" if "log1p_10k" in ad_mem.layers else None

    dp = sc.pl.dotplot(
        ad_mem,
        var_names=gene_varnames,
        groupby="Level2_final",
        layer=use_layer,
        use_raw=False,
        dendrogram=False,
        standard_scale="var",
        show=False,
        return_fig=True,
    )
    dp = dp.add_totals().style(dot_edge_color="black", dot_edge_lw=0.5)
    dp.savefig(out_png, dpi=300, bbox_inches="tight")
    plt.close("all")

In [None]:
# LOOP por linaje: UMAP + Dotplot
for lin in lineages:
    print("\n====", lin, "====")

    df_sub = df_plot.loc[df_plot["Level1_refined"].astype(str) == lin].copy()
    if df_sub.shape[0] == 0:
        print("[SKIP] sin células en embeddings")
        continue

    lin_safe = safe_name(lin)

    out_umap = ANNEX_DIR / f"Annex_UMAP_{lin_safe}_colored_by_Level2final.png"
    save_lineage_umap(df_sub, out_umap, title=f"{lin} — UMAP (Harmony) colored by Level2_final")
    print("Saved:", out_umap)

    mask_bool = (obs["Level1_refined"].astype(str) == lin)
    cell_ids = obs.index[mask_bool].tolist()

    present_l2 = sorted(set(obs.loc[mask_bool, "Level2_final"].fillna("Unlabeled").astype(str).unique()))
    out_dot = ANNEX_DIR / f"Annex_Dotplot_{lin_safe}_Level2final_2markers.png"
    make_lineage_dotplot(cell_ids, lin, present_l2, out_dot)
    print("Saved:", out_dot)

adata_b.file.close()
print("\n[OK] Annex generado en:", ANNEX_DIR)

In [None]:
# CHECK: ¿UMAP_Harmony_embeddings.csv está alineado con OUT_FILTER.h5ad?
import pandas as pd
import scanpy as sc

adata_b = sc.read_h5ad(OUT_FILTER, backed="r")
emb = pd.read_csv(EMB_PATH, index_col=0)

need_cols = ["UMAP1_harmony", "UMAP2_harmony"]
missing_cols = [c for c in need_cols if c not in emb.columns]
print("\n[1] UMAP columns check:")
print("  missing_cols:", missing_cols)
if missing_cols:
    adata_b.file.close()
    raise KeyError(f"Faltan columnas en EMB_PATH: {missing_cols}")

idx_h5ad = adata_b.obs_names
idx_csv = emb.index.astype(str)

print("\n[2] Index sizes:")
print("  n_h5ad:", len(idx_h5ad))
print("  n_csv :", len(idx_csv))

set_h5ad = set(map(str, idx_h5ad))
set_csv  = set(map(str, idx_csv))

inter = set_h5ad & set_csv
only_h5ad = set_h5ad - set_csv
only_csv  = set_csv - set_h5ad

print("\n[3] Overlap:")
print("  intersection:", len(inter), f"({len(inter)/len(set_h5ad):.3%} of h5ad)")
print("  only_in_h5ad:", len(only_h5ad))
print("  only_in_csv :", len(only_csv))

exact_same = (len(idx_h5ad) == len(idx_csv)) and (list(map(str, idx_h5ad)) == idx_csv.tolist())
print("\n[4] Exact same index + same order?:", exact_same)

can_reorder = (len(only_h5ad) == 0) and (len(set_csv) >= len(set_h5ad))
print("[5] Can reorder CSV to h5ad order (h5ad subset of csv)?:", can_reorder)

if not exact_same:
    if len(only_h5ad) > 0:
        print("\nEjemplos only_in_h5ad:", list(sorted(only_h5ad))[:5])
    if len(only_csv) > 0:
        print("Ejemplos only_in_csv :", list(sorted(only_csv))[:5])

adata_b.file.close()
print("\n[OK] Check terminado.")