In [None]:
# ============================================================
# ANNEX — UMAPs por linaje + dotplots de subanotación (Level2_final)
# Reusa: OUT_FILTER + Level2_final_map.json + markers.py + overrides
#
# OUTPUTS:
#   figures_final/annex_lineage_umap_dotplots/
#     - Annex_UMAP_<linaje>_colored_by_Level2final.png
#     - Annex_Dotplot_<linaje>_Level2final_2markers.png
# ============================================================

from pathlib import Path
import sys, json
import numpy as np
import pandas as pd
import scanpy as sc
import anndata as ad
import matplotlib.pyplot as plt

NOTEBOOK_DIR = Path.cwd()

def find_project_root(start: Path) -> Path:
    for p in [start] + list(start.parents):
        if (p / "data_processed").exists():
            return p
    raise FileNotFoundError(f"No encuentro 'data_processed' subiendo desde: {start}")

# -----------------------------
# Paths
# -----------------------------
PROJECT_ROOT = find_project_root(NOTEBOOK_DIR)
DATA_PROCESSED = PROJECT_ROOT / "data_processed"
OUT_FILTER = DATA_PROCESSED / "TFM_CIRRHOSIS_main_filtered_for_analysis.h5ad"

FIG_DIR = PROJECT_ROOT / "figures_final"
OUT_SUM = PROJECT_ROOT / "summary_tables_final"
ANNEX_DIR = FIG_DIR / "annex_lineage_umap_dotplots"
ANNEX_DIR.mkdir(exist_ok=True, parents=True)

MAP_PATH = OUT_SUM / "Level2_final_map.json"
EMB_PATH = OUT_SUM / "UMAP_Harmony_embeddings.csv"

print("PROJECT_ROOT:", PROJECT_ROOT)
print("OUT_FILTER  :", OUT_FILTER)
print("MAP_PATH    :", MAP_PATH)
print("EMB_PATH    :", EMB_PATH)
print("ANNEX_DIR   :", ANNEX_DIR)

if not OUT_FILTER.exists(): raise FileNotFoundError(OUT_FILTER)
if not MAP_PATH.exists(): raise FileNotFoundError(MAP_PATH)
if not EMB_PATH.exists(): raise FileNotFoundError(EMB_PATH)

with open(MAP_PATH, "r", encoding="utf-8") as f:
    level2_map = json.load(f)

# -----------------------------
# Import markers.py
# -----------------------------
if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))

from AI_Package.markers.markers import geneMarkers_level2, symbols_to_varnames

# base Level2 -> genes (2+ por subtipo, según markers.py)
lvl2_to_symbols = {}
for _l1, subdict in geneMarkers_level2.items():
    for _l2, genes in subdict.items():
        if genes:
            lvl2_to_symbols[_l2] = list(genes)

# overrides EXACTOS (copiados de tu notebook final)
OVERRIDE_2MARKERS = {
    "B_Other":       ["MS4A1", "CD74"],
    "CD4_Memory":    ["IL7R", "CCR7"],
    "ISG_Myeloid":   ["ISG15", "IFIT3"],
    "MonoDC_Other":  ["LYZ", "FCER1G"],
    "HSCs":          ["CD34", "KIT"],
    "Plasma":        ["MZB1", "JCHAIN"],
    "pDC":           ["IL3RA", "IRF7"],
}
for l2, genes2 in OVERRIDE_2MARKERS.items():
    if (l2 not in lvl2_to_symbols) or (len([g for g in lvl2_to_symbols.get(l2, []) if g]) < 2):
        lvl2_to_symbols[l2] = genes2

# orden final (consistente con Fig1C)
order_by_group = {
    "B":     ["B_Naive", "B_Memory", "B_Activated", "B_Atypical", "B_Other"],
    "Plasma":["Plasma"],
    "pDC":   ["pDC"],
    "T":     ["CD4_Naive","CD4_Memory","CD8_Naive","CD8_Effector_Cytotoxic","Treg","MAIT","GammaDelta_T","Proliferative_T","Exhausted_T"],
    "NK":    ["NK"],
    "Mono":  ["Classical_Mono","NonClassical_Mono","ISG_Myeloid","MonoDC_Other"],
    "DC":    ["cDC1","cDC2","DC4","aDC"],
    "HSCs":  ["HSCs"],
}

# -----------------------------
# Load object in backed mode
# -----------------------------
adata_b = sc.read_h5ad(OUT_FILTER, backed="r")

need = ["patientID", "disease", "Level1_refined", "Level2"]
missing = [c for c in need if c not in adata_b.obs.columns]
if missing:
    adata_b.file.close()
    raise KeyError(f"Faltan columnas en obs: {missing}")

obs = adata_b.obs[need].copy()
obs.index = obs.index.astype(str)
obs["Level2_final"] = obs["Level2"].astype(str).replace(level2_map).astype(str)

# RBC-out para annex
lineages = sorted(set(obs["Level1_refined"].astype(str).unique()))
lineages = [x for x in lineages if x != "RBC"]
print("\nLineages (Level1_refined, sin RBC):", lineages)

# -----------------------------
# Load embeddings (CSV)
# -----------------------------
emb = pd.read_csv(EMB_PATH, index_col=0)
emb.index = emb.index.astype(str)

# columnas mínimas esperadas
for c in ["UMAP1_harmony", "UMAP2_harmony", "Level1_refined", "Level2_final"]:
    if c not in emb.columns:
        adata_b.file.close()
        raise KeyError(f"Falta columna '{c}' en {EMB_PATH}. Tiene: {emb.columns.tolist()}")

# Alinear emb <-> obs por índice (células)
common = emb.index.intersection(obs.index)
if len(common) == 0:
    adata_b.file.close()
    raise ValueError(
        "No hay intersección entre índices de embeddings y obs del h5ad.\n"
        "Revisa que UMAP_Harmony_embeddings.csv use obs_names como índice."
    )

emb = emb.loc[common].copy()
obs = obs.loc[common].copy()
print("\nCélulas alineadas (common):", len(common))

# ============================================================
# Helpers
# ============================================================
sc.settings.autoshow = False

def save_lineage_umap(df_sub: pd.DataFrame, out_png: Path, title: str):
    # AnnData mínimo SOLO para plot
    a = ad.AnnData(
        X=np.zeros((df_sub.shape[0], 1), dtype=np.float32),
        obs=df_sub[["disease", "Level1_refined", "Level2_final"]].copy()
    )
    a.obsm["X_umap_harmony"] = df_sub[["UMAP1_harmony", "UMAP2_harmony"]].to_numpy().astype(np.float32)

    ax = sc.pl.embedding(
        a,
        basis="umap_harmony",
        color="Level2_final",
        show=False,
        frameon=False,
        title=title,
        legend_loc="right margin",
        legend_fontsize=6
    )
    fig = ax.figure
    fig.savefig(out_png, dpi=300, bbox_inches="tight")
    plt.close(fig)

def make_lineage_dotplot(cell_ids, lineage: str, present_l2, out_png: Path):
    # orden de subtipos: usa order_by_group si existe, si no, alphabetical
    if lineage in order_by_group:
        order = [l2 for l2 in order_by_group[lineage] if l2 in present_l2] + sorted(
            [l2 for l2 in present_l2 if l2 not in order_by_group[lineage]]
        )
    else:
        order = sorted(present_l2)

    # genes: 2 por subtipo (dedup, orden estable)
    gene_symbols = []
    for l2 in order:
        gene_symbols.extend(lvl2_to_symbols.get(l2, [])[:2])
    seen = set()
    gene_symbols = [g for g in gene_symbols if g and not (g in seen or seen.add(g))]

    # map symbols -> var_names reales presentes
    gene_varnames = symbols_to_varnames(adata_b, gene_symbols)
    gene_varnames = [v for v in gene_varnames if v is not None]

    if len(gene_varnames) == 0:
        print(f"[WARN] {lineage}: no se encontró ningún marcador en var_names.")
        return

    # cargar solo genes necesarios, solo células del linaje (cell_ids = obs_names)
    ad_view = adata_b[cell_ids, gene_varnames]
    ad_mem = ad_view.to_memory()

    # añadir Level2_final (ordenado)
    ad_mem.obs["Level2_final"] = obs.loc[ad_mem.obs_names, "Level2_final"].astype(str).values
    ad_mem.obs["Level2_final"] = pd.Categorical(ad_mem.obs["Level2_final"], categories=order, ordered=True)

    # layer preferido
    use_layer = "log1p_10k" if "log1p_10k" in ad_mem.layers else None

    dp = sc.pl.dotplot(
        ad_mem,
        var_names=gene_varnames,
        groupby="Level2_final",
        layer=use_layer,
        use_raw=False,
        dendrogram=False,
        standard_scale="var",
        show=False,
        return_fig=True,
    )
    dp = dp.add_totals().style(dot_edge_color="black", dot_edge_lw=0.5)
    dp.savefig(out_png, dpi=300, bbox_inches="tight")
    plt.close("all")

# ============================================================
# LOOP por linaje: UMAP + Dotplot
# ============================================================
for lin in lineages:
    print("\n====", lin, "====")

    # subset por linaje usando embeddings (ya alineadas con obs)
    df_sub = emb.loc[emb["Level1_refined"].astype(str) == lin].copy()
    if df_sub.shape[0] == 0:
        print("[SKIP] sin células en embeddings")
        continue

    # ---- UMAP por linaje ----
    out_umap = ANNEX_DIR / f"Annex_UMAP_{lin}_colored_by_Level2final.png"
    save_lineage_umap(df_sub, out_umap, title=f"{lin} — UMAP (Harmony) colored by Level2_final")
    print("Saved:", out_umap)

    # ---- dotplot subanotación por linaje ----
    # CAMBIO CLAVE: cell_ids = lista de obs_names (NO boolean mask)
    mask_bool = (obs["Level1_refined"].astype(str) == lin)
    cell_ids = obs.index[mask_bool].tolist()

    present_l2 = sorted(set(obs.loc[mask_bool, "Level2_final"].astype(str).unique()))
    out_dot = ANNEX_DIR / f"Annex_Dotplot_{lin}_Level2final_2markers.png"
    make_lineage_dotplot(cell_ids, lin, present_l2, out_dot)
    print("Saved:", out_dot)

adata_b.file.close()
print("\n[OK] Annex generado en:", ANNEX_DIR)


In [None]:
# ============================================================
# CHECK: ¿UMAP_Harmony_embeddings.csv está alineado con OUT_FILTER.h5ad?
# - Verifica columnas UMAP1/UMAP2
# - Verifica que el índice (obs_names) coincide con el .h5ad
# - Verifica solape y orden (si es 100% igual, perfecto)
# ============================================================

from pathlib import Path
import pandas as pd
import scanpy as sc

NOTEBOOK_DIR = Path.cwd()

def find_project_root(start: Path) -> Path:
    for p in [start] + list(start.parents):
        if (p / "data_processed").exists():
            return p
    raise FileNotFoundError(f"No encuentro 'data_processed' subiendo desde: {start}")

PROJECT_ROOT = find_project_root(NOTEBOOK_DIR)
DATA_PROCESSED = PROJECT_ROOT / "data_processed"
OUT_SUM = PROJECT_ROOT / "summary_tables_final"

OUT_FILTER = DATA_PROCESSED / "TFM_CIRRHOSIS_main_filtered_for_analysis.h5ad"
EMB_PATH   = OUT_SUM / "UMAP_Harmony_embeddings.csv"

print("OUT_FILTER:", OUT_FILTER)
print("EMB_PATH  :", EMB_PATH)

# --- load ---
adata_b = sc.read_h5ad(OUT_FILTER, backed="r")
emb = pd.read_csv(EMB_PATH, index_col=0)

# --- 1) columnas Harmony ---
need_cols = ["UMAP1_harmony", "UMAP2_harmony"]
missing_cols = [c for c in need_cols if c not in emb.columns]
print("\n[1] UMAP columns check:")
print("  missing_cols:", missing_cols)
if missing_cols:
    raise KeyError(f"Faltan columnas en EMB_PATH: {missing_cols}")

# --- 2) índices ---
idx_h5ad = adata_b.obs_names
idx_csv = emb.index.astype(str)

print("\n[2] Index sizes:")
print("  n_h5ad:", len(idx_h5ad))
print("  n_csv :", len(idx_csv))

set_h5ad = set(map(str, idx_h5ad))
set_csv  = set(map(str, idx_csv))

inter = set_h5ad & set_csv
only_h5ad = set_h5ad - set_csv
only_csv  = set_csv - set_h5ad

print("\n[3] Overlap:")
print("  intersection:", len(inter), f"({len(inter)/len(set_h5ad):.3%} of h5ad)")
print("  only_in_h5ad:", len(only_h5ad))
print("  only_in_csv :", len(only_csv))

# --- 3) ¿coinciden EXACTOS y en el mismo orden? ---
exact_same = (len(idx_h5ad) == len(idx_csv)) and (list(map(str, idx_h5ad)) == idx_csv.tolist())
print("\n[4] Exact same index + same order?:", exact_same)

# --- 4) si NO es exacto, ¿al menos puedes reordenar el CSV al orden del h5ad?
can_reorder = (len(only_h5ad) == 0) and (len(set_csv) >= len(set_h5ad))
print("[5] Can reorder CSV to h5ad order (h5ad subset of csv)?:", can_reorder)

# muestra ejemplos si hay problemas
if not exact_same:
    if len(only_h5ad) > 0:
        print("\nEjemplos only_in_h5ad:", list(sorted(only_h5ad))[:5])
    if len(only_csv) > 0:
        print("Ejemplos only_in_csv :", list(sorted(only_csv))[:5])

adata_b.file.close()

print("\n[OK] Check terminado.")
