### 1. Imports + rutas

In [None]:
from pathlib import Path

import numpy as np
import pandas as pd
import scanpy as sc
import anndata as ad
import matplotlib.pyplot as plt

from src.paths import project_paths

print("Scanpy version:", sc.__version__)
print("AnnData version:", ad.__version__)

P = project_paths(Path.cwd())
PROJECT_ROOT = P["PROJECT_ROOT"]
CONFIG_DIR   = P["CONFIG_DIR"]
DATA_DIR     = P["DATA_DIR"]
RESULTS_DIR  = P["RESULTS_DIR"]
FIGURES_DIR  = P["FIGURES_DIR"]

for d in [CONFIG_DIR, DATA_DIR, RESULTS_DIR, FIGURES_DIR]:
    d.mkdir(parents=True, exist_ok=True)

INT_RESULTS_DIR = RESULTS_DIR / "integration"
INT_RESULTS_DIR.mkdir(parents=True, exist_ok=True)

UMAP_FIG_DIR = FIGURES_DIR / "umap_global"
UMAP_FIG_DIR.mkdir(parents=True, exist_ok=True)

print("\nPROJECT_ROOT:", PROJECT_ROOT)
print("INT_RESULTS_DIR:", INT_RESULTS_DIR)
print("UMAP_FIG_DIR:", UMAP_FIG_DIR)

### 2. Leer config/config.yaml

In [None]:
def load_simple_yaml(path: Path) -> dict:
    cfg = {}
    for line in path.read_text(encoding="utf-8").splitlines():
        line = line.strip()
        if not line or line.startswith("#"):
            continue
        if ":" not in line:
            continue
        k, v = line.split(":", 1)
        k = k.strip()
        v = v.strip().strip('"').strip("'")
        cfg[k] = v
    return cfg

def as_int(x: str) -> int:
    return int(float(x))

def as_float(x: str) -> float:
    return float(x)

cfg_path = CONFIG_DIR / "config.yaml"
if not cfg_path.exists():
    raise FileNotFoundError(f"Falta {cfg_path}. Debe existir en el repo.")

CFG = load_simple_yaml(cfg_path)

HARMONY_FILENAME = CFG["harmony_h5ad_filename"]
OUT_FILENAME     = CFG["neighbors_umap_h5ad_filename"]

HARMONY_EMB_KEY  = CFG.get("harmony_emb_key", "X_pca_harmony")

NEIGHBORS_KEY    = CFG.get("neighbors_key", "harmony")
N_NEIGHBORS      = as_int(CFG.get("neighbors_n_neighbors", "15"))

UMAP_KEY         = CFG.get("umap_key", "X_umap_harmony")
UMAP_MIN_DIST    = as_float(CFG.get("umap_min_dist", "0.3"))
UMAP_SPREAD      = as_float(CFG.get("umap_spread", "1.0"))
UMAP_RANDOM_STATE = as_int(CFG.get("umap_random_state", "0"))

LEIDEN_RESOLUTIONS_STR = CFG.get("leiden_resolutions", "0.2,0.5,0.8")
LEIDEN_RESOLUTIONS = [as_float(x.strip()) for x in LEIDEN_RESOLUTIONS_STR.split(",") if x.strip()]

LEIDEN_L1_RES = as_float(CFG.get("leiden_l1_resolution", "0.5"))
LEIDEN_L1_KEY = CFG.get("leiden_l1_key", "leiden_L1")

# Para scanpy.pl.embedding: basis = nombre sin prefijo "X_"
UMAP_BASIS = UMAP_KEY[2:] if UMAP_KEY.startswith("X_") else UMAP_KEY

CFG

### 3. Cargar input (salida del Notebook 04)

In [None]:
in_path = INT_RESULTS_DIR / HARMONY_FILENAME

if not in_path.exists():
    raise FileNotFoundError(
        f"No se encuentra el objeto con Harmony en:\n{in_path}\n\n"
        "Ejecuta primero el Notebook 04."
    )

print("Leyendo input:", in_path)
adata = sc.read_h5ad(in_path)
print(adata)

if HARMONY_EMB_KEY not in adata.obsm:
    raise KeyError(
        f"Falta el embedding integrado en adata.obsm['{HARMONY_EMB_KEY}'].\n"
        "Revisa el Notebook 04."
    )

print("Embedding Harmony:", HARMONY_EMB_KEY, "| shape:", adata.obsm[HARMONY_EMB_KEY].shape)

### 4. Vecinos kNN sobre Harmony

In [None]:
print("\nCalculando vecinos...")
print("neighbors_key:", NEIGHBORS_KEY)
print("n_neighbors:", N_NEIGHBORS)
print("use_rep:", HARMONY_EMB_KEY)

sc.pp.neighbors(
    adata,
    n_neighbors=N_NEIGHBORS,
    use_rep=HARMONY_EMB_KEY,
    key_added=NEIGHBORS_KEY,
)

print("Vecinos calculados y guardados en uns['...'] =", NEIGHBORS_KEY)

### 5. UMAP integrado (y copiar a X_umap_harmony)

In [None]:
print("\nCalculando UMAP...")
print("UMAP min_dist:", UMAP_MIN_DIST)
print("UMAP spread:", UMAP_SPREAD)
print("UMAP random_state:", UMAP_RANDOM_STATE)
print("neighbors_key usado:", NEIGHBORS_KEY)

# Scanpy escribe en obsm["X_umap"]
sc.tl.umap(
    adata,
    neighbors_key=NEIGHBORS_KEY,
    min_dist=UMAP_MIN_DIST,
    spread=UMAP_SPREAD,
    random_state=UMAP_RANDOM_STATE,
)

# Copiar a la clave que usaremos
adata.obsm[UMAP_KEY] = adata.obsm["X_umap"].copy()

print("UMAP guardado en obsm['X_umap'] y copiado a obsm['%s']" % UMAP_KEY)
print("Forma UMAP:", adata.obsm[UMAP_KEY].shape)

### 6. Figuras UMAP

In [None]:
# Variables para colorear
color_vars = []
for col in ["gem_id", "disease", "total_counts", "n_genes_by_counts", "pct_counts_mt"]:
    if col in adata.obs.columns:
        color_vars.append(col)

print("Variables para colorear UMAP:", color_vars)

if color_vars:
    sc.pl.embedding(
        adata,
        basis=UMAP_BASIS,
        color=color_vars,
        ncols=3,
        frameon=False,
        show=False,
    )
    plt.savefig(UMAP_FIG_DIR / "umap_colored_overview.png", bbox_inches="tight", dpi=200)
    plt.close()
    print("Figura guardada:", UMAP_FIG_DIR / "umap_colored_overview.png")
else:
    print("[INFO] No se encontraron columnas para colorear; se omite figura.")

### 7. Leiden (varias resoluciones) + definir leiden_L1

In [None]:
print("\nCalculando Leiden en resoluciones:", LEIDEN_RESOLUTIONS)

leiden_keys = []
for res in LEIDEN_RESOLUTIONS:
    key = f"leiden_r{str(res).replace('.', '_')}"
    sc.tl.leiden(
        adata,
        resolution=res,
        key_added=key,
        neighbors_key=NEIGHBORS_KEY,
    )
    leiden_keys.append(key)

print("Clustering Leiden calculados:", leiden_keys)

# Clustering principal para Level1
target_key = f"leiden_r{str(LEIDEN_L1_RES).replace('.', '_')}"
if target_key not in adata.obs.columns:
    raise KeyError(
        f"No existe {target_key} en obs. Revisa LEIDEN_RESOLUTIONS / LEIDEN_L1_RES en config."
    )

adata.obs[LEIDEN_L1_KEY] = adata.obs[target_key].astype("category")
print(f"Clustering principal guardado en obs['{LEIDEN_L1_KEY}'] (desde {target_key}).")

sc.pl.embedding(
    adata,
    basis=UMAP_BASIS,
    color=[LEIDEN_L1_KEY],
    frameon=False,
    show=False,
)
plt.savefig(UMAP_FIG_DIR / "umap_leiden_L1.png", bbox_inches="tight", dpi=200)
plt.close()
print("Figura guardada:", UMAP_FIG_DIR / "umap_leiden_L1.png")

### 8. Guardar salida en results/integration/

In [None]:
out_path = INT_RESULTS_DIR / OUT_FILENAME
adata.write_h5ad(out_path)

print("\nObjeto guardado en:")
print(out_path)
print("Incluye: vecinos ('%s'), UMAP ('%s'), Leiden L1 ('%s')" % (NEIGHBORS_KEY, UMAP_KEY, LEIDEN_L1_KEY))