### 1. Imports + rutas

In [None]:
from pathlib import Path

import numpy as np
import pandas as pd
import scanpy as sc
import anndata as ad
import matplotlib.pyplot as plt

from src.paths import project_paths

print("Scanpy version:", sc.__version__)
print("AnnData version:", ad.__version__)

# Dependencia Harmony (fuera del notebook en el entorno)
try:
    import harmonypy as hm
except ImportError as e:
    raise ImportError(
        "Falta 'harmonypy' en el entorno.\n"
        "Instálalo fuera del notebook y reintenta:\n"
        "  pip install harmonypy\n"
        "o\n"
        "  conda install -c conda-forge harmonypy"
    ) from e

P = project_paths(Path.cwd())
PROJECT_ROOT = P["PROJECT_ROOT"]
CONFIG_DIR   = P["CONFIG_DIR"]
DATA_DIR     = P["DATA_DIR"]
RESULTS_DIR  = P["RESULTS_DIR"]
FIGURES_DIR  = P["FIGURES_DIR"]

for d in [CONFIG_DIR, DATA_DIR, RESULTS_DIR, FIGURES_DIR]:
    d.mkdir(parents=True, exist_ok=True)

PREP_RESULTS_DIR = RESULTS_DIR / "preprocess"
PREP_RESULTS_DIR.mkdir(parents=True, exist_ok=True)

INT_RESULTS_DIR = RESULTS_DIR / "integration"
INT_RESULTS_DIR.mkdir(parents=True, exist_ok=True)

HARM_FIG_DIR = FIGURES_DIR / "harmony"
HARM_FIG_DIR.mkdir(parents=True, exist_ok=True)

print("\nPROJECT_ROOT:", PROJECT_ROOT)
print("PREP_RESULTS_DIR:", PREP_RESULTS_DIR)
print("INT_RESULTS_DIR:", INT_RESULTS_DIR)
print("HARM_FIG_DIR:", HARM_FIG_DIR)
print("Harmony importado correctamente.")

### 2. Leer config/config.yaml

In [None]:
def load_simple_yaml(path: Path) -> dict:
    cfg = {}
    for line in path.read_text(encoding="utf-8").splitlines():
        line = line.strip()
        if not line or line.startswith("#"):
            continue
        if ":" not in line:
            continue
        k, v = line.split(":", 1)
        k = k.strip()
        v = v.strip().strip('"').strip("'")
        cfg[k] = v
    return cfg

def as_int(x: str) -> int:
    return int(float(x))

def as_bool(x: str) -> bool:
    return str(x).strip().lower() in {"true", "1", "yes", "y"}

cfg_path = CONFIG_DIR / "config.yaml"
if not cfg_path.exists():
    raise FileNotFoundError(f"Falta {cfg_path}. Debe existir en el repo.")

CFG = load_simple_yaml(cfg_path)

NORMALIZED_FILENAME = CFG["normalized_h5ad_filename"]
HARMONY_FILENAME    = CFG["harmony_h5ad_filename"]

PCA_N_COMPS     = as_int(CFG["pca_n_comps"])
PCA_ZERO_CENTER = as_bool(CFG.get("pca_zero_center", "false"))

HARMONY_BATCH_KEY = CFG["harmony_batch_key"]
HARMONY_EMB_KEY   = CFG.get("harmony_emb_key", "X_pca_harmony")

# Para plotting con scanpy: basis = nombre sin prefijo "X_"
HARMONY_BASIS = HARMONY_EMB_KEY[2:] if HARMONY_EMB_KEY.startswith("X_") else HARMONY_EMB_KEY

CFG

### 3. Cargar input (salida del Notebook 03)

In [None]:
normalized_path = PREP_RESULTS_DIR / NORMALIZED_FILENAME

if not normalized_path.exists():
    raise FileNotFoundError(
        f"No se encuentra el objeto normalizado en:\n{normalized_path}\n\n"
        "Ejecuta primero el Notebook 03 (normalización + HVG)."
    )

print("Leyendo input normalizado:", normalized_path)
adata = sc.read_h5ad(normalized_path)
print(adata)

if "highly_variable" not in adata.var.columns:
    raise ValueError("Falta adata.var['highly_variable']. Ejecuta Notebook 03.")

if "log1p_10k" not in adata.layers:
    raise ValueError("Falta adata.layers['log1p_10k']. Ejecuta Notebook 03.")

print("Nº HVG marcados:", int(adata.var["highly_variable"].sum()))

### 4. Preparar matriz para PCA (usa log1p_10k)

In [None]:
# Usamos la matriz log-normalizada para PCA
adata.X = adata.layers["log1p_10k"]

# Sanity check de batch key
print("Batch key configurado para Harmony:", HARMONY_BATCH_KEY)
print("¿Existe en obs?:", HARMONY_BATCH_KEY in adata.obs.columns)
if HARMONY_BATCH_KEY in adata.obs.columns:
    print("Nº niveles:", adata.obs[HARMONY_BATCH_KEY].nunique())

### 5. PCA sobre HVG

In [None]:
print("\nEjecutando PCA...")
print("PCA_N_COMPS:", PCA_N_COMPS)
print("PCA_ZERO_CENTER:", PCA_ZERO_CENTER)

# Nota: use_highly_variable=True usa la máscara var['highly_variable'] directamente
# zero_center=False evita densificar si la matriz es sparse (más seguro en datasets grandes)
sc.tl.pca(
    adata,
    n_comps=PCA_N_COMPS,
    use_highly_variable=True,
    svd_solver="arpack" if PCA_ZERO_CENTER else "randomized",
    zero_center=PCA_ZERO_CENTER,
)

print("PCA guardado en adata.obsm['X_pca']:", adata.obsm["X_pca"].shape)

sc.pl.pca_variance_ratio(adata, log=True, show=False)
plt.savefig(HARM_FIG_DIR / "pca_variance_ratio.png", bbox_inches="tight", dpi=200)
plt.close()
print("Figura guardada:", HARM_FIG_DIR / "pca_variance_ratio.png")

### 6. Harmony sobre PCs

In [None]:
X_pca = adata.obsm["X_pca"][:, :PCA_N_COMPS]

run_harmony = False
if HARMONY_BATCH_KEY in adata.obs.columns and adata.obs[HARMONY_BATCH_KEY].nunique() > 1:
    run_harmony = True

if run_harmony:
    print(f"\nEjecutando Harmony con batch_key='{HARMONY_BATCH_KEY}' y {PCA_N_COMPS} PCs...")
    ho = hm.run_harmony(X_pca, adata.obs, HARMONY_BATCH_KEY)
    Z_corr = ho.Z_corr.T  # (cells x PCs)
    adata.obsm[HARMONY_EMB_KEY] = Z_corr
    print("Embedding Harmony guardado en:", HARMONY_EMB_KEY, "| shape:", Z_corr.shape)
else:
    print(
        "\n[INFO] No se ejecuta Harmony (batch_key no existe o tiene 1 solo nivel). "
        "Se copia el PCA a la clave de Harmony para mantener interfaz."
    )
    adata.obsm[HARMONY_EMB_KEY] = X_pca.copy()
    print("Embedding (copia PCA) guardado en:", HARMONY_EMB_KEY, "| shape:", adata.obsm[HARMONY_EMB_KEY].shape)

### 7. Plot rápido del embedding

In [None]:
candidate_colors = ["gem_id", "disease"]
colors = [c for c in candidate_colors if c in adata.obs.columns]

if colors:
    sc.pl.embedding(
        adata,
        basis=HARMONY_BASIS,   # usa obsm['X_'+basis]
        color=colors,
        ncols=min(2, len(colors)),
        show=False
    )
    plt.savefig(HARM_FIG_DIR / "harmony_embedding.png", bbox_inches="tight", dpi=200)
    plt.close()
    print("Figura guardada:", HARM_FIG_DIR / "harmony_embedding.png")
else:
    print("[INFO] No hay columnas gem_id/disease en obs para colorear; se omite figura.")

### 8. Guardar salida en results/integration/

In [None]:
out_path = INT_RESULTS_DIR / HARMONY_FILENAME
adata.write_h5ad(out_path)

print("Objeto con PCA + Harmony guardado en:")
print(out_path)
print("Embedding integrado en adata.obsm[...]:", HARMONY_EMB_KEY)