### 1. Imports + rutas del repo

In [None]:
from pathlib import Path

import numpy as np
import pandas as pd
import scanpy as sc
import anndata as ad

from scipy.sparse import issparse

from src.paths import project_paths

print("Scanpy version:", sc.__version__)
print("AnnData version:", ad.__version__)

P = project_paths(Path.cwd())
PROJECT_ROOT = P["PROJECT_ROOT"]
CONFIG_DIR   = P["CONFIG_DIR"]
DATA_DIR     = P["DATA_DIR"]
RESULTS_DIR  = P["RESULTS_DIR"]
FIGURES_DIR  = P["FIGURES_DIR"]

for d in [CONFIG_DIR, DATA_DIR, RESULTS_DIR, FIGURES_DIR]:
    d.mkdir(parents=True, exist_ok=True)

QC_RESULTS_DIR = RESULTS_DIR / "qc"
QC_RESULTS_DIR.mkdir(parents=True, exist_ok=True)

QC_FIG_DIR = FIGURES_DIR / "qc"
QC_FIG_DIR.mkdir(parents=True, exist_ok=True)

print("\nPROJECT_ROOT:", PROJECT_ROOT)
print("DATA_DIR:", DATA_DIR)
print("QC_RESULTS_DIR:", QC_RESULTS_DIR)
print("QC_FIG_DIR:", QC_FIG_DIR)

### 2. Leer config/config.yaml + casteo numérico

In [None]:
def load_simple_yaml(path: Path) -> dict:
    cfg = {}
    for line in path.read_text(encoding="utf-8").splitlines():
        line = line.strip()
        if not line or line.startswith("#"):
            continue
        if ":" not in line:
            continue
        k, v = line.split(":", 1)
        k = k.strip()
        v = v.strip().strip('"').strip("'")
        cfg[k] = v
    return cfg

def as_int(x: str) -> int:
    return int(float(x))

def as_float(x: str) -> float:
    return float(x)

cfg_path = CONFIG_DIR / "config.yaml"
if not cfg_path.exists():
    raise FileNotFoundError(f"Falta {cfg_path}. Debe existir en el repo.")

CFG = load_simple_yaml(cfg_path)

RAW_H5AD_FILENAME = CFG["raw_h5ad_filename"]
BACKED_MODE = CFG.get("backed_mode", "r")

AFTER_QC_FILENAME = CFG["after_qc_h5ad_filename"]

MIN_GENES  = as_int(CFG["qc_min_genes"])
MAX_GENES  = as_int(CFG["qc_max_genes"])
MIN_COUNTS = as_int(CFG["qc_min_counts"])
MAX_COUNTS = as_int(CFG["qc_max_counts"])
MAX_PCT_MT = as_float(CFG["qc_max_pct_mt"])

CFG

### 3. Cargar .h5ad

In [None]:
raw_h5ad_path = DATA_DIR / RAW_H5AD_FILENAME

if not raw_h5ad_path.exists():
    raise FileNotFoundError(
        f"No se encuentra el .h5ad en:\n{raw_h5ad_path}\n\n"
        "Este repo NO incluye datos. Para ejecutar QC:\n"
        f"1) Consigue el archivo {RAW_H5AD_FILENAME} por vías internas\n"
        "2) Colócalo en la carpeta data/ (local)\n"
        "3) Vuelve a ejecutar el notebook"
    )

print("Leyendo .h5ad desde:", raw_h5ad_path)
adata = sc.read_h5ad(raw_h5ad_path)
print(adata)

### 4. Inspección básica de obs/var + comprobar si X parece counts

In [None]:
print("\nColumnas obs (primeras 40):")
print(list(adata.obs.columns)[:40])

print("\nColumnas var (primeras 40):")
print(list(adata.var.columns)[:40])

X = adata.X
sub = X[:100, :100]
if issparse(sub):
    sub = sub.toarray()

sample_values = np.asarray(sub, dtype=float)
print("\nMin X sample:", float(sample_values.min()))
print("Max X sample:", float(sample_values.max()))
is_integer_like = np.allclose(sample_values, np.round(sample_values), atol=1e-8)
print("¿X mayoritariamente entero (counts-like)?", bool(is_integer_like))

### 5. Recalcular counts/genes desde X y comparar con columnas tipo Seurat

In [None]:
X = adata.X
if issparse(X):
    counts_per_cell = np.ravel(X.sum(axis=1))
    genes_per_cell  = np.ravel((X > 0).sum(axis=1))
else:
    counts_per_cell = np.ravel(X.sum(axis=1))
    genes_per_cell  = np.ravel((X > 0).sum(axis=1))

adata.obs["total_counts_from_X"] = counts_per_cell
adata.obs["n_genes_from_X"] = genes_per_cell

cols_to_show = []
if "nCount_RNA" in adata.obs.columns:
    cols_to_show.append("nCount_RNA")
cols_to_show.append("total_counts_from_X")

if "nFeature_RNA" in adata.obs.columns:
    cols_to_show.append("nFeature_RNA")
cols_to_show.append("n_genes_from_X")

print("\nComparación (primeras 5 filas) si existen columnas tipo Seurat:")
print(adata.obs[cols_to_show].head())

### 6. QC metrics (mitocondria) + plots

In [None]:
import matplotlib.pyplot as plt

# Detectar genes mitocondriales de forma robusta
gene_names = adata.var_names.astype(str)
mt_mask = pd.Index(gene_names).str.upper().str.startswith("MT-")

# Si no detecta ninguno, intentamos con una columna típica si existe
if mt_mask.sum() == 0 and "features" in adata.var.columns:
    feat = adata.var["features"].astype(str)
    mt_mask = pd.Index(feat).str.upper().str.startswith("MT-")

adata.var["mt"] = np.array(mt_mask, dtype=bool)
print("Nº genes mitocondriales detectados:", int(adata.var["mt"].sum()))

sc.pp.calculate_qc_metrics(adata, qc_vars=["mt"], inplace=True)

# Violin QC
ax = sc.pl.violin(
    adata,
    ["n_genes_by_counts", "total_counts", "pct_counts_mt"],
    jitter=0.0,
    multi_panel=True,
    show=False
)
plt.savefig(QC_FIG_DIR / "qc_violin_before_filter.png", bbox_inches="tight", dpi=200)
plt.close()

# Scatter 1
sc.pl.scatter(adata, x="total_counts", y="n_genes_by_counts", color="pct_counts_mt", show=False)
plt.savefig(QC_FIG_DIR / "qc_scatter_counts_vs_genes.png", bbox_inches="tight", dpi=200)
plt.close()

# Scatter 2
sc.pl.scatter(adata, x="total_counts", y="pct_counts_mt", show=False)
plt.savefig(QC_FIG_DIR / "qc_scatter_counts_vs_pctmt.png", bbox_inches="tight", dpi=200)
plt.close()

print("Figuras guardadas en:", QC_FIG_DIR)

### 7. Aplicar filtros QC + plots post-QC

In [None]:
print("\nUmbrales QC (desde config/config.yaml):")
print("MIN_GENES :", MIN_GENES)
print("MAX_GENES :", MAX_GENES)
print("MIN_COUNTS:", MIN_COUNTS)
print("MAX_COUNTS:", MAX_COUNTS)
print("MAX_PCT_MT:", MAX_PCT_MT)

obs = adata.obs

mask = np.ones(adata.n_obs, dtype=bool)
mask &= obs["n_genes_by_counts"] >= MIN_GENES
mask &= obs["n_genes_by_counts"] <= MAX_GENES
mask &= obs["total_counts"] >= MIN_COUNTS
mask &= obs["total_counts"] <= MAX_COUNTS
mask &= obs["pct_counts_mt"] <= MAX_PCT_MT

n_before = adata.n_obs
n_after = int(mask.sum())

print("\nCélulas antes:", n_before)
print("Células después (QC técnico):", n_after)
print("Filtradas:", n_before - n_after)
print("Proporción conservada:", round(n_after / n_before, 4))

adata_qc = adata[mask].copy()
print("\nObjeto tras QC:")
print(adata_qc)

# Violin post-QC
import matplotlib.pyplot as plt
sc.pl.violin(
    adata_qc,
    ["n_genes_by_counts", "total_counts", "pct_counts_mt"],
    jitter=0.0,
    multi_panel=True,
    show=False
)
plt.savefig(QC_FIG_DIR / "qc_violin_after_filter.png", bbox_inches="tight", dpi=200)
plt.close()

### 8. Guardar objeto tras QC en results/

In [None]:
if adata_qc.raw is not None:
    adata_qc.raw = None

qc_output_path = QC_RESULTS_DIR / AFTER_QC_FILENAME
adata_qc.write_h5ad(qc_output_path)

print("Objeto tras QC guardado en:")
print(qc_output_path)