In [None]:
from pathlib import Path
import shutil
import matplotlib.pyplot as plt
import matplotlib.image as mpimg

NOTEBOOK_DIR = Path.cwd()

def find_project_root(start: Path) -> Path:
    for p in [start] + list(start.parents):
        if (p / "data_processed").exists():
            return p
    raise FileNotFoundError(f"No encuentro 'data_processed' subiendo desde: {start}")

PROJECT_ROOT = find_project_root(NOTEBOOK_DIR)
FIG_DIR = PROJECT_ROOT / "figures_final"
SUM_DIR = PROJECT_ROOT / "summary_tables_final"
OUT_DIR = PROJECT_ROOT / "final_deliverables"
OUT_DIR.mkdir(exist_ok=True)

print("PROJECT_ROOT:", PROJECT_ROOT)
print("FIG_DIR     :", FIG_DIR)
print("SUM_DIR     :", SUM_DIR)
print("OUT_DIR     :", OUT_DIR)


In [None]:
import pandas as pd
import numpy as np

# -----------------------------
# Paths (desde tu setup)
# -----------------------------
S1_PATH = SUM_DIR / "DE_pseudobulk_FOR_REPORT_LINFO_CLEAN_top10_by_celltype.csv"
S2_PATH = SUM_DIR / "cell_counts_Level1refined_by_patient.csv"

print("S1_PATH:", S1_PATH)
print("S2_PATH:", S2_PATH)
assert S1_PATH.exists(), f"No existe: {S1_PATH}"
assert S2_PATH.exists(), f"No existe: {S2_PATH}"

# -----------------------------
# Load Table S1 (DE top10)
# -----------------------------
df_s1 = pd.read_csv(S1_PATH)

# arreglar coma decimal -> punto en columnas numéricas (si aplica)
num_cols_s1 = [c for c in ["log2FC_Healthy_vs_Cirrhosis", "FDR"] if c in df_s1.columns]
for c in num_cols_s1:
    df_s1[c] = (
        df_s1[c].astype(str)
        .str.replace(",", ".", regex=False)
        .replace("nan", np.nan)
    )
    df_s1[c] = pd.to_numeric(df_s1[c], errors="coerce")

# ordenar para lectura: Level1_refined -> direction -> FDR
sort_cols = [c for c in ["Level1_refined", "direction", "FDR"] if c in df_s1.columns]
if sort_cols:
    df_s1 = df_s1.sort_values(sort_cols, ascending=[True, True, True]).reset_index(drop=True)

# -----------------------------
# Load Table S2 (counts per patient)
# -----------------------------
df_s2 = pd.read_csv(S2_PATH)

# orden recomendado de columnas (si existen)
preferred_ct = ["B","DC","HSCs","Mono","NK","Plasma","T","pDC"]
front = [c for c in ["patientID","disease"] if c in df_s2.columns]
cts   = [c for c in preferred_ct if c in df_s2.columns]
rest  = [c for c in df_s2.columns if c not in front + cts]
# intenta colocar total al final
if "total_cells_patient" in rest:
    rest = [c for c in rest if c != "total_cells_patient"] + ["total_cells_patient"]

df_s2 = df_s2[front + cts + rest]

# ordenar por disease y patientID si están
if "disease" in df_s2.columns and "patientID" in df_s2.columns:
    df_s2 = df_s2.sort_values(["disease","patientID"]).reset_index(drop=True)

# -----------------------------
# Display "bonito" en notebook
# -----------------------------
display(
    df_s1.style
      .format({
          "log2FC_Healthy_vs_Cirrhosis": "{:.3f}",
          "FDR": "{:.2e}",
      }, na_rep="")
      .set_caption("Table S1. Top differentially expressed genes by lineage (pseudobulk)")
)

display(
    df_s2.style
      .set_caption("Table S2. Per-patient cell counts by lineage (Level1_refined)")
      .format(na_rep="")
)

# -----------------------------
# (Opcional) Exportar versiones "pretty"
# -----------------------------
PRETTY_XLSX = OUT_DIR / "Tables_S1_S2_pretty.xlsx"
with pd.ExcelWriter(PRETTY_XLSX, engine="openpyxl") as writer:
    df_s1.to_excel(writer, sheet_name="TableS1_DE_top10", index=False)
    df_s2.to_excel(writer, sheet_name="TableS2_counts_by_patient", index=False)

print("Saved pretty Excel:", PRETTY_XLSX)


In [None]:
from pathlib import Path
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages

# --- paths ---
NOTEBOOK_DIR = Path.cwd()
def find_project_root(start: Path) -> Path:
    for p in [start] + list(start.parents):
        if (p / "data_processed").exists():
            return p
    raise FileNotFoundError("No encuentro 'data_processed'")

PROJECT_ROOT = find_project_root(NOTEBOOK_DIR)
OUT_DIR = PROJECT_ROOT / "final_deliverables"
OUT_DIR.mkdir(exist_ok=True)

# --- load table ---
path = SUM_DIR / "DE_pseudobulk_FOR_REPORT_LINFO_CLEAN_top10_by_celltype.csv"
df = pd.read_csv(path)

# convertir decimales con coma -> punto, y a numérico donde toque
for col in ["log2FC_Healthy_vs_Cirrhosis", "FDR"]:
    df[col] = pd.to_numeric(df[col].astype(str).str.replace(",", ".", regex=False), errors="coerce")

# ordenar de forma estable (por linaje, dirección, FDR)
df = df.sort_values(["Level1_refined", "direction", "FDR"], ascending=[True, True, True])

# --- export: 1 page per lineage ---
pdf_path = OUT_DIR / "TableS1_pretty_by_lineage.pdf"
lineages = df["Level1_refined"].unique().tolist()

with PdfPages(pdf_path) as pdf:
    for lin in lineages:
        sub = df[df["Level1_refined"] == lin].copy()

        # opcional: acorta columnas para que quepa mejor
        sub = sub[["direction", "gene", "log2FC_Healthy_vs_Cirrhosis", "FDR"]]

        fig, ax = plt.subplots(figsize=(11.7, 8.3))  # A4 apaisado aprox
        ax.axis("off")
        ax.set_title(f"Table S1 — {lin} (pseudobulk Healthy vs Cirrhosis)", pad=12)

        table = ax.table(
            cellText=sub.values,
            colLabels=sub.columns,
            loc="center",
            cellLoc="left",
            colLoc="left",
        )
        table.auto_set_font_size(False)
        table.set_fontsize(8)
        table.scale(1, 1.2)

        pdf.savefig(fig, bbox_inches="tight")
        plt.close(fig)

print("Saved:", pdf_path)


In [None]:
from pathlib import Path
import pandas as pd
import matplotlib.pyplot as plt

NOTEBOOK_DIR = Path.cwd()

def find_project_root(start: Path) -> Path:
    for p in [start] + list(start.parents):
        if (p / "data_processed").exists():
            return p
    raise FileNotFoundError(f"No encuentro 'data_processed' subiendo desde: {start}")

PROJECT_ROOT = find_project_root(NOTEBOOK_DIR)
OUT_DIR = PROJECT_ROOT / "final_deliverables"
OUT_DIR.mkdir(exist_ok=True)

path = SUM_DIR / "DE_pseudobulk_FOR_REPORT_LINFO_CLEAN_top10_by_celltype.csv"
df = pd.read_csv(path)

# --- convertir coma decimal a punto en columnas numéricas ---
for col in ["log2FC_Healthy_vs_Cirrhosis", "FDR"]:
    df[col] = pd.to_numeric(df[col].astype(str).str.replace(",", ".", regex=False), errors="coerce")

# --- criterio: top N por dirección y linaje, por FDR (más pequeño primero) ---
N_PER_DIRECTION = 3  # <-- cambia a 4 o 5 si quieres una tabla más larga

df = df.sort_values(["Level1_refined", "direction", "FDR"], ascending=[True, True, True])

df_repr = (
    df.groupby(["Level1_refined", "direction"], as_index=False)
      .head(N_PER_DIRECTION)
      .copy()
)

# ordenar filas de forma agradable: por linaje, y dentro: Healthy primero y luego Cirrhosis
dir_order = ["Higher_in_Healthy", "Higher_in_Cirrhosis"]
df_repr["direction"] = pd.Categorical(df_repr["direction"], categories=dir_order, ordered=True)
df_repr = df_repr.sort_values(["Level1_refined", "direction", "FDR"])

# seleccionar/renombrar columnas para la tabla final
df_repr = df_repr.rename(columns={
    "Level1_refined": "Lineage",
    "direction": "Direction",
    "gene": "Gene",
    "log2FC_Healthy_vs_Cirrhosis": "log2FC (Healthy vs Cirrhosis)",
    "FDR": "FDR"
})[["Lineage", "Direction", "Gene", "log2FC (Healthy vs Cirrhosis)", "FDR"]]

# formateo bonito (opcional): redondeos
df_repr["log2FC (Healthy vs Cirrhosis)"] = df_repr["log2FC (Healthy vs Cirrhosis)"].map(lambda x: f"{x:.3f}")
df_repr["FDR"] = df_repr["FDR"].map(lambda x: f"{x:.2e}")

display(df_repr)

# --- guardar como PNG para pegar en la memoria ---
fig, ax = plt.subplots(figsize=(11.7, 8.3))  # A4 apaisado aprox
ax.axis("off")
ax.set_title(f"Representative DE genes per lineage (pseudobulk; top {N_PER_DIRECTION} per direction by FDR)", pad=12)

table = ax.table(
    cellText=df_repr.values,
    colLabels=df_repr.columns,
    loc="center",
    cellLoc="left",
    colLoc="left",
)

table.auto_set_font_size(False)
table.set_fontsize(8)
table.scale(1, 1.2)

out_png = OUT_DIR / f"TableS1_representative_top{N_PER_DIRECTION}_per_direction.png"
plt.savefig(out_png, dpi=300, bbox_inches="tight")
plt.close(fig)

print("Saved:", out_png)


In [None]:
from pathlib import Path
import pandas as pd
import matplotlib.pyplot as plt

NOTEBOOK_DIR = Path.cwd()

def find_project_root(start: Path) -> Path:
    for p in [start] + list(start.parents):
        if (p / "data_processed").exists():
            return p
    raise FileNotFoundError(f"No encuentro 'data_processed' subiendo desde: {start}")

PROJECT_ROOT = find_project_root(NOTEBOOK_DIR)
OUT_DIR = PROJECT_ROOT / "final_deliverables"
OUT_DIR.mkdir(exist_ok=True)

path = OUT_DIR / "cell_counts_Level1refined_by_patient.csv"
df = pd.read_csv(path)

# Orden agradable: primero Cirrhosis luego Healthy (o cambia si prefieres)
disease_order = ["Cirrhosis", "Healthy"]
if "disease" in df.columns:
    df["disease"] = pd.Categorical(df["disease"].astype(str), categories=disease_order, ordered=True)
    df = df.sort_values(["disease", "patientID"])

# (Opcional) Reordenar columnas para que quede "presentable"
preferred_cols = ["patientID", "disease", "total_cells_patient", "B", "Plasma", "pDC", "T", "NK", "Mono", "DC", "HSCs"]
present_cols = [c for c in preferred_cols if c in df.columns] + [c for c in df.columns if c not in preferred_cols]
df = df[present_cols]

display(df)

# Guardar como PNG (tabla completa) para captura/insertar en la memoria
fig, ax = plt.subplots(figsize=(12, 4.5))  # ajusta alto si quieres más aire
ax.axis("off")
ax.set_title("Per-patient cell counts (Level1_refined)", pad=12)

table = ax.table(
    cellText=df.values,
    colLabels=df.columns,
    loc="center",
    cellLoc="center",
    colLoc="center",
)

table.auto_set_font_size(False)
table.set_fontsize(8)
table.scale(1, 1.25)

out_png = OUT_DIR / "TableS2_cell_counts_Level1refined_by_patient.png"
plt.savefig(out_png, dpi=300, bbox_inches="tight")
plt.close(fig)

print("Saved:", out_png)
