# 01 - EDA
Proposito: cargar datos y generar estadisticos y graficos basicos.

Inputs:
- `config/project.json`
- CSV en `cfg["data_csv_path"]`

Outputs esperados:
- `outputs/figures/hist_var.png`
- `outputs/figures/qq_var.png`
- `outputs/figures/xy_scatter.png`


### 1. Project setup


In [None]:
import os, sys, json, glob
from IPython.display import Image, display

PROJECT_ROOT = os.path.abspath(os.path.join(os.getcwd(), ".."))
if os.path.basename(os.getcwd()) == "notebooks":
    os.chdir(PROJECT_ROOT)
else:
    PROJECT_ROOT = os.getcwd()
    os.chdir(PROJECT_ROOT)

if PROJECT_ROOT not in sys.path:
    sys.path.insert(0, PROJECT_ROOT)

print("PROJECT_ROOT:", PROJECT_ROOT)
print("CWD:", os.getcwd())

from src.io_utils import read_csv_robust, standardize_columns, coerce_numeric
from src.eda import apply_topcut, basic_stats, plot_hist, plot_qq, plot_xy_scatter
from src.make_demo_data import make_demo_csv


### 2. Load config


In [None]:
cfg_path = "config/project.json"
print("Config:", os.path.abspath(cfg_path))
with open(cfg_path, "r", encoding="utf-8") as f:
    cfg = json.load(f)

print("Data CSV:", cfg["data_csv_path"])
mapping = {
    "x": cfg["columns"].get("x"),
    "y": cfg["columns"].get("y"),
    "z": cfg["columns"].get("z"),
    "var": cfg["columns"].get("variable_objetivo"),
    "domain": cfg["columns"].get("domain"),
}
print("Column mapping:", mapping)


### 3. Load data


In [None]:
if not os.path.exists(cfg["data_csv_path"]):
    make_demo_csv(cfg["data_csv_path"])

df_raw = read_csv_robust(cfg["data_csv_path"])
df = standardize_columns(df_raw, mapping)
df.replace(cfg.get("nodata_values", []), None, inplace=True)
coerce_numeric(df, ["x", "y", "z", "var"])

rows_before = len(df)
df.dropna(subset=["x", "y", "var"], inplace=True)
rows_after = len(df)
removed_pct = 0.0 if rows_before == 0 else (rows_before - rows_after) * 100.0 / rows_before

print("Shape raw:", df_raw.shape, "clean:", df.shape)
print("Dtypes:
", df.dtypes)
print("Rows removed (%):", f"{removed_pct:.2f}")
print("X range:", (df["x"].min(), df["x"].max()))
print("Y range:", (df["y"].min(), df["y"].max()))
if "z" in df.columns and df["z"].notna().any():
    print("Z range:", (df["z"].min(), df["z"].max()))
else:
    print("Z range: n/a")


### 4. EDA


In [None]:
if cfg.get("topcut", {}).get("enabled") and cfg.get("topcut", {}).get("high"):
    df = apply_topcut(df, "var", float(cfg["topcut"]["high"]))

stats = basic_stats(df["var"])
print(stats)

os.makedirs("outputs/figures", exist_ok=True)
plot_hist(df["var"], "outputs/figures/hist_var.png")
plot_qq(df["var"], "outputs/figures/qq_var.png")
plot_xy_scatter(df, "x", "y", "var", "outputs/figures/xy_scatter.png", color_by="domain")


### 5. Artifacts generated


In [None]:
figure_paths = sorted(glob.glob("outputs/figures/*.png"))
table_paths = sorted(glob.glob("outputs/tables/*.csv"))
model_paths = sorted(glob.glob("outputs/models/*"))

print("Figures:", [os.path.abspath(p) for p in figure_paths])
print("Tables:", [os.path.abspath(p) for p in table_paths])
print("Models:", [os.path.abspath(p) for p in model_paths])

for p in figure_paths:
    display(Image(filename=p))
