In [2]:
import os
import re
import json
import zipfile
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from typing import Dict, Any, Tuple, List

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.pipeline import Pipeline

import tensorflow as tf
from tensorflow.keras import layers, regularizers, optimizers, callbacks, Model

In [3]:
# =========================
# 0) CONFIGURACIÓN GENERAL
# =========================

CSV_PATH = "./dataset_natalidad.csv"  # ajusta si es necesario (p.ej. "/mnt/data/dataset_natalidad.csv")
OUT_DIR = "./m8_output"
ZIP_PATH = "./entrega_final_m8.zip"
RNG = 42
os.makedirs(OUT_DIR, exist_ok=True)
np.random.seed(RNG)
tf.random.set_seed(RNG)

In [4]:
# ====================================
# 1) CARGA Y EXPLORACIÓN DE LOS DATOS
# ====================================

def normalize_colnames(cols: List[str]) -> List[str]:
    """Convierte a snake_case y elimina tildes/espacios para trabajar cómodo."""
    def _norm(c):
        c = c.strip()
        # reemplazos básicos
        c = c.replace("á","a").replace("é","e").replace("í","i").replace("ó","o").replace("ú","u").replace("ñ","n")
        c = re.sub(r"[^A-Za-z0-9]+", "_", c)
        c = re.sub(r"_+", "_", c).strip("_")
        return c.lower()
    return [_norm(c) for c in cols]


def detect_target(df: pd.DataFrame) -> str:
    """
    Detecta la columna objetivo (tasa de natalidad) de forma robusta.
    Busca por patrones comunes: 'tasa_natalidad', 'natalidad', 'birth_rate'.
    """
    lower_map = {c.lower(): c for c in df.columns}
    # candidatos explícitos
    candidates = [
        "tasa_de_natalidad", "tasa_natalidad", "natalidad", "tasa__natalidad",
        "birth_rate", "births_per_woman", "births_per_1000", "births_per_1000_people"
    ]
    for k in candidates:
        if k in lower_map:
            return lower_map[k]
    # heurística: columna que contenga 'natalid' o 'birth'
    for c in df.columns:
        cl = c.lower()
        if "natalid" in cl or "birth" in cl:
            return c
    raise ValueError("No se encontró la columna objetivo (tasa de natalidad). Renómbrala o ajusta detect_target().")


def load_dataset(path: str) -> Tuple[pd.DataFrame, str]:
    df = pd.read_csv(path)
    original_cols = df.columns.tolist()
    df.columns = normalize_colnames(df.columns.tolist())
    # Mapeo antiguo->nuevo por si quieres rastrear
    colmap = dict(zip(original_cols, df.columns))
    # detectar objetivo
    target_col = detect_target(df)
    return df, target_col


def basic_eda(df: pd.DataFrame, target: str) -> None:
    """Distribuciones y matriz de correlaciones (matplotlib puro)."""
    # 1) Histograma de cada columna
    fig_dir = OUT_DIR
    num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    # Hist distribuciones
    for col in num_cols:
        plt.figure()
        plt.hist(df[col].dropna().values, bins=20)
        plt.title(f"Distribución de {col}")
        plt.xlabel(col); plt.ylabel("Frecuencia")
        plt.tight_layout()
        plt.savefig(os.path.join(fig_dir, f"dist_{col}.png"), dpi=180)
        plt.close()

    # 2) Correlaciones
    corr = df[num_cols].corr()
    plt.figure(figsize=(8, 6))
    im = plt.imshow(corr.values, aspect='auto')
    plt.colorbar(im, fraction=0.046, pad=0.04)
    plt.xticks(range(len(num_cols)), num_cols, rotation=90)
    plt.yticks(range(len(num_cols)), num_cols)
    plt.title("Matriz de correlaciones (Pearson)")
    plt.tight_layout()
    plt.savefig(os.path.join(fig_dir, "correlaciones.png"), dpi=200)
    plt.close()

    # Guardar resumen de stats
    summary = {
        "shape": df.shape,
        "columns": df.columns.tolist(),
        "numeric_columns": num_cols,
        "describe": df[num_cols].describe().to_dict()
    }
    with open(os.path.join(OUT_DIR, "eda_summary.json"), "w", encoding="utf-8") as f:
        json.dump(summary, f, ensure_ascii=False, indent=2)

In [5]:
# =============================================
# 2) PREPROCESAMIENTO + DISEÑO DEL MODELO (NN)
# =============================================

def make_preprocessor(feature_cols: List[str]) -> Pipeline:
    """Pipeline: imputación (mediana) + escalado estándar."""
    return Pipeline(steps=[
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler())
    ])


def build_regression_mlp(
    n_features: int,
    activation: str = "relu",
    l2_lambda: float = 1e-4,
    dropout_rate: float = 0.2,
    hidden_units: Tuple[int, int] = (64, 32),
    lr: float = 1e-3,
    optimizer_name: str = "adam"
) -> Model:
    """MLP con 2 capas ocultas mínimo, L2 + Dropout."""
    reg = regularizers.l2(l2_lambda) if l2_lambda and l2_lambda > 0 else None

    inputs = layers.Input(shape=(n_features,))
    x = layers.Dense(hidden_units[0], activation=activation, kernel_regularizer=reg)(inputs)
    x = layers.Dropout(dropout_rate)(x)
    x = layers.Dense(hidden_units[1], activation=activation, kernel_regularizer=reg)(x)
    x = layers.Dropout(dropout_rate)(x)
    outputs = layers.Dense(1, activation="linear")(x)

    model = Model(inputs, outputs, name="NatalidadRegressor")

    if optimizer_name.lower() == "adam":
        opt = optimizers.Adam(learning_rate=lr)
    elif optimizer_name.lower() == "sgd":
        opt = optimizers.SGD(learning_rate=lr, momentum=0.9)
    elif optimizer_name.lower() == "rmsprop":
        opt = optimizers.RMSprop(learning_rate=lr)
    else:
        opt = optimizers.Adam(learning_rate=lr)

    model.compile(optimizer=opt, loss="mse", metrics=["mae"])
    return model


def train_and_evaluate(
    X_train: np.ndarray, y_train: np.ndarray,
    X_val: np.ndarray, y_val: np.ndarray,
    X_test: np.ndarray, y_test: np.ndarray,
    config: Dict[str, Any],
    run_name: str
) -> Dict[str, Any]:
    """Entrena un modelo con EarlyStopping/ReduceLROnPlateau y evalúa en test."""
    model = build_regression_mlp(
        n_features=X_train.shape[1],
        activation=config.get("activation", "relu"),
        l2_lambda=config.get("l2_lambda", 1e-4),
        dropout_rate=config.get("dropout", 0.2),
        hidden_units=config.get("hidden_units", (64, 32)),
        lr=config.get("lr", 1e-3),
        optimizer_name=config.get("optimizer", "adam"),
    )

    es = callbacks.EarlyStopping(monitor="val_loss", patience=50, restore_best_weights=True)
    rlrop = callbacks.ReduceLROnPlateau(monitor="val_loss", factor=0.5, patience=20, min_lr=1e-6)

    hist = model.fit(
        X_train, y_train,
        validation_data=(X_val, y_val),
        epochs=config.get("epochs", 500),
        batch_size=config.get("batch_size", 16),
        verbose=0,
        callbacks=[es, rlrop]
    )

    preds = model.predict(X_test, verbose=0).reshape(-1)
    mae = mean_absolute_error(y_test, preds)
    rmse = np.sqrt(np.mean((y_test - preds) ** 2))
    r2 = r2_score(y_test, preds)

    # Guardar curva de entrenamiento
    plt.figure()
    plt.plot(hist.history["loss"], label="train_loss")
    plt.plot(hist.history["val_loss"], label="val_loss")
    plt.title(f"Pérdida (MSE) - {run_name}")
    plt.xlabel("Epoch"); plt.ylabel("MSE")
    plt.legend()
    plt.tight_layout()
    plt.savefig(os.path.join(OUT_DIR, f"loss_curve_{run_name}.png"), dpi=180)
    plt.close()

    return {
        "run": run_name,
        "config": config,
        "mae": float(mae),
        "rmse": float(rmse),
        "r2": float(r2),
        "model": model
    }

In [6]:
# ====================================
# 3) IMPORTANCIA (PERMUTATION IMPORT.)
# ====================================

def permutation_importance_mae(
    model: Model,
    X_test: np.ndarray,
    y_test: np.ndarray,
    feature_names: List[str],
    n_repeats: int = 8,
    random_state: int = RNG
) -> pd.DataFrame:
    """
    Importancia por permutación medida como incremento de MAE al permutar cada feature.
    """
    rng = np.random.default_rng(random_state)
    base_pred = model.predict(X_test, verbose=0).reshape(-1)
    base_mae = mean_absolute_error(y_test, base_pred)

    results = []
    X_work = X_test.copy()
    for j, name in enumerate(feature_names):
        incs = []
        for _ in range(n_repeats):
            X_perm = X_work.copy()
            rng.shuffle(X_perm[:, j])  # permuta columna j
            pred_perm = model.predict(X_perm, verbose=0).reshape(-1)
            mae_perm = mean_absolute_error(y_test, pred_perm)
            incs.append(mae_perm - base_mae)
        results.append({"feature": name, "mae_increase_mean": float(np.mean(incs)),
                        "mae_increase_std": float(np.std(incs))})
    imp_df = pd.DataFrame(results).sort_values("mae_increase_mean", ascending=False).reset_index(drop=True)
    return imp_df

In [7]:
# ======================
# 4) PROGRAMA PRINCIPAL
# ======================

def main():
    # Carga
    df, target = load_dataset(CSV_PATH)
    basic_eda(df, target)

    # Features y objetivo
    feature_cols = [c for c in df.columns if c != target]
    num_cols = df[feature_cols].select_dtypes(include=[np.number]).columns.tolist()
    X_all = df[num_cols].values.astype(np.float32)
    y_all = df[target].values.astype(np.float32).reshape(-1)

    # Split train/val/test: 70/15/15 (estricto)
    X_temp, X_test, y_temp, y_test = train_test_split(X_all, y_all, test_size=0.15, random_state=RNG)
    X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.1765, random_state=RNG)  # 0.1765*0.85≈0.15

    # Preprocesador
    pre = make_preprocessor(num_cols)
    X_train_p = pre.fit_transform(X_train)
    X_val_p = pre.transform(X_val)
    X_test_p = pre.transform(X_test)

    # Configuraciones a comparar (activaciones y learning rates; regularización fija de base)
    configs = []
    for act in ["relu", "tanh", "elu"]:
        for lr in [1e-3, 5e-4]:
            configs.append({
                "activation": act,
                "lr": lr,
                "optimizer": "adam",
                "l2_lambda": 1e-4,
                "dropout": 0.2,
                "hidden_units": (64, 32),
                "epochs": 500,
                "batch_size": 16
            })

    results = []
    best = None
    for i, cfg in enumerate(configs, start=1):
        run_name = f"run{i}_{cfg['activation']}_lr{cfg['lr']}"
        res = train_and_evaluate(X_train_p, y_train, X_val_p, y_val, X_test_p, y_test, cfg, run_name)
        results.append(res)
        if (best is None) or (res["rmse"] < best["rmse"]):
            best = res

    # Guardar resultados agregados (sin el objeto modelo)
    results_to_save = [
        {k: v for k, v in r.items() if k not in ("model",)}
        for r in results
    ]
    with open(os.path.join(OUT_DIR, "resultados_experimentos.json"), "w", encoding="utf-8") as f:
        json.dump(results_to_save, f, ensure_ascii=False, indent=2)

    # ==========
    # 5) ANÁLISIS
    # ==========

    best_model = best["model"]
    # Importancia por permutación (en espacio preprocesado, nombres = num_cols)
    imp_df = permutation_importance_mae(best_model, X_test_p, y_test, feature_names=num_cols, n_repeats=16)
    imp_df.to_csv(os.path.join(OUT_DIR, "importancia_variables_perm.csv"), index=False, encoding="utf-8")

    # Predicciones vs reales (test)
    y_pred = best_model.predict(X_test_p, verbose=0).reshape(-1)
    # Scatter y = ŷ
    plt.figure()
    plt.scatter(y_test, y_pred)
    lims = [min(y_test.min(), y_pred.min()), max(y_test.max(), y_pred.max())]
    plt.plot(lims, lims)
    plt.title(f"Predicción vs Real (Mejor modelo) — R²={best['r2']:.3f}")
    plt.xlabel("Real (tasa de natalidad)")
    plt.ylabel("Predicción (tasa de natalidad)")
    plt.tight_layout()
    plt.savefig(os.path.join(OUT_DIR, "scatter_pred_vs_real.png"), dpi=180)
    plt.close()

    # Residuales
    residuals = y_test - y_pred
    plt.figure()
    plt.scatter(y_pred, residuals)
    plt.axhline(0)
    plt.title("Residuales vs Predicción (Mejor modelo)")
    plt.xlabel("Predicción"); plt.ylabel("Residuo")
    plt.tight_layout()
    plt.savefig(os.path.join(OUT_DIR, "residuales_vs_prediccion.png"), dpi=180)
    plt.close()

    # =============================
    # 6) REPORTE Y COMPRESIÓN (ZIP)
    # =============================

    # Reporte con reflexiones (se basa en métricas y top variables)
    top_vars_txt = ", ".join(imp_df.head(5)["feature"].tolist())
    report_lines = []
    report_lines.append("EVALUACIÓN FINAL M8 — RED NEURONAL PARA PREDICCIÓN DE NATALIDAD\n")
    report_lines.append("1) Diseño y entrenamiento\n")
    report_lines.append(f"- Mejor configuración: activación={best['config']['activation']}, lr={best['config']['lr']}, "
                        f"L2={best['config']['l2_lambda']}, dropout={best['config']['dropout']}, capas={best['config']['hidden_units']}\n")
    report_lines.append(f"- Métricas (test): MAE={best['mae']:.3f}, RMSE={best['rmse']:.3f}, R²={best['r2']:.3f}\n")

    report_lines.append("2) Evaluación/Comparación\n")
    report_lines.append("- Se compararon activaciones (ReLU, Tanh, ELU) y tasas de aprendizaje (1e-3, 5e-4) con Adam. "
                        "Se emplearon EarlyStopping y ReduceLROnPlateau para estabilidad y evitar sobreajuste.\n")

    report_lines.append("3) Importancia de variables (permuta, ΔMAE)\n")
    report_lines.append(f"- Más influyentes (top-5): {top_vars_txt}\n")

    report_lines.append("4) Reflexión\n")
    report_lines.append("- Las variables con mayor ΔMAE influyen más en la predicción: se relacionan con determinantes "
                        "socioeconómicos/estructurales (p. ej., ingreso, educación, salud, urbanización, edad media de maternidad). "
                        "La red captura efectos no lineales y posibles interacciones.\n")
    report_lines.append("- Mejoras futuras: búsqueda de hiperparámetros más amplia (capas/unidades), "
                        "regularización elástica, k-fold CV, y análisis SHAP para interpretabilidad más fina.\n")
    report_text = "\n".join(report_lines)

    with open(os.path.join(OUT_DIR, "reporte_final_m8.txt"), "w", encoding="utf-8") as f:
        f.write(report_text)

    # Crear ZIP con archivos clave
    with zipfile.ZipFile(ZIP_PATH, "w", zipfile.ZIP_DEFLATED) as z:
        # EDA
        z.write(os.path.join(OUT_DIR, "eda_summary.json"), arcname="eda_summary.json")
        if os.path.exists(os.path.join(OUT_DIR, "correlaciones.png")):
            z.write(os.path.join(OUT_DIR, "correlaciones.png"), arcname="correlaciones.png")
        # Curvas de pérdida
        for file in os.listdir(OUT_DIR):
            if file.startswith("loss_curve_") and file.endswith(".png"):
                z.write(os.path.join(OUT_DIR, file), arcname=file)
        # Figuras de evaluación
        for fn in ["scatter_pred_vs_real.png", "residuales_vs_prediccion.png"]:
            fp = os.path.join(OUT_DIR, fn)
            if os.path.exists(fp):
                z.write(fp, arcname=fn)
        # Importancia + resultados
        for fn in ["importancia_variables_perm.csv", "resultados_experimentos.json", "reporte_final_m8.txt"]:
            fp = os.path.join(OUT_DIR, fn)
            if os.path.exists(fp):
                z.write(fp, arcname=fn)

    # Consola
    print("=== LISTO ===")
    print(f"Mejor modelo -> MAE={best['mae']:.3f} | RMSE={best['rmse']:.3f} | R²={best['r2']:.3f}")
    print(f"Top variables (permuta): {', '.join(imp_df.head(5)['feature'].tolist())}")
    print(f"Carpeta de salida: {os.path.abspath(OUT_DIR)}")
    print(f"ZIP de entrega:    {os.path.abspath(ZIP_PATH)}")


if __name__ == "__main__":
    main()



=== LISTO ===
Mejor modelo -> MAE=1.736 | RMSE=1.854 | R²=0.955
Top variables (permuta): pib_per_capita, urbanizacion, edad_maternidad, tasa_empleo_femenino, nivel_educativo
Carpeta de salida: /content/m8_output
ZIP de entrega:    /content/entrega_final_m8.zip
