In [10]:
# === CAMBIO 1: Setup reproducible + utilidades ===
import os, json, numpy as np, pandas as pd
from pathlib import Path
from datetime import datetime
import joblib
import sys
sys.path.append("..")  # permite: from src import ...

# Reproducibilidad (evita sorpresas en re-ejecuciones)
os.environ["PYTHONHASHSEED"] = "42"
np.random.seed(42)

# Carpeta donde guardaremos artefactos (gráficas, json, modelos, etc.)
OUT = Path("artifacts")
OUT.mkdir(exist_ok=True)

def dump_json(obj, path):
    """Guarda un dict/objeto como JSON bonito (UTF-8, identado)."""
    with open(path, "w", encoding="utf-8") as f:
        json.dump(obj, f, ensure_ascii=False, indent=2)

# Log rápido de entorno (útil para reproducibilidad)
VERSIONS = {
    "timestamp": datetime.now().isoformat(timespec="seconds"),
    "python": f"{os.sys.version_info.major}.{os.sys.version_info.minor}.{os.sys.version_info.micro}",
    "numpy": np.__version__,
    "pandas": pd.__version__,
}
try:
    import sklearn, scipy
    VERSIONS.update({
        "scikit_learn": sklearn.__version__,
        "scipy": scipy.__version__,
    })
except Exception:
    pass

dump_json(VERSIONS, OUT / "env_versions.json")
print("OK: entorno inicializado. Artefactos en:", OUT.resolve())


OK: entorno inicializado. Artefactos en: C:\Users\pc\Documents\Proyecto Final\notebooks\artifacts


In [11]:
import pandas as pd
from pathlib import Path

DATA_PATH = Path("./../data/df_model_prepared.csv")  # ajusta si tu ruta es otra
df = pd.read_csv(DATA_PATH, sep=";")

# Tipado: categóricas a 'category', numéricas a número
cat_cols = ["Product","Sub-product","Issue","Sub-issue","State","Company","Company response"]
for c in cat_cols:
    if c in df.columns:
        df[c] = df[c].astype("category")


df["Timely response?"] = df["Timely response?"].astype("int8")

# target: 0/1 -> int8 (sobra)
df["target"] = df["target"].astype("int8")

# days_to_company: 0..10 -> uint8 (entero sin signo, ocupa menos)
df["days_to_company"] = df["days_to_company"].astype("uint8")

df.shape, df.dtypes


((4678, 10),
 Product             category
 Sub-product         category
 Issue               category
 Sub-issue           category
 State               category
 Company             category
 Company response    category
 Timely response?        int8
 days_to_company        uint8
 target                  int8
 dtype: object)

In [12]:
print("Problema: Clasificación binaria (predecir 'target' 0/1)")

vc = df["target"].value_counts().sort_index()
print("\nDistribución target:")
print(vc)
print(f"\nRatio positivos (1): {df['target'].mean():.3f}")


Problema: Clasificación binaria (predecir 'target' 0/1)

Distribución target:
target
0    1076
1    3602
Name: count, dtype: int64

Ratio positivos (1): 0.770


In [13]:
# === Split recomendado: Train/Test (CV interno para validación) ===
from sklearn.model_selection import train_test_split
import pandas as pd

TARGET = "target"
X = df.drop(columns=[TARGET])
y = df[TARGET].astype(int)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=42, stratify=y
)

print("Shapes -> train:", X_train.shape, "| test:", X_test.shape)
print("Ratios de positivos -> train:", y_train.mean(), "| test:", y_test.mean())


Shapes -> train: (3742, 9) | test: (936, 9)
Ratios de positivos -> train: 0.7699091394975949 | test: 0.7702991452991453


In [14]:
# === Preprocesamiento columnas ===
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
import inspect

# Columnas por tipo
cat_cols = X_train.select_dtypes(include=["category"]).columns.tolist()
num_cols = [c for c in X_train.select_dtypes(include=["number"]).columns if c not in ["target", "Timely response?"]]
bin_cols = [c for c in ["Timely response?"] if c in X_train.columns]

# OneHotEncoder con fallback y salida densa
ohe_kwargs = {"handle_unknown": "ignore"}
if "min_frequency" in inspect.signature(OneHotEncoder).parameters:
    ohe_kwargs["min_frequency"] = 0.01  # agrupa categorías muy raras
# salida densa (nombre depende de versión)
if "sparse_output" in inspect.signature(OneHotEncoder).parameters:
    ohe_kwargs["sparse_output"] = False
else:
    ohe_kwargs["sparse"] = False

cat_pipe = Pipeline([
    ("impute", SimpleImputer(strategy="most_frequent")),
    ("ohe", OneHotEncoder(**ohe_kwargs))
])

num_pipe = Pipeline([
    ("impute", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

preprocess = ColumnTransformer(
    transformers=[
        ("cat", cat_pipe, cat_cols),
        ("num", num_pipe, num_cols),
        ("bin", "passthrough", bin_cols),
    ],
    remainder="drop",
    n_jobs=-1
)

preprocess

0,1,2
,transformers,"[('cat', ...), ('num', ...), ...]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,-1
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,0.01
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True


In [15]:
# === Inspección de columnas por pipeline ===
print("Columnas categóricas (cat_pipe):", cat_cols)
print("Columnas numéricas (num_pipe):", num_cols)
print("Columnas binarias (passthrough):", bin_cols)

# También: ver cómo quedará el total de features
preprocess.fit(X_train)  # ajustamos para que genere nombres

try:
    feat_names = preprocess.get_feature_names_out()
    print(f"\nTotal de features tras preprocess: {len(feat_names)}")
    print("Ejemplo nombres:", feat_names[:15])
except Exception as e:
    print("No se pudieron obtener nombres de features:", e)


Columnas categóricas (cat_pipe): ['Product', 'Sub-product', 'Issue', 'Sub-issue', 'State', 'Company', 'Company response']
Columnas numéricas (num_pipe): ['days_to_company']
Columnas binarias (passthrough): ['Timely response?']

Total de features tras preprocess: 121
Ejemplo nombres: ['cat__Product_Bank account or service' 'cat__Product_Consumer loan'
 'cat__Product_Credit card' 'cat__Product_Credit reporting'
 'cat__Product_Debt collection' 'cat__Product_Mortgage'
 'cat__Product_Student loan' 'cat__Product_infrequent_sklearn'
 'cat__Sub-product_Checking account'
 'cat__Sub-product_Conventional adjustable mortgage (ARM)'
 'cat__Sub-product_Conventional fixed mortgage'
 'cat__Sub-product_Credit card' 'cat__Sub-product_FHA mortgage'
 'cat__Sub-product_Home equity loan or line of credit'
 'cat__Sub-product_Medical']


In [16]:
!pip install xgboost lightgbm



In [17]:
# === Leaderboard ===
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_validate, StratifiedKFold
from sklearn.metrics import make_scorer, f1_score, precision_score, recall_score, average_precision_score
import numpy as np
import pandas as pd

# --- Métricas y CV ---
SCORING = {
    "roc_auc": "roc_auc",
    "f1": make_scorer(f1_score),
    "precision": make_scorer(precision_score),
    "recall": make_scorer(recall_score),
    "ap": make_scorer(average_precision_score, needs_threshold=True)  # PR AUC
}
cv_ = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
primary_metric = "roc_auc"

# --- Modelos base ---
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC

models = {
    "LogReg": LogisticRegression(max_iter=1000, solver="lbfgs"),
    "RandomForest": RandomForestClassifier(class_weight="balanced", random_state=42, n_jobs=-1),
    "GBDT": GradientBoostingClassifier(random_state=42),
    "DecisionTree": DecisionTreeClassifier(class_weight="balanced", random_state=42),
    "KNN": KNeighborsClassifier(),
    "LinearSVC": LinearSVC(class_weight="balanced", random_state=42)
}

# --- XGBoost y LightGBM ---
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

models["XGBoost"] = XGBClassifier(
    eval_metric="logloss", random_state=42, n_jobs=-1,
    tree_method="hist"  # rápido y robusto
)
models["LightGBM"] = LGBMClassifier(random_state=42, n_jobs=-1)

# --- Pipelines usando TU 'preprocess' ya definido ---
pipelines = {name: Pipeline([("prep", preprocess), ("model", mdl)]) for name, mdl in models.items()}

# --- Cross-validate todos y crear leaderboard ---
rows, errors = [], []
for name, pipe in pipelines.items():
    try:
        cvres = cross_validate(
            pipe, X_train, y_train,
            cv=cv_, scoring=SCORING, n_jobs=-1, return_train_score=False
        )
        row = {"model": name, "fit_time": np.mean(cvres["fit_time"])}
        for k in SCORING.keys():
            row[f"{k}_mean"] = float(np.mean(cvres[f"test_{k}"]))
            row[f"{k}_std"]  = float(np.std(cvres[f"test_{k}"]))
        rows.append(row)
    except Exception as e:
        errors.append((name, repr(e)))

leaderboard = pd.DataFrame(rows).sort_values(f"{primary_metric}_mean", ascending=False).reset_index(drop=True)
print("Modelos con error:", errors if errors else "ninguno")
leaderboard


Modelos con error: ninguno


Unnamed: 0,model,fit_time,roc_auc_mean,roc_auc_std,f1_mean,f1_std,precision_mean,precision_std,recall_mean,recall_std,ap_mean,ap_std
0,XGBoost,0.280989,0.663136,0.018704,0.857687,0.005329,0.792886,0.005607,0.934048,0.00679,,
1,GBDT,0.607369,0.661625,0.02665,0.868928,0.002546,0.778151,0.003111,0.983686,0.001773,,
2,LightGBM,0.309328,0.657318,0.021374,0.860481,0.003379,0.789055,0.005328,0.946197,0.006957,,
3,RandomForest,0.346152,0.651712,0.013724,0.845287,0.002408,0.792552,0.003146,0.905585,0.006313,,
4,LogReg,0.080815,0.645281,0.026868,0.866347,0.004404,0.780623,0.001913,0.973271,0.009862,,
5,LinearSVC,0.053237,0.640993,0.022307,0.714113,0.014239,0.837146,0.014584,0.622693,0.015256,,
6,KNN,0.041996,0.591527,0.01556,0.844857,0.005968,0.78609,0.003493,0.913226,0.012979,,
7,DecisionTree,0.071782,0.579833,0.017137,0.782293,0.008571,0.807784,0.008401,0.758414,0.010737,,


In [None]:
# === TUNING + SELECCIÓN DEL MEJOR (sin LightGBM) ===
# Requisitos previos en memoria: preprocess, X_train, y_train, X_test, y_test
# (SCORING no es necesario aquí porque usamos 'roc_auc' como métrica de refit)

from pathlib import Path
from datetime import datetime
import json, joblib, numpy as np, pandas as pd

from joblib import Memory
from scipy.stats import randint, uniform, loguniform
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold
from sklearn.pipeline import Pipeline

# Modelos del leaderboard
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC
try:
    from xgboost import XGBClassifier
except Exception:
    XGBClassifier = None  # si no está instalado, se omite

# -------------------- Configuración global --------------------
OUT = Path("artifacts"); OUT.mkdir(exist_ok=True)
CACHE_DIR = OUT / "cache"; CACHE_DIR.mkdir(exist_ok=True)
memory = Memory(location=str(CACHE_DIR), verbose=0)

RANDOM_STATE = 42
CV_FOLDS = 2              # CV estable
N_ITER = 10               # <-- "un poco más" de iteraciones que antes
cv_ = StratifiedKFold(n_splits=CV_FOLDS, shuffle=True, random_state=RANDOM_STATE)

def save_json(obj, path):
    """JSON seguro (convierte tipos numpy/pandas)."""
    import numpy as _np, pandas as _pd
    def _cast(x):
        if isinstance(x, dict): return {str(k): _cast(v) for k, v in x.items()}
        if isinstance(x, (list, tuple, set)): return [_cast(v) for v in x]
        if isinstance(x, (_np.integer,)): return int(x)
        if isinstance(x, (_np.floating,)): return float(x)
        if isinstance(x, (_np.ndarray,)): return x.tolist()
        if isinstance(x, (_pd.Timestamp,)): return x.isoformat()
        return x
    with open(path, "w", encoding="utf-8") as f:
        json.dump(_cast(obj), f, ensure_ascii=False, indent=2)

# -------------------- Catálogo de modelos --------------------
models = {
    "LogReg":      LogisticRegression(max_iter=1000, solver="lbfgs"),
    "RandomForest":RandomForestClassifier(class_weight="balanced", random_state=RANDOM_STATE, n_jobs=-1),
    "GBDT":        GradientBoostingClassifier(random_state=RANDOM_STATE),
    "DecisionTree":DecisionTreeClassifier(class_weight="balanced", random_state=RANDOM_STATE),
    "KNN":         KNeighborsClassifier(),
    "LinearSVC":   LinearSVC(class_weight="balanced", random_state=RANDOM_STATE),
}
if XGBClassifier is not None:
    models["XGBoost"] = XGBClassifier(
        eval_metric="logloss",
        tree_method="hist",     # rápido/robusto (CPU)
        random_state=RANDOM_STATE,
        n_jobs=-1
    )

# -------------------- Espacios de hiperparámetros --------------------
param_spaces = {
    "LogReg": {
        "model__C":        loguniform(1e-3, 1e2),
        "model__solver":   ["lbfgs", "liblinear"],
        "model__penalty":  ["l2"],
        "model__max_iter": [1000],
    },
    "RandomForest": {
        "model__n_estimators":     randint(400, 1001),
        "model__max_depth":        randint(6, 31),
        "model__min_samples_split":randint(2, 16),
        "model__min_samples_leaf": randint(1, 9),
        "model__max_features":     ["sqrt", "log2", 0.6, 0.8],
        "model__bootstrap":        [True, False],
    },
    "GBDT": {
        "model__n_estimators":  randint(200, 1001),
        "model__learning_rate": loguniform(1e-2, 3e-1),
        "model__max_depth":     randint(2, 7),
        "model__subsample":     uniform(0.6, 0.4),
    },
    "DecisionTree": {
        "model__max_depth":        randint(3, 41),
        "model__min_samples_split": randint(2, 21),
        "model__min_samples_leaf":  randint(1, 11),
        "model__max_features":      ["sqrt", "log2", None],
    },
    "KNN": {
        "model__n_neighbors": randint(3, 51),
        "model__weights":     ["uniform", "distance"],
        "model__p":           [1, 2],  # Manhattan o Euclídea
    },
    "LinearSVC": {
        "model__C":            loguniform(1e-3, 1e2),
        "model__loss":         ["squared_hinge"],
        "model__fit_intercept":[True, False],
    },
}
if "XGBoost" in models:
    param_spaces["XGBoost"] = {
        "model__n_estimators":     randint(400, 1001),
        "model__max_depth":        randint(3, 11),
        "model__learning_rate":    loguniform(1e-2, 3e-1),
        "model__subsample":        uniform(0.6, 0.4),
        "model__colsample_bytree": uniform(0.6, 0.4),
        "model__min_child_weight": randint(1, 8),
        "model__reg_alpha":        loguniform(1e-8, 1e0),
        "model__reg_lambda":       loguniform(1e-3, 2e0),
        "model__gamma":            uniform(0.0, 2.0),
    }

# -------------------- Tuning + selección del mejor --------------------
leader_rows = []
best_name, best_model = None, None
best_cv_auc = -np.inf

for name, estimator in models.items():
    print(f"\n=== Tuning {name} ===")
    pipe = Pipeline([("prep", preprocess), ("model", estimator)], memory=memory)
    search = RandomizedSearchCV(
        estimator=pipe,
        param_distributions=param_spaces[name],
        n_iter=N_ITER,
        scoring="roc_auc",         # métrica principal para elegir el mejor
        refit=True,                # reentrena con los mejores hiperparámetros
        cv=cv_,
        n_jobs=-1,
        verbose=1,
        random_state=RANDOM_STATE,
    )
    search.fit(X_train, y_train)

    cv_auc = float(search.best_score_)
    leader_rows.append({
        "model": name,
        "cv_auc": cv_auc,
        "best_params": search.best_params_,
        "refit_time_s": float(getattr(search, "refit_time_", np.nan)),
    })

    if cv_auc > best_cv_auc:
        best_cv_auc = cv_auc
        best_name = name
        best_model = search.best_estimator_

# -------------------- Leaderboard CV + guardado --------------------
leaderboard_cv = pd.DataFrame(leader_rows).sort_values("cv_auc", ascending=False).reset_index(drop=True)
display(leaderboard_cv)

tag = best_name.lower()
joblib.dump(best_model, OUT / f"pipeline_best_{tag}.joblib")
save_json(
    {"best_name": best_name, "cv_auc": best_cv_auc, "timestamp": datetime.now().isoformat()},
    OUT / "best_overall.json"
)
print(f"\n✔ Mejor por CV: {best_name} (ROC-AUC CV={best_cv_auc:.4f}). Guardado en artifacts/pipeline_best_{tag}.joblib")


NameError: name 'pipeline' is not defined

In [None]:
# === EVALUACIÓN EN TEST DEL MEJOR + GRÁFICAS ===
import json, joblib, numpy as np, matplotlib.pyplot as plt
from pathlib import Path
from sklearn import metrics
from sklearn.metrics import ConfusionMatrixDisplay

OUT = Path("artifacts"); OUT.mkdir(exist_ok=True)

# --- cargar el mejor (si no viene en memoria de la celda anterior) ---
try:
    best_model, best_name
except NameError:
    meta = json.load(open(OUT / "best_overall.json", "r", encoding="utf-8"))
    best_name = meta["best_name"]
    best_model = joblib.load(OUT / f"pipeline_best_{best_name.lower()}.joblib")

# --- helper para obtener score continuo ---
def get_scores(est, X):
    if hasattr(est, "predict_proba"):
        return est.predict_proba(X)[:, 1]
    if hasattr(est, "decision_function"):
        return est.decision_function(X)
    # último recurso: etiqueta binaria como score (no ideal para AUC/AP)
    return est.predict(X).astype(float)

y_score = get_scores(best_model, X_test)
y_pred_05 = (y_score >= 0.5).astype(int)

# --- umbral óptimo por F1 ---
thresholds = np.linspace(0.05, 0.95, 19)
f1_vals = [metrics.f1_score(y_test, (y_score >= t).astype(int)) for t in thresholds]
t_opt = thresholds[int(np.argmax(f1_vals))]
y_pred_opt = (y_score >= t_opt).astype(int)

# --- métricas resumen (0.5 y umbral óptimo) ---
summary = {
    "model": best_name,
    "test_roc_auc": float(metrics.roc_auc_score(y_test, y_score)),
    "test_ap": float(metrics.average_precision_score(y_test, y_score)),
    "f1@0.5": float(metrics.f1_score(y_test, y_pred_05)),
    "precision@0.5": float(metrics.precision_score(y_test, y_pred_05)),
    "recall@0.5": float(metrics.recall_score(y_test, y_pred_05)),
    "threshold_opt_f1": float(t_opt),
    "f1@opt": float(metrics.f1_score(y_test, y_pred_opt)),
    "precision@opt": float(metrics.precision_score(y_test, y_pred_opt)),
    "recall@opt": float(metrics.recall_score(y_test, y_pred_opt)),
}
print("Resumen test:")
for k, v in summary.items():
    print(f"  {k}: {v:.4f}" if isinstance(v, float) else f"  {k}: {v}")

# --- ROC ---
fpr, tpr, _ = metrics.roc_curve(y_test, y_score)
auc = metrics.roc_auc_score(y_test, y_score)
plt.figure()
plt.plot(fpr, tpr, label=f"AUC={auc:.3f}")
plt.plot([0, 1], [0, 1], "--")
plt.xlabel("FPR"); plt.ylabel("TPR"); plt.title(f"ROC - {best_name}")
plt.legend(); plt.tight_layout()
plt.savefig(OUT / f"roc_{best_name}.png", dpi=160)
plt.show()

# --- Precision-Recall ---
prec, rec, _ = metrics.precision_recall_curve(y_test, y_score)
ap = metrics.average_precision_score(y_test, y_score)
plt.figure()
plt.plot(rec, prec, label=f"AP={ap:.3f}")
plt.xlabel("Recall"); plt.ylabel("Precision"); plt.title(f"PR Curve - {best_name}")
plt.legend(); plt.tight_layout()
plt.savefig(OUT / f"pr_{best_name}.png", dpi=160)
plt.show()

# --- Matriz de confusión (con umbral óptimo) ---
plt.figure()
ConfusionMatrixDisplay.from_predictions(y_test, y_pred_opt)
plt.title(f"Confusion Matrix - {best_name} @ thr={t_opt:.2f}")
plt.tight_layout()
plt.savefig(OUT / f"cm_{best_name}_thr{t_opt:.2f}.png", dpi=160)
plt.show()

print(f"\nImágenes guardadas en: {OUT.resolve()}")


In [None]:
# === REBALANCEO + TUNING DEL TOP-3 (LogReg, RandomForest, GBDT) ===
# Requisitos: preprocess, X_train, y_train, X_test, y_test

from pathlib import Path
from datetime import datetime
import json, joblib, numpy as np, pandas as pd

from joblib import Memory
from scipy.stats import randint, uniform, loguniform
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.utils.class_weight import compute_sample_weight

# Modelos candidatos
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

# -------------------- Configuración --------------------
OUT = Path("artifacts"); OUT.mkdir(exist_ok=True)
CACHE_DIR = OUT / "cache"; CACHE_DIR.mkdir(exist_ok=True)
memory = Memory(location=str(CACHE_DIR), verbose=0)

RANDOM_STATE = 42
CV_FOLDS = 2
N_ITER = 10  # sube/baja para afinar más/menos
cv_ = StratifiedKFold(n_splits=CV_FOLDS, shuffle=True, random_state=RANDOM_STATE)

# -------------------- TOP-3 explícito --------------------
# (Si tienes un leaderboard previo con columna 'model', puedes leer de ahí; aquí lo fijamos)
top3_names = ["RandomForest", "LogReg", "GBDT"]

catalog = {
    "LogReg":      LogisticRegression(max_iter=1000, solver="lbfgs"),
    "RandomForest":RandomForestClassifier(random_state=RANDOM_STATE, n_jobs=-1),
    "GBDT":        GradientBoostingClassifier(random_state=RANDOM_STATE),
}
models = {name: catalog[name] for name in top3_names}

# -------------------- Espacios de hiperparámetros --------------------
param_spaces = {
    "LogReg": {
        "model__C":        loguniform(1e-3, 1e2),
        "model__solver":   ["lbfgs", "liblinear"],
        "model__penalty":  ["l2"],
        "model__max_iter": [1000],
    },
    "RandomForest": {
        "model__n_estimators":      randint(400, 1001),
        "model__max_depth":         randint(6, 31),
        "model__min_samples_split": randint(2, 16),
        "model__min_samples_leaf":  randint(1, 9),
        "model__max_features":      ["sqrt", "log2", 0.6, 0.8],
        "model__bootstrap":         [True, False],
    },
    "GBDT": {
        "model__n_estimators":  randint(200, 1001),
        "model__learning_rate": loguniform(1e-2, 3e-1),
        "model__max_depth":     randint(2, 7),
        "model__subsample":     uniform(0.6, 0.4),
    },
}

# -------------------- Pesos balanceados por muestra --------------------
sw_train = compute_sample_weight(class_weight="balanced", y=y_train)

# -------------------- Tuning con sample_weight + selección del mejor --------------------
rows = []
best_name, best_model, best_cv = None, None, -np.inf

for name, est in models.items():
    print(f"\n=== Tuning (rebalanceado) {name} ===")
    pipe = Pipeline([("prep", preprocess), ("model", est)], memory=memory)
    search = RandomizedSearchCV(
        estimator=pipe,
        param_distributions=param_spaces[name],
        n_iter=N_ITER,
        scoring="roc_auc",   # métrica principal (puedes cambiar a "average_precision")
        refit=True,
        cv=cv_,
        n_jobs=-1,
        verbose=1,
        random_state=RANDOM_STATE,
    )
    # Rebajamos el sesgo usando sample_weight en todo el CV/refit
    search.fit(X_train, y_train, **{"model__sample_weight": sw_train})

    cv_score = float(search.best_score_)
    rows.append({"model": name, "cv_auc": cv_score, "best_params": search.best_params_})

    if cv_score > best_cv:
        best_cv = cv_score
        best_name = name
        best_model = search.best_estimator_

leaderboard_cv_bal = pd.DataFrame(rows).sort_values("cv_auc", ascending=False).reset_index(drop=True)
display(leaderboard_cv_bal)

# Guardamos el mejor
tag = best_name.lower()
joblib.dump(best_model, OUT / f"pipeline_best_bal_{tag}.joblib")
with open(OUT / "best_overall_bal.json", "w", encoding="utf-8") as f:
    json.dump({"best_name": best_name, "cv_auc": best_cv, "timestamp": datetime.now().isoformat()}, f, indent=2)

print(f"\n✔ Mejor por CV (rebalanceado): {best_name} (ROC-AUC={best_cv:.4f}). Guardado en artifacts/pipeline_best_bal_{tag}.joblib")


In [None]:
# === TEST DEL MEJOR (rebalanceado) + UMBRAL ÓPTIMO + GRÁFICAS ===
import json, joblib, numpy as np, matplotlib.pyplot as plt
from pathlib import Path
from sklearn import metrics
from sklearn.metrics import ConfusionMatrixDisplay

OUT = Path("artifacts"); OUT.mkdir(exist_ok=True)

# Cargar el mejor re-balanceado si no está ya en memoria
try:
    best_model, best_name
except NameError:
    meta = json.load(open(OUT / "best_overall_bal.json", "r", encoding="utf-8"))
    best_name = meta["best_name"]
    best_model = joblib.load(OUT / f"pipeline_best_bal_{best_name.lower()}.joblib")

def scores(est, X):
    if hasattr(est, "predict_proba"):    return est.predict_proba(X)[:, 1]
    if hasattr(est, "decision_function"): return est.decision_function(X)
    return est.predict(X).astype(float)   # último recurso

# Scores y baseline @0.5
y_score = scores(best_model, X_test)
y_pred_05 = (y_score >= 0.5).astype(int)

# Umbral óptimo por F1
grid = np.linspace(0.05, 0.95, 19)
f1s = [metrics.f1_score(y_test, (y_score >= t).astype(int)) for t in grid]
t_opt = grid[int(np.argmax(f1s))]
y_pred_opt = (y_score >= t_opt).astype(int)

# Resumen de métricas
summary = {
    "model": best_name,
    "test_roc_auc": float(metrics.roc_auc_score(y_test, y_score)),
    "test_ap": float(metrics.average_precision_score(y_test, y_score)),
    "f1@0.5": float(metrics.f1_score(y_test, y_pred_05)),
    "precision@0.5": float(metrics.precision_score(y_test, y_pred_05)),
    "recall@0.5": float(metrics.recall_score(y_test, y_pred_05)),
    "threshold_opt_f1": float(t_opt),
    "f1@opt": float(metrics.f1_score(y_test, y_pred_opt)),
    "precision@opt": float(metrics.precision_score(y_test, y_pred_opt)),
    "recall@opt": float(metrics.recall_score(y_test, y_pred_opt)),
}
print("Resumen test (rebalanceado):")
for k, v in summary.items():
    print(f"  {k}: {v:.4f}" if isinstance(v, float) else f"  {k}: {v}")

# ====== Gráficas ======
# ROC
fpr, tpr, _ = metrics.roc_curve(y_test, y_score)
auc = metrics.roc_auc_score(y_test, y_score)
plt.figure()
plt.plot(fpr, tpr, label=f"AUC={auc:.3f}")
plt.plot([0,1],[0,1],'--')
plt.xlabel("FPR"); plt.ylabel("TPR"); plt.title(f"ROC - {best_name} (rebalanceado)")
plt.legend(); plt.tight_layout()
plt.savefig(OUT / f"roc_bal_{best_name}.png", dpi=160)
plt.show()

# Precision-Recall
prec, rec, _ = metrics.precision_recall_curve(y_test, y_score)
ap = metrics.average_precision_score(y_test, y_score)
plt.figure()
plt.plot(rec, prec, label=f"AP={ap:.3f}")
plt.xlabel("Recall"); plt.ylabel("Precision"); plt.title(f"PR Curve - {best_name} (rebalanceado)")
plt.legend(); plt.tight_layout()
plt.savefig(OUT / f"pr_bal_{best_name}.png", dpi=160)
plt.show()

# Matriz de confusión (umbral óptimo)
plt.figure()
ConfusionMatrixDisplay.from_predictions(y_test, y_pred_opt)
plt.title(f"Confusion Matrix - {best_name} (rebalanceado) @ thr={t_opt:.2f}")
plt.tight_layout()
plt.savefig(OUT / f"cm_bal_{best_name}_thr{t_opt:.2f}.png", dpi=160)
plt.show()

print(f"\nImágenes guardadas en: {OUT.resolve()}")


In [None]:
!pip install -U imbalanced-learn


In [None]:
# === SMOTE + RANDOM FOREST (tuning con CV, sin fuga) ===
# Requisitos en memoria: preprocess, X_train, y_train, X_test, y_test

from pathlib import Path
from datetime import datetime
import json, joblib, numpy as np, pandas as pd

from joblib import Memory
from scipy.stats import randint
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV

# Pipeline con SMOTE
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline  # OJO: el Pipeline de imblearn
from sklearn.ensemble import RandomForestClassifier

# -------------------- Configuración --------------------
OUT = Path("artifacts"); OUT.mkdir(exist_ok=True)
CACHE_DIR = OUT / "cache"; CACHE_DIR.mkdir(exist_ok=True)
memory = Memory(location=str(CACHE_DIR), verbose=0)

RANDOM_STATE = 42
cv_ = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)
N_ITER = 10  # sube/baja para afinar más/menos

# -------------------- Pipeline con SMOTE -> RF --------------------
smote = SMOTE(
    sampling_strategy="auto",   # balancea la clase minoritaria
    k_neighbors=5,              # vecinos para sintetizar
    random_state=RANDOM_STATE
)

rf = RandomForestClassifier(
    random_state=RANDOM_STATE,
    n_jobs=-1,
    class_weight=None  # con SMOTE no solemos usar class_weight
)

pipe = Pipeline(steps=[
    ("prep", preprocess),  # tu ColumnTransformer (OHE + escala + passthrough)
    ("smote", smote),      # oversampling aplicado SOLO en train de cada fold
    ("model", rf)
], memory=memory)

# -------------------- Espacio de hiperparámetros --------------------
param_dist = {
    "model__n_estimators":      randint(400, 1001),
    "model__max_depth":         randint(6, 31),
    "model__min_samples_split": randint(2, 16),
    "model__min_samples_leaf":  randint(1, 9),
    "model__max_features":      ["sqrt", "log2", 0.6, 0.8],
    "model__bootstrap":         [True, False],
    # (opcional) SMOTE también se puede buscar:
    # "smote__k_neighbors":       randint(3, 11),
}

# -------------------- RandomizedSearchCV --------------------
search = RandomizedSearchCV(
    estimator=pipe,
    param_distributions=param_dist,
    n_iter=N_ITER,
    scoring="roc_auc",      # métrica principal para elegir
    refit=True,
    cv=cv_,
    n_jobs=-1,
    verbose=1,
    random_state=RANDOM_STATE
)

search.fit(X_train, y_train)

best_model = search.best_estimator_
best_params = search.best_params_
cv_auc = float(search.best_score_)

# -------------------- Guardado --------------------
def _to_jsonable(o):
    import numpy as _np, pandas as _pd
    if isinstance(o, dict): return {str(k): _to_jsonable(v) for k, v in o.items()}
    if isinstance(o, (list, tuple, set)): return [_to_jsonable(x) for x in o]
    if isinstance(o, (_np.integer,)): return int(o)
    if isinstance(o, (_np.floating,)): return float(o)
    if isinstance(o, (_np.ndarray,)): return o.tolist()
    if isinstance(o, (_pd.Timestamp,)): return o.isoformat()
    return o

tag = "rf_smote"
joblib.dump(best_model, OUT / f"pipeline_best_{tag}.joblib")
with open(OUT / f"best_{tag}.json", "w", encoding="utf-8") as f:
    json.dump(_to_jsonable({
        "model": "RandomForest + SMOTE",
        "cv_auc": cv_auc,
        "best_params": best_params,
        "timestamp": datetime.now().isoformat()
    }), f, ensure_ascii=False, indent=2)

print(f"\n✔ RF+SMOTE listo. CV ROC-AUC={cv_auc:.4f}")
print("Guardado:", (OUT / f"pipeline_best_{tag}.joblib").resolve())


In [None]:
# === TEST RF+SMOTE + GRÁFICAS + IMPORTANCIAS ===
import json, joblib, numpy as np, matplotlib.pyplot as plt
from pathlib import Path
from sklearn import metrics
from sklearn.metrics import ConfusionMatrixDisplay

OUT = Path("artifacts"); OUT.mkdir(exist_ok=True)

# cargar modelo
tag = "rf_smote"
best_model = joblib.load(OUT / f"pipeline_best_{tag}.joblib")

# helper de score continuo
def scores(est, X):
    # el pipeline termina en RandomForest -> tiene predict_proba
    if hasattr(est, "predict_proba"): return est.predict_proba(X)[:, 1]
    if hasattr(est, "decision_function"): return est.decision_function(X)
    return est.predict(X).astype(float)

# ---- métricas base en test ----
y_score = scores(best_model, X_test)
y_pred_05 = (y_score >= 0.5).astype(int)

# Umbral óptimo por F1
grid = np.linspace(0.05, 0.95, 19)
f1s = [metrics.f1_score(y_test, (y_score >= t).astype(int)) for t in grid]
t_opt = grid[int(np.argmax(f1s))]
y_pred_opt = (y_score >= t_opt).astype(int)

summary = {
    "test_roc_auc": float(metrics.roc_auc_score(y_test, y_score)),
    "test_ap": float(metrics.average_precision_score(y_test, y_score)),
    "f1@0.5": float(metrics.f1_score(y_test, y_pred_05)),
    "precision@0.5": float(metrics.precision_score(y_test, y_pred_05)),
    "recall@0.5": float(metrics.recall_score(y_test, y_pred_05)),
    "thr_opt_f1": float(t_opt),
    "f1@opt": float(metrics.f1_score(y_test, y_pred_opt)),
    "precision@opt": float(metrics.precision_score(y_test, y_pred_opt)),
    "recall@opt": float(metrics.recall_score(y_test, y_pred_opt)),
}
print("Resumen test RF+SMOTE:")
for k, v in summary.items():
    print(f"  {k}: {v:.4f}" if isinstance(v, float) else f"  {k}: {v}")

# ---- ROC ----
fpr, tpr, _ = metrics.roc_curve(y_test, y_score)
auc = metrics.roc_auc_score(y_test, y_score)
plt.figure()
plt.plot(fpr, tpr, label=f"AUC={auc:.3f}")
plt.plot([0,1],[0,1],'--')
plt.xlabel("FPR"); plt.ylabel("TPR"); plt.title("ROC - RF + SMOTE")
plt.legend(); plt.tight_layout()
plt.savefig(OUT / "roc_rf_smote.png", dpi=160); plt.show()

# ---- Precision-Recall ----
prec, rec, _ = metrics.precision_recall_curve(y_test, y_score)
ap = metrics.average_precision_score(y_test, y_score)
plt.figure()
plt.plot(rec, prec, label=f"AP={ap:.3f}")
plt.xlabel("Recall"); plt.ylabel("Precision"); plt.title("PR Curve - RF + SMOTE")
plt.legend(); plt.tight_layout()
plt.savefig(OUT / "pr_rf_smote.png", dpi=160); plt.show()

# ---- Matriz de confusión (umbral óptimo F1) ----
plt.figure()
ConfusionMatrixDisplay.from_predictions(y_test, y_pred_opt)
plt.title(f"Confusion Matrix - RF+SMOTE @ thr={t_opt:.2f}")
plt.tight_layout()
plt.savefig(OUT / f"cm_rf_smote_thr{t_opt:.2f}.png", dpi=160); plt.show()

# ---- Importancias de características (Gini) ----
# Necesitamos obtener las importancias del paso "model" y los nombres del "prep"
rf = best_model.named_steps["model"]
try:
    feat_names = best_model.named_steps["prep"].get_feature_names_out()
except Exception:
    # fallback si tu preprocess no soporta nombres
    feat_names = [f"f_{i}" for i in range(rf.n_features_in_)]

importances = rf.feature_importances_
imp_df = (pd.DataFrame({"feature": feat_names, "importance": importances})
            .sort_values("importance", ascending=False)
            .reset_index(drop=True))

imp_top = imp_df.head(20)
print("\nTop 20 importancias (Gini):")
print(imp_top)

# Barplot simple de importancias top-20
plt.figure(figsize=(8, 6))
plt.barh(imp_top["feature"][::-1], imp_top["importance"][::-1])
plt.xlabel("Importance"); plt.title("RandomForest Feature Importances (Top 20)")
plt.tight_layout()
plt.savefig(OUT / "rf_smote_importances_top20.png", dpi=160); plt.show()

print(f"\nArtefactos guardados en: {OUT.resolve()}")


In [None]:
# === NN: PREPARACIÓN (Train/Val + Preprocess + SMOTE + CUDA) ===
# Requisitos previos: df ya dividido en X_train, y_train, X_test, y_test y definido 'preprocess'

import numpy as np
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from pathlib import Path

OUT = Path("artifacts"); OUT.mkdir(exist_ok=True)

# 1) Split de validación a partir del train existente (80/20)
X_tr, X_val, y_tr, y_val = train_test_split(
    X_train, y_train, test_size=0.2, random_state=42, stratify=y_train
)

# 2) Ajusta preprocess SOLO con train y transforma a arrays densos float32
preprocess.fit(X_tr)
X_tr_tx  = preprocess.transform(X_tr)
X_val_tx = preprocess.transform(X_val)
X_te_tx  = preprocess.transform(X_test)

# Aseguramos float32 para PyTorch
X_tr_tx  = np.asarray(X_tr_tx, dtype=np.float32)
X_val_tx = np.asarray(X_val_tx, dtype=np.float32)
X_te_tx  = np.asarray(X_te_tx, dtype=np.float32)
y_tr     = np.asarray(y_tr, dtype=np.int64)
y_val    = np.asarray(y_val, dtype=np.int64)
y_test_np= np.asarray(y_test, dtype=np.int64)

# 3) SMOTE SOLO sobre el set de entrenamiento (sin tocar val/test)
smote = SMOTE(sampling_strategy="auto", k_neighbors=5, random_state=42)
X_tr_bal, y_tr_bal = smote.fit_resample(X_tr_tx, y_tr)

print("Shapes tras preprocess + SMOTE:")
print("  Train (bal):", X_tr_bal.shape, y_tr_bal.shape)
print("  Val:", X_val_tx.shape, y_val.shape)
print("  Test:", X_te_tx.shape, y_test_np.shape)

# 4) CUDA / GPU
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)

# Guardamos a disco por si quieres reusar en otra sesión
np.save(OUT / "X_tr_bal.npy", X_tr_bal)
np.save(OUT / "y_tr_bal.npy", y_tr_bal)
np.save(OUT / "X_val_tx.npy", X_val_tx)
np.save(OUT / "y_val.npy", y_val)
np.save(OUT / "X_te_tx.npy", X_te_tx)
np.save(OUT / "y_test.npy", y_test_np)
with open(OUT / "nn_device.txt", "w") as f:
    f.write(str(device))


In [None]:
# Guardar modelo entrenado para evaluación posterior
import os, joblib
os.makedirs("../models", exist_ok=True)
joblib.dump(search, "../models/final_model.pkl")
print("Modelo guardado en ../models/final_model.pkl")
