# MODELOS PREDICTIVOS CON MACHINE/DEEP LEARNING

In [8]:
# Versión robusta para evitar TerminatedWorkerError / OOM en GridSearch
# Pega en tu notebook (ajusta FILE_PATH si quieres)
# Requisitos: pip install xgboost scikit-learn pandas joblib

import os
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.metrics import roc_auc_score, confusion_matrix, classification_report
from xgboost import XGBClassifier
import joblib
from scipy import stats

RANDOM_STATE = 42

# ---------- CONFIG ----------
FILE_PATH = "/home/jovyan/work/data/curated/"
TARGET_COL = None   # si conoces el nombre, ponlo aquí
APPROVAL_THRESHOLD = 0.8   # por defecto 80% (si quieres 0.8% usar 0.008)
MODEL_OUTPUT = "xgb_loan_model_robust.joblib"
# ----------------------------

# 1) Cargar datos
if not os.path.exists(FILE_PATH):
    raise FileNotFoundError(f"Archivo no encontrado en FILE_PATH: {FILE_PATH}")

df = pd.read_parquet(FILE_PATH)
print("Datos cargados. Shape:", df.shape)

# 2) Detectar target automaticamente (si TARGET_COL es None)
if TARGET_COL is None:
    candidates = ["target", "y", "paid", "repaid", "repaid_flag", "loan_status", "default", "is_default",
                  "paid_loan", "status", "GOOD_BAD", "good_bad"]
    lower_cols = {c.lower(): c for c in df.columns}
    found = None
    for c in candidates:
        if c.lower() in lower_cols:
            found = lower_cols[c.lower()]
            break
    if found is None:
        # intenta detectar columnas binarias 0/1
        for col in df.columns:
            vals = pd.Series(df[col].dropna().unique())
            if set(vals.unique()).issubset({0,1}) and len(vals.unique())<=2:
                found = col
                break
    if found is None:
        raise ValueError(
            "No se ha detectado la columna objetivo. Asigna TARGET_COL manualmente al nombre de la columna 0/1."
        )
    TARGET_COL = found
    print(f"TARGET_COL detectado automáticamente: '{TARGET_COL}'")

# 3) Preparar X e y
y = df[TARGET_COL]
X = df.drop(columns=[TARGET_COL])

# 4) Train/test split (estratificado si posible)
stratify = y if len(np.unique(y))>1 else None
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=RANDOM_STATE, stratify=stratify
)
print("Train/Test shapes:", X_train.shape, X_test.shape)

# 5) Detectar num y categ
num_cols = X_train.select_dtypes(include=["number"]).columns.tolist()
cat_cols = X_train.select_dtypes(include=["object", "category", "bool"]).columns.tolist()
print(f"Numéricas: {len(num_cols)} / Categóricas: {len(cat_cols)}")

# 6) Preprocesamiento (ordinal para categóricas -> reduce memoria)
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

# OrdinalEncoder para mantener columnas compactas (XGBoost maneja bien encoding ordinal)
# usamos encoded_missing_value=-1 para desconocidos
categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="constant", fill_value="__missing__")),
    ("ord", OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, num_cols),
        ("cat", categorical_transformer, cat_cols)
    ],
    remainder="drop"
)

# 7) Configurar XGBoost (n_jobs=1 para evitar nested parallelism)
# Calculamos scale_pos_weight para clase positiva si hay desbalance:
pos = np.sum(y_train == 1)
neg = np.sum(y_train == 0)
scale_pos_weight = 1.0
if pos > 0 and neg > 0:
    scale_pos_weight = neg / pos
    print(f"scale_pos_weight calculado: {scale_pos_weight:.3f}")

xgb = XGBClassifier(
    objective="binary:logistic",
    use_label_encoder=False,
    eval_metric="logloss",
    random_state=RANDOM_STATE,
    n_jobs=1,   # important: avoid parallel workers inside each joblib worker
    tree_method="hist",
    scale_pos_weight=scale_pos_weight
)

pipe = Pipeline(steps=[("preproc", preprocessor), ("model", xgb)])

# 8) RandomizedSearchCV ligero (n_jobs=1 para evitar workers extra)
param_dist = {
    "model__n_estimators": [100, 200, 400],
    "model__max_depth": stats.randint(3, 8),
    "model__learning_rate": [0.01, 0.03, 0.05, 0.1],
    "model__subsample": [0.6, 0.8, 1.0],
    "model__colsample_bytree": [0.5, 0.7, 1.0],
    "model__gamma": [0, 0.1, 0.3]
}

rs = RandomizedSearchCV(
    pipe,
    param_distributions=param_dist,
    n_iter=12,           # rápido: prueba 12 combinaciones
    cv=3,
    scoring="roc_auc",
    random_state=RANDOM_STATE,
    verbose=2,
    n_jobs=1,            # <-- importante: evita workers múltiples
    refit=True
)

# 9) Fit con early stopping: pasamos fit_params prefijados con el nombre del paso ('model__')
# Usamos eval_set con una porción del train para early stopping (aquí usamos X_test,y_test)
fit_params = {
    "model__early_stopping_rounds": 30,
    "model__eval_set": [(preprocessor.fit_transform(X_test), y_test)]  # pre-transform test once for early stopping
}
# Nota: preprocessor.fit_transform(X_test) aplica transform con la preproc en su estado actual:
# para evitar discrepancia entre folds, podríamos pasar eval_set con datos no transformados y dejar que pipeline lo haga.
# Sin embargo RandomizedSearchCV pasará esos fit_params al pipeline y el prefijo 'model__eval_set' espera matrices
# en la forma que XGBoost acepta (numpy/scipy). La transformación del eval_set la hacemos manualmente arriba para seguridad.

# IMPORTANTE: si la línea anterior falla por transform en preproc no ajustado, hagamos fit parcial directo:
# Para simplificar y robustez, en lugar de pasar eval_set pre-transformado, pasamos eval_set vacío y usamos early stopping en el modelo final.
# Para entornos con problemas, puedes comentar fit_params y ejecutar rs.fit(X_train, y_train) sin early stopping en CV.

# Vamos a intentar primero sin pasar eval_set a cada fit (más robusto en CV). En su lugar usaremos early stopping al final sobre validación.
rs_fit_params = {}  # dejamos vacío para evitar errores durante CV

try:
    rs.fit(X_train, y_train, **rs_fit_params)
except Exception as e:
    print("Advertencia: RandomizedSearchCV con fit_params produjo un error. Reintentando sin fit_params.")
    rs = RandomizedSearchCV(
        pipe,
        param_distributions=param_dist,
        n_iter=8,
        cv=3,
        scoring="roc_auc",
        random_state=RANDOM_STATE,
        verbose=2,
        n_jobs=1,
        refit=True
    )
    rs.fit(X_train, y_train)

print("Mejor params (RandomizedSearchCV):", rs.best_params_)

best_model = rs.best_estimator_

# 10) Como good practice: reentrenamos el mejor XGB con early stopping sobre un validation set
# Separamos del train un pequeño validation
X_tr_sub, X_val, y_tr_sub, y_val = train_test_split(X_train, y_train, test_size=0.15, random_state=RANDOM_STATE, stratify=y_train if len(np.unique(y_train))>1 else None)

# Transformamos mediante preprocessor del pipeline (ya ajustado en rs.best_estimator_.named_steps['preproc'])
preproc = best_model.named_steps["preproc"]
X_tr_sub_t = preproc.transform(X_tr_sub)
X_val_t = preproc.transform(X_val)
X_test_t = preproc.transform(X_test)

# Extraemos parámetros encontrados
best_params_for_xgb = {k.replace("model__",""): v for k,v in rs.best_params_.items() if k.startswith("model__")}
# Creamos nuevo XGB con esos params y early stopping
xgb_final = XGBClassifier(
    objective="binary:logistic",
    use_label_encoder=False,
    eval_metric="logloss",
    random_state=RANDOM_STATE,
    n_jobs=1,
    tree_method="hist",
    scale_pos_weight=scale_pos_weight,
    **best_params_for_xgb
)

xgb_final.fit(
    X_tr_sub_t,
    y_tr_sub,
    early_stopping_rounds=30,
    eval_set=[(X_val_t, y_val)],
    verbose=False
)

# 11) Predicción sobre test y métricas
y_prob_test = xgb_final.predict_proba(X_test_t)[:, 1]
y_pred_threshold = (y_prob_test >= APPROVAL_THRESHOLD).astype(int)

auc = roc_auc_score(y_test, y_prob_test)
print(f"\nROC AUC en test: {auc:.4f}")
print("\nClasificación con umbral (threshold = {:.4f}):".format(APPROVAL_THRESHOLD))
print(classification_report(y_test, y_pred_threshold))
cm = confusion_matrix(y_test, y_pred_threshold)
print("Matriz de confusión (verdadero x predicho):\n", cm)

# 12) Añadir probabilidades y decisión al dataset de test y guardar
X_test_out = X_test.reset_index(drop=True).copy()
results = pd.DataFrame({
    "prob_pay": y_prob_test,
    "approve": y_pred_threshold,
    "actual": y_test.reset_index(drop=True)
})
output = pd.concat([X_test_out, results], axis=1)
output_file = "test_with_probs_robust.csv"
output.to_csv(output_file, index=False)
print(f"\nResultados de test guardados en {output_file}")

# 13) Guardar pipeline (preproc + modelo final integrado)
# Construimos pipeline final: preproc + xgb_final (necesitamos un pipeline nuevo)
final_pipeline = Pipeline(steps=[("preproc", preproc), ("model", xgb_final)])
joblib.dump(final_pipeline, MODEL_OUTPUT)
print(f"Pipeline final guardado en: {MODEL_OUTPUT}")

# 14) Mostrar top 10 por probabilidad
print("\nTop 10 clientes por probabilidad de pago (test set):")
display(output.sort_values("prob_pay", ascending=False).head(10))

# Ejemplo de uso posterior:
print(f"""
USO posterior:
from joblib import load
m = load('{MODEL_OUTPUT}')
probs = m.predict_proba(nuevos_datos)[:,1]   # nuevos_datos debe tener las mismas columnas que X (sin la target)
decisiones = (probs >= {APPROVAL_THRESHOLD}).astype(int)
""")


Datos cargados. Shape: (322156, 49)
TARGET_COL detectado automáticamente: 'NON_COMPLIANT_CONTRACT'
Train/Test shapes: (257724, 48) (64432, 48)
Numéricas: 36 / Categóricas: 12
scale_pos_weight calculado: 11.280
Fitting 3 folds for each of 12 candidates, totalling 36 fits
[CV] END model__colsample_bytree=1.0, model__gamma=0, model__learning_rate=0.05, model__max_depth=5, model__n_estimators=100, model__subsample=0.6; total time=  11.6s
[CV] END model__colsample_bytree=1.0, model__gamma=0, model__learning_rate=0.05, model__max_depth=5, model__n_estimators=100, model__subsample=0.6; total time=   7.3s
[CV] END model__colsample_bytree=1.0, model__gamma=0, model__learning_rate=0.05, model__max_depth=5, model__n_estimators=100, model__subsample=0.6; total time=   7.4s
[CV] END model__colsample_bytree=1.0, model__gamma=0.1, model__learning_rate=0.05, model__max_depth=5, model__n_estimators=400, model__subsample=0.6; total time=  12.7s
[CV] END model__colsample_bytree=1.0, model__gamma=0.1, mod

TypeError: XGBClassifier.fit() got an unexpected keyword argument 'early_stopping_rounds'

In [None]:
# ============================================================
# 1. CARGA DE DATOS
# ============================================================

import pandas as pd
import numpy as np

# Ruta al dataset curado
df = pd.read_parquet("data/curated/Master_FinPlus_Final.parquet")

print("Dimensiones:", df.shape)
df.head()


# ============================================================
# 2. PREPARACIÓN DE VARIABLES
# ============================================================

# Variable objetivo:
# NON_COMPLIANT_CONTRACT = 1 → No paga
# Queremos probabilidad de PAGO → target = 1 si paga
df["TARGET"] = (df["NON_COMPLIANT_CONTRACT"] == 0).astype(int)

# Eliminamos columnas no útiles / fugas de información
cols_drop = [
    "NON_COMPLIANT_CONTRACT",
    "CLIENT_ID",
    "CONTRACT_ID",
    "DATE"
]

df = df.drop(columns=[c for c in cols_drop if c in df.columns])


# =======================
# Detectar numéricas y categóricas
# =======================
numeric_cols = df.select_dtypes(include=["int64", "float64"]).columns.tolist()
categorical_cols = df.select_dtypes(include=["object"]).columns.tolist()

print("Numéricas:", len(numeric_cols))
print("Categóricas:", len(categorical_cols))


# ============================================================
# 3. PREPROCESAMIENTO
# ============================================================

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier

X = df.drop("TARGET", axis=1)
y = df["TARGET"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# One-hot-encoding para variables categóricas
preprocess = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_cols),
        ("num", "passthrough", numeric_cols)
    ]
)


# ============================================================
# 4. MODELO XGBOOST
# ============================================================

model = XGBClassifier(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    eval_metric="logloss",
    random_state=42
)

pipeline = Pipeline(steps=[
    ("preprocess", preprocess),
    ("model", model)
])

pipeline.fit(X_train, y_train)

print("Modelo entrenado correctamente.")


# ============================================================
# 5. PROBABILIDAD DE PAGO & DECISIÓN DE CRÉDITO
# ============================================================

# Probabilidad de pagar (TARGET = 1)
df["PROB_PAGO"] = pipeline.predict_proba(X)[:, 1]

# Decisión: se aprueba si PROB_PAGO ≥ 0.8
df["APROBADO"] = (df["PROB_PAGO"] >= 0.8).astype(int)

df[["PROB_PAGO", "APROBADO"]].head()


# ============================================================
# 6. EXPORTAR RESULTADOS
# ============================================================

df.to_csv("output_probabilidad_credito.csv", index=False)
print("Archivo exportado: output_probabilidad_credito.csv")


In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score, confusion_matrix, roc_curve, accuracy_score
from sklearn.impute import SimpleImputer

from xgboost import XGBClassifier
import optuna
import joblib


FILE_PATH = "/home/jovyan/work/data/curated/"

# Aquí cargas tu dataset final (ajusta el nombre si es distinto)
df = pd.read_csv(FILE_PATH + "final_df.csv")  


TARGET = "NON_COMPLIANT_CONTRACT"

y = df[TARGET]
X = df.drop(columns=[TARGET])

cat_cols = X.select_dtypes(include=["object"]).columns.tolist()
num_cols = X.select_dtypes(exclude=["object"]).columns.tolist()

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, stratify=y, random_state=42
)

X_train, X_val, y_train, y_val = train_test_split(
    X_train, y_train, test_size=0.20, stratify=y_train, random_state=42
)


preprocess = ColumnTransformer(
    transformers=[
        ("num", SimpleImputer(strategy="median"), num_cols),
        ("cat", Pipeline([
            ("imp", SimpleImputer(strategy="most_frequent")),
            ("oh", OneHotEncoder(handle_unknown="ignore"))
        ]), cat_cols)
    ]
)


def objective(trial):

    params = {
        "n_estimators": trial.suggest_int("n_estimators", 200, 600),
        "max_depth": trial.suggest_int("max_depth", 3, 8),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.15),
        "subsample": trial.suggest_float("subsample", 0.6, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "gamma": trial.suggest_float("gamma", 0, 5),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 10),
    }

    model = XGBClassifier(
        objective="binary:logistic",
        eval_metric="logloss",
        early_stopping_rounds=30,
        tree_method="hist",
        **params
    )

    pipe = Pipeline([
        ("prep", preprocess),
        ("model", model)
    ])

    pipe.fit(
        X_train, y_train,
        model__eval_set=[(X_val, y_val)],
        model__verbose=False
    )

    y_pred_prob = pipe.predict_proba(X_val)[:, 1]
    return roc_auc_score(y_val, y_pred_prob)


study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=25)


best_params = study.best_params

xgb_final = XGBClassifier(
    objective="binary:logistic",
    eval_metric="logloss",
    early_stopping_rounds=30,
    tree_method="hist",
    **best_params
)

pipe_final = Pipeline([
    ("prep", preprocess),
    ("model", xgb_final)
])

pipe_final.fit(
    X_train, y_train,
    model__eval_set=[(X_val, y_val)],
    model__verbose=False
)


y_prob_test = pipe_final.predict_proba(X_test)[:, 1]
y_pred_test = (y_prob_test > 0.5).astype(int)

print("ROC AUC:", roc_auc_score(y_test, y_prob_test))
print("Accuracy:", accuracy_score(y_test, y_pred_test))
print(confusion_matrix(y_test, y_pred_test))


fpr, tpr, thr = roc_curve(y_test, y_prob_test)
ks = np.max(tpr - fpr)
print("KS:", ks)


model_inner = pipe_final.named_steps["model"]
pre = pipe_final.named_steps["prep"]

oh = pre.named_transformers_["cat"].named_steps["oh"]
cat_names = oh.get_feature_names_out(cat_cols)
final_features = np.concatenate([num_cols, cat_names])

fimp = pd.DataFrame({
    "feature": final_features,
    "importance": model_inner.feature_importances_
}).sort_values("importance", ascending=False)

fimp.head(20)


joblib.dump(pipe_final, FILE_PATH + "xgb_model_final.pkl")

