# MODELOS PREDICTIVOS CON MACHINE/DEEP LEARNING

In [2]:
pip install xgboost scikit-learn pandas joblib scipy

Collecting xgboost
  Downloading xgboost-3.1.2-py3-none-manylinux_2_28_x86_64.whl.metadata (2.1 kB)
Collecting nvidia-nccl-cu12 (from xgboost)
  Downloading nvidia_nccl_cu12-2.28.9-py3-none-manylinux_2_18_x86_64.whl.metadata (2.0 kB)
Downloading xgboost-3.1.2-py3-none-manylinux_2_28_x86_64.whl (115.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.9/115.9 MB[0m [31m998.5 kB/s[0m eta [36m0:00:00[0m0:01[0m00:02[0mm
[?25hDownloading nvidia_nccl_cu12-2.28.9-py3-none-manylinux_2_18_x86_64.whl (296.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m296.8/296.8 MB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m00:01[0m00:03[0mm
[?25hInstalling collected packages: nvidia-nccl-cu12, xgboost
Successfully installed nvidia-nccl-cu12-2.28.9 xgboost-3.1.2
Note: you may need to restart the kernel to use updated packages.


In [None]:
# script simplificado listo para copiar/pegar
# Requisitos: pip install xgboost scikit-learn pandas joblib
import os
import glob
import warnings
import numpy as np
import pandas as pd
warnings.filterwarnings("ignore")

from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import roc_auc_score, confusion_matrix, classification_report
from xgboost import XGBClassifier
import joblib
from scipy import stats

RANDOM_STATE = 42

# ---------- CONFIG (ajusta si quieres) ----------
FILE_PATH = "/home/jovyan/work/data/curated/"
TARGET_COL = 'NON_COMPLIANT_CONTRACT'
APPROVAL_THRESHOLD = 0.8
MODEL_OUTPUT = "xgb_loan_model_simplified.joblib"
# -------------------------------------------

def load_table(path):
    return pd.read_parquet(path)
        
# 1) Cargar datos
df = load_table(FILE_PATH)
print("Datos cargados. Shape:", df.shape)

# 2) Preparar X e y (asegurar 0/1)
y = df[TARGET_COL].copy()

# Conversión sencilla y robusta a 0/1
if y.dtype == bool:
    y = y.astype(int)
elif pd.api.types.is_numeric_dtype(y):
    uniq = np.unique(y.dropna())
    if set(uniq).issubset({0,1}):
        y = y.astype(int)
    elif len(uniq) == 2:
        mapping = {uniq[0]: 0, uniq[1]: 1}
        y = y.map(mapping).astype(int)
    else:
        raise ValueError(f"Target numérico no binario: {uniq}. Asigna TARGET_COL correcto.")
else:
    uniq = pd.Series(y.dropna().astype(str).str.lower().unique())
    # casos claros true/false/yes/no
    tf_set = {"true","false","t","f","yes","no","y","n"}
    if set(uniq).issubset(tf_set):
        map_true = {"true","t","yes","y"}
        y = y.astype(str).str.lower().map(lambda x: 1 if x in map_true else 0)
    elif len(uniq) == 2:
        mapping = {uniq.iloc[0]: 0, uniq.iloc[1]: 1}
        y = y.astype(str).map(mapping).astype(int)
    else:
        raise ValueError(f"Target categórico no binario: {uniq.tolist()}. Asigna TARGET_COL correcto.")

# comprobación final
if not set(pd.Series(y.dropna()).unique()).issubset({0,1}):
    raise ValueError("Después de la conversión, la columna target no contiene sólo 0/1. Revisa TARGET_COL.")

X = df.drop(columns=[TARGET_COL]).copy()

# 3) Split train/test
stratify = y if len(np.unique(y)) > 1 else None
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_STATE, stratify=stratify)
print("Train/Test shapes:", X_train.shape, X_test.shape)

# 4) Detectar num y categ
num_cols = X_train.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = X_train.select_dtypes(include=["object", "category", "bool"]).columns.tolist()
cat_cols = [c for c in cat_cols if c not in num_cols]  # evitar solapamientos

print(f"Numéricas: {len(num_cols)} / Categóricas: {len(cat_cols)}")

if not num_cols and not cat_cols:
    raise ValueError("No se detectaron columnas numéricas ni categóricas. Revisa tus features.")

# 5) Preprocesamiento simple: imputer+scaler para num, imputer+onehot para cat
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

# intentar compatibilidad para diferentes versiones de sklearn
try:
    ohe = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
except TypeError:
    ohe = OneHotEncoder(handle_unknown="ignore", sparse=False)

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="constant", fill_value="__missing__")),
    ("onehot", ohe)
])

transformers = []
if num_cols:
    transformers.append(("num", numeric_transformer, num_cols))
if cat_cols:
    transformers.append(("cat", categorical_transformer, cat_cols))

preprocessor = ColumnTransformer(transformers=transformers, remainder="drop")

# 6) XGBoost base (scale_pos_weight calculado si aplica)
pos = np.sum(y_train == 1)
neg = np.sum(y_train == 0)
scale_pos_weight = float(neg) / float(pos) if (pos > 0 and neg > 0) else 1.0
print(f"scale_pos_weight: {scale_pos_weight:.3f}")

xgb = XGBClassifier(
    objective="binary:logistic",
    use_label_encoder=False,
    eval_metric="logloss",
    random_state=RANDOM_STATE,
    n_jobs=1,
    tree_method="hist",
    scale_pos_weight=scale_pos_weight
)

pipe = Pipeline(steps=[("preproc", preprocessor), ("model", xgb)])

# 7) Búsqueda de hiperparámetros (RandomizedSearchCV)
param_dist = {
    "model__n_estimators": [100, 200],
    "model__max_depth": stats.randint(3, 8),
    "model__learning_rate": [0.01, 0.03],
    "model__subsample": [0.6, 0.8],
    "model__colsample_bytree": [0.5, 0.7],
    "model__gamma": [0, 0.1]
}

rs = RandomizedSearchCV(pipe, param_distributions=param_dist, n_iter=6, cv=3,
                        scoring="roc_auc", random_state=RANDOM_STATE, verbose=2, n_jobs=1, refit=True)

print("Iniciando RandomizedSearchCV (esto puede tardar según tamaño de datos y n_iter)...")
rs.fit(X_train, y_train)

print("Mejores parámetros (RandomizedSearchCV):", rs.best_params_)
best_pipeline = rs.best_estimator_  # ya está refiteado en todo X_train por RandomizedSearchCV

# 8) Predicción y métricas (usamos el pipeline completo: preproc + modelo)
y_prob_test = best_pipeline.predict_proba(X_test)[:, 1]
y_pred_threshold = (y_prob_test >= APPROVAL_THRESHOLD).astype(int)

auc = roc_auc_score(y_test, y_prob_test)
print(f"\nROC AUC en test: {auc:.4f}")
print("\nReporte de clasificación con umbral (threshold = {:.4f}):".format(APPROVAL_THRESHOLD))
print(classification_report(y_test, y_pred_threshold))
cm = confusion_matrix(y_test, y_pred_threshold)
print("Matriz de confusión (verdadero x predicho):\n", cm)

# 9) Guardar resultados test
X_test_out = X_test.reset_index(drop=True).copy()
results = pd.DataFrame({
    "prob_pay": y_prob_test,
    "approve": y_pred_threshold,
    "actual": y_test.reset_index(drop=True)
})
output = pd.concat([X_test_out, results], axis=1)
output_file = "test_with_probs_simplified.csv"
output.to_csv(output_file, index=False)
print(f"\nResultados de test guardados en: {output_file}")

# 10) Guardar pipeline final (preproc + modelo)
joblib.dump(best_pipeline, MODEL_OUTPUT)
print(f"Pipeline final guardado en: {MODEL_OUTPUT}")

# 11) Top 10 imprimible
print("\nTop 10 clientes por probabilidad (test set):")
top10 = output.sort_values("prob_pay", ascending=False).head(10)
print(top10.reset_index(drop=True).to_string(index=False))


Datos cargados. Shape: (322156, 49)
Train/Test shapes: (257724, 48) (64432, 48)
Numéricas: 36 / Categóricas: 12
scale_pos_weight: 11.280
Iniciando RandomizedSearchCV (esto puede tardar según tamaño de datos y n_iter)...
Fitting 3 folds for each of 12 candidates, totalling 36 fits


: 

: 

: 

In [8]:
# script ligero y estable para dataset grande
import os
import warnings
import numpy as np
import pandas as pd
warnings.filterwarnings("ignore")

from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.metrics import roc_auc_score, confusion_matrix, classification_report
from xgboost import XGBClassifier
import joblib
from scipy import stats

RANDOM_STATE = 42
FILE_PATH = "/home/jovyan/work/data/curated/"
TARGET_COL = 'NON_COMPLIANT_CONTRACT'
APPROVAL_THRESHOLD = 0.8
MODEL_OUTPUT = "xgb_model_stable.joblib"

# ---- carga ----
df = pd.read_parquet(FILE_PATH)
print("Datos cargados. Shape:", df.shape)

# ---- target ----
y = df[TARGET_COL].copy()
if y.dtype == bool:
    y = y.astype(int)
elif y.dtype == object:
    uniq = y.dropna().astype(str).str.lower().unique()
    if set(uniq).issubset({"true","t","yes","y","false","f","no","n"}):
        y = y.astype(str).str.lower().map(lambda v: 1 if v in {"true","t","yes","y"} else 0)
    else:
        # map automatico si son 2 clases
        if len(uniq) == 2:
            mapping = {uniq[0]:0, uniq[1]:1}
            y = y.astype(str).map(mapping)
        else:
            raise ValueError("TARGET no binario")
else:
    y = y.astype(int)

X = df.drop(columns=[TARGET_COL])

# ---- split ----
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=RANDOM_STATE, stratify=y
)
print("Train/Test shapes:", X_train.shape, X_test.shape)

# ---- columnas ----
num_cols = X_train.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = X_train.select_dtypes(include=["object","bool","category"]).columns.tolist()

print("Num:", len(num_cols), "Cat:", len(cat_cols))

# ---- preprocesamiento ligero ----
numeric_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

# ordinal encoder es MUCHO más ligero que onehot
categorical_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="constant", fill_value="__missing__")),
    ("encoder", OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1))
])

preprocessor = ColumnTransformer(
    [
        ("num", numeric_transformer, num_cols),
        ("cat", categorical_transformer, cat_cols)
    ]
)

# ---- modelo ----
pos = (y_train == 1).sum()
neg = (y_train == 0).sum()
scale_pos_weight = neg / pos
print("scale_pos_weight:", scale_pos_weight)

xgb = XGBClassifier(
    objective="binary:logistic",
    eval_metric="logloss",
    random_state=RANDOM_STATE,
    n_jobs=1,              # MUY IMPORTANTE para que no se bloquee
    tree_method="hist",
    scale_pos_weight=scale_pos_weight
)

pipe = Pipeline([("preproc", preprocessor), ("model", xgb)])

# ---- búsqueda ligera ----
param_dist = {
    "model__n_estimators": [200, 400],
    "model__max_depth": [4, 6],
    "model__learning_rate": [0.03, 0.1]
}

rs = RandomizedSearchCV(
    pipe,
    param_dist,
    n_iter=6,            # muy reducido, estable
    cv=3,
    scoring="roc_auc",
    random_state=RANDOM_STATE,
    n_jobs=1,
    verbose=1,
    refit=True
)

print("Entrenando modelos (ligero)...")
rs.fit(X_train, y_train)

print("Mejores parámetros:", rs.best_params_)
best_pipeline = rs.best_estimator_

# ---- evaluación ----
y_prob = best_pipeline.predict_proba(X_test)[:,1]
y_pred = (y_prob >= APPROVAL_THRESHOLD).astype(int)

print("AUC:", roc_auc_score(y_test, y_prob))
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

# ---- guardar ----
joblib.dump(best_pipeline, MODEL_OUTPUT)
print("Modelo guardado en:", MODEL_OUTPUT)

# ------------------ APARTADO: dataset final con nuevas columnas ------------------
# Construimos un DataFrame final con:
# - las columnas originales de X_test
# - las columnas preprocesadas (prefijo "proc_")
# - las columnas de resultado: prob_pay, approve, actual

# reset index para alinear
X_test_out = X_test.reset_index(drop=True).copy()
y_test_out = y_test.reset_index(drop=True).copy()

# transformado preprocesado (puede devolver sparse o array)
preproc = best_pipeline.named_steps["preproc"]
X_test_proc = preproc.transform(X_test_out)

# Si devuelve sparse, convertir a array
try:
    # tiene toarray?
    if hasattr(X_test_proc, "toarray"):
        X_test_proc = X_test_proc.toarray()
except Exception:
    pass

# Crear nombres de columnas procesadas (mantenemos un nombre por cada columna pasada al preprocessor)
proc_cols = []
if num_cols:
    proc_cols += [f"proc_{c}" for c in num_cols]
if cat_cols:
    proc_cols += [f"proc_{c}" for c in cat_cols]

# Asegurarnos que las dimensiones coincidan
if X_test_proc.shape[1] != len(proc_cols):
    # Caso inusual: ColumnTransformer reordena o expande columnas (no debería aquí).
    # En ese caso, sólo mostramos las primeras/últimas columnas con índices numéricos.
    proc_cols = [f"proc_{i}" for i in range(X_test_proc.shape[1])]

processed_df = pd.DataFrame(X_test_proc, columns=proc_cols)

# Resultados (probabilidades y predicciones)
results_df = pd.DataFrame({
    "prob_pay": y_prob,
    "approve": y_pred,
    "actual": y_test_out
})

# Concatenar todo en un DataFrame final
final_df = pd.concat([X_test_out.reset_index(drop=True), processed_df.reset_index(drop=True), results_df.reset_index(drop=True)], axis=1)

# Información y vista previa
print("\nDataset final (original features + preprocessed features + preds). Shape:", final_df.shape)
print("\nPrimeras 10 filas del dataset final:")
print(final_df.head(10).to_string(index=False))

# Guardar CSV para inspección
final_csv = "final_dataset_with_features_and_preds.csv"
final_df.to_csv(final_csv, index=False)
print(f"\nDataset final guardado en: {final_csv}")
# ------------------------------------------------------------------------------

print(MODEL_OUTPUT)


Datos cargados. Shape: (483234, 49)
Train/Test shapes: (386587, 48) (96647, 48)
Num: 36 Cat: 12
scale_pos_weight: 11.280010164861345
Entrenando modelos (ligero)...
Fitting 3 folds for each of 6 candidates, totalling 18 fits
Mejores parámetros: {'model__n_estimators': 400, 'model__max_depth': 6, 'model__learning_rate': 0.1}
AUC: 0.8854641691124511
              precision    recall  f1-score   support

           0       0.94      0.98      0.96     88777
           1       0.59      0.25      0.36      7870

    accuracy                           0.92     96647
   macro avg       0.76      0.62      0.66     96647
weighted avg       0.91      0.92      0.91     96647

[[87390  1387]
 [ 5872  1998]]
Modelo guardado en: xgb_model_stable.joblib

Dataset final (original features + preprocessed features + preds). Shape: (96647, 99)

Primeras 10 filas del dataset final:
   CLIENT_ID NAME_PRODUCT_TYPE GENDER  TOTAL_INCOME  AMOUNT_PRODUCT  INSTALLMENT EDUCATION MARITAL_STATUS       HOME_SITUATI

In [7]:
X_test_out = X_test.reset_index(drop=True).copy()
results = pd.DataFrame({
    "prob_pay": y_prob,
    "approve": y_pred,
    "actual": y_test.reset_index(drop=True)
})
output = pd.concat([X_test_out, results], axis=1)
output_file = "test_with_probs_simplified.csv"
output.to_csv(output_file, index=False)
print(f"\nResultados de test guardados en: {output_file}")

# 10) Guardar pipeline final (preproc + modelo)
joblib.dump(best_pipeline, MODEL_OUTPUT)
print(f"Pipeline final guardado en: {MODEL_OUTPUT}")

# 11) Top 100 imprimible
print("\nTop 100 clientes por probabilidad (test set):")
top10 = output.sort_values("prob_pay", ascending=False).head(100)
print(top10.reset_index(drop=True).to_string(index=False))


Resultados de test guardados en: test_with_probs_simplified.csv
Pipeline final guardado en: xgb_model_stable.joblib

Top 10 clientes por probabilidad (test set):
   CLIENT_ID NAME_PRODUCT_TYPE GENDER  TOTAL_INCOME  AMOUNT_PRODUCT  INSTALLMENT             EDUCATION MARITAL_STATUS       HOME_SITUATION  REGION_SCORE  AGE_IN_YEARS  JOB_SENIORITY  HOME_SENIORITY  LAST_UPDATE OWN_INSURANCE_CAR  CAR_AGE  FAMILY_SIZE  REACTIVE_SCORING  PROACTIVE_SCORING  BEHAVIORAL_SCORING  DAYS_LAST_INFO_CHANGE  NUMBER_OF_PRODUCTS OCCUPATION  DIGITAL_CLIENT HOME_OWNER EMPLOYER_ORGANIZATION_TYPE  NUM_PREVIOUS_LOAN_APP  LOAN_ANNUITY_PAYMENT_SUM  LOAN_APPLICATION_AMOUNT_SUM  LOAN_CREDIT_GRANTED_SUM  NUM_STATUS_ANNULLED  NUM_STATUS_AUTHORIZED  NUM_STATUS_DENIED  NUM_STATUS_NOT_USED  NUM_FLAG_INSURED       DATE  CREDICT_CARD_BALANCE  CREDIT_CARD_LIMIT  CREDIT_CARD_PAYMENT  NUMBER_DRAWINGS_ATM  NUMBER_DRAWINGS  NUMBER_INSTALMENTS  KPI_TOTAL_SPEND  KPI_DEBT_RATIO KPI_AGE_GROUP  KPI_LOAN_VOLATILITY  KPI_APPROVAL_RAT