# MODELOS PREDICTIVOS CON MACHINE/DEEP LEARNING

In [None]:
pip install xgboost scipy

In [6]:

# script ligero y estable para dataset grande
import os
import warnings
import numpy as np
import pandas as pd
warnings.filterwarnings("ignore")

from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.metrics import roc_auc_score, confusion_matrix, classification_report
from xgboost import XGBClassifier
import joblib
from scipy import stats

RANDOM_STATE = 42
FILE_PATH = "/home/jovyan/work/data/curated/Master_Model_FinPlus.parquet"
TARGET_COL = 'NON_COMPLIANT_CONTRACT'
APPROVAL_THRESHOLD = 0.8
MODEL_OUTPUT = "xgb_model_stable.joblib"

# ---- carga ----
df = pd.read_parquet(FILE_PATH)
print("Datos cargados. Shape:", df.shape)

# ---- target ----
y = df[TARGET_COL].copy()
if y.dtype == bool:
    y = y.astype(int)
elif y.dtype == object:
    uniq = y.dropna().astype(str).str.lower().unique()
    if set(uniq).issubset({"true","t","yes","y","false","f","no","n"}):
        y = y.astype(str).str.lower().map(lambda v: 1 if v in {"true","t","yes","y"} else 0)
    else:
        # map automatico si son 2 clases
        if len(uniq) == 2:
            mapping = {uniq[0]:0, uniq[1]:1}
            y = y.astype(str).map(mapping)
        else:
            raise ValueError("TARGET no binario")
else:
    y = y.astype(int)

X = df.drop(columns=[TARGET_COL])

# ---- split ----
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=RANDOM_STATE, stratify=y
)
print("Train/Test shapes:", X_train.shape, X_test.shape)

# ---- columnas ----
num_cols = X_train.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = X_train.select_dtypes(include=["object","bool","category"]).columns.tolist()

print("Num:", len(num_cols), "Cat:", len(cat_cols))

# ---- preprocesamiento ligero ----
numeric_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

# ordinal encoder es MUCHO más ligero que onehot
categorical_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="constant", fill_value="__missing__")),
    ("encoder", OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1))
])

preprocessor = ColumnTransformer(
    [
        ("num", numeric_transformer, num_cols),
        ("cat", categorical_transformer, cat_cols)
    ]
)

# ---- modelo ----
pos = (y_train == 1).sum()
neg = (y_train == 0).sum()
scale_pos_weight = neg / pos
print("scale_pos_weight:", scale_pos_weight)

xgb = XGBClassifier(
    objective="binary:logistic",
    eval_metric="logloss",
    random_state=RANDOM_STATE,
    n_jobs=1,              # MUY IMPORTANTE para que no se bloquee
    tree_method="hist",
    scale_pos_weight=scale_pos_weight
)

pipe = Pipeline([("preproc", preprocessor), ("model", xgb)])

# ---- búsqueda ligera ----
param_dist = {
    "model__n_estimators": [100, 200],
    "model__max_depth": stats.randint(3, 8),
    "model__learning_rate": [0.01, 0.03],
    "model__subsample": [0.6, 0.8],
    "model__colsample_bytree": [0.5, 0.7],
    "model__gamma": [0, 0.1]
}

rs = RandomizedSearchCV(
    pipe,
    param_dist,
    n_iter=12,            # muy reducido, estable
    cv=3,
    scoring="roc_auc",
    random_state=RANDOM_STATE,
    n_jobs=1,
    verbose=1,
    refit=True
)

print("Entrenando modelos (ligero)...")
rs.fit(X_train, y_train)

print("Mejores parámetros:", rs.best_params_)
best_pipeline = rs.best_estimator_

# ---- evaluación ----
y_prob = best_pipeline.predict_proba(X_test)[:,1]
y_pred = (y_prob >= APPROVAL_THRESHOLD).astype(int)

print("AUC:", roc_auc_score(y_test, y_prob))
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

# ---- guardar ----
joblib.dump(best_pipeline, MODEL_OUTPUT)
print("Modelo guardado en:", MODEL_OUTPUT)

# ------------------ APARTADO: dataset final con nuevas columnas ------------------
# Construimos un DataFrame final con:
# - las columnas originales de X_test
# - las columnas preprocesadas (prefijo "proc_")
# - las columnas de resultado: prob_pay, approve, actual

# reset index para alinear
X_test_out = X_test.reset_index(drop=True).copy()
y_test_out = y_test.reset_index(drop=True).copy()

# transformado preprocesado (puede devolver sparse o array)
preproc = best_pipeline.named_steps["preproc"]
X_test_proc = preproc.transform(X_test_out)

# Si devuelve sparse, convertir a array
try:
    # tiene toarray?
    if hasattr(X_test_proc, "toarray"):
        X_test_proc = X_test_proc.toarray()
except Exception:
    pass

# Crear nombres de columnas procesadas (mantenemos un nombre por cada columna pasada al preprocessor)
proc_cols = []
if num_cols:
    proc_cols += [f"proc_{c}" for c in num_cols]
if cat_cols:
    proc_cols += [f"proc_{c}" for c in cat_cols]

# Asegurarnos que las dimensiones coincidan
if X_test_proc.shape[1] != len(proc_cols):
    # Caso inusual: ColumnTransformer reordena o expande columnas (no debería aquí).
    # En ese caso, sólo mostramos las primeras/últimas columnas con índices numéricos.
    proc_cols = [f"proc_{i}" for i in range(X_test_proc.shape[1])]

processed_df = pd.DataFrame(X_test_proc, columns=proc_cols)

# Resultados (probabilidades y predicciones)
results_df = pd.DataFrame({
    "prob_pay": y_prob,
    "approve": y_pred,
    "actual": y_test_out
})

# Concatenar todo en un DataFrame final
final_df = pd.concat([X_test_out.reset_index(drop=True), processed_df.reset_index(drop=True), results_df.reset_index(drop=True)], axis=1)

# Información y vista previa
print("\nDataset final (original features + preprocessed features + preds). Shape:", final_df.shape)
print("\nPrimeras 10 filas del dataset final:")
print(final_df.head(10).to_string(index=False))

# Guardar CSV para inspección
final_csv = "final_dataset_with_features_and_preds.csv"
final_df.to_csv(final_csv, index=False)
print(f"\nDataset final guardado en: {final_csv}")
# ------------------------------------------------------------------------------

print(MODEL_OUTPUT)


Datos cargados. Shape: (162977, 48)
Train/Test shapes: (130381, 47) (32596, 47)
Num: 47 Cat: 0
scale_pos_weight: 11.312871848144301
Entrenando modelos (ligero)...
Fitting 3 folds for each of 12 candidates, totalling 36 fits
Mejores parámetros: {'model__colsample_bytree': 0.7, 'model__gamma': 0.1, 'model__learning_rate': 0.03, 'model__max_depth': 7, 'model__n_estimators': 200, 'model__subsample': 0.6}
AUC: 0.7564042539361368
              precision    recall  f1-score   support

           0       0.93      0.99      0.96     29949
           1       0.40      0.10      0.16      2647

    accuracy                           0.91     32596
   macro avg       0.66      0.54      0.56     32596
weighted avg       0.88      0.91      0.89     32596

[[29549   400]
 [ 2376   271]]
Modelo guardado en: xgb_model_stable.joblib

Dataset final (original features + preprocessed features + preds). Shape: (32596, 97)

Primeras 10 filas del dataset final:
 TOTAL_INCOME  AMOUNT_PRODUCT  INSTALLMENT  R

In [7]:
X_test_out = X_test.reset_index(drop=True).copy()
results = pd.DataFrame({
    "prob_pay": y_prob,
    "approve": y_pred,
    "actual": y_test.reset_index(drop=True)
})
output = pd.concat([X_test_out, results], axis=1)
output_file = "test_with_probs_simplified.csv"
output.to_csv(output_file, index=False)
print(f"\nResultados de test guardados en: {output_file}")

# 10) Guardar pipeline final (preproc + modelo)
joblib.dump(best_pipeline, MODEL_OUTPUT)
print(f"Pipeline final guardado en: {MODEL_OUTPUT}")

# 11) Top 100 imprimible
print("\nTop 100 clientes por probabilidad (test set):")
top10 = output.sort_values("prob_pay", ascending=False).head(100)
print(top10.reset_index(drop=True).to_string(index=False))


Resultados de test guardados en: test_with_probs_simplified.csv
Pipeline final guardado en: xgb_model_stable.joblib

Top 100 clientes por probabilidad (test set):
 TOTAL_INCOME  AMOUNT_PRODUCT  INSTALLMENT  REGION_SCORE  AGE_IN_YEARS  JOB_SENIORITY  HOME_SENIORITY  LAST_UPDATE  CAR_AGE  FAMILY_SIZE  REACTIVE_SCORING  PROACTIVE_SCORING  BEHAVIORAL_SCORING  DAYS_LAST_INFO_CHANGE  NUMBER_OF_PRODUCTS  DIGITAL_CLIENT  NUM_PREVIOUS_LOAN_APP  LOAN_ANNUITY_PAYMENT_SUM  LOAN_APPLICATION_AMOUNT_SUM  LOAN_CREDIT_GRANTED_SUM  NUM_STATUS_ANNULLED  NUM_STATUS_AUTHORIZED  NUM_STATUS_DENIED  NUM_STATUS_NOT_USED  NUM_FLAG_INSURED  CREDICT_CARD_BALANCE  CREDIT_CARD_LIMIT  CREDIT_CARD_PAYMENT  NUMBER_DRAWINGS_ATM  NUMBER_DRAWINGS  NUMBER_INSTALMENTS  KPI_DAYS_LAST_MOV  KPI_TOTAL_SPEND  KPI_DEBT_RATIO  KPI_LOAN_VOLATILITY  KPI_APPROVAL_RATIO  KPI_DENIAL_RATE  NAME_PRODUCT_TYPE_IDX  GENDER_IDX  EDUCATION_IDX  MARITAL_STATUS_IDX  HOME_SITUATION_IDX  OWN_INSURANCE_CAR_IDX  OCCUPATION_IDX  HOME_OWNER_IDX  EM