<a href="https://colab.research.google.com/github/johansbustamante-gif/Proyecto-Inteligencia-Artificial/blob/main/04-Modelo%20CatBoost.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# =========================
# Notebook completo (celda única) - Split 70/30
# CatBoost (GPU opcional, fallback CPU) + XGBoost + LGBM + Stacking
# Guarda submission_ensemble_fastV2.csv
# =========================

# ---------- 0) Instalación mínima (descomenta si necesitas instalar paquetes) ----------
import sys, subprocess, os, time
def pip_install(packages):
    to_install = []
    for pkg in packages:
        name = pkg.split('==')[0]
        try:
            __import__(name)
        except Exception:
            to_install.append(pkg)
    if to_install:
        print("Instalando:", to_install)
        subprocess.check_call([sys.executable, "-m", "pip", "install", *to_install])
    else:
        print("Dependencias OK")

# Si te faltan paquetes en el runtime descomenta la siguiente línea:
# pip_install(["category_encoders","catboost","xgboost","lightgbm","joblib","scikit-learn","pandas","numpy","seaborn"])

# ---------- 1) Imports ----------
import numpy as np
import pandas as pd
import joblib
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import warnings
warnings.filterwarnings("ignore")

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier

from category_encoders import TargetEncoder
import xgboost as xgb
from lightgbm import LGBMClassifier, early_stopping, log_evaluation
from catboost import CatBoostClassifier, Pool

# ---------- 2) Montar Google Drive ----------
from google.colab import drive
print("Montando Google Drive...")
drive.mount('/content/drive', force_remount=True)

# ---------- 3) Rutas ----------
DRIVE_PATH = '/content/drive/MyDrive/DataProyectoIA'
TRAIN_PATH = os.path.join(DRIVE_PATH, 'train_limpio.csv')
TEST_PATH  = os.path.join(DRIVE_PATH, 'test.csv')
OUT_DIR = '/content/models_saberpro'
os.makedirs(OUT_DIR, exist_ok=True)

if not os.path.exists(TRAIN_PATH):
    raise FileNotFoundError(f"No se encontró {TRAIN_PATH}. Ajusta DRIVE_PATH o sube el archivo.")

# ---------- 4) Cargar datos ----------
print("Cargando datos...")
train = pd.read_csv(TRAIN_PATH)
print("Train shape:", train.shape)
test = None
if os.path.exists(TEST_PATH):
    test = pd.read_csv(TEST_PATH)
    print("Test shape:", test.shape)
else:
    print("No se encontró test.csv. El pipeline correrá sin submission final.")

# ---------- 5) Preparar X, y ----------
TARGET = 'RENDIMIENTO_GLOBAL'
if TARGET not in train.columns:
    raise KeyError(f"Columna {TARGET} no encontrada en train.")

X_raw = train.drop(columns=[TARGET]).copy()
y_raw = train[TARGET].copy()

# Guardar ID y remover de X
if 'ID' in X_raw.columns:
    train_ids = X_raw['ID'].copy()
    X_raw = X_raw.drop(columns=['ID'])
else:
    train_ids = None

# Columnas categóricas / numéricas
cat_cols = X_raw.select_dtypes(include=['object','category']).columns.tolist()
num_cols = X_raw.select_dtypes(include=[np.number]).columns.tolist()
print("Categorical cols:", len(cat_cols), "Numeric cols:", len(num_cols))

# ---------- 6) Split 70/30 estratificado ----------
print("Realizando split 70/30 estratificado...")
le_tmp = LabelEncoder()
y_tmp = le_tmp.fit_transform(y_raw)
X_train_raw, X_val_raw, y_train_tmp, y_val_tmp = train_test_split(
    X_raw, y_tmp, test_size=0.30, random_state=42, stratify=y_tmp
)

# Etiquetado final consistente con strings
le = LabelEncoder()
y_train_labels = le.fit_transform(le_tmp.inverse_transform(y_train_tmp))
y_val_labels   = le.transform(le_tmp.inverse_transform(y_val_tmp))
print("Clases:", list(le.classes_))

# Variables de entrenamiento/validación
y_train_enc = np.array(y_train_labels)
y_val_enc   = np.array(y_val_labels)

# ---------- 7) Ingeniería de features ----------
def feature_engineering(df):
    df = df.copy()
    # count / freq para categóricas
    for c in cat_cols:
        if c in df.columns:
            vc = df[c].value_counts(dropna=False)
            df[c + "_cnt"] = df[c].map(vc).fillna(0).astype(int)
            df[c + "_freq"] = df[c + "_cnt"] / len(df)
    # ejemplos de interacciones / transformaciones (ajusta según columnas reales)
    if 'INGRESO_HOGAR' in df.columns and 'MUNICIPIO' in df.columns:
        df['ingreso_rank_mun'] = df.groupby('MUNICIPIO')['INGRESO_HOGAR'].rank(pct=True).fillna(0)
    if 'EDAD' in df.columns and 'ESTRATO' in df.columns:
        df['edad_x_estrato'] = df['EDAD'] * df['ESTRATO']
    if 'EDAD' in df.columns:
        df['edad_sq'] = df['EDAD']**2
    # rare levels
    for c in cat_cols:
        if c in df.columns:
            freq = df[c].map(df[c].value_counts(normalize=True))
            df[c + "_rare"] = (freq < 0.01).astype(int)
    # rellenar NA
    df = df.fillna(-999)
    return df

X_train_fe = feature_engineering(X_train_raw)
X_val_fe   = feature_engineering(X_val_raw)
if test is not None:
    if 'ID' in test.columns:
        test_ids = test['ID']
        X_test_raw = test.drop(columns=['ID'])
    else:
        test_ids = None
        X_test_raw = test.copy()
    X_test_fe = feature_engineering(X_test_raw)
else:
    X_test_fe = None

# ---------- 8) Preprocesado: TargetEncoder + Scaler ----------
cat_cols_cb = [c for c in cat_cols if c in X_train_fe.columns]
print("CatBoost categorical cols:", len(cat_cols_cb))

te_cols = [c for c in cat_cols if c in X_train_fe.columns]
te = TargetEncoder(cols=te_cols, smoothing=0.3)
X_train_te = te.fit_transform(X_train_fe, y_train_enc)
X_val_te   = te.transform(X_val_fe)
X_test_te  = te.transform(X_test_fe) if X_test_fe is not None else None

# Escalar numéricos (útil para MLP/meta modelos)
scaler = StandardScaler()
numeric_for_scaler = [c for c in X_train_te.select_dtypes(include=[np.number]).columns.tolist()]
if len(numeric_for_scaler) > 0:
    X_train_te[numeric_for_scaler] = scaler.fit_transform(X_train_te[numeric_for_scaler])
    X_val_te[numeric_for_scaler]   = scaler.transform(X_val_te[numeric_for_scaler])
    if X_test_te is not None:
        X_test_te[numeric_for_scaler] = scaler.transform(X_test_te[numeric_for_scaler])

# Arrays para XGB/LGB/MLP
X_train_xgb = X_train_te.values
X_val_xgb   = X_val_te.values
X_test_xgb  = X_test_te.values if X_test_te is not None else None

# ---------- 9) Pesos de clase (opcional) ----------
counter = Counter(y_train_enc)
class_weights = {cls: 1.0/np.sqrt(count) for cls,count in counter.items()}
weights_list = np.array([class_weights[int(lbl)] for lbl in y_train_enc])
weights_list = weights_list / np.mean(weights_list)
class_weight_list = [class_weights.get(i,1.0) for i in range(len(le.classes_))]
print("Class weights computed (normalized).")

# ---------- 10) Entrenamiento CatBoost robusto (Pools con weight; intenta GPU -> fallback CPU) ----------
USE_GPU = True   # Pon False si quieres forzar CPU
catboost_params = {
    "iterations": 1000,
    "learning_rate": 0.05,
    "depth": 6,
    "loss_function": "MultiClass",
    "eval_metric": "MultiClass",
    "random_seed": 42,
    "early_stopping_rounds": 50,
    "verbose": 100
}

print("=== Entrenamiento CatBoost (intentando GPU si USE_GPU=True) ===")

# Validar weights_list
if weights_list is None or len(weights_list) != len(y_train_enc):
    print("Advertencia: 'weights_list' inválido o longitud diferente. No se usarán weights.")
    weights_list_local = None
else:
    weights_list_local = weights_list

# Función para crear Pools (con weights opcionalmente)
def make_pools(use_weights):
    if use_weights and (weights_list_local is not None):
        train_pool = Pool(data=X_train_fe, label=y_train_enc, cat_features=cat_cols_cb, weight=weights_list_local)
    else:
        train_pool = Pool(data=X_train_fe, label=y_train_enc, cat_features=cat_cols_cb)
    val_pool = Pool(data=X_val_fe, label=y_val_enc, cat_features=cat_cols_cb)
    test_pool = Pool(data=X_test_fe, label=None, cat_features=cat_cols_cb) if X_test_fe is not None else None
    return train_pool, val_pool, test_pool

# Función de entrenamiento usando Pools (no pasar sample_weight en fit cuando usamos Pool)
def train_catboost_with_pool(params, use_gpu, use_weights_for_pool=True):
    params_local = params.copy()
    if use_gpu:
        params_local.update({"task_type":"GPU", "devices":"0"})
    else:
        params_local.pop("task_type", None)
        params_local.pop("devices", None)
    model_local = CatBoostClassifier(**params_local)
    tr_pool, va_pool, te_pool = make_pools(use_weights_for_pool)
    model_local.fit(tr_pool, eval_set=va_pool, use_best_model=True)
    return model_local

model_cb = None
if USE_GPU:
    try:
        print("Intentando CatBoost en GPU...")
        model_cb = train_catboost_with_pool(catboost_params, use_gpu=True, use_weights_for_pool=True)
        print("CatBoost entrenado en GPU correctamente.")
    except Exception as e_gpu:
        print("Error entrenando CatBoost en GPU (se intentará CPU). Error:", repr(e_gpu))
        try:
            print("Reintentando CatBoost en CPU...")
            model_cb = train_catboost_with_pool(catboost_params, use_gpu=False, use_weights_for_pool=True)
            print("CatBoost entrenado en CPU correctamente.")
        except Exception as e_cpu:
            print("Error CatBoost CPU:", repr(e_cpu))
            raise e_cpu
else:
    try:
        print("Entrenando CatBoost en CPU...")
        model_cb = train_catboost_with_pool(catboost_params, use_gpu=False, use_weights_for_pool=True)
        print("CatBoost entrenado en CPU correctamente.")
    except Exception as e:
        print("Error CatBoost CPU:", repr(e))
        raise e

# Probabilidades CatBoost
probs_cb_val  = model_cb.predict_proba(X_val_fe)
probs_cb_test = model_cb.predict_proba(X_test_fe) if X_test_fe is not None else None

# Guardar CatBoost (intenta joblib, si falla usa save_model)
try:
    joblib.dump(model_cb, os.path.join(OUT_DIR, "catboost_model_safe.joblib"))
    print("CatBoost guardado (joblib).")
except Exception:
    try:
        model_cb.save_model(os.path.join(OUT_DIR, "catboost_model_safe.cbm"))
        print("CatBoost guardado (.cbm).")
    except Exception as e:
        print("No se pudo guardar CatBoost:", e)

# ---------- 11) Entrenamiento XGBoost (CPU, DMatrix) ----------
print("Entrenando XGBoost (CPU, DMatrix)...")
dtrain = xgb.DMatrix(X_train_xgb, label=y_train_enc)
dval   = xgb.DMatrix(X_val_xgb, label=y_val_enc)
xgb_params = {
    "objective":"multi:softprob",
    "num_class": len(le.classes_),
    "eta": 0.05,
    "max_depth": 6,
    "subsample": 0.85,
    "colsample_bytree": 0.8,
    "eval_metric": "mlogloss",
    "verbosity": 0
}
bst = xgb.train(xgb_params, dtrain, num_boost_round=600, evals=[(dtrain,"train"), (dval,"eval")], early_stopping_rounds=50, verbose_eval=100)
probs_xgb_val  = bst.predict(dval)
probs_xgb_test = bst.predict(xgb.DMatrix(X_test_xgb)) if X_test_xgb is not None else None

# Guardar XGBoost
try:
    bst.save_model(os.path.join(OUT_DIR, "xgb_model.json"))
except Exception as e:
    print("No se pudo guardar XGBoost:", e)

# ---------- 12) Entrenamiento LightGBM (LGBMClassifier) - usa callbacks en vez de early_stopping_rounds ----------
print("Entrenando LightGBM (LGBMClassifier)...")
lgbm = LGBMClassifier(
    objective="multiclass",
    num_class=len(le.classes_),
    learning_rate=0.05,
    n_estimators=1500,
    subsample=0.9,
    colsample_bytree=0.9,
    reg_lambda=1.0,
    reg_alpha=0.3,
    n_jobs=-1,
    random_state=42
)

# Reemplazamos early_stopping_rounds por callbacks (compatible con varias versiones de lightgbm)
lgbm.fit(
    X_train_xgb,
    y_train_enc,
    eval_set=[(X_val_xgb, y_val_enc)],
    eval_metric="multi_logloss",
    callbacks=[early_stopping(stopping_rounds=50), log_evaluation(period=100)]
)

probs_lgb_val  = lgbm.predict_proba(X_val_xgb)
probs_lgb_test = lgbm.predict_proba(X_test_xgb) if X_test_xgb is not None else None

# Guardar LGBM
try:
    joblib.dump(lgbm, os.path.join(OUT_DIR, "lgbm_model_safeV2.joblib"))
except Exception as e:
    print("No se pudo guardar LGBM:", e)

print("Modelos individuales listos.")

# ---------- 13) MLP ligero (opcional) ----------
use_mlp = False
probs_mlp_val = None
probs_mlp_test = None
try:
    print("Entrenando MLP (opcional)...")
    mlp = MLPClassifier(hidden_layer_sizes=(256,128), max_iter=200, random_state=42, verbose=False)
    mlp.fit(X_train_xgb, y_train_enc)
    probs_mlp_val = mlp.predict_proba(X_val_xgb)
    probs_mlp_test = mlp.predict_proba(X_test_xgb) if X_test_xgb is not None else None
    use_mlp = True
    joblib.dump(mlp, os.path.join(OUT_DIR, "mlp_model.joblib"))
except Exception as e:
    print("MLP omitido por memoria/tiempo:", e)
    use_mlp = False

# ---------- 14) Stacking - preparar features para meta (validation) ----------
print("Construyendo dataset para meta-learner...")
stack_features_val = np.hstack([probs_cb_val, probs_xgb_val, probs_lgb_val])
if use_mlp and (probs_mlp_val is not None):
    stack_features_val = np.hstack([stack_features_val, probs_mlp_val])
y_meta = y_val_enc

print("Entrenando meta-learner (LogisticRegression)...")
meta = LogisticRegression(max_iter=2000, multi_class='multinomial', solver='saga', n_jobs=-1)
meta.fit(stack_features_val, y_meta)
joblib.dump(meta, os.path.join(OUT_DIR, "stacker_meta.joblib"))

# ---------- 15) Evaluación hold-out ----------
stack_val_preds = meta.predict(stack_features_val)
acc_stack_val = accuracy_score(y_meta, stack_val_preds)
print("Accuracy hold-out (stack):", acc_stack_val)
print("Reporte clasificación (stack):\n", classification_report(y_meta, stack_val_preds, target_names=le.classes_))

# ---------- 16) Generar submission_ensemble_fastV2.csv ----------
if X_test_xgb is not None and probs_cb_test is not None and probs_xgb_test is not None and probs_lgb_test is not None:
    stack_features_test = np.hstack([probs_cb_test, probs_xgb_test, probs_lgb_test])
    if use_mlp and (probs_mlp_test is not None):
        stack_features_test = np.hstack([stack_features_test, probs_mlp_test])
    preds_test_idx = meta.predict(stack_features_test)
    preds_test_labels = le.inverse_transform(preds_test_idx)

    submission = pd.DataFrame({
        "ID": test_ids.values if (test_ids is not None) else test['ID'],
        "RENDIMIENTO_GLOBAL": preds_test_labels
    })

    sub_path = os.path.join(DRIVE_PATH, "submission_ensemble_fastV2.csv")
    submission.to_csv(sub_path, index=False)
    print("Submission guardado en:", sub_path)
    print(submission.head())
else:
    print("No hay test completo o faltan probabilidades; no se generó submission.")

# ---------- 17) Guardar artefactos y preprocesadores ----------
try:
    joblib.dump(model_cb, os.path.join(OUT_DIR, "catboost_model_safe.joblib"))
except Exception:
    try:
        model_cb.save_model(os.path.join(OUT_DIR,"catboost_model_safe.cbm"))
    except Exception:
        pass
try:
    bst.save_model(os.path.join(OUT_DIR,"xgb_model.json"))
except Exception:
    pass
try:
    joblib.dump(lgbm, os.path.join(OUT_DIR,"lgbm_model_safeV2.joblib"))
except Exception:
    pass

joblib.dump(te, os.path.join(OUT_DIR, "target_encoder.joblib"))
joblib.dump(scaler, os.path.join(OUT_DIR, "scaler.joblib"))
joblib.dump(le, os.path.join(OUT_DIR, "label_encoder.joblib"))
print("Artefactos guardados en:", OUT_DIR)

# ---------- 18) Diagnóstico rápido ----------
try:
    print("Top importancias CatBoost (prettified):")
    print(model_cb.get_feature_importance(prettified=True)[:20])
except Exception:
    pass
try:
    importances_xgb = bst.get_score(importance_type='gain')
    print("Top 20 XGBoost (gain):")
    print(sorted(importances_xgb.items(), key=lambda x: x[1], reverse=True)[:20])
except Exception:
    pass

print("Ejecución completa. Revisa submission_ensemble_fastV2.csv en tu Drive si se generó.")





Montando Google Drive...
Mounted at /content/drive
Cargando datos...
Train shape: (644232, 21)
Test shape: (296786, 20)
Categorical cols: 14 Numeric cols: 5
Realizando split 70/30 estratificado...
Clases: ['alto', 'bajo', 'medio-alto', 'medio-bajo']
CatBoost categorical cols: 14
Class weights computed (normalized).
=== Entrenamiento CatBoost (intentando GPU si USE_GPU=True) ===
Intentando CatBoost en GPU...
Error entrenando CatBoost en GPU (se intentará CPU). Error: CatBoostError('catboost/cuda/cuda_lib/cuda_base.h:281: CUDA error 35: CUDA driver version is insufficient for CUDA runtime version')
Reintentando CatBoost en CPU...
0:	learn: 1.3742201	test: 1.3743282	best: 1.3743282 (0)	total: 6.97s	remaining: 1h 56m 6s
100:	learn: 1.2028303	test: 1.2055966	best: 1.2055966 (100)	total: 7m 36s	remaining: 1h 7m 40s
200:	learn: 1.1935543	test: 1.1983314	best: 1.1983314 (200)	total: 15m 29s	remaining: 1h 1m 35s
300:	learn: 1.1891704	test: 1.1962313	best: 1.1962220 (298)	total: 23m 12s	remainin