<a href="https://colab.research.google.com/github/johansbustamante-gif/Proyecto-Inteligencia-Artificial/blob/main/Modelo%20Optuna.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# =========================
# Notebook Colab: Versión rápida y corregida (70/30 split)
# Optuna rápido + XGBoost + LightGBM + Ensemble
# Pegar y ejecutar en UNA sola celda en Colab.
# =========================

# ---------- 0) Instalación de dependencias ----------
import sys, subprocess, os, time

def pip_install(packages):
    to_install = []
    for pkg in packages:
        try:
            __import__(pkg.split('==')[0])
        except Exception:
            to_install.append(pkg)
    if to_install:
        print("Instalando:", to_install)
        subprocess.check_call([sys.executable, "-m", "pip", "install", *to_install])
    else:
        print("Dependencias OK")

pip_install(["category_encoders", "xgboost", "lightgbm", "optuna", "joblib", "seaborn"])

# ---------- 1) Imports ----------
import numpy as np
import pandas as pd
import joblib
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# TargetEncoder fallback
try:
    from category_encoders import TargetEncoder
    has_target_encoder = True
except Exception as e:
    print("category_encoders no disponible -> OneHotEncoder fallback. Error:", e)
    from sklearn.preprocessing import OneHotEncoder
    from sklearn.compose import ColumnTransformer
    has_target_encoder = False

# xgboost / lightgbm / optuna
import xgboost as xgb
from xgboost import XGBClassifier
import lightgbm as lgb
import optuna

# ---------- 2) Montar Google Drive ----------
from google.colab import drive
print("Montando Google Drive...")
drive.mount('/content/drive', force_remount=True)

# ---------- 3) Rutas ----------
DRIVE_PATH = '/content/drive/MyDrive/DataProyectoIA'
TRAIN_PATH = os.path.join(DRIVE_PATH, 'train_limpio.csv')
TEST_PATH = os.path.join(DRIVE_PATH, 'test.csv')

if not os.path.exists(DRIVE_PATH):
    print("ERROR: No existe la carpeta esperada:", DRIVE_PATH)
    print("Contenido de /content/drive/MyDrive:")
    try:
        print(os.listdir('/content/drive/MyDrive'))
    except Exception:
        pass
    raise FileNotFoundError("Ajusta DRIVE_PATH")

if not os.path.exists(TRAIN_PATH):
    print("ERROR: No se encontró:", TRAIN_PATH)
    print("Contenido de", DRIVE_PATH, ":", os.listdir(DRIVE_PATH))
    raise FileNotFoundError("Coloca train_limpio.csv en la carpeta o corrige la ruta")

# ---------- 4) Cargar datos ----------
print("Cargando:", TRAIN_PATH)
train = pd.read_csv(TRAIN_PATH)
print("Train shape:", train.shape)

test = None
if os.path.exists(TEST_PATH):
    print("Cargando:", TEST_PATH)
    test = pd.read_csv(TEST_PATH)
    print("Test shape:", test.shape)
else:
    print("No se encontró test.csv; no se generará submission automáticamente.")

# ---------- 5) Preparar X, y ----------
if 'RENDIMIENTO_GLOBAL' not in train.columns:
    raise KeyError("Falta columna 'RENDIMIENTO_GLOBAL' en train_limpio.csv")

X = train.drop(columns=['RENDIMIENTO_GLOBAL']).copy()
y = train['RENDIMIENTO_GLOBAL'].copy()

if 'ID' in X.columns:
    train_ids = X['ID'].copy()
    X = X.drop(columns=['ID'])
else:
    train_ids = None

le = LabelEncoder()
y_enc = le.fit_transform(y)
print("Clases:", list(le.classes_))

cat_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()
num_cols = X.select_dtypes(include=[np.number]).columns.tolist()
print("Num cols:", len(num_cols), "Cat cols:", len(cat_cols))

# ---------- 6) Split (train/val estratificado 70/30) ----------
X_train, X_val, y_train, y_val = train_test_split(
    X, y_enc, test_size=0.3, random_state=42, stratify=y_enc
)
print("Split realizado (70/30):", X_train.shape, X_val.shape)

# ---------- 7) Preprocesado ----------
encoder = None
ct = None

if has_target_encoder and len(cat_cols) > 0:
    encoder = TargetEncoder(cols=cat_cols, smoothing=0.3)
    X_train_enc = encoder.fit_transform(X_train, y_train)
    X_val_enc = encoder.transform(X_val)
    if test is not None:
        if 'ID' in test.columns:
            test_ids = test['ID']
            X_test_raw = test.drop(columns=['ID'])
        else:
            test_ids = None
            X_test_raw = test.copy()
        X_test_enc = encoder.transform(X_test_raw)
else:
    if len(cat_cols) > 0:
        print("Usando OneHotEncoder (fallback)")
        from sklearn.preprocessing import OneHotEncoder
        from sklearn.compose import ColumnTransformer
        ohe = OneHotEncoder(handle_unknown='ignore', sparse=False)
        ct = ColumnTransformer(transformers=[('ohe', ohe, cat_cols)], remainder='passthrough')
        X_train_tmp = ct.fit_transform(X_train)
        X_val_tmp = ct.transform(X_val)
        ohe_cols = ct.named_transformers_['ohe'].get_feature_names_out(cat_cols).tolist()
        remainder_cols = [c for c in X_train.columns if c not in cat_cols]
        cols_final = ohe_cols + remainder_cols
        X_train_enc = pd.DataFrame(X_train_tmp, columns=cols_final)
        X_val_enc = pd.DataFrame(X_val_tmp, columns=cols_final)
        if test is not None:
            if 'ID' in test.columns:
                test_ids = test['ID']
                X_test_raw = test.drop(columns=['ID'])
            else:
                test_ids = None
                X_test_raw = test.copy()
            X_test_tmp = ct.transform(X_test_raw)
            X_test_enc = pd.DataFrame(X_test_tmp, columns=cols_final)
    else:
        X_train_enc = X_train.copy()
        X_val_enc = X_val.copy()
        if test is not None:
            if 'ID' in test.columns:
                test_ids = test['ID']
                X_test_enc = test.drop(columns=['ID'])
            else:
                test_ids = None
                X_test_enc = test.copy()

# reset indices
X_train_enc.reset_index(drop=True, inplace=True)
X_val_enc.reset_index(drop=True, inplace=True)
if test is not None:
    X_test_enc.reset_index(drop=True, inplace=True)

# ---------- 8) Escalado ----------
scaler = StandardScaler()
if len(num_cols) > 0:
    common_num_cols = [c for c in num_cols if c in X_train_enc.columns]
    if len(common_num_cols) > 0:
        X_train_enc[common_num_cols] = scaler.fit_transform(X_train_enc[common_num_cols])
        X_val_enc[common_num_cols] = scaler.transform(X_val_enc[common_num_cols])
        if test is not None:
            X_test_enc[common_num_cols] = scaler.transform(X_test_enc[common_num_cols])

# ---------- 9) Entrenamiento inicial rápido (baseline) ----------
import numpy as _np
X_train_arr = X_train_enc.values if hasattr(X_train_enc, "values") else _np.array(X_train_enc)
X_val_arr = X_val_enc.values if hasattr(X_val_enc, "values") else _np.array(X_val_enc)
if test is not None:
    X_test_arr = X_test_enc.values if hasattr(X_test_enc, "values") else _np.array(X_test_enc)

# Intentamos entrenar un XGBoost rápido como baseline
baseline = XGBClassifier(
    objective='multi:softprob',
    num_class=len(le.classes_),
    n_estimators=300,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.9,
    colsample_bytree=0.8,
    random_state=42,
    use_label_encoder=False,
    eval_metric='mlogloss'
)

try:
    baseline.fit(X_train_arr, y_train, eval_set=[(X_val_arr, y_val)], verbose=50)
    baseline_ok = True
except Exception as e:
    print("Baseline XGBClassifier.fit falló (usando fallback xgb.train()). Error:", e)
    baseline_ok = False
    dtrain_bl = xgb.DMatrix(X_train_arr, label=y_train)
    dval_bl = xgb.DMatrix(X_val_arr, label=y_val)
    booster_bl = xgb.train({"objective":"multi:softprob","num_class":len(le.classes_),"eta":0.05,"max_depth":6,"eval_metric":"mlogloss"},
                           dtrain_bl, num_boost_round=300, evals=[(dtrain_bl,"train"),(dval_bl,"eval")], verbose_eval=50)
    class WrappedBoosterBl:
        def __init__(self, booster_obj):
            self.booster = booster_obj
        def predict(self, X):
            d = xgb.DMatrix(X)
            probs = self.booster.predict(d)
            return _np.argmax(probs, axis=1)
    baseline = WrappedBoosterBl(booster_bl)

# Evaluación baseline
y_val_pred = baseline.predict(X_val_arr)
acc_baseline = accuracy_score(y_val, y_val_pred)
print("Accuracy validación (baseline):", acc_baseline)

# ---------- 10) Optuna rápido (submuestra) con pruner y timeout ----------
# Parámetros rápidos (ajustables)
SAMPLE_SIZE = 150_000       # submuestra para Optuna (ajusta si memoria)
N_TRIALS_FAST = 40          # número de trials rápidos
NUM_BOOST_ROUND_FAST = 600  # rounds por trial
EARLY_STOP_FAST = 30        # early stopping por trial
OPTUNA_TIMEOUT = 3600       # segundos totales máximo (1 hora)

print("Preparando Optuna rápido (submuestra). Esto tarda menos que explorar todo el dataset.")

# Crear submuestra reproducible de X_train_arr
n_rows = len(X_train_arr)
sample_n = int(min(SAMPLE_SIZE, n_rows))
rng = np.random.RandomState(42)
idx = rng.choice(n_rows, size=sample_n, replace=False)
X_sub = X_train_arr[idx]
y_sub = np.array(y_train)[idx]

# split interno
X_tr_s, X_va_s, y_tr_s, y_va_s = train_test_split(X_sub, y_sub, test_size=0.2, random_state=42, stratify=y_sub)

# Objective con soporte de pruning
def objective_fast(trial):
    params = {
        "objective": "multi:softprob",
        "num_class": len(le.classes_),
        "eta": trial.suggest_float("eta", 1e-3, 0.2, log=True),
        "max_depth": trial.suggest_int("max_depth", 4, 10),
        "subsample": trial.suggest_float("subsample", 0.6, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "lambda": trial.suggest_float("lambda", 1e-3, 10.0, log=True),
        "alpha": trial.suggest_float("alpha", 1e-3, 10.0, log=True),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 10),
        "eval_metric": "mlogloss",
        "verbosity": 0
    }
    dtr = xgb.DMatrix(X_tr_s, label=y_tr_s)
    dva = xgb.DMatrix(X_va_s, label=y_va_s)
    booster = xgb.train(
        params=params,
        dtrain=dtr,
        num_boost_round=NUM_BOOST_ROUND_FAST,
        evals=[(dtr,"train"), (dva,"eval")],
        early_stopping_rounds=EARLY_STOP_FAST,
        verbose_eval=False
    )
    preds = booster.predict(dva)
    pred_idx = np.argmax(preds, axis=1)
    return accuracy_score(y_va_s, pred_idx)

# Crear study con MedianPruner y timeout
pruner = optuna.pruners.MedianPruner(n_startup_trials=5, n_warmup_steps=5)
study_fast = optuna.create_study(direction="maximize", study_name="xgb_opt_fast", pruner=pruner)

print("Ejecutando Optuna (rápido). Timeout:", OPTUNA_TIMEOUT, "s. Trials máximos:", N_TRIALS_FAST)
t0 = time.time()
study_fast.optimize(objective_fast, n_trials=N_TRIALS_FAST, n_jobs=1, timeout=OPTUNA_TIMEOUT)
t1 = time.time()
print("Optuna rápido finalizado. Tiempo (s):", int(t1-t0))
print("Trials completados:", len(study_fast.trials))
print("Mejor accuracy (submuestra):", study_fast.best_value)
print("Mejores parámetros (submuestra):", study_fast.best_params)

# Guardar estudio rápido
OUT_DIR = "/content/models_saberpro"
os.makedirs(OUT_DIR, exist_ok=True)
joblib.dump(study_fast, os.path.join(OUT_DIR, "optuna_xgb_fast.joblib"))

# ---------- 11) Reentrenar XGBoost final sobre train+val con mejores parámetros ----------
print("Reentrenando XGBoost final sobre train+val con mejores parámetros...")
best = study_fast.best_params.copy()
best["objective"] = "multi:softprob"
best["num_class"] = len(le.classes_)
best["eval_metric"] = "mlogloss"
best["verbosity"] = 0

X_full = np.vstack([X_train_arr, X_val_arr])
y_full = np.concatenate([np.array(y_train), np.array(y_val)])
dfull = xgb.DMatrix(X_full, label=y_full)

# Estimar rounds óptimos con xgb.cv (rápido)
cvres = xgb.cv(
    params=best,
    dtrain=dfull,
    num_boost_round=800,
    nfold=5,
    stratified=True,
    early_stopping_rounds=EARLY_STOP_FAST,
    verbose_eval=False,
    as_pandas=True,
    seed=42
)
best_nrounds = len(cvres)
print("num_boost_round estimado por CV:", best_nrounds)

# Entrenar booster final
booster_final = xgb.train(params=best, dtrain=dfull, num_boost_round=best_nrounds, verbose_eval=50)
booster_final.save_model(os.path.join(OUT_DIR, "xgb_booster_final_fast.json"))
print("XGBoost final guardado:", os.path.join(OUT_DIR, "xgb_booster_final_fast.json"))

# ---------- 12) Entrenar LightGBM sobre train+val ----------
print("Entrenando LightGBM sobre train+val...")
lgb_train = lgb.Dataset(X_full, label=y_full)
lgb_params = {
    "objective": "multiclass",
    "num_class": len(le.classes_),
    "metric": "multi_logloss",
    "learning_rate": 0.03,
    "num_leaves": 128,
    "feature_fraction": 0.8,
    "bagging_fraction": 0.8,
    "bagging_freq": 5,
    "verbose": -1,
    "min_data_in_leaf": 20,
    "lambda_l1": 0.1,
    "lambda_l2": 0.1
}
lgb_model = lgb.train(lgb_params, lgb_train, num_boost_round=best_nrounds)
lgb_model.save_model(os.path.join(OUT_DIR, "lgb_model_fast.txt"))
print("LightGBM guardado:", os.path.join(OUT_DIR, "lgb_model_fast.txt"))

# ---------- 13) Ensemble sobre test (promedio de probabilidades) ----------
print("Generando probabilidades en test y promediando (ensemble)...")
if not ('X_test_arr' in globals()):
    X_test_arr = X_test_enc.values if hasattr(X_test_enc, "values") else np.array(X_test_enc)

dtest = xgb.DMatrix(X_test_arr)
probs_xgb_test = booster_final.predict(dtest)
probs_lgb_test = lgb_model.predict(X_test_arr)

if probs_xgb_test.shape != probs_lgb_test.shape:
    print("Warning: shapes de probabilidades diferentes:", probs_xgb_test.shape, probs_lgb_test.shape)

avg_probs_test = (probs_xgb_test + probs_lgb_test) / 2.0
preds_test_idx = np.argmax(avg_probs_test, axis=1)
preds_test_labels = le.inverse_transform(preds_test_idx)

submission_ens = pd.DataFrame({
    "ID": test_ids.values if hasattr(test_ids, "values") else test_ids,
    "RENDIMIENTO_GLOBAL": preds_test_labels
})

# Validaciones y guardado
errors = []
if len(submission_ens) != len(test):
    errors.append(f"Mismatch longitudes: submission={len(submission_ens)} test={len(test)}")
try:
    ids_equal = submission_ens['ID'].astype(str).tolist() == test['ID'].astype(str).tolist()
except Exception:
    ids_equal = False
if not ids_equal:
    try:
        submission_ens = submission_ens.set_index("ID").loc[test['ID']].reset_index()
    except Exception as e:
        errors.append("No se pudo reordenar submission según test[ID]: " + str(e))
if submission_ens['ID'].duplicated().any():
    errors.append("IDs duplicados en submission")
valid_classes = set(le.classes_)
if not set(submission_ens['RENDIMIENTO_GLOBAL']).issubset(valid_classes):
    errors.append("Etiquetas en submission no coinciden con clases del encoder")

if len(errors) == 0:
    path_ens = os.path.join(DRIVE_PATH, "submission_ensemble_fast.csv")
    submission_ens.to_csv(path_ens, index=False)
    print("Submission ensemble guardado en:", path_ens)
    print(submission_ens.head())
else:
    print("Errores detectados, no se guardó submission:")
    for e in errors:
        print("-", e)

# ---------- 14) Evaluar ensemble en hold-out ----------
dval = xgb.DMatrix(X_val_arr)
probs_xgb_val = booster_final.predict(dval)
probs_lgb_val = lgb_model.predict(X_val_arr)
avg_probs_val = (probs_xgb_val + probs_lgb_val) / 2.0
preds_ens_val = np.argmax(avg_probs_val, axis=1)
acc_ens = accuracy_score(y_val, preds_ens_val)
print("Accuracy hold-out del ensemble (rápido):", acc_ens)

# ---------- 15) Guardar artefactos ----------
joblib.dump(study_fast, os.path.join(OUT_DIR, "optuna_xgb_fast.joblib"))
joblib.dump(le, os.path.join(OUT_DIR, "label_encoder.joblib"))
joblib.dump(scaler, os.path.join(OUT_DIR, "scaler.joblib"))
if has_target_encoder and encoder is not None:
    try:
        joblib.dump(encoder, os.path.join(OUT_DIR, "target_encoder.joblib"))
    except Exception:
        pass
if ct is not None:
    try:
        joblib.dump(ct, os.path.join(OUT_DIR, "column_transformer_ohe.joblib"))
    except Exception:
        pass

print("Proceso rápido completado. Revisa submission_ensemble_fast.csv en tu Drive.")





Dependencias OK
Montando Google Drive...
Mounted at /content/drive
Cargando: /content/drive/MyDrive/DataProyectoIA/train_limpio.csv
Train shape: (644232, 21)
Cargando: /content/drive/MyDrive/DataProyectoIA/test.csv
Test shape: (296786, 20)
Clases: ['alto', 'bajo', 'medio-alto', 'medio-bajo']
Num cols: 5 Cat cols: 14
Split realizado (70/30): (450962, 19) (193270, 19)
[0]	validation_0-mlogloss:1.37815
[50]	validation_0-mlogloss:1.24427
[100]	validation_0-mlogloss:1.22591
[150]	validation_0-mlogloss:1.21933
[200]	validation_0-mlogloss:1.21559
[250]	validation_0-mlogloss:1.21284
[299]	validation_0-mlogloss:1.21101


[I 2025-11-23 02:21:59,116] A new study created in memory with name: xgb_opt_fast


Accuracy validación (baseline): 0.42692606198582295
Preparando Optuna rápido (submuestra). Esto tarda menos que explorar todo el dataset.
Ejecutando Optuna (rápido). Timeout: 3600 s. Trials máximos: 40


[I 2025-11-23 02:22:42,478] Trial 0 finished with value: 0.4246666666666667 and parameters: {'eta': 0.045464867828470444, 'max_depth': 6, 'subsample': 0.8469283365722686, 'colsample_bytree': 0.9777576695519508, 'lambda': 1.154878748794114, 'alpha': 0.06781806088084787, 'min_child_weight': 7}. Best is trial 0 with value: 0.4246666666666667.
[I 2025-11-23 02:23:27,192] Trial 1 finished with value: 0.4076666666666667 and parameters: {'eta': 0.001836030198941853, 'max_depth': 4, 'subsample': 0.7115406951451138, 'colsample_bytree': 0.6784829792068909, 'lambda': 1.6332501299920132, 'alpha': 0.14056964687152326, 'min_child_weight': 5}. Best is trial 0 with value: 0.4246666666666667.
[I 2025-11-23 02:24:15,195] Trial 2 finished with value: 0.42343333333333333 and parameters: {'eta': 0.04994466840077553, 'max_depth': 5, 'subsample': 0.9110736036665124, 'colsample_bytree': 0.8838252134873121, 'lambda': 0.24548978780772765, 'alpha': 4.733561799652069, 'min_child_weight': 8}. Best is trial 0 with 

Optuna rápido finalizado. Tiempo (s): 1859
Trials completados: 40
Mejor accuracy (submuestra): 0.42743333333333333
Mejores parámetros (submuestra): {'eta': 0.03685349372490847, 'max_depth': 7, 'subsample': 0.8736633266374918, 'colsample_bytree': 0.5478973934964821, 'lambda': 0.05254994286772205, 'alpha': 0.04490229168860766, 'min_child_weight': 8}
Reentrenando XGBoost final sobre train+val con mejores parámetros...
num_boost_round estimado por CV: 800
XGBoost final guardado: /content/models_saberpro/xgb_booster_final_fast.json
Entrenando LightGBM sobre train+val...
LightGBM guardado: /content/models_saberpro/lgb_model_fast.txt
Generando probabilidades en test y promediando (ensemble)...
Submission ensemble guardado en: /content/drive/MyDrive/DataProyectoIA/submission_ensemble_fast.csv
       ID RENDIMIENTO_GLOBAL
0  550236               bajo
1   98545         medio-alto
2  499179               alto
3  782980               bajo
4  785185               bajo
Accuracy hold-out del ensemble