In [None]:
# ===============================
# Notebook completo para Colab
# - EDA b√°sico
# - Preprocesamiento (imputaci√≥n, factorize)
# - Target encoding CV-based para categ√≥ricas de alta cardinalidad
# - StratifiedKFold CV + OOF
# - Optuna HPO para LightGBM (opcional) + RandomizedSearch para RF
# - Ensemble promediando probabilidades LGBM + RF
# - Generaci√≥n y descarga de submission.csv
# ===============================

# 0) Instalar dependencias
!pip install -q lightgbm scikit-learn pandas optuna

# 1) Imports
from google.colab import files
import pandas as pd
import numpy as np
import os, time
from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import lightgbm as lgb
import optuna
import warnings
warnings.filterwarnings("ignore")

# 2) Subir archivos
print("üìå Sube train.csv")
_ = files.upload()
print("üìå Sube test.csv")
_ = files.upload()
print("üìå Sube submission_example.csv (o submission_example.csv)")
_ = files.upload()

# 3) Cargar datos
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
# try different names for sample
sample_name = None
for candidate in ["submission_example.csv", "sample_submission.csv", "submission_example", "sample_submission"]:
    if os.path.exists(candidate):
        sample_name = candidate
        break
if sample_name is None:
    # create a dummy sample from test if none provided
    print("No encontr√© sample_submission en los nombres comunes; se crear√° uno temporal desde test.")
    sample = pd.DataFrame({ 'ID': test.iloc[:,0], 'RENDIMIENTO_GLOBAL': ['bajo']*len(test) })
else:
    sample = pd.read_csv(sample_name)

print(f"Shapes -> train: {train.shape}, test: {test.shape}, sample: {sample.shape}")

# 4) EDA r√°pido y checks
print("\n-- EDA R√°pido --")
print("Nulos por columna (top 20):")
print(train.isna().sum().sort_values(ascending=False).head(20))
print("\nDistribuci√≥n target:")
print(train["RENDIMIENTO_GLOBAL"].value_counts())
print("\nPrimeras columnas:", train.columns.tolist()[:30])

# 5) Preparar target -> map to ints
TARGET = "RENDIMIENTO_GLOBAL"
if TARGET not in train.columns:
    raise ValueError(f"No encontr√© la columna target '{TARGET}' en train.csv")

# Mapear etiquetas a ints (asegurar orden reproducible con sorted unique)
unique_targets = sorted(train[TARGET].astype(str).unique().tolist())
class_mapping = {c:i for i,c in enumerate(unique_targets)}
inverse_mapping = {i:c for c,i in class_mapping.items()}
print("\nMapeo target (label -> c√≥digo):", class_mapping)

train['target_num'] = train[TARGET].astype(str).map(class_mapping)

# 6) Separar X y y
X = train.drop(columns=[TARGET, 'target_num'])
y = train['target_num']
test_ids_candidates = [c for c in test.columns if c.lower() in ['id','ids','Id','ID']]
if len(test_ids_candidates)==0:
    # fallback: use first column as ID
    id_col = test.columns[0]
    print(f"No encontr√© columna ID expl√≠cita en test; usar√© la primera columna '{id_col}' como ID.")
else:
    id_col = test_ids_candidates[0]
print("ID column detected:", id_col)

# 7) Unir train+test para transformar iguales
all_data = pd.concat([X, test], axis=0, ignore_index=True)
n_train = len(X)
n_test = len(test)
print("Total filas combinadas (train+test):", all_data.shape)

# 8) Detectar tipos y decisiones de encoding
num_cols = all_data.select_dtypes(include=["int64","float64"]).columns.tolist()
cat_cols = all_data.select_dtypes(include=["object","category"]).columns.tolist()
print(f"Num cols: {len(num_cols)}, Cat cols: {len(cat_cols)}")

# 9) Imputaci√≥n num√©rica (median) y conservar NaN flags opcional
for c in num_cols:
    if all_data[c].isna().any():
        all_data[c + "_na_flag"] = all_data[c].isna().astype(int)
        all_data[c] = all_data[c].fillna(all_data[c].median())

# 10) Preparar categ√≥ricas: decide target-encoding para cardinalidad alta
cardinality = all_data[cat_cols].nunique().sort_values(ascending=False)
# Umbral para considerar target encoding
TE_CARDINALITY = 20
te_cols = [c for c in cat_cols if all_data[c].nunique() > TE_CARDINALITY]
low_card_cols = [c for c in cat_cols if all_data[c].nunique() <= TE_CARDINALITY]
print("Target-encoding columns (high-cardinality):", te_cols)
print("Low-cardinality (factorize):", low_card_cols)

# 11) Factorize low-cardinality categorical columns
for c in low_card_cols:
    all_data[c] = all_data[c].fillna("MISSING").astype(str)
    all_data[c] = all_data[c].factorize()[0]

# 12) Target encoding (out-of-fold) for high-cardinality columns
# We'll perform target-encoding using only TRAIN rows; for test use global means from train
def target_encode_oof(train_df, full_df, col, target, n_splits=5, random_state=42):
    """
    train_df: dataframe with only train rows (index aligned to 0..n_train-1)
    full_df: concatenated all_data used for mapping (train+test)
    col: column name in full_df to encode
    target: Series with numeric labels for train
    returns: encoded_train (np array length n_train), encoded_full_test_value (mapping for test)
    """
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state)
    oof = pd.Series(index=train_df.index, dtype=float)
    for tr_idx, val_idx in skf.split(train_df, target):
        means = pd.Series(train_df.iloc[tr_idx].groupby(col)[target.name].mean())
        oof.iloc[val_idx] = train_df.iloc[val_idx][col].map(means)
    # fill NA with global mean
    global_mean = target.mean()
    oof.fillna(global_mean, inplace=True)
    # For test mapping: compute means on full train
    train_means_full = train_df.groupby(col)[target.name].mean()
    # map test values via full_df
    test_values = full_df.iloc[n_train:][col].map(train_means_full).fillna(global_mean).values
    return oof.values, test_values, train_means_full.to_dict()

# Before target encoding we need the original string categories in the train slice
# Ensure the columns exist as strings
for c in te_cols:
    all_data[c] = all_data[c].fillna("MISSING").astype(str)

# Prepare containers
train_te_arrays = {}
test_te_arrays = {}
te_maps = {}

if len(te_cols)>0:
    print("\nAplicando target-encoding CV-based a columnas con alta cardinalidad...")
    train_slice = all_data.iloc[:n_train].copy().reset_index(drop=True)
    full_df = all_data.copy().reset_index(drop=True)
    # attach target to train_slice for grouping
    train_slice[TARGET] = train[TARGET].astype(str).values
    # but for numeric means we need numeric target mapping
    train_slice['target_num'] = y.values
    for c in te_cols:
        print("TE:", c)
        oof_vals, test_vals, mapping = target_encode_oof(train_slice, full_df, c, train_slice['target_num'], n_splits=5)
        # create new columns in all_data: c+"_te"
        all_data.loc[:n_train-1, c + "_te"] = oof_vals
        all_data.loc[n_train:, c + "_te"] = test_vals
        train_te_arrays[c] = oof_vals
        test_te_arrays[c] = test_vals
        te_maps[c] = mapping
else:
    print("No hay columnas de alta cardinalidad para target-encoding.")

# 13) After TE, drop original high-cardinality raw columns (optional) or keep both
# We'll keep both but convert originals to factorized integers (to feed tree models)
for c in te_cols:
    all_data[c + "_raw_factor"] = all_data[c].factorize()[0]

# 14) For any remaining object/category columns (should be none), factorize
remaining_cat_cols = all_data.select_dtypes(include=["object","category"]).columns.tolist()
for c in remaining_cat_cols:
    all_data[c] = all_data[c].astype(str).factorize()[0]

# 15) Finalize X_proc and test_proc
X_proc = all_data.iloc[:n_train].reset_index(drop=True)
test_proc = all_data.iloc[n_train:].reset_index(drop=True)
print("X_proc shape:", X_proc.shape, " test_proc shape:", test_proc.shape)

# 16) Parameters and settings for CV / models
N_SPLITS = 5
SEED = 42
skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=SEED)

# Containers for OOF probabilities and test predictions
n_classes = len(class_mapping)
oof_probs_lgb = np.zeros((n_train, n_classes))
oof_probs_rf = np.zeros((n_train, n_classes))
preds_test_lgb = np.zeros((n_test, n_classes))
preds_test_rf = np.zeros((n_test, n_classes))

# LightGBM params default (will be tuned)
lgb_base_params = {
    "objective": "multiclass",
    "num_class": n_classes,
    "metric": "multi_logloss",
    "verbosity": -1,
    "seed": SEED
}

# 17) Optuna HPO for LightGBM (per fold tuning optional) - we'll run once to get global best params
USE_OPTUNA = True
N_TRIALS_OPTUNA = 20  # reduce/increase per your time budget

def optuna_lgb_objective(trial, Xtr, ytr, Xval, yval):
    param = {
        "objective": "multiclass",
        "num_class": n_classes,
        "metric": "multi_logloss",
        "verbosity": -1,
        "seed": SEED,
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.2, log=True),
        "num_leaves": trial.suggest_int("num_leaves", 20, 200),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.4, 1.0),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.4, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 0, 10),
        "lambda_l1": trial.suggest_float("lambda_l1", 0.0, 5.0),
        "lambda_l2": trial.suggest_float("lambda_l2", 0.0, 5.0),
    }
    dtrain = lgb.Dataset(Xtr, label=ytr)
    dval = lgb.Dataset(Xval, label=yval, reference=dtrain)
    bst = lgb.train(param, dtrain, num_boost_round=1000, valid_sets=[dval],
                    callbacks=[lgb.early_stopping(stopping_rounds=50), lgb.log_evaluation(period=0)])
    preds = bst.predict(Xval, num_iteration=bst.best_iteration)
    loss = -1.0 * (accuracy_score(yval, np.argmax(preds, axis=1)))  # we minimize negative accuracy
    return loss

best_lgb_params = None
if USE_OPTUNA:
    print("\n==> Ejecutando Optuna para LightGBM (puede tardar)...")
    # We'll create a small train/val split from X_proc to tune quickly
    X_tr_split, X_val_split, y_tr_split, y_val_split = train_test_split(X_proc, y, test_size=0.2, random_state=SEED, stratify=y)
    def optuna_wrapper(trial):
        return optuna_lgb_objective(trial, X_tr_split, y_tr_split, X_val_split, y_val_split)
    study = optuna.create_study(direction="minimize", sampler=optuna.samplers.TPESampler(seed=SEED))
    study.optimize(optuna_wrapper, n_trials=N_TRIALS_OPTUNA, show_progress_bar=True)
    print("Optuna best value (neg acc):", study.best_value)
    print("Optuna best params:", study.best_params)
    # Translate best_params into LightGBM params
    best_lgb_params = {
        "objective": "multiclass",
        "num_class": n_classes,
        "metric": "multi_logloss",
        "verbosity": -1,
        "seed": SEED,
        **study.best_params
    }
    # ensure integer params if needed
    best_lgb_params["num_leaves"] = int(best_lgb_params.get("num_leaves", 31))
    print("Best LGB params prepared.")
else:
    # Use defaults
    best_lgb_params = {
        **lgb_base_params,
        "learning_rate": 0.05,
        "num_leaves": 31
    }

# 18) RandomizedSearch for RandomForest (global)
print("\n==> Haciendo RandomizedSearch para RandomForest (r√°pido)...")
rf = RandomForestClassifier(random_state=SEED, n_jobs=-1)
param_dist = {
    "n_estimators": [100, 200, 400],
    "max_depth": [5, 10, 20, None],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4]
}
N_ITER_RF = 20
rs = RandomizedSearchCV(rf, param_dist, n_iter=N_ITER_RF, scoring="accuracy", cv=3, random_state=SEED, n_jobs=-1, verbose=0)
# fit on full processed X_proc to get best RF params (this uses factorized/TE features)
rs.fit(X_proc, y)
best_rf_params = rs.best_params_
print("Best RF params:", best_rf_params)

# 19) Cross-validation loop: train LGBM and RF on each fold, collect OOF and test preds
print("\n==> Iniciando StratifiedKFold CV (Entrenando LGBM + RF en cada fold)...")
fold = 0
for train_idx, val_idx in skf.split(X_proc, y):
    fold += 1
    print(f"\n--- Fold {fold} ---")
    X_tr, X_val = X_proc.iloc[train_idx], X_proc.iloc[val_idx]
    y_tr, y_val_fold = y.iloc[train_idx], y.iloc[val_idx]

    # LightGBM
    dtrain = lgb.Dataset(X_tr, label=y_tr)
    dval = lgb.Dataset(X_val, label=y_val_fold, reference=dtrain)
    lgb_params = best_lgb_params.copy()
    # set reasonable n_estimators via num_boost_round
    bst = lgb.train(lgb_params, dtrain, num_boost_round=1000, valid_sets=[dval],
                    callbacks=[lgb.early_stopping(stopping_rounds=50), lgb.log_evaluation(period=0)])
    pred_val_lgb = bst.predict(X_val, num_iteration=bst.best_iteration)
    pred_test_lgb = bst.predict(test_proc, num_iteration=bst.best_iteration)
    oof_probs_lgb[val_idx] = pred_val_lgb
    preds_test_lgb += pred_test_lgb / N_SPLITS

    # RandomForest (fit on fold)
    rf_model = RandomForestClassifier(**best_rf_params, random_state=SEED, n_jobs=-1)
    rf_model.fit(X_tr, y_tr)
    pred_val_rf_probs = rf_model.predict_proba(X_val)
    pred_test_rf_probs = rf_model.predict_proba(test_proc)
    # Note: RandomForest.predict_proba returns columns in order of rf_model.classes_
    # We need to align classes order to 0..n_classes-1
    # Build aligned arrays
    rf_classes = list(rf_model.classes_)
    aligned_val_rf = np.zeros((len(pred_val_rf_probs), n_classes))
    aligned_test_rf = np.zeros((n_test, n_classes))
    for idx, cls in enumerate(rf_classes):
        aligned_val_rf[:, int(cls)] = pred_val_rf_probs[:, idx]
        aligned_test_rf[:, int(cls)] = pred_test_rf_probs[:, idx]
    oof_probs_rf[val_idx] = aligned_val_rf
    preds_test_rf += aligned_test_rf / N_SPLITS

# 20) OOF accuracy per model and ensemble
oof_pred_lgb = np.argmax(oof_probs_lgb, axis=1)
oof_pred_rf = np.argmax(oof_probs_rf, axis=1)
# Ensemble: average probabilities
oof_probs_ensemble = (oof_probs_lgb + oof_probs_rf) / 2
oof_pred_ensemble = np.argmax(oof_probs_ensemble, axis=1)

print("\nOOF Accuracy LGB:", accuracy_score(y, oof_pred_lgb))
print("OOF Accuracy RF: ", accuracy_score(y, oof_pred_rf))
print("OOF Accuracy Ensemble (avg):", accuracy_score(y, oof_pred_ensemble))

# 21) Predicciones finales en test: promedio LGB + RF probabilities
final_test_probs = (preds_test_lgb + preds_test_rf) / 2
final_test_labels = np.argmax(final_test_probs, axis=1)
final_test_labels_str = [inverse_mapping[i] for i in final_test_labels]

# 22) Crear submission.csv validando columnas y orden
submission = pd.DataFrame({
    id_col: test[id_col].values,
    "RENDIMIENTO_GLOBAL": final_test_labels_str
})

# Ensure same number rows as test
if len(submission) != n_test:
    raise ValueError("El archivo submission no tiene el mismo n√∫mero de filas que test.csv")

submission.to_csv("submission.csv", index=False)
print("\n‚úî submission.csv creado correctamente. Primeras filas:")
print(submission.head())

# 23) Descargar submission
files.download("submission.csv")

# 24) Guardar modelos y artefactos (opcional)
# bst.save_model("lgb_final_model.txt")  # uncomment if you want to save last LGB model
# import joblib
# joblib.dump(rf_model, "rf_final_model.pkl")

print("\nFIN. Revisa submission.csv y s√∫belo manualmente a Kaggle (Submit Predictions).")


[?25l   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m0.0/404.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[90m‚ï∫[0m [32m399.4/404.7 kB[0m [31m22.3 MB/s[0m eta [36m0:00:01[0m[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m404.7/404.7 kB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
[?25hüìå Sube train.csv


Saving train.csv to train.csv
üìå Sube test.csv


Saving test.csv to test.csv
üìå Sube submission_example.csv (o submission_example.csv)


Saving submission_example.csv to submission_example.csv
Shapes -> train: (692500, 21), test: (296786, 20), sample: (296786, 2)

-- EDA R√°pido --
Nulos por columna (top 20):
F_TIENEAUTOMOVIL               43623
F_TIENELAVADORA                39773
F_TIENECOMPUTADOR              38103
F_ESTRATOVIVIENDA              32137
E_HORASSEMANATRABAJA           30857
F_TIENEINTERNET.1              26629
F_TIENEINTERNET                26629
F_EDUCACIONMADRE               23664
F_EDUCACIONPADRE               23178
E_PAGOMATRICULAPROPIO           6498
E_VALORMATRICULAUNIVERSIDAD     6287
PERIODO_ACADEMICO                  0
ID                                 0
E_PRGM_DEPARTAMENTO                0
E_PRGM_ACADEMICO                   0
E_PRIVADO_LIBERTAD                 0
RENDIMIENTO_GLOBAL                 0
INDICADOR_1                        0
INDICADOR_2                        0
INDICADOR_3                        0
dtype: int64

Distribuci√≥n target:
RENDIMIENTO_GLOBAL
alto          175619
bajo      

[I 2025-11-28 00:09:49,942] A new study created in memory with name: no-name-33910d7c-c374-48ab-804e-9d5122e56192


  0%|          | 0/20 [00:00<?, ?it/s]

Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[431]	valid_0's multi_logloss: 1.19205
[I 2025-11-28 00:15:16,398] Trial 0 finished with value: -0.43812274368231047 and parameters: {'learning_rate': 0.030710573677773714, 'num_leaves': 192, 'min_child_samples': 75, 'feature_fraction': 0.759195090518222, 'bagging_fraction': 0.4936111842654619, 'bagging_freq': 1, 'lambda_l1': 0.2904180608409973, 'lambda_l2': 4.330880728874676}. Best is trial 0 with value: -0.43812274368231047.
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[277]	valid_0's multi_logloss: 1.19146
[I 2025-11-28 00:18:31,565] Trial 1 finished with value: -0.4376823104693141 and parameters: {'learning_rate': 0.06054365855469246, 'num_leaves': 148, 'min_child_samples': 6, 'feature_fraction': 0.9819459112971965, 'bagging_fraction': 0.899465584480253, 'bagging_freq': 2, 'lambda_l1': 0.9091248360355031, 'lambda_l2': 0.9170225492671691}