# Modelo LightGBM

## Instalación de dependencias e imports

In [None]:
# Dependencias necesarias (ejecutar una vez por sesión de Colab T4)
!pip install -q lightgbm optuna seaborn kaggle category_encoders

In [None]:
import gc
import os
import json
import random
from pathlib import Path

import pandas as pd
import numpy as np

from dataclasses import dataclass
from typing import Dict, Iterable, List, Optional, Tuple

from category_encoders import TargetEncoder
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import VarianceThreshold

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

import torch
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold

import optuna
from optuna.samplers import TPESampler
from optuna.pruners import MedianPruner

optuna.logging.set_verbosity(optuna.logging.WARNING)
random.seed(42)
np.random.seed(42)

In [None]:
# === VARIABLES DE PREPROCESAMIENTO ===
TARGET_COL = "RENDIMIENTO_GLOBAL"

# === COLUMNAS IDENTIFICADAS EN EL ANÁLISIS EXPLORATORIO ===

# Columnas que no vamos a usar
DROP_CANDIDATES = [
    "ID",  # No hay señal predictiva
    "F_TIENEINTERNET.1",  # Información duplicada de F_TIENEINTERNET
    "E_PRIVADO_LIBERTAD",  # Varianza MUY baja (≈0.005%)
]

# Columnas con alta cardinalidad
HIGH_CARDINALITY = [
    "E_PRGM_ACADEMICO",  # Más de 900 categorías
    "E_PRGM_DEPARTAMENTO",  # Más de 30 categorías
]

# Columnas con baja cardinalidad
LOW_CARDINALITY = [
    "F_TIENEINTERNET",
    "F_TIENECOMPUTADOR",
    "F_TIENEAUTOMOVIL",
    "F_TIENELAVADORA",
    "E_PAGOMATRICULAPROPIO",
]

# Mapeo para columnas ordinales
ORDINAL_MAP = {
    "F_ESTRATOVIVIENDA": [
        "Desconocido",
        "Sin Estrato",
        "Estrato 1",
        "Estrato 2",
        "Estrato 3",
        "Estrato 4",
        "Estrato 5",
        "Estrato 6",
    ],
    "E_HORASSEMANATRABAJA": [
        "Desconocido",
        "0",
        "Menos de 10 horas",
        "Entre 11 y 20 horas",
        "Entre 21 y 30 horas",
        "Más de 30 horas",
    ],
    "E_VALORMATRICULAUNIVERSIDAD": [
        "Desconocido",
        "No pagó matrícula",
        "Menos de 500 mil",
        "Entre 500 mil y menos de 1 millón",
        "Entre 1 millón y menos de 2.5 millones",
        "Entre 2.5 millones y menos de 4 millones",
        "Entre 4 millones y menos de 5.5 millones",
        "Entre 5.5 millones y menos de 7 millones",
        "Más de 7 millones",
    ],
    "F_EDUCACIONPADRE": [
        "Desconocido",
        "Ninguno",
        "Primaria incompleta",
        "Primaria completa",
        "Secundaria (Bachillerato) incompleta",
        "Secundaria (Bachillerato) completa",
        "Técnica o tecnológica incompleta",
        "Técnica o tecnológica completa",
        "Educación profesional incompleta",
        "Educación profesional completa",
        "Postgrado",
        "No Aplica",
        "No sabe",
    ],
    "F_EDUCACIONMADRE": [
        "Desconocido",
        "Ninguno",
        "Primaria incompleta",
        "Primaria completa",
        "Secundaria (Bachillerato) incompleta",
        "Secundaria (Bachillerato) completa",
        "Técnica o tecnológica incompleta",
        "Técnica o tecnológica completa",
        "Educación profesional incompleta",
        "Educación profesional completa",
        "Postgrado",
        "No Aplica",
        "No sabe",
    ],
}

# Columnas numéricas
NUMERIC_COLUMNS = [
    "PERIODO_ACADEMICO",
    "INDICADOR_1",
    "INDICADOR_2",
    "INDICADOR_3",
    "INDICADOR_4",
]

# === IDENTIFICACIÓN DE CLASES ===

CLASS_NAMES = ["alto", "medio-alto", "medio-bajo", "bajo"]
CLASS2IDX = {cls: idx for idx, cls in enumerate(CLASS_NAMES)}
IDX2CLASS = {idx: cls for cls, idx in CLASS2IDX.items()}

# === CONFIGURACIÓN DEL MODELO ===

N_SPLITS = 5
RANDOM_STATE = 42
ARTIFACT_DIR = Path("./artifacts_lightgbm")
ARTIFACT_DIR.mkdir(exist_ok=True, parents=True)

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Device hint: {device} | Artifacts: {ARTIFACT_DIR.resolve()}")

## Creación del Dataset

In [None]:
os.environ["KAGGLE_CONFIG_DIR"] = "."
!kaggle competitions download -c udea-ai-4-eng-20252-pruebas-saber-pro-colombia
!unzip -q udea-ai-4-eng-20252-pruebas-saber-pro-colombia.zip

In [None]:
def load_dataset(path: Path) -> pd.DataFrame:
    df = pd.read_csv(path, encoding="latin-1")
    return df

## Pipeline de Preprocesamiento

### Clases Helpers

In [None]:
class DataFrameImputer(BaseEstimator, TransformerMixin):
    """Rellenar datos faltantes manteniendo la estructura DataFrame."""

    def __init__(self, fill_value: str = "Desconocido") -> None:
        self.fill_value = fill_value
        self.columns_: Optional[List[str]] = None

    def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None) -> "DataFrameImputer":
        self.columns_ = list(X.columns) if hasattr(X, "columns") else None
        return self

    def transform(self, X: pd.DataFrame) -> pd.DataFrame:
        if self.columns_ is None:
            raise RuntimeError("Se debe llamar fit antes de transform.")
        if not isinstance(X, pd.DataFrame):
            X = pd.DataFrame(X, columns=self.columns_)
        return X.fillna(self.fill_value)

    def get_feature_names_out(self, input_features: Optional[List[str]] = None) -> np.ndarray:
        if input_features is not None:
            return np.asarray(input_features, dtype=object)
        if self.columns_ is None:
            raise RuntimeError("Llamar fit antes de solicitar los nombres.")
        return np.asarray(self.columns_, dtype=object)

In [None]:
@dataclass
class FeatureConfig:
    """Clase para almacenar la categorización de columnas"""

    drop: List[str]
    high_card: List[str]
    low_card: List[str]
    ordinal: List[str]
    numeric: List[str]
    ordinal_categories: List[List[str]]

    @classmethod
    def from_dataframe(cls, df: pd.DataFrame) -> "FeatureConfig":
        available_columns = set(df.columns)
        drop = [c for c in DROP_CANDIDATES if c in available_columns]
        usable = available_columns - {TARGET_COL}
        high_card = [c for c in HIGH_CARDINALITY if c in usable]
        low_card = [c for c in LOW_CARDINALITY if c in usable]
        ordinal = [c for c in ORDINAL_MAP if c in usable]
        numeric = [c for c in NUMERIC_COLUMNS if c in usable]
        ordinal_categories = [ORDINAL_MAP[c] for c in ordinal]
        return cls(drop, high_card, low_card, ordinal, numeric, ordinal_categories)

### Construcción

In [None]:
def make_preprocessor(config: FeatureConfig, random_state: int = RANDOM_STATE) -> Pipeline:
    """Construye el pipeline de preprocesamiento para todas las columnas."""

    transformers = []

    if config.high_card:
        high_card_pipeline = Pipeline(
            steps=[
                ("imputer", DataFrameImputer(fill_value="Desconocido")),
                (
                    "encoder",
                    TargetEncoder(
                        cols=config.high_card,
                        smoothing=0.5,
                        handle_unknown="value",
                        handle_missing="value",
                    ),
                ),
            ]
        )
        transformers.append(("high_card", high_card_pipeline, config.high_card))

    if config.ordinal:
        ordinal_pipeline = Pipeline(
            steps=[
                ("imputer", SimpleImputer(strategy="constant", fill_value="Desconocido")),
                (
                    "encoder",
                    OrdinalEncoder(
                        categories=config.ordinal_categories,
                        dtype=float,
                        handle_unknown="use_encoded_value",
                        unknown_value=-1,
                    ),
                ),
            ]
        )
        transformers.append(("ordinal", ordinal_pipeline, config.ordinal))

    if config.low_card:
        low_card_pipeline = Pipeline(
            steps=[
                ("imputer", SimpleImputer(strategy="constant", fill_value="Desconocido")),
                ("encoder", OneHotEncoder(handle_unknown="ignore", sparse_output=False)),
            ]
        )
        transformers.append(("one_hot", low_card_pipeline, config.low_card))

    if config.numeric:
        numeric_pipeline = Pipeline(
            steps=[
                ("imputer", SimpleImputer(strategy="median")),
                ("scaler", StandardScaler()),
            ]
        )
        transformers.append(("numeric", numeric_pipeline, config.numeric))

    column_transformer = ColumnTransformer(
        transformers=transformers,
        remainder="drop",
        verbose_feature_names_out=False,
    )

    preprocessing_pipeline = Pipeline(
        steps=[
            ("preprocess", column_transformer),
            ("variance", VarianceThreshold(threshold=1e-5)),
        ]
    )
    return preprocessing_pipeline

### Procesar Datasets

In [None]:
def process_dataset(
    df: pd.DataFrame,
    config: FeatureConfig,
    random_state: int = RANDOM_STATE,
) -> pd.DataFrame:
    X = df.drop(columns=[TARGET_COL] + config.drop, errors="ignore")
    y = df[TARGET_COL]

    preprocessing = make_preprocessor(config, random_state=random_state)
    preprocessing.fit(X, y)

    feature_names = preprocessing.named_steps["preprocess"].get_feature_names_out()
    mask = preprocessing.named_steps["variance"].get_support()
    selected_feature_names = feature_names[mask]
    transformed = preprocessing.transform(X)

    processed_df = pd.DataFrame(transformed, columns=selected_feature_names, index=df.index)
    processed_df[TARGET_COL] = y.values

    return processed_df

In [None]:
train_raw = load_dataset(Path("train.csv"))
train_config = FeatureConfig.from_dataframe(train_raw)
train_df = process_dataset(train_raw, train_config)
print(f"train_df shape: {train_df.shape}")

## Modelo

### Utilidades

In [None]:
def collect_fold_metrics(name: str, fold_scores: List[dict]) -> pd.DataFrame:
    df = pd.DataFrame(fold_scores)
    summary = {
        "model": name,
        "mean_acc": df["accuracy"].mean(),
        "std_acc": df["accuracy"].std(ddof=0),
        "min_acc": df["accuracy"].min(),
        "max_acc": df["accuracy"].max(),
    }
    return df, summary


def describe_class_balance(labels: np.ndarray):
    counts = pd.Series(labels).value_counts().sort_index()
    display(counts.rename(index=IDX2CLASS))

### Folds Estratificados

In [None]:
print(f"Shape: {train_df.shape} | Memoria ~{train_df.memory_usage().sum() / 1e6:.1f} MB")

y = train_df[TARGET_COL].map(CLASS2IDX).to_numpy(dtype=np.int64)
X = train_df.drop(columns=[TARGET_COL]).to_numpy(dtype=np.float32)

describe_class_balance(y)
print(f"Feature dims: {X.shape[1]}")

del train_df
_ = gc.collect()

skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_STATE)
folds = list(skf.split(X, y))
print(f"Folds preparados: {len(folds)}")

### Entrenamiento LightGBM

In [None]:
def train_lightgbm_cv(X: np.ndarray, y: np.ndarray, folds) -> dict:
    # Este set base mantiene un balance razonable entre bias y varianza
    base_params = {
        "objective": "multiclass",
        "num_class": len(CLASS_NAMES),
        "metric": "multi_logloss",
        "learning_rate": 0.05,
        "n_estimators": 600,
        "num_leaves": 63,
        "subsample": 0.8,
        "colsample_bytree": 0.7,
        "random_state": RANDOM_STATE,
        "n_jobs": -1,
    }

    oof_pred = np.zeros((len(y), len(CLASS_NAMES)), dtype=np.float32)
    fold_scores = []
    model_paths = []

    for fold_id, (tr_idx, val_idx) in enumerate(folds):
        print(f"[LightGBM] Fold {fold_id}")
        # Reinstanciamos para garantizar independencia entre folds
        model = lgb.LGBMClassifier(**base_params)
        model.fit(
            X[tr_idx],
            y[tr_idx],
            eval_set=[(X[val_idx], y[val_idx])],
            eval_metric="multi_logloss",
            verbose=100,
        )
        probs = model.predict_proba(X[val_idx])
        # Guardamos los logits para análisis posteriores
        oof_pred[val_idx] = probs
        preds = probs.argmax(axis=1)
        fold_scores.append({
            "fold": fold_id,
            "accuracy": accuracy_score(y[val_idx], preds),
        })
        model_path = ARTIFACT_DIR / f"lightgbm_fold{fold_id}.txt"
        # Persistir cada fold permite ensamblar luego sin reentrenar
        model.booster_.save_model(model_path)
        model_paths.append(model_path)
        gc.collect()

    # Resumen compacto para logging aguas abajo
    return {
        "name": "LightGBM",
        "params": base_params,
        "fold_metrics": fold_scores,
        "oof_predictions": oof_pred,
        "model_paths": model_paths,
    }

lightgbm_results = train_lightgbm_cv(X, y, folds)
lightgbm_fold_df, lightgbm_summary = collect_fold_metrics(
    lightgbm_results["name"], lightgbm_results["fold_metrics"]
)
print(f"Promedio accuracy LightGBM: {lightgbm_summary['mean_acc']:.4f}")
lightgbm_fold_df

### Evaluación final

In [None]:
lightgbm_summary

y_pred = lightgbm_results["oof_predictions"].argmax(axis=1)
acc = accuracy_score(y, y_pred)
print(f"Accuracy OOF global LightGBM: {acc:.4f}")

conf_mat = confusion_matrix(y, y_pred)
plt.figure(figsize=(6, 5))
sns.heatmap(
    conf_mat,
    annot=True,
    fmt="d",
    cmap="Greens",
    xticklabels=CLASS_NAMES,
    yticklabels=CLASS_NAMES,
)
plt.xlabel("Predicción")
plt.ylabel("Real")
plt.title("Matriz de confusión LightGBM")
plt.tight_layout()
plt.show()

print(classification_report(y, y_pred, target_names=CLASS_NAMES))