# Modelo XGBoost para `RENDIMIENTO_GLOBAL`

Incluye Optuna, validación estratificada (K=5), métricas por fold y utilidades de exportación/inferencia.


## Instalación de Dependencias e Imports

In [None]:
# Dependencias necesarias (ejecutar una vez por sesión de Colab T4)
!pip install -q xgboost optuna seaborn kaggle category_encoders

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/404.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m404.7/404.7 kB[0m [31m12.2 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/85.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.9/85.9 kB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import gc
import os
import json
import random
from pathlib import Path
import pandas as pd
import numpy as np

# Anotaciones, clases, tipos
from dataclasses import dataclass
from sklearn.base import BaseEstimator, TransformerMixin
from typing import Dict, Iterable, List, Optional, Tuple

# Encoders
from category_encoders import TargetEncoder
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler

# Preprocesamiento
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import VarianceThreshold

# Métricas y visualización
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

# Modelo
import torch
import xgboost as xgb
from sklearn.model_selection import StratifiedKFold

# Optimización
import optuna
from optuna.samplers import TPESampler
from optuna.pruners import MedianPruner

optuna.logging.set_verbosity(optuna.logging.WARNING)
random.seed(42)
np.random.seed(42)

In [None]:
# === VARIABLES DE PREPROCESAMIENTO ===
TARGET_COL = "RENDIMIENTO_GLOBAL"

# === COLUMNAS IDENTIFICADAS EN EL ANÁLISIS EXPLORATORIO ===

# Columnas que no vamos a usar
DROP_CANDIDATES = [
    "ID",  # No hay señal predictiva
    "F_TIENEINTERNET.1",  # Información duplicada de F_TIENEINTERNET
    "E_PRIVADO_LIBERTAD",  # Varianza MUY baja (≈0.005%)
]

# Columnas con alta cardinalidad
HIGH_CARDINALITY = [
    "E_PRGM_ACADEMICO",  # Más de 900 categorías
    "E_PRGM_DEPARTAMENTO",  # Más de 30 categorías
]

# Columnas con baja cardinalidad
LOW_CARDINALITY = [
    "F_TIENEINTERNET",
    "F_TIENECOMPUTADOR",
    "F_TIENEAUTOMOVIL",
    "F_TIENELAVADORA",
    "E_PAGOMATRICULAPROPIO",
]

# Mapeo para columnas ordinales
ORDINAL_MAP = {
    "F_ESTRATOVIVIENDA": [
        "Desconocido",
        "Sin Estrato",
        "Estrato 1",
        "Estrato 2",
        "Estrato 3",
        "Estrato 4",
        "Estrato 5",
        "Estrato 6",
    ],
    "E_HORASSEMANATRABAJA": [
        "Desconocido",
        "0",
        "Menos de 10 horas",
        "Entre 11 y 20 horas",
        "Entre 21 y 30 horas",
        "Más de 30 horas",
    ],
    "E_VALORMATRICULAUNIVERSIDAD": [
        "Desconocido",
        "No pagó matrícula",
        "Menos de 500 mil",
        "Entre 500 mil y menos de 1 millón",
        "Entre 1 millón y menos de 2.5 millones",
        "Entre 2.5 millones y menos de 4 millones",
        "Entre 4 millones y menos de 5.5 millones",
        "Entre 5.5 millones y menos de 7 millones",
        "Más de 7 millones",
    ],
    "F_EDUCACIONPADRE": [
        "Desconocido",
        "Ninguno",
        "Primaria incompleta",
        "Primaria completa",
        "Secundaria (Bachillerato) incompleta",
        "Secundaria (Bachillerato) completa",
        "Técnica o tecnológica incompleta",
        "Técnica o tecnológica completa",
        "Educación profesional incompleta",
        "Educación profesional completa",
        "Postgrado",
        "No Aplica",
        "No sabe",
    ],
    "F_EDUCACIONMADRE": [
        "Desconocido",
        "Ninguno",
        "Primaria incompleta",
        "Primaria completa",
        "Secundaria (Bachillerato) incompleta",
        "Secundaria (Bachillerato) completa",
        "Técnica o tecnológica incompleta",
        "Técnica o tecnológica completa",
        "Educación profesional incompleta",
        "Educación profesional completa",
        "Postgrado",
        "No Aplica",
        "No sabe",
    ],
}

# Columnas numéricas
NUMERIC_COLUMNS = [
    "PERIODO_ACADEMICO",
    "INDICADOR_1",
    "INDICADOR_2",
    "INDICADOR_3",
    "INDICADOR_4",
]

# === IDENTIFICACIÓN DE CLASES ===

CLASS_NAMES = ["alto", "medio-alto", "medio-bajo", "bajo"]
CLASS2IDX = {cls: idx for idx, cls in enumerate(CLASS_NAMES)}
IDX2CLASS = {idx: cls for cls, idx in CLASS2IDX.items()}

# === CONFIGURACIÓN DEL MODELO ===

N_SPLITS = 5
RANDOM_STATE = 42
ARTIFACT_DIR = Path("./artifacts_xgboost")
ARTIFACT_DIR.mkdir(exist_ok=True, parents=True)

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Device hint: {device} | Artifacts: {ARTIFACT_DIR.resolve()}")

Device hint: cuda | Artifacts: /content/artifacts_xgboost


## Creación del Dataset

In [None]:
os.environ["KAGGLE_CONFIG_DIR"] = "."
!kaggle competitions download -c udea-ai-4-eng-20252-pruebas-saber-pro-colombia
!unzip -q udea-ai-4-eng-20252-pruebas-saber-pro-colombia.zip

Downloading udea-ai-4-eng-20252-pruebas-saber-pro-colombia.zip to /content
  0% 0.00/29.9M [00:00<?, ?B/s]
100% 29.9M/29.9M [00:00<00:00, 1.39GB/s]


In [None]:
def load_dataset(path: Path) -> pd.DataFrame:
    df = pd.read_csv(path, encoding="latin-1")
    return df

## Pipeline de Preprocesamiento

### Clases Helpers


In [None]:
class DataFrameImputer(BaseEstimator, TransformerMixin):
    """
    Rellenar datos faltantes, manteniendo la estructura DataFrame
    con los nombres de las columnas.
    """

    def __init__(self, fill_value: str = "Desconocido") -> None:
        self.fill_value = fill_value
        self.columns_: Optional[List[str]] = None

    def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None) -> "DataFrameImputer":
        self.columns_ = list(X.columns) if hasattr(X, "columns") else None
        return self

    def transform(self, X: pd.DataFrame) -> pd.DataFrame:
        if self.columns_ is None:
            raise RuntimeError("Se debe llamar fit antes de transform.")
        if not isinstance(X, pd.DataFrame):
            X = pd.DataFrame(X, columns=self.columns_)
        return X.fillna(self.fill_value)

    def get_feature_names_out(self, input_features: Optional[List[str]] = None) -> np.ndarray:
        """Devolver nombre de los features/columnas para los transformers."""
        if input_features is not None:
            return np.asarray(input_features, dtype=object)
        if self.columns_ is None:
            raise RuntimeError("Llamar fit antes de solicitar los nombres.")
        return np.asarray(self.columns_, dtype=object)


In [None]:
@dataclass
class FeatureConfig:
    """Clase para almacenar la categorización de columnas"""
    drop: List[str]
    high_card: List[str]
    low_card: List[str]
    ordinal: List[str]
    numeric: List[str]
    ordinal_categories: List[List[str]]

    @classmethod
    def from_dataframe(cls, df: pd.DataFrame) -> "FeatureConfig":
        available_columns = set(df.columns)
        drop = [c for c in DROP_CANDIDATES if c in available_columns]
        usable = available_columns - {TARGET_COL}
        high_card = [c for c in HIGH_CARDINALITY if c in usable]
        low_card = [c for c in LOW_CARDINALITY if c in usable]
        ordinal = [c for c in ORDINAL_MAP if c in usable]
        numeric = [c for c in NUMERIC_COLUMNS if c in usable]
        ordinal_categories = [ORDINAL_MAP[c] for c in ordinal]
        return cls(drop, high_card, low_card, ordinal, numeric, ordinal_categories)

### Construcción

In [None]:
def make_preprocessor(config: FeatureConfig, random_state: int = RANDOM_STATE) -> Pipeline:
    """
    Construir el pipeline de preprocesamiento, con las transformaciones para
    cada columna.
    """
    transformers = []

    # Para las columnas de alta cardinalidad:
    # 1. Rellenamos datos vacíos con "Desconocido", pero manteniendo la estructura
    # 2. Convertimos las categorías en números usando la media de RENDIMIENTO_GLOBAL
    # 3. Agregamos este pipeline al conjunto de transformaciones
    if config.high_card:
        high_card_pipeline = Pipeline(
            steps=[
                ("imputer", DataFrameImputer(fill_value="Desconocido")),
                (
                    "encoder",
                    TargetEncoder(
                        cols=config.high_card,
                        smoothing=0.5,
                        handle_unknown="value",
                        handle_missing="value",
                    ),
                ),
            ]
        )
        transformers.append(("high_card", high_card_pipeline, config.high_card))

    # Para las columnas ordinales:
    # 1. Rellenamos datos vacíos con "Desconocido"
    # 2. Convierte las categorías en números, respetando el orden lógico
    # 3. Agregamos este pipeline al conjunto de transformaciones
    if config.ordinal:
        ordinal_pipeline = Pipeline(
            steps=[
                ("imputer", SimpleImputer(strategy="constant", fill_value="Desconocido")),
                (
                    "encoder",
                    OrdinalEncoder(
                        categories=config.ordinal_categories,
                        dtype=float,
                        handle_unknown="use_encoded_value",
                        unknown_value=-1,
                    ),
                ),
            ]
        )
        transformers.append(("ordinal", ordinal_pipeline, config.ordinal))

    # Para las columnas de baja cardinalidad:
    # 1. Rellenamos datos vacíos con "Desconocido"
    # 2. Convierte las categorías en columnas binarias
    # 3. Agregamos este pipeline al conjunto de transformaciones
    if config.low_card:
        low_card_pipeline = Pipeline(
            steps=[
                ("imputer", SimpleImputer(strategy="constant", fill_value="Desconocido")),
                (
                    "encoder",
                    OneHotEncoder(handle_unknown="ignore", sparse_output=False),
                ),
            ]
        )
        transformers.append(("one_hot", low_card_pipeline, config.low_card))

    # Para las columnas numéricas:
    # 1. Rellenamos datos vacíos con la MEDIANA
    # 2. Transformamos los valores para que tengan media 0 y desv. std 1
    # 3. Agregamos este pipeline al conjunto de transformaciones
    if config.numeric:
        numeric_pipeline = Pipeline(
            steps=[
                ("imputer", SimpleImputer(strategy="median")),
                ("scaler", StandardScaler()),
            ]
        )
        transformers.append(("numeric", numeric_pipeline, config.numeric))

    column_transformer = ColumnTransformer(
        transformers=transformers,
        remainder="drop",
        verbose_feature_names_out=False,
    )

    preprocessing_pipeline = Pipeline(
        steps=[
            ("preprocess", column_transformer),
            ("variance", VarianceThreshold(threshold=1e-5)),
        ]
    )
    return preprocessing_pipeline

### Procesar Datasets

In [None]:
def process_dataset(
    df: pd.DataFrame,
    config: FeatureConfig,
    random_state: int = RANDOM_STATE,
) -> None:
    X = df.drop(columns=[TARGET_COL] + config.drop, errors="ignore")
    y = df[TARGET_COL]

    preprocessing = make_preprocessor(config, random_state=random_state)
    preprocessing.fit(X, y)

    feature_names = preprocessing.named_steps["preprocess"].get_feature_names_out()
    mask = preprocessing.named_steps["variance"].get_support()
    selected_feature_names = feature_names[mask]
    transformed = preprocessing.transform(X)

    processed_df = pd.DataFrame(transformed, columns=selected_feature_names, index=df.index)
    processed_df[TARGET_COL] = y.values

    return processed_df


In [None]:
train_df = process_dataset(load_dataset('train.csv'), FeatureConfig.from_dataframe(load_dataset('train.csv')))

## Modelo

### Utilidades

In [None]:
def collect_fold_metrics(name: str, fold_scores: list[dict]) -> pd.DataFrame:
    df = pd.DataFrame(fold_scores)
    summary = {
        "model": name,
        "mean_acc": df["accuracy"].mean(),
        "std_acc": df["accuracy"].std(ddof=0),
        "min_acc": df["accuracy"].min(),
        "max_acc": df["accuracy"].max(),
    }
    return df, summary


def describe_class_balance(labels: np.ndarray):
    counts = pd.Series(labels).value_counts().sort_index()
    display(counts.rename(index=IDX2CLASS))

### Folds Estratificados

In [None]:
print(f"Shape: {train_df.shape} | Memoria ~{train_df.memory_usage().sum() / 1e6:.1f} MB")

y = train_df[TARGET_COL].map(CLASS2IDX).to_numpy(dtype=np.int64)
X = train_df.drop(columns=[TARGET_COL]).to_numpy(dtype=np.float32)

describe_class_balance(y)
print(f"Feature dims: {X.shape[1]}")

del train_df
_ = gc.collect()

Shape: (692500, 28) | Memoria ~155.1 MB


Unnamed: 0,count
alto,175619
medio-alto,171619
medio-bajo,172275
bajo,172987


Feature dims: 27


In [None]:
# Definición de folds estratificados K=5
skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_STATE)
folds = list(skf.split(X, y))
print(f"Folds preparados: {len(folds)}")

Folds preparados: 5


### Optimización

In [None]:
# Optuna + entrenamiento para XGBoost

def tune_xgboost(X, y, folds, n_trials: int = 200, seed: int = 42):
    # Sampler y pruner
    sampler = TPESampler(n_startup_trials=20, multivariate=True, seed=seed)
    pruner = MedianPruner(n_startup_trials=20, n_warmup_steps=5, interval_steps=1)

    study = optuna.create_study(
        direction="maximize",
        study_name="xgboost_study",
        sampler=sampler,
        pruner=pruner,
    )

    def objective(trial: optuna.Trial) -> float:
        params = {
            "objective": "multi:softprob",
            "n_jobs": 2,
            "tree_method": "hist",
            "device": "cuda",
            "eval_metric": "mlogloss",
            "num_class": len(CLASS_NAMES),
            "max_depth": trial.suggest_int("max_depth", 5, 15),
            "learning_rate": trial.suggest_float("learning_rate", 0.005, 0.2, log=True),
            "min_child_weight": trial.suggest_float("min_child_weight", 0.1, 10.0),
            "gamma": trial.suggest_float("gamma", 0.0, 10.0),
            "subsample": trial.suggest_float("subsample", 0.5, 1.0),
            "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
            "reg_lambda": trial.suggest_float("reg_lambda", 1e-4, 20.0, log=True),
            "reg_alpha": trial.suggest_float("reg_alpha", 1e-4, 20.0, log=True),
        }

        n_rounds = trial.suggest_int("n_estimators", 800, 4000, step=200)

        fold_accs = []
        for fold_id, (tr_idx, val_idx) in enumerate(folds[:2]):
            dtrain = xgb.DMatrix(X[tr_idx], label=y[tr_idx])
            dval = xgb.DMatrix(X[val_idx], label=y[val_idx])
            evals = [(dval, "validation_0")]

            bst = xgb.train(
                params,
                dtrain,
                num_boost_round=n_rounds,
                evals=evals,
                early_stopping_rounds=200,
                verbose_eval=False,
            )

            preds = np.argmax(bst.predict(dval), axis=1)
            fold_accs.append(accuracy_score(y[val_idx], preds))

            if trial.should_prune():
                raise optuna.TrialPruned()

        return float(np.mean(fold_accs))

    study.optimize(objective, n_trials=n_trials, n_jobs=2, show_progress_bar=True)
    return study, study.best_params

In [None]:
# Tuning XGBoost
N_TRIALS_XGB = 120
xgb_study, xgb_best_params = tune_xgboost(X, y, folds, n_trials=N_TRIALS_XGB)
print(f"XGBoost best params: {json.dumps(xgb_best_params, indent=2)}")

In [None]:
xgb_best_params

{'max_depth': 12,
 'learning_rate': 0.007787475681129683,
 'min_child_weight': 9.80461085537906,
 'gamma': 1.6010131040205766,
 'subsample': 0.8009882111793881,
 'colsample_bytree': 0.553584361973649,
 'reg_lambda': 1.120469233675485,
 'reg_alpha': 0.0016458428280811846,
 'n_estimators': 1800}

### Entrenamiento

In [None]:
def train_xgboost_cv(best_params: dict, X: np.ndarray, y: np.ndarray, folds):
    params = best_params.copy()
    # Esta base fija garantiza métricas consistentes entre ejecuciones
    base_kwargs = {
        "objective": "multi:softprob",
        "num_class": len(CLASS_NAMES),
        "eval_metric": "mlogloss",
        "tree_method": "hist",
        "predictor": "auto",
        "early_stopping_rounds": 200
    }

    oof_pred = np.zeros((len(y), len(CLASS_NAMES)), dtype=np.float32)
    fold_scores = []
    model_paths = []

    for fold_id, (tr_idx, val_idx) in enumerate(folds):
        print(f"[XGBoost] Fold {fold_id}")
        # Reiniciamos el booster para que cada fold sea independiente
        model = xgb.XGBClassifier(**base_kwargs, **params)
        model.fit(
            X[tr_idx],
            y[tr_idx],
            eval_set=[(X[val_idx], y[val_idx])],
            verbose=100,
        )
        probs = model.predict_proba(X[val_idx])
        # Guardamos probabilidades completas para futuros ensambles
        oof_pred[val_idx] = probs
        preds = probs.argmax(axis=1)
        fold_scores.append({
            "fold": fold_id,
            "accuracy": accuracy_score(y[val_idx], preds),
            "best_iteration": getattr(model, "best_iteration", model.n_estimators),
        })
        model_path = ARTIFACT_DIR / f"xgboost_fold{fold_id}.json"
        # Persistimos pesos del fold para inferencia reproducible
        model.save_model(model_path)
        model_paths.append(model_path)
        gc.collect()
    # Retornamos todo lo necesario para análisis y submission
    return {
        "name": "XGBoost",
        "best_params": {**base_kwargs, **params},
        "fold_metrics": fold_scores,
        "oof_predictions": oof_pred,
        "model_paths": model_paths,
    }

In [None]:
xgb_best_params = {'max_depth': 12,
 'learning_rate': 0.007787475681129683,
 'min_child_weight': 9.80461085537906,
 'gamma': 1.6010131040205766,
 'subsample': 0.8009882111793881,
 'colsample_bytree': 0.553584361973649,
 'reg_lambda': 1.120469233675485,
 'reg_alpha': 0.0016458428280811846,
 'n_estimators': 1800}

In [None]:
# Entrenamiento XGBoost
xgb_results = train_xgboost_cv(xgb_best_params, X, y, folds)
xgb_fold_df, xgb_summary = collect_fold_metrics(xgb_results["name"], xgb_results["fold_metrics"])
print(f"Promedio accuracy XGBoost: {xgb_summary['mean_acc']:.4f}")
xgb_fold_df

[XGBoost] Fold 0


Parameters: { "predictor" } are not used.

  self.starting_round = model.num_boosted_rounds()


[0]	validation_0-mlogloss:1.38524
[100]	validation_0-mlogloss:1.30019
[200]	validation_0-mlogloss:1.25939
[300]	validation_0-mlogloss:1.23829
[400]	validation_0-mlogloss:1.22544
[500]	validation_0-mlogloss:1.21777
[600]	validation_0-mlogloss:1.21211
[700]	validation_0-mlogloss:1.20872
[800]	validation_0-mlogloss:1.20639
[900]	validation_0-mlogloss:1.20469
[1000]	validation_0-mlogloss:1.20346
[1100]	validation_0-mlogloss:1.20251
[1200]	validation_0-mlogloss:1.20181
[1300]	validation_0-mlogloss:1.20126
[1400]	validation_0-mlogloss:1.20083
[1500]	validation_0-mlogloss:1.20053
[1600]	validation_0-mlogloss:1.20026
[1700]	validation_0-mlogloss:1.20003
[1799]	validation_0-mlogloss:1.19989
[XGBoost] Fold 1


Parameters: { "predictor" } are not used.

  self.starting_round = model.num_boosted_rounds()


[0]	validation_0-mlogloss:1.38525
[100]	validation_0-mlogloss:1.30052
[200]	validation_0-mlogloss:1.25956
[300]	validation_0-mlogloss:1.23821
[400]	validation_0-mlogloss:1.22516
[500]	validation_0-mlogloss:1.21729
[600]	validation_0-mlogloss:1.21160
[700]	validation_0-mlogloss:1.20813
[800]	validation_0-mlogloss:1.20569
[900]	validation_0-mlogloss:1.20389
[1000]	validation_0-mlogloss:1.20263
[1100]	validation_0-mlogloss:1.20165
[1200]	validation_0-mlogloss:1.20091
[1300]	validation_0-mlogloss:1.20032
[1400]	validation_0-mlogloss:1.19988
[1500]	validation_0-mlogloss:1.19962
[1600]	validation_0-mlogloss:1.19935
[1700]	validation_0-mlogloss:1.19911
[1799]	validation_0-mlogloss:1.19895
[XGBoost] Fold 2


Parameters: { "predictor" } are not used.

  self.starting_round = model.num_boosted_rounds()


[0]	validation_0-mlogloss:1.38526
[100]	validation_0-mlogloss:1.30087
[200]	validation_0-mlogloss:1.26043
[300]	validation_0-mlogloss:1.23958
[400]	validation_0-mlogloss:1.22679
[500]	validation_0-mlogloss:1.21917
[600]	validation_0-mlogloss:1.21366
[700]	validation_0-mlogloss:1.21030
[800]	validation_0-mlogloss:1.20797
[900]	validation_0-mlogloss:1.20628
[1000]	validation_0-mlogloss:1.20505
[1100]	validation_0-mlogloss:1.20409
[1200]	validation_0-mlogloss:1.20338
[1300]	validation_0-mlogloss:1.20281
[1400]	validation_0-mlogloss:1.20231
[1500]	validation_0-mlogloss:1.20199
[1600]	validation_0-mlogloss:1.20174
[1700]	validation_0-mlogloss:1.20149
[1799]	validation_0-mlogloss:1.20132
[XGBoost] Fold 3


Parameters: { "predictor" } are not used.

  self.starting_round = model.num_boosted_rounds()


[0]	validation_0-mlogloss:1.38525
[100]	validation_0-mlogloss:1.30060
[200]	validation_0-mlogloss:1.26000
[300]	validation_0-mlogloss:1.23895
[400]	validation_0-mlogloss:1.22613
[500]	validation_0-mlogloss:1.21848
[600]	validation_0-mlogloss:1.21291
[700]	validation_0-mlogloss:1.20950
[800]	validation_0-mlogloss:1.20713
[900]	validation_0-mlogloss:1.20538
[1000]	validation_0-mlogloss:1.20414
[1100]	validation_0-mlogloss:1.20319
[1200]	validation_0-mlogloss:1.20245
[1300]	validation_0-mlogloss:1.20187
[1400]	validation_0-mlogloss:1.20145
[1500]	validation_0-mlogloss:1.20114
[1600]	validation_0-mlogloss:1.20091
[1700]	validation_0-mlogloss:1.20069
[1799]	validation_0-mlogloss:1.20053
[XGBoost] Fold 4


Parameters: { "predictor" } are not used.

  self.starting_round = model.num_boosted_rounds()


[0]	validation_0-mlogloss:1.38526
[100]	validation_0-mlogloss:1.30081
[200]	validation_0-mlogloss:1.26020
[300]	validation_0-mlogloss:1.23913
[400]	validation_0-mlogloss:1.22628
[500]	validation_0-mlogloss:1.21862
[600]	validation_0-mlogloss:1.21307
[700]	validation_0-mlogloss:1.20964
[800]	validation_0-mlogloss:1.20724
[900]	validation_0-mlogloss:1.20546
[1000]	validation_0-mlogloss:1.20422
[1100]	validation_0-mlogloss:1.20323
[1200]	validation_0-mlogloss:1.20251
[1300]	validation_0-mlogloss:1.20194
[1400]	validation_0-mlogloss:1.20150
[1500]	validation_0-mlogloss:1.20121
[1600]	validation_0-mlogloss:1.20095
[1700]	validation_0-mlogloss:1.20068
[1799]	validation_0-mlogloss:1.20054
Promedio accuracy XGBoost: 0.4349


Unnamed: 0,fold,accuracy,best_iteration
0,0,0.435668,1799
1,1,0.435242,1799
2,2,0.434137,1799
3,3,0.435271,1799
4,4,0.434123,1799


### Evaluación final

In [None]:
# Resumen y matriz de confusión
xgb_summary

y_pred = xgb_results["oof_predictions"].argmax(axis=1)
acc = accuracy_score(y, y_pred)
print(f"Accuracy OOF global XGBoost: {acc:.4f}")

conf_mat = confusion_matrix(y, y_pred)
plt.figure(figsize=(6, 5))
sns.heatmap(conf_mat, annot=True, fmt="d", cmap="Blues",
            xticklabels=CLASS_NAMES, yticklabels=CLASS_NAMES)
plt.xlabel("Predicción")
plt.ylabel("Real")
plt.title("Matriz de confusión XGBoost")
plt.tight_layout()
plt.show()

print(classification_report(y, y_pred, target_names=CLASS_NAMES))

### Exportación e Inferencia

In [None]:
def process_dataset_test(
    df: pd.DataFrame,
    config: FeatureConfig,
    random_state: int = RANDOM_STATE,
) -> pd.DataFrame:
    """
    Procesa un dataset de test sin la columna TARGET_COL.
    Devuelve un DataFrame con las features transformadas.
    """
    # Eliminamos columnas que no se usan (drop)
    X = df.drop(columns=config.drop, errors="ignore")

    # Construimos el pipeline de preprocesamiento ya fiteado
    train_raw = load_dataset("train.csv")
    config = FeatureConfig.from_dataframe(train_raw)
    preprocessing = make_preprocessor(config, random_state=RANDOM_STATE)
    train_X = train_raw.drop(columns=[TARGET_COL] + config.drop, errors="ignore")
    train_y = train_raw[TARGET_COL]
    preprocessing.fit(train_X, train_y)

    # Ajustamos el pipeline SOLO con los datos de entrenamiento previamente
    # Aquí asumimos que ya está entrenado con train_df, así que usamos transform
    transformed = preprocessing.transform(X)

    # Obtenemos nombres de las features seleccionadas
    feature_names = preprocessing.named_steps["preprocess"].get_feature_names_out()
    mask = preprocessing.named_steps["variance"].get_support()
    selected_feature_names = feature_names[mask]

    # Creamos el DataFrame procesado
    processed_df = pd.DataFrame(
        transformed,
        columns=selected_feature_names,
        index=df.index
    )

    return processed_df

In [None]:
# Exportación e inferencia con el ensamble XGBoost
xgb_model_paths = [str(p) for p in xgb_results["model_paths"]]
print("Modelos XGBoost:", xgb_model_paths)

def predict_with_xgboost(processed: pd.DataFrame, raw: pd.DataFrame) -> pd.DataFrame:
    processed = np.asarray(processed, dtype=np.float32)
    probas = np.zeros((processed.shape[0], len(CLASS_NAMES)), dtype=np.float32)

    # Ensamble de modelos
    for model_path in xgb_model_paths:
        booster = xgb.Booster()
        booster.load_model(model_path)
        probas += booster.predict(xgb.DMatrix(processed))
    probas /= len(xgb_model_paths)

    # Predicciones finales
    preds = probas.argmax(axis=1)

    # Crear df solo con ID y predicción
    result = pd.DataFrame({
        "ID": raw["ID"],
        "RENDIMIENTO_GLOBAL": [IDX2CLASS[idx] for idx in preds]
    })

    return result

raw_test_df = load_dataset('test.csv')
test_df = process_dataset_test(raw_test_df, FeatureConfig.from_dataframe(raw_test_df))

df_result = predict_with_xgboost(test_df, raw_test_df)
df_result.to_csv("submission_xgboost.csv", index=False, encoding="utf-8")

Modelos XGBoost: ['artifacts_xgboost/xgboost_fold0.json', 'artifacts_xgboost/xgboost_fold1.json', 'artifacts_xgboost/xgboost_fold2.json', 'artifacts_xgboost/xgboost_fold3.json', 'artifacts_xgboost/xgboost_fold4.json']
