## Dependencias

In [3]:
!pip install -q kaggle category_encoders

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/85.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.7/85.7 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[?25h

In [9]:
# Librerias Básicas
import pandas as pd
import numpy as np
import joblib
import json
from pathlib import Path
import os

# Anotaciones, clases, tipos
from dataclasses import dataclass
from sklearn.base import BaseEstimator, TransformerMixin
from typing import Dict, Iterable, List, Optional, Tuple

# Encoders
from category_encoders import TargetEncoder
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler

# Pipeline
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import VarianceThreshold

# Evaluación
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report

## Configuración

In [10]:
# === VARIABLES DE CONFIGURACIÓN ===
SEED = 42
TARGET_COLUMN = "RENDIMIENTO_GLOBAL"
DATA_PATH = Path("train.csv")
EXPORT_PATH = Path("processed_train.parquet")
PIPELINE_PATH = Path("preprocessing_pipeline.joblib")

# === COLUMNAS IDENTIFICADAS EN EL ANÁLISIS EXPLORATORIO ===

# Columnas que no vamos a usar
DROP_CANDIDATES = [
    "ID",  # No hay señal predictiva
    "F_TIENEINTERNET.1",  # Información duplicada de F_TIENEINTERNET
    "E_PRIVADO_LIBERTAD",  # Varianza MUY baja (≈0.005%)
]

# Columnas con alta cardinalidad
HIGH_CARDINALITY = [
    "E_PRGM_ACADEMICO",  # Más de 900 categorías
    "E_PRGM_DEPARTAMENTO",  # Más de 30 categorías
]

# Columnas con baja cardinalidad
LOW_CARDINALITY = [
    "F_TIENEINTERNET",
    "F_TIENECOMPUTADOR",
    "F_TIENEAUTOMOVIL",
    "F_TIENELAVADORA",
    "E_PAGOMATRICULAPROPIO",
]

# Mapeo para columnas ordinales
ORDINAL_MAP = {
    "F_ESTRATOVIVIENDA": [
        "Desconocido",
        "Sin Estrato",
        "Estrato 1",
        "Estrato 2",
        "Estrato 3",
        "Estrato 4",
        "Estrato 5",
        "Estrato 6",
    ],
    "E_HORASSEMANATRABAJA": [
        "Desconocido",
        "0",
        "Menos de 10 horas",
        "Entre 11 y 20 horas",
        "Entre 21 y 30 horas",
        "Más de 30 horas",
    ],
    "E_VALORMATRICULAUNIVERSIDAD": [
        "Desconocido",
        "No pagó matrícula",
        "Menos de 500 mil",
        "Entre 500 mil y menos de 1 millón",
        "Entre 1 millón y menos de 2.5 millones",
        "Entre 2.5 millones y menos de 4 millones",
        "Entre 4 millones y menos de 5.5 millones",
        "Entre 5.5 millones y menos de 7 millones",
        "Más de 7 millones",
    ],
    "F_EDUCACIONPADRE": [
        "Desconocido",
        "Ninguno",
        "Primaria incompleta",
        "Primaria completa",
        "Secundaria (Bachillerato) incompleta",
        "Secundaria (Bachillerato) completa",
        "Técnica o tecnológica incompleta",
        "Técnica o tecnológica completa",
        "Educación profesional incompleta",
        "Educación profesional completa",
        "Postgrado",
        "No Aplica",
        "No sabe",
    ],
    "F_EDUCACIONMADRE": [
        "Desconocido",
        "Ninguno",
        "Primaria incompleta",
        "Primaria completa",
        "Secundaria (Bachillerato) incompleta",
        "Secundaria (Bachillerato) completa",
        "Técnica o tecnológica incompleta",
        "Técnica o tecnológica completa",
        "Educación profesional incompleta",
        "Educación profesional completa",
        "Postgrado",
        "No Aplica",
        "No sabe",
    ],
}

# Columnas numéricas
NUMERIC_COLUMNS = [
    "PERIODO_ACADEMICO",
    "INDICADOR_1",
    "INDICADOR_2",
    "INDICADOR_3",
    "INDICADOR_4",
]

## Dataset

In [11]:
os.environ["KAGGLE_CONFIG_DIR"] = "."
!kaggle competitions download -c udea-ai-4-eng-20252-pruebas-saber-pro-colombia
!unzip udea-ai-4-eng-20252-pruebas-saber-pro-colombia.zip

udea-ai-4-eng-20252-pruebas-saber-pro-colombia.zip: Skipping, found more recently modified local copy (use --force to force download)
Archive:  udea-ai-4-eng-20252-pruebas-saber-pro-colombia.zip
replace submission_example.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: 

In [12]:
def load_dataset(path: Path = DATA_PATH) -> pd.DataFrame:
    df = pd.read_csv(path, encoding="latin-1")
    return df

def profile_dataset(df: pd.DataFrame) -> Dict[str, object]:
    summary = {
        "shape": df.shape,
        "target_distribution": df[TARGET_COLUMN].value_counts().to_dict(),
        "dtypes": df.dtypes.astype(str).to_dict(),
        "missing_fraction": df.isna().mean().sort_values(ascending=False).head(10).to_dict(),
    }
    return summary

## Clases de Ayuda

### Imputer

In [20]:
class DataFrameImputer(BaseEstimator, TransformerMixin):
    """
    Rellenar datos faltantes, manteniendo la estructura DataFrame
    con los nombres de las columnas.
    """

    def __init__(self, fill_value: str = "Desconocido") -> None:
        self.fill_value = fill_value
        self.columns_: Optional[List[str]] = None

    def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None) -> "DataFrameImputer":
        self.columns_ = list(X.columns) if hasattr(X, "columns") else None
        return self

    def transform(self, X: pd.DataFrame) -> pd.DataFrame:
        if self.columns_ is None:
            raise RuntimeError("Se debe llamar fit antes de transform.")
        if not isinstance(X, pd.DataFrame):
            X = pd.DataFrame(X, columns=self.columns_)
        return X.fillna(self.fill_value)

    def get_feature_names_out(self, input_features: Optional[List[str]] = None) -> np.ndarray:
        """Devolver nombre de los features/columnas para los transformers."""
        if input_features is not None:
            return np.asarray(input_features, dtype=object)
        if self.columns_ is None:
            raise RuntimeError("Llamar fit antes de solicitar los nombres.")
        return np.asarray(self.columns_, dtype=object)


### Configuración de Features

In [21]:
@dataclass
class FeatureConfig:
    """Clase para almacenar la categorización de columnas"""

    drop: List[str]
    high_card: List[str]
    low_card: List[str]
    ordinal: List[str]
    numeric: List[str]
    ordinal_categories: List[List[str]]

    @classmethod
    def from_dataframe(cls, df: pd.DataFrame) -> "FeatureConfig":
        available_columns = set(df.columns)
        drop = [c for c in DROP_CANDIDATES if c in available_columns]
        usable = available_columns - {TARGET_COLUMN}
        high_card = [c for c in HIGH_CARDINALITY if c in usable]
        low_card = [c for c in LOW_CARDINALITY if c in usable]
        ordinal = [c for c in ORDINAL_MAP if c in usable]
        numeric = [c for c in NUMERIC_COLUMNS if c in usable]
        ordinal_categories = [ORDINAL_MAP[c] for c in ordinal]
        return cls(drop, high_card, low_card, ordinal, numeric, ordinal_categories)

## Pipeline de Preprocesamiento




### Construcción

In [22]:
def make_preprocessor(config: FeatureConfig, random_state: int = SEED) -> Pipeline:
    """
    Construir el pipeline de preprocesamiento, con las transformaciones para
    cada columna.
    """
    transformers = []

    # Para las columnas de alta cardinalidad:
    # 1. Rellenamos datos vacíos manteniendo la estructura
    # 2. Convertimos las categorías en números usando la media de RENDIMIENTO_GLOBAL
    # 3. Agregamos este pipeline al conjunto de transformaciones
    if config.high_card:
        high_card_pipeline = Pipeline(
            steps=[
                ("imputer", DataFrameImputer(fill_value="Desconocido")),
                (
                    "encoder",
                    TargetEncoder(
                        cols=config.high_card,
                        smoothing=0.5,
                        handle_unknown="value",
                        handle_missing="value",
                    ),
                ),
            ]
        )
        transformers.append(("high_card", high_card_pipeline, config.high_card))

    # Para las columnas ordinales:
    # 1. Rellenamos datos vacíos con "Desconocido"
    # 2. Convierte las categorías en números, respetando el orden lógico
    # 3. Agregamos este pipeline al conjunto de transformaciones
    if config.ordinal:
        ordinal_pipeline = Pipeline(
            steps=[
                ("imputer", SimpleImputer(strategy="constant", fill_value="Desconocido")),
                (
                    "encoder",
                    OrdinalEncoder(
                        categories=config.ordinal_categories,
                        dtype=float,
                        handle_unknown="use_encoded_value",
                        unknown_value=-1,
                    ),
                ),
            ]
        )
        transformers.append(("ordinal", ordinal_pipeline, config.ordinal))

    # Para las columnas de baja cardinalidad:
    # 1. Rellenamos datos vacíos con "Desconocido"
    # 2. Convierte las categorías en columnas binarias
    # 3. Agregamos este pipeline al conjunto de transformaciones
    if config.low_card:
        low_card_pipeline = Pipeline(
            steps=[
                ("imputer", SimpleImputer(strategy="constant", fill_value="Desconocido")),
                (
                    "encoder",
                    OneHotEncoder(handle_unknown="ignore", sparse_output=False),
                ),
            ]
        )
        transformers.append(("one_hot", low_card_pipeline, config.low_card))

    # Para las columnas numéricas:
    # 1. Rellenamos datos vacíos con la MEDIANA
    # 2. Transformamos los valores para que tengan media 0 y desv. std 1
    # 3. Agregamos este pipeline al conjunto de transformaciones
    if config.numeric:
        numeric_pipeline = Pipeline(
            steps=[
                ("imputer", SimpleImputer(strategy="median")),
                ("scaler", StandardScaler()),
            ]
        )
        transformers.append(("numeric", numeric_pipeline, config.numeric))

    column_transformer = ColumnTransformer(
        transformers=transformers,
        remainder="drop",
        verbose_feature_names_out=False,
    )

    preprocessing_pipeline = Pipeline(
        steps=[
            ("preprocess", column_transformer),
            ("variance", VarianceThreshold(threshold=1e-5)),
        ]
    )
    return preprocessing_pipeline

### Exportar Pipeline y Dataset preprocesado

In [23]:
def export_transformed_dataset(
    df: pd.DataFrame,
    config: FeatureConfig,
    output_path: Path = EXPORT_PATH,
    pipeline_path: Path = PIPELINE_PATH,
    random_state: int = SEED,
) -> None:
    X = df.drop(columns=[TARGET_COLUMN] + config.drop, errors="ignore")
    y = df[TARGET_COLUMN]

    preprocessing = make_preprocessor(config, random_state=random_state)
    preprocessing.fit(X, y)

    feature_names = preprocessing.named_steps["preprocess"].get_feature_names_out()
    mask = preprocessing.named_steps["variance"].get_support()
    selected_feature_names = feature_names[mask]
    transformed = preprocessing.transform(X)

    processed_df = pd.DataFrame(transformed, columns=selected_feature_names, index=df.index)
    processed_df[TARGET_COLUMN] = y.values
    processed_df.to_parquet(output_path, index=False)

    joblib.dump(preprocessing, pipeline_path)

    return processed_df


In [24]:
processed_df = export_transformed_dataset(load_dataset(), FeatureConfig.from_dataframe(load_dataset()))
print(processed_df.head(10))

   E_PRGM_ACADEMICO  E_PRGM_DEPARTAMENTO  F_ESTRATOVIVIENDA  \
0          1.662742             1.458950                4.0   
1          1.441590             1.481302                4.0   
2          1.741652             1.458950                4.0   
3          1.622211             1.404329                5.0   
4          1.543373             1.407370                4.0   
5          1.465016             1.407370                6.0   
6          1.145641             1.657729                3.0   
7          1.707842             1.458950                3.0   
8          1.396686             1.481302                2.0   
9          1.622211             1.407370                6.0   

   E_HORASSEMANATRABAJA  E_VALORMATRICULAUNIVERSIDAD  F_EDUCACIONPADRE  \
0                   2.0                          7.0              -1.0   
1                   1.0                          5.0              -1.0   
2                  -1.0                          5.0               5.0   
3         

## Evaluación con un modelo preliminar

In [25]:
def make_model_pipeline(config: FeatureConfig, random_state: int = SEED) -> Pipeline:
    estimator = HistGradientBoostingClassifier(
        random_state=random_state,
        max_depth=6,
        learning_rate=0.12,
        l2_regularization=0.0,
    )
    preprocessing = make_preprocessor(config, random_state=random_state)
    model_pipeline = Pipeline(
        steps=[
            ("features", preprocessing),
            ("model", estimator),
        ]
    )
    return model_pipeline

In [26]:
def stratified_sample(df: pd.DataFrame, per_class: int, random_state: int = SEED) -> pd.DataFrame:
    return df.groupby(TARGET_COLUMN, group_keys=False).sample(
        n=min(per_class, df.groupby(TARGET_COLUMN).size().min()),
        random_state=random_state,
    )

In [28]:
def evaluate_pipeline(
    df: pd.DataFrame,
    config: FeatureConfig,
    sample_per_class: Optional[int] = 5000,
    cv_splits: int = 3,
    random_state: int = SEED,
):
    if sample_per_class is not None:
        # Usando un sample (más liviano)
        df_evaluation = stratified_sample(df, per_class=sample_per_class, random_state=random_state)
    else:
        # Usar todo el dataset
        df_evaluation = df

    X = df_evaluation.drop(columns=[TARGET_COLUMN] + config.drop, errors="ignore")
    y = df_evaluation[TARGET_COLUMN]

    pipeline = make_model_pipeline(config, random_state=random_state)
    cv = StratifiedKFold(n_splits=cv_splits, shuffle=True, random_state=random_state)
    scores = cross_val_score(pipeline, X, y, scoring="accuracy", cv=cv, n_jobs=1)

    X_train, X_valid, y_train, y_valid = train_test_split(
        X, y, test_size=0.2, random_state=random_state, stratify=y
    )
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_valid)

    evaluation = {
        "cv_scores": scores.tolist(),
        "cv_mean": float(scores.mean()),
        "holdout_accuracy": float(accuracy_score(y_valid, y_pred)),
        "holdout_report": classification_report(y_valid, y_pred, output_dict=True),
    }

    return evaluation


In [29]:
evaluation = evaluate_pipeline(load_dataset(), FeatureConfig.from_dataframe(load_dataset()), sample_per_class=None)


{'cv_scores': [0.428896956254278, 0.4272829274843718, 0.427841773056712], 'cv_mean': 0.4280072189317872, 'holdout_accuracy': 0.4292129963898917, 'holdout_report': {'alto': {'precision': 0.5395825128003151, 'recall': 0.6240747067532172, 'f1-score': 0.5787611554100438, 'support': 35124.0}, 'bajo': {'precision': 0.4500742432895488, 'recall': 0.5694713414457901, 'f1-score': 0.5027816056754963, 'support': 34597.0}, 'medio-alto': {'precision': 0.3246186551299743, 'recall': 0.2535834984267568, 'f1-score': 0.2847375566350982, 'support': 34324.0}, 'medio-bajo': {'precision': 0.33421284080914687, 'recall': 0.26469307792773183, 'f1-score': 0.2954181040765755, 'support': 34455.0}, 'accuracy': 0.4292129963898917, 'macro avg': {'precision': 0.4121220630072463, 'recall': 0.42795565613837405, 'f1-score': 0.4154246054493035, 'support': 138500.0}, 'weighted avg': {'precision': 0.4128594146097269, 'recall': 0.4292129963898917, 'f1-score': 0.4164267487659137, 'support': 138500.0}}}


In [30]:
print(json.dumps(evaluation, indent=4))

{
    "cv_scores": [
        0.428896956254278,
        0.4272829274843718,
        0.427841773056712
    ],
    "cv_mean": 0.4280072189317872,
    "holdout_accuracy": 0.4292129963898917,
    "holdout_report": {
        "alto": {
            "precision": 0.5395825128003151,
            "recall": 0.6240747067532172,
            "f1-score": 0.5787611554100438,
            "support": 35124.0
        },
        "bajo": {
            "precision": 0.4500742432895488,
            "recall": 0.5694713414457901,
            "f1-score": 0.5027816056754963,
            "support": 34597.0
        },
        "medio-alto": {
            "precision": 0.3246186551299743,
            "recall": 0.2535834984267568,
            "f1-score": 0.2847375566350982,
            "support": 34324.0
        },
        "medio-bajo": {
            "precision": 0.33421284080914687,
            "recall": 0.26469307792773183,
            "f1-score": 0.2954181040765755,
            "support": 34455.0
        },
        "a