##**Instaladores**

In [None]:
!pip install pandas scikit-learn matplotlib seaborn >nul 2>&1
!pip install catboost >nul 2>&1

In [None]:
!pip install optuna >nul 2>&1

##**Librerias**

In [None]:
import pandas as pd
import numpy as np
from google.colab import drive
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from catboost import CatBoostClassifier
from sklearn.utils.class_weight import compute_sample_weight
from sklearn.base import BaseEstimator, ClassifierMixin
import matplotlib.pyplot as plt
import seaborn as sns
import optuna
import xgboost as xgb
drive.mount('/content/drive')
pd.set_option('display.max_columns', None)

Mounted at /content/drive


##**Funciones**

In [None]:
#Tabla resumen
def resumen(dataframe):
  # Cálculos de resumen
  null_counts = dataframe.isnull().sum()
  total_counts = len(dataframe)
  non_null_counts = dataframe.notnull().sum()
  data_types = dataframe.dtypes
  unique_counts = dataframe.nunique()
  # Crear DataFrame de resumen
  summary_df = pd.DataFrame({
      'Tipo de Dato': data_types,
      'Valores Únicos': unique_counts,
      'No Nulos': non_null_counts,
      'Nulos': null_counts,
      '% Nulos': (null_counts / total_counts) * 100
  })
  # Ordenar por % de nulos
  summary_df = summary_df.sort_values(by='% Nulos', ascending=False)
  # Aplicar formato con Styler
  styled_summary = summary_df.style \
      .format({'% Nulos': '{:.2f}%'}) \
      .bar(subset='% Nulos', color='#f08080') \
      .set_properties(**{'text-align': 'center'}) \
      .set_caption("Resumen de Columnas del DataFrame") \
      .set_table_styles([
          # Color del encabezado de columnas
          {'selector': 'th', 'props': [
              ('text-align', 'center'),
              ('background-color', '#1f4e79'),
              ('color', 'white'),
              ('font-weight', 'bold')
          ]},
          # Color de las celdas
          {'selector': 'td', 'props': [('padding', '5px')]},
          # Estilo del título de la tabla
          {'selector': 'caption', 'props': [
              ('color', '#1f4e79'),
              ('font-size', '18px'),
              ('text-align', 'center'),
              ('font-weight', 'bold')
          ]}
      ])
  # Mostrar
  display(styled_summary)
  # Información final
  print(f"\n🧾 El DataFrame tiene un total de **{len(dataframe.columns)}** columnas y **{len(dataframe)}** filas.")

In [None]:
#Función Encabezado
def encabezado(dataframe,filas=5):
  salida=dataframe.head(filas).style \
      .set_properties(**{
          'background-color': '#f9f9f9',
          'border': '1px solid #ddd',
          'text-align': 'center'
      }) \
      .set_table_styles([
          {'selector': 'th', 'props': [('background-color', '#4B8BBE'), ('color', 'white'), ('text-align', 'center')]}
      ]) \
      .highlight_null() \
      .set_caption("🔍 Vista Previa de los Primeros Registros")
  display(salida)

##**Cargar Datos**

In [None]:
X_train = pd.read_csv('/content/drive/MyDrive/X_train.csv')
Y_train = pd.read_csv('/content/drive/MyDrive/y_train.csv')
X_test = pd.read_csv('/content/drive/MyDrive/X_test.csv')
id_test = pd.read_csv('/content/drive/MyDrive/id_test.csv')

In [None]:
resumen(X_train)

  end = (x - left) / (right - left)


Unnamed: 0,Tipo de Dato,Valores Únicos,No Nulos,Nulos,% Nulos
ESTU_PRGM_DEPARTAMENTO,float64,688698,692500,0,0.00%
ESTU_VALORMATRICULAUNIVERSIDAD,float64,692310,692500,0,0.00%
ESTU_HORASSEMANATRABAJA,float64,692436,692500,0,0.00%
FAMI_ESTRATOVIVIENDA,float64,692422,692500,0,0.00%
FAMI_TIENEINTERNET,int64,2,692500,0,0.00%
FAMI_EDUCACIONPADRE,float64,692050,692500,0,0.00%
FAMI_TIENELAVADORA,int64,2,692500,0,0.00%
FAMI_TIENEAUTOMOVIL,int64,2,692500,0,0.00%
ESTU_PAGOMATRICULAPROPIO,int64,2,692500,0,0.00%
FAMI_TIENECOMPUTADOR,int64,2,692500,0,0.00%



🧾 El DataFrame tiene un total de **25** columnas y **692500** filas.


In [None]:
encabezado(X_train)

Unnamed: 0,ESTU_PRGM_DEPARTAMENTO,ESTU_VALORMATRICULAUNIVERSIDAD,ESTU_HORASSEMANATRABAJA,FAMI_ESTRATOVIVIENDA,FAMI_TIENEINTERNET,FAMI_EDUCACIONPADRE,FAMI_TIENELAVADORA,FAMI_TIENEAUTOMOVIL,ESTU_PAGOMATRICULAPROPIO,FAMI_TIENECOMPUTADOR,FAMI_EDUCACIONMADRE,coef_1,coef_2,coef_3,coef_4,EDUMADRE_DISCRIMINANTE,EDUPADRE_DISCRIMINANTE,GRUPO_DEPARTAMENTO,ISE,ESFUERZO_FAMILIAR,ACCESO_TECNOLOGICO,NIVEL_SOCIOFAMILIAR,DESAJUSTE_ECONOMICO,INCONGRUENCIA_ISE_MATRICULA,ESFUERZO_NORM
0,1.491769,1.491769,1.491769,1.491769,1,1.491769,1,1,0,1,1.491769,0.437002,-0.556223,0.813978,0.060296,1.491769,1.491769,1.491769,1.395273,0.320585,0.415248,1.13056,1.137621,-1.202997,-0.652458
1,1.491769,1.491769,1.491769,1.745884,0,1.491769,1,0,0,1,1.491769,0.346934,-0.481341,0.50818,0.016142,1.491769,1.491769,1.491769,0.380067,-1.022363,-0.76121,0.418709,-0.004977,-0.361375,-0.936679
2,1.745884,1.245884,1.491769,1.497256,1,1.491769,1,0,0,0,1.491769,0.232301,-0.492038,0.729034,0.016142,1.491769,1.245884,1.745884,-0.055021,0.768234,-1.937669,-0.115179,-0.004977,0.143598,0.275999
3,1.491769,1.491769,1.245884,1.491769,1,1.491769,1,0,0,1,1.245884,1.583325,-0.941332,-0.171371,-0.690324,1.245884,1.745884,1.497256,-0.34508,-0.574714,0.415248,-0.649068,-0.004977,0.648571,-0.354025
4,1.491769,1.163923,1.491769,1.372942,1,1.491769,1,1,0,1,1.491769,0.387874,-0.299484,0.389259,0.457683,1.491769,1.491769,1.122942,-0.34508,0.320585,0.415248,-0.649068,-0.004977,0.480247,0.242839


In [None]:
resumen(X_test)

  end = (x - left) / (right - left)


Unnamed: 0,Tipo de Dato,Valores Únicos,No Nulos,Nulos,% Nulos
ESTU_PRGM_DEPARTAMENTO,float64,31,296786,0,0.00%
ESTU_VALORMATRICULAUNIVERSIDAD,float64,8,296786,0,0.00%
ESTU_HORASSEMANATRABAJA,float64,5,296786,0,0.00%
FAMI_ESTRATOVIVIENDA,float64,7,296786,0,0.00%
FAMI_TIENEINTERNET,int64,2,296786,0,0.00%
FAMI_EDUCACIONPADRE,float64,12,296786,0,0.00%
FAMI_TIENELAVADORA,int64,2,296786,0,0.00%
FAMI_TIENEAUTOMOVIL,int64,2,296786,0,0.00%
ESTU_PAGOMATRICULAPROPIO,int64,2,296786,0,0.00%
FAMI_TIENECOMPUTADOR,int64,2,296786,0,0.00%



🧾 El DataFrame tiene un total de **25** columnas y **296786** filas.


In [None]:
encabezado(X_test)

Unnamed: 0,ESTU_PRGM_DEPARTAMENTO,ESTU_VALORMATRICULAUNIVERSIDAD,ESTU_HORASSEMANATRABAJA,FAMI_ESTRATOVIVIENDA,FAMI_TIENEINTERNET,FAMI_EDUCACIONPADRE,FAMI_TIENELAVADORA,FAMI_TIENEAUTOMOVIL,ESTU_PAGOMATRICULAPROPIO,FAMI_TIENECOMPUTADOR,FAMI_EDUCACIONMADRE,coef_1,coef_2,coef_3,coef_4,EDUMADRE_DISCRIMINANTE,EDUPADRE_DISCRIMINANTE,GRUPO_DEPARTAMENTO,ACCESO_TECNOLOGICO,ESFUERZO_FAMILIAR,ISE,NIVEL_SOCIOFAMILIAR,DESAJUSTE_ECONOMICO,INCONGRUENCIA_ISE_MATRICULA,ESFUERZO_NORM
0,1.576757,1.391822,1.396823,1.482493,1,1.453422,1,0,1,1,1.67429,0.492496,-0.433304,0.907947,-0.22859,1.67429,1.517896,1.579579,0.366753,-1.463543,-0.384764,0.022671,-1.535605,0.014923,-0.961952
1,1.407371,1.678276,1.547282,1.609556,1,1.548989,1,0,0,1,1.451805,-0.339797,0.24326,0.555805,0.910679,1.524045,1.517896,1.436167,0.366753,0.31971,0.20378,0.202456,0.622868,-0.156366,-0.137839
2,1.458951,1.684564,1.243946,1.482493,1,1.609116,1,0,0,1,1.568799,0.138153,-0.338162,0.522267,-0.22859,1.524045,1.519429,1.436167,0.366753,-1.463543,-0.237628,-0.157114,-0.456368,0.186211,-1.017887
3,1.716829,1.684564,1.547282,1.657122,0,1.68885,1,0,0,0,1.696341,-0.891912,1.564675,-0.735383,0.466808,1.687228,1.682998,1.579579,-3.721459,-0.126103,-1.561853,-1.77518,0.622868,1.727805,1.890747
4,1.481302,1.678276,1.49193,1.609556,1,1.548989,1,0,0,1,1.568799,-0.488126,0.24326,0.723491,0.348442,1.524045,1.517896,1.579579,0.366753,-0.126103,-0.090492,-0.157114,0.622868,0.186211,-0.222363


In [None]:
encabezado(Y_train)

Unnamed: 0,RENDIMIENTO_GLOBAL
0,medio-alto
1,bajo
2,bajo
3,alto
4,medio-bajo


In [None]:
#COdificacion Ordinal, importante para el tipo de modelo ordinal
ordinal_Y={
    'alto':3,
    'medio-alto':2,
    'medio-bajo':1,
    'bajo':0
}
Y_train['RENDIMIENTO_GLOBAL']=Y_train['RENDIMIENTO_GLOBAL'].map(ordinal_Y)

In [None]:
encabezado(Y_train)

Unnamed: 0,RENDIMIENTO_GLOBAL
0,2
1,0
2,0
3,3
4,1


In [None]:
resumen(Y_train)

  end = (x - left) / (right - left)


Unnamed: 0,Tipo de Dato,Valores Únicos,No Nulos,Nulos,% Nulos
RENDIMIENTO_GLOBAL,int64,4,692500,0,0.00%



🧾 El DataFrame tiene un total de **1** columnas y **692500** filas.


In [None]:
x_train, x_val, y_train, y_val = train_test_split(X_train, Y_train, test_size=0.2, random_state=42, stratify=Y_train)

##**Modelos**

###**Fase 1**

In [None]:
def objective(trial):
    # Diccionario de hiperparámetros a optimizar
    params = {
        'objective': 'multi:softmax',      # Clasificación multiclase (predice directamente la clase con mayor probabilidad)
        'num_class': 4,                    # Número de clases en el target (en tu caso: 4 niveles de rendimiento)
        'tree_method': 'gpu_hist',         # Usar GPU para acelerar el entrenamiento
        'predictor': 'gpu_predictor',      # También usar GPU para las predicciones

        # Hiperparámetros a optimizar con Optuna:
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.15, log=True),  # Tasa de aprendizaje (más baja = mejor generalización)
        'max_depth': trial.suggest_int('max_depth', 7, 14),                           # Profundidad máxima de los árboles
        'n_estimators': trial.suggest_int('n_estimators', 1000, 3000),                # Número total de árboles (más = más potencia)

        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),             # Mínimo de datos por hoja (más alto = menos overfitting)
        'gamma': trial.suggest_float('gamma', 0, 5),                                  # Ganancia mínima para hacer una división (regulariza)
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),                      # Porcentaje de datos usados por árbol
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),        # Porcentaje de features usadas por árbol

        'lambda': trial.suggest_float('lambda', 0.01, 10.0, log=True),                # Regularización L2
        'alpha': trial.suggest_float('alpha', 0.01, 10.0, log=True),                  # Regularización L1

        'scale_pos_weight': trial.suggest_float('scale_pos_weight', 0.8, 1.2),        # Balanceo de clases, cercano a 1 si no están desbalanceadas

        'eval_metric': 'mlogloss',               # Métrica para multiclase (pérdida logarítmica)
        'verbosity': 0,                          # No mostrar logs de entrenamiento
        'early_stopping_rounds': 50,             # Detener entrenamiento si no mejora en 50 rounds
        'random_state': 42,                      # Reproducibilidad
        'disable_default_eval_metric': True      # Solo usar la métrica personalizada (mlogloss)
    }

    # Entrenar modelo con los parámetros actuales
    model = xgb.XGBClassifier(**params)
    model.fit(x_train, y_train, eval_set=[(x_val, y_val)], verbose=False)

    # Evaluar el modelo
    preds = model.predict(x_val)
    acc = accuracy_score(y_val, preds)
    return acc



[I 2025-07-02 18:11:25,752] A new study created in memory with name: no-name-ba753ce6-2580-42b9-8d58-ad401f160e40
[I 2025-07-02 18:12:11,984] Trial 0 finished with value: 0.4225270758122744 and parameters: {'learning_rate': 0.018044870641697953, 'max_depth': 8, 'n_estimators': 2654, 'min_child_weight': 9, 'gamma': 2.7151200411316245, 'subsample': 0.772203608344223, 'colsample_bytree': 0.7101415586740398, 'lambda': 2.3432413548517586, 'alpha': 0.021588506942800633, 'scale_pos_weight': 0.9586504024183446}. Best is trial 0 with value: 0.4225270758122744.
[I 2025-07-02 18:12:51,159] Trial 1 finished with value: 0.42313357400722024 and parameters: {'learning_rate': 0.028542261527164714, 'max_depth': 12, 'n_estimators': 1095, 'min_child_weight': 9, 'gamma': 1.860937306224311, 'subsample': 0.834485703907346, 'colsample_bytree': 0.7460321836054405, 'lambda': 9.530080828082307, 'alpha': 0.01104146392643576, 'scale_pos_weight': 0.8526756473686905}. Best is trial 1 with value: 0.42313357400722024

🚀 Mejores hiperparámetros encontrados:
{'learning_rate': 0.014078117522367692, 'max_depth': 13, 'n_estimators': 2409, 'min_child_weight': 8, 'gamma': 2.1019294258203285, 'subsample': 0.9823973280803207, 'colsample_bytree': 0.627330872741747, 'lambda': 1.3269292043998044, 'alpha': 0.3234236193934336, 'scale_pos_weight': 0.9838534387226989}


Parameters: { "scale_pos_weight" } are not used.



In [None]:
#Prediccion del modelo con datos de testeo
X_test = X_test[X_train.columns]
y_pred_xgboos = final_model.predict(X_test)

In [None]:
#Adjuntar el ID
id_test['RENDIMIENTO_GLOBAL'] =pd.Series(y_pred_xgboos)

In [None]:
Descodificacion ordinal
id_test['RENDIMIENTO_GLOBAL']=id_test['RENDIMIENTO_GLOBAL'].map({
    0:'bajo',
    1:'medio-bajo',
    2:'medio-alto',
    3:'alto'
})

In [None]:
resumen(id_test)

  end = (x - left) / (right - left)


Unnamed: 0,Tipo de Dato,Valores Únicos,No Nulos,Nulos,% Nulos
ID,int64,296786,296786,0,0.00%
RENDIMIENTO_GLOBAL,object,4,296786,0,0.00%



🧾 El DataFrame tiene un total de **2** columnas y **296786** filas.


In [None]:
encabezado(id_test)

Unnamed: 0,ID,RENDIMIENTO_GLOBAL
0,550236,bajo
1,98545,medio-alto
2,499179,medio-bajo
3,782980,bajo
4,785185,bajo


In [None]:
#Guardar archivo CSV
id_test.to_csv('id_test.csv', index=False)