In [4]:
# ============================================================
# 99 - modelo solución
# ============================================================

# 1. Importaciones
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer
!pip install opendatasets
import opendatasets as od
import scipy

# 2. Carga
dataset_link="https://www.kaggle.com/competitions/udea-ai-4-eng-20252-pruebas-saber-pro-colombia/overview"
od.download(dataset_link)

data_path = "udea-ai-4-eng-20252-pruebas-saber-pro-colombia/"
train = pd.read_csv(data_path + "train.csv")
test = pd.read_csv(data_path + "test.csv")
test_ids = test['ID']

target_col = 'RENDIMIENTO_GLOBAL'
X = train.drop([target_col, 'ID'], axis=1)
y = train[target_col]
X_test = test.drop('ID', axis=1)

# 3. Configuración del Preprocesador
numeric_features = X.select_dtypes(include=np.number).columns.tolist()
categorical_features = X.select_dtypes(include=['object']).columns.tolist()

if 'PERIODO_ACADEMICO' in numeric_features:
    numeric_features.remove('PERIODO_ACADEMICO')
    categorical_features.append('PERIODO_ACADEMICO')
    X['PERIODO_ACADEMICO'] = X['PERIODO_ACADEMICO'].astype(str)
    X_test['PERIODO_ACADEMICO'] = X_test['PERIODO_ACADEMICO'].astype(str)

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    # Sparse=True es el default, ahorra memoria
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=True))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# 4. PREPROCESADO MANUAL (SIN .toarray())
print("Transformando datos (manteniendo formato disperso para ahorrar RAM)...")

# fit_transform devuelve una matriz 'sparse' (comprimida)
X_processed = preprocessor.fit_transform(X)
X_test_processed = preprocessor.transform(X_test)

# Ordenar los índices hace que Keras lea la matriz más rápido
X_processed.sort_indices()
X_test_processed.sort_indices()

print(f"Dimensiones de X train: {X_processed.shape}")
print("Tipo de dato:", type(X_processed)) # Verás que es scipy.sparse.csr_matrix

# 5. Codificación del Target
categories_order = [['bajo', 'medio-bajo', 'medio-alto', 'alto']]
target_encoder = OrdinalEncoder(categories=categories_order)
y_int = target_encoder.fit_transform(y.to_frame())
y_categorical = to_categorical(y_int)

# 6. DEFINICIÓN DE LA RED NEURONAL
input_dim = X_processed.shape[1]
num_classes = 4

model_nn = Sequential([
    # --- Capa 1: Entrada Grande ---
    # Usamos 512 neuronas para capturar más detalles de las 1000 columnas
    Dense(512, activation='relu', input_shape=(input_dim,)),
    BatchNormalization(), # Normaliza los datos dentro de la red
    Dropout(0.3),         # Apaga el 30% para evitar memorizar

    # --- Capa 2: Procesamiento Intermedio ---
    Dense(256, activation='relu'),
    BatchNormalization(),
    Dropout(0.3),

    # --- Capa 3: Refinamiento ---
    Dense(128, activation='relu'),
    BatchNormalization(),
    Dropout(0.2),

    # --- Capa de Salida ---
    Dense(num_classes, activation='softmax')
])

# Usamos un learning rate un poco más bajo para ser más precisos
opt = tf.keras.optimizers.Adam(learning_rate=0.001)

model_nn.compile(optimizer=opt,
                 loss='categorical_crossentropy',
                 metrics=['accuracy'])

# 7. ENTRENAMIENTO INTELIGENTE (CALLBACKS)
# Definimos "vigilantes" para el entrenamiento
callbacks_list = [
    # Si la 'val_loss' no mejora en 5 épocas, para de entrenar.
    EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True, verbose=1),

    # Si se estanca, reduce la velocidad de aprendizaje (learning rate) para afinar.
    ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=2, verbose=1)
]

print("Entrenando Red Neuronal Mejorada...")
# Aumentamos epochs a 30, pero el EarlyStopping lo detendrá antes si es necesario
history = model_nn.fit(
    X_processed,
    y_categorical,
    epochs=30,           # Le damos más tiempo
    batch_size=128,      # Batch más grande para que sea estable y rápido
    validation_split=0.2,
    callbacks=callbacks_list # Añadimos los vigilantes
)

# 8. Predicción
print("Generando predicciones...")
# Keras también acepta matrices dispersas en .predict()
y_pred_probs = model_nn.predict(X_test_processed)
y_pred_int = np.argmax(y_pred_probs, axis=1)

# 9. Generar Submission
y_pred_labels = target_encoder.inverse_transform(y_pred_int.reshape(-1, 1)).ravel()

submission = pd.DataFrame({
    'ID': test_ids,
    'RENDIMIENTO_GLOBAL': y_pred_labels
})

submission.to_csv('submission.csv', index=False)
print("Archivo 'submission.csv' generado exitosamente.")
print(submission.head())

Skipping, found downloaded files in "./udea-ai-4-eng-20252-pruebas-saber-pro-colombia" (use force=True to force download)
Transformando datos (manteniendo formato disperso para ahorrar RAM)...
Dimensiones de X train: (692500, 1050)
Tipo de dato: <class 'scipy.sparse._csr.csr_matrix'>


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Entrenando Red Neuronal Mejorada...
Epoch 1/30
[1m4329/4329[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m132s[0m 30ms/step - accuracy: 0.3956 - loss: 1.3026 - val_accuracy: 0.4321 - val_loss: 1.2013 - learning_rate: 0.0010
Epoch 2/30
[1m4329/4329[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m143s[0m 30ms/step - accuracy: 0.4318 - loss: 1.2036 - val_accuracy: 0.4381 - val_loss: 1.1949 - learning_rate: 0.0010
Epoch 3/30
[1m4329/4329[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m127s[0m 29ms/step - accuracy: 0.4383 - loss: 1.1933 - val_accuracy: 0.4383 - val_loss: 1.1910 - learning_rate: 0.0010
Epoch 4/30
[1m4329/4329[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m124s[0m 29ms/step - accuracy: 0.4428 - loss: 1.1876 - val_accuracy: 0.4426 - val_loss: 1.1865 - learning_rate: 0.0010
Epoch 5/30
[1m4329/4329[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m125s[0m 29ms/step - accuracy: 0.4434 - loss: 1.1842 - val_accuracy: 0.4418 - val_loss: 1.1864 - learning_rate: 0.0010
Epoch 6/3