In [1]:
# ============================================================
# 04 - EXPERIMENTO: PCA + REGRESIÓN LOGÍSTICA
# ============================================================

# 1. Importaciones
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA # Para reducción de dimensionalidad
from sklearn.linear_model import LogisticRegression # Modelo lineal simple

!pip install opendatasets
import opendatasets as od

# 2. Carga de Datos
dataset_link="https://www.kaggle.com/competitions/udea-ai-4-eng-20252-pruebas-saber-pro-colombia/overview"
od.download(dataset_link)

data_path = "udea-ai-4-eng-20252-pruebas-saber-pro-colombia/"
train = pd.read_csv(data_path + "train.csv")
test = pd.read_csv(data_path + "test.csv")

test_ids = test['ID']

# 3. Separación X / y
target_col = 'RENDIMIENTO_GLOBAL'
X = train.drop([target_col, 'ID'], axis=1)
y = train[target_col]
X_test = test.drop('ID', axis=1)

# 4. Preprocesado (Igual que en la entrega anterior)
numeric_features = X.select_dtypes(include=np.number).columns.tolist()
categorical_features = X.select_dtypes(include=['object']).columns.tolist()

if 'PERIODO_ACADEMICO' in numeric_features:
    numeric_features.remove('PERIODO_ACADEMICO')
    categorical_features.append('PERIODO_ACADEMICO')
    X['PERIODO_ACADEMICO'] = X['PERIODO_ACADEMICO'].astype(str)
    X_test['PERIODO_ACADEMICO'] = X_test['PERIODO_ACADEMICO'].astype(str)

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# 5. Codificación del Target
categories_order = [['bajo', 'medio-bajo', 'medio-alto', 'alto']]
target_encoder = OrdinalEncoder(categories=categories_order)
y_encoded = target_encoder.fit_transform(y.to_frame()).ravel()

# 6. DEFINICIÓN DEL MODELO (AQUÍ ESTÁ LA DIFERENCIA)
# Pipeline: Preprocesado -> PCA (Reducir a 100 dim) -> Regresión Logística
model_pca = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('pca', PCA(n_components=100, random_state=42)),
    ('classifier', LogisticRegression(max_iter=1000, random_state=42, solver='saga'))
    # solver='saga' es bueno para datasets grandes
])

# 7. Entrenamiento
print("Entrenando PCA + Regresión Logística...")
model_pca.fit(X, y_encoded)
print("¡Entrenamiento completado!")

# 8. Predicción
print("Generando predicciones...")
y_pred_encoded = model_pca.predict(X_test)

# 9. Generar Submission
y_pred_labels = target_encoder.inverse_transform(y_pred_encoded.reshape(-1, 1)).ravel()

submission = pd.DataFrame({
    'ID': test_ids,
    'RENDIMIENTO_GLOBAL': y_pred_labels
})

submission.to_csv('submission_pca_logreg.csv', index=False)
print("Archivo 'submission_pca_logreg.csv' generado.")
print(submission.head())

Collecting opendatasets
  Downloading opendatasets-0.1.22-py3-none-any.whl.metadata (9.2 kB)
Downloading opendatasets-0.1.22-py3-none-any.whl (15 kB)
Installing collected packages: opendatasets
Successfully installed opendatasets-0.1.22
Please provide your Kaggle credentials to download this dataset. Learn more: http://bit.ly/kaggle-creds
Your Kaggle username: simoncorrearios
Your Kaggle Key: ··········
Downloading udea-ai-4-eng-20252-pruebas-saber-pro-colombia.zip to ./udea-ai-4-eng-20252-pruebas-saber-pro-colombia


100%|██████████| 29.9M/29.9M [00:00<00:00, 1.30GB/s]


Extracting archive ./udea-ai-4-eng-20252-pruebas-saber-pro-colombia/udea-ai-4-eng-20252-pruebas-saber-pro-colombia.zip to ./udea-ai-4-eng-20252-pruebas-saber-pro-colombia





Entrenando PCA + Regresión Logística...
¡Entrenamiento completado!
Generando predicciones...
Archivo 'submission_pca_logreg.csv' generado.
       ID RENDIMIENTO_GLOBAL
0  550236               bajo
1   98545         medio-alto
2  499179               alto
3  782980               bajo
4  785185               bajo
