In [None]:
# ============================================================
# üìò PROYECTO: Predicci√≥n de Desempe√±o en las Pruebas Saber Pro
# ============================================================
# üßæ Overview:
# Las Pruebas Saber Pro son ex√°menes estandarizados aplicados en Colombia
# para evaluar la calidad de la educaci√≥n superior.
# Eval√∫an cinco componentes: Ingl√©s, Lectura Cr√≠tica, Competencias Ciudadanas,
# Razonamiento Cuantitativo y Comunicaci√≥n Escrita.
#
# üéØ Objetivo:
# Crear un modelo de clasificaci√≥n que prediga el desempe√±o de un estudiante
# (bajo, medio-bajo, medio-alto o alto) a partir de sus caracter√≠sticas.
#
# En este notebook se muestra c√≥mo se carga el archivo `train.csv`,
# se limpian los datos y se realiza el preprocesamiento necesario.
# ============================================================

# 1Ô∏è‚É£ IMPORTAR LIBRER√çAS
# ------------------------------------------------------------
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import joblib
import csv

# 2Ô∏è‚É£ CARGAR ARCHIVO train.csv
# ------------------------------------------------------------
# Detecci√≥n autom√°tica del separador (por si el CSV usa ";" en lugar de ",")
with open("train.csv", 'r', encoding='utf-8') as f:
    sample = f.read(2048)
    try:
        dialect = csv.Sniffer().sniff(sample)
        sep = dialect.delimiter
    except Exception:
        sep = ','

print(f"üß≠ Separador detectado: '{sep}'")

train = pd.read_csv("train.csv", sep=sep, low_memory=False)
print("‚úÖ Archivo cargado correctamente.")
print("Forma del dataset:", train.shape)
display(train.head())

# 3Ô∏è‚É£ EXPLORACI√ìN INICIAL
# ------------------------------------------------------------
print("\nüìä Informaci√≥n general:")
print(train.info())

print("\nüîç Porcentaje de valores nulos:")
display(train.isnull().mean().sort_values(ascending=False).head(10))

print("\nüìà Estad√≠sticas descriptivas:")
display(train.describe(include='all').T)

# 4Ô∏è‚É£ LIMPIEZA B√ÅSICA
# ------------------------------------------------------------
# Eliminar columnas innecesarias o identificadores
cols_to_drop = [c for c in ['id', 'ID'] if c in train.columns]
if cols_to_drop:
    train.drop(columns=cols_to_drop, inplace=True)
    print(f"üßπ Columnas eliminadas: {cols_to_drop}")

# Eliminar duplicados
duplicados = train.duplicated().sum()
if duplicados > 0:
    train.drop_duplicates(inplace=True)
    print(f"üßπ Se eliminaron {duplicados} filas duplicadas.")
else:
    print("‚úÖ No hay duplicados en el dataset.")

# Eliminar columnas vac√≠as
train.dropna(axis=1, how='all', inplace=True)

# 5Ô∏è‚É£ DETECTAR VARIABLES NUM√âRICAS Y CATEG√ìRICAS
# ------------------------------------------------------------
num_cols = train.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = train.select_dtypes(include=['object', 'category']).columns.tolist()

print("\nüìà Variables num√©ricas:", len(num_cols))
print("üß© Variables categ√≥ricas:", len(cat_cols))

# 6Ô∏è‚É£ DEFINIR PIPELINES DE PREPROCESADO
# ------------------------------------------------------------
# - Para num√©ricas: imputar con la mediana y normalizar
# - Para categ√≥ricas: imputar valores faltantes y codificar con OneHotEncoder

num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='__missing__')),
    ('encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

# Combinar ambos pipelines
preprocessor = ColumnTransformer([
    ('num', num_pipeline, num_cols),
    ('cat', cat_pipeline, cat_cols)
])

# 7Ô∏è‚É£ APLICAR TRANSFORMACIONES
# ------------------------------------------------------------
X_processed = preprocessor.fit_transform(train)
print("\n‚úÖ Transformaci√≥n completada correctamente.")
print("Dimensiones del dataset transformado:", X_processed.shape)

# 8Ô∏è‚É£ CREAR NOMBRES DE COLUMNAS RESULTANTES
# ------------------------------------------------------------
num_features = num_cols
cat_features = preprocessor.named_transformers_['cat']['encoder'].get_feature_names_out(cat_cols)
final_columns = np.concatenate([num_features, cat_features])

# Crear DataFrame final
train_processed = pd.DataFrame(X_processed, columns=final_columns)
print("\n‚úÖ DataFrame final creado.")
print("Tama√±o:", train_processed.shape)
display(train_processed.head())

# 9Ô∏è‚É£ GUARDAR RESULTADOS
# ------------------------------------------------------------
train_processed.to_csv("train_processed.csv", index=False)
joblib.dump(preprocessor, "preprocessor.joblib")

print("\nüíæ Archivos generados correctamente:")
print("- train_processed.csv  ‚Üí Dataset limpio y listo para modelar")
print("- preprocessor.joblib  ‚Üí Pipeline de preprocesamiento guardado")

# üîü CONCLUSI√ìN
# ------------------------------------------------------------
print("""
üèÅ Proceso completado exitosamente.
Se ha realizado:
- Carga y revisi√≥n del dataset `train.csv`
- Limpieza de columnas vac√≠as, duplicadas y no relevantes
- Imputaci√≥n de valores faltantes
- Escalado de variables num√©ricas
- Codificaci√≥n One-Hot de variables categ√≥ricas
- Exportaci√≥n de dataset procesado y pipeline
""")


üß≠ Separador detectado: ','
‚úÖ Archivo cargado correctamente.
Forma del dataset: (272814, 21)


Unnamed: 0,ID,PERIODO_ACADEMICO,E_PRGM_ACADEMICO,E_PRGM_DEPARTAMENTO,E_VALORMATRICULAUNIVERSIDAD,E_HORASSEMANATRABAJA,F_ESTRATOVIVIENDA,F_TIENEINTERNET,F_EDUCACIONPADRE,F_TIENELAVADORA,...,E_PRIVADO_LIBERTAD,E_PAGOMATRICULAPROPIO,F_TIENECOMPUTADOR,F_TIENEINTERNET.1,F_EDUCACIONMADRE,RENDIMIENTO_GLOBAL,INDICADOR_1,INDICADOR_2,INDICADOR_3,INDICADOR_4
0,904256,20212,ENFERMERIA,BOGOT√Å,Entre 5.5 millones y menos de 7 millones,Menos de 10 horas,Estrato 3,Si,T√©cnica o tecnol√≥gica incompleta,Si,...,N,No,Si,Si,Postgrado,medio-alto,0.322,0.208,0.31,0.267
1,645256,20212,DERECHO,ATLANTICO,Entre 2.5 millones y menos de 4 millones,0,Estrato 3,No,T√©cnica o tecnol√≥gica completa,Si,...,N,No,Si,No,T√©cnica o tecnol√≥gica incompleta,bajo,0.311,0.215,0.292,0.264
2,308367,20203,MERCADEO Y PUBLICIDAD,BOGOT√Å,Entre 2.5 millones y menos de 4 millones,M√°s de 30 horas,Estrato 3,Si,Secundaria (Bachillerato) completa,Si,...,N,No,No,Si,Secundaria (Bachillerato) completa,bajo,0.297,0.214,0.305,0.264
3,470353,20195,ADMINISTRACION DE EMPRESAS,SANTANDER,Entre 4 millones y menos de 5.5 millones,0,Estrato 4,Si,No sabe,Si,...,N,No,Si,Si,Secundaria (Bachillerato) completa,alto,0.485,0.172,0.252,0.19
4,989032,20212,PSICOLOGIA,ANTIOQUIA,Entre 2.5 millones y menos de 4 millones,Entre 21 y 30 horas,Estrato 3,Si,Primaria completa,Si,...,N,No,Si,Si,Primaria completa,medio-bajo,0.316,0.232,0.285,0.294



üìä Informaci√≥n general:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 272814 entries, 0 to 272813
Data columns (total 21 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   ID                           272814 non-null  int64  
 1   PERIODO_ACADEMICO            272814 non-null  int64  
 2   E_PRGM_ACADEMICO             272814 non-null  object 
 3   E_PRGM_DEPARTAMENTO          272813 non-null  object 
 4   E_VALORMATRICULAUNIVERSIDAD  270332 non-null  object 
 5   E_HORASSEMANATRABAJA         260689 non-null  object 
 6   F_ESTRATOVIVIENDA            260231 non-null  object 
 7   F_TIENEINTERNET              262380 non-null  object 
 8   F_EDUCACIONPADRE             263651 non-null  object 
 9   F_TIENELAVADORA              257200 non-null  object 
 10  F_TIENEAUTOMOVIL             255690 non-null  object 
 11  E_PRIVADO_LIBERTAD           272813 non-null  object 
 12  E_PAGOMATRICULAPROPIO        2

Unnamed: 0,0
F_TIENEAUTOMOVIL,0.062768
F_TIENELAVADORA,0.057233
F_TIENECOMPUTADOR,0.054818
F_ESTRATOVIVIENDA,0.046123
E_HORASSEMANATRABAJA,0.044444
F_TIENEINTERNET.1,0.038246
F_TIENEINTERNET,0.038246
F_EDUCACIONMADRE,0.034177
F_EDUCACIONPADRE,0.033587
E_PAGOMATRICULAPROPIO,0.009398



üìà Estad√≠sticas descriptivas:


Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
ID,272814.0,,,,494618.513779,285442.540862,11.0,247622.75,494578.5,741484.5,989281.0
PERIODO_ACADEMICO,272814.0,,,,20198.367232,10.530992,20183.0,20195.0,20195.0,20203.0,20213.0
E_PRGM_ACADEMICO,272814.0,924.0,DERECHO,21049.0,,,,,,,
E_PRGM_DEPARTAMENTO,272813.0,31.0,BOGOT√Å,111090.0,,,,,,,
E_VALORMATRICULAUNIVERSIDAD,270332.0,8.0,Entre 1 mill√≥n y menos de 2.5 millones,80299.0,,,,,,,
E_HORASSEMANATRABAJA,260689.0,5.0,M√°s de 30 horas,98334.0,,,,,,,
F_ESTRATOVIVIENDA,260231.0,7.0,Estrato 2,92153.0,,,,,,,
F_TIENEINTERNET,262380.0,2.0,Si,233324.0,,,,,,,
F_EDUCACIONPADRE,263651.0,12.0,Secundaria (Bachillerato) completa,50864.0,,,,,,,
F_TIENELAVADORA,257200.0,2.0,Si,221934.0,,,,,,,


üßπ Columnas eliminadas: ['ID']
‚úÖ No hay duplicados en el dataset.

üìà Variables num√©ricas: 5
üß© Variables categ√≥ricas: 15

‚úÖ Transformaci√≥n completada correctamente.
Dimensiones del dataset transformado: (272814, 1036)

‚úÖ DataFrame final creado.
Tama√±o: (272814, 1036)


Unnamed: 0,PERIODO_ACADEMICO,INDICADOR_1,INDICADOR_2,INDICADOR_3,INDICADOR_4,E_PRGM_ACADEMICO_3¬∞ CICLO PROFESIONAL NEGOCIOS INTERNACIONALES,E_PRGM_ACADEMICO_ACTIVIDAD FISICA Y DEPORTE,E_PRGM_ACADEMICO_ACUICULTURA,E_PRGM_ACADEMICO_ADMINISTRACION,E_PRGM_ACADEMICO_ADMINISTRACION FINANCIERA,...,F_EDUCACIONMADRE_Secundaria (Bachillerato) completa,F_EDUCACIONMADRE_Secundaria (Bachillerato) incompleta,F_EDUCACIONMADRE_T√©cnica o tecnol√≥gica completa,F_EDUCACIONMADRE_T√©cnica o tecnol√≥gica incompleta,F_EDUCACIONMADRE___missing__,RENDIMIENTO_GLOBAL___missing__,RENDIMIENTO_GLOBAL_alto,RENDIMIENTO_GLOBAL_bajo,RENDIMIENTO_GLOBAL_medio-alto,RENDIMIENTO_GLOBAL_medio-bajo
0,1.29454,0.438545,-0.559018,0.813828,0.057959,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,1.29454,0.348286,-0.483954,0.507265,0.013715,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
2,0.439918,0.233411,-0.494678,0.728672,0.013715,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,-0.319746,1.776018,-0.945057,-0.173985,-1.077631,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,1.29454,0.389313,-0.301658,0.388047,0.456152,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0



üíæ Archivos generados correctamente:
- train_processed.csv  ‚Üí Dataset limpio y listo para modelar
- preprocessor.joblib  ‚Üí Pipeline de preprocesamiento guardado

üèÅ Proceso completado exitosamente.
Se ha realizado:
- Carga y revisi√≥n del dataset `train.csv`
- Limpieza de columnas vac√≠as, duplicadas y no relevantes
- Imputaci√≥n de valores faltantes
- Escalado de variables num√©ricas
- Codificaci√≥n One-Hot de variables categ√≥ricas
- Exportaci√≥n de dataset procesado y pipeline

