In [1]:
# ============================================================
# PREPROCESADO COMPLETO Y ROBUSTO
# Compatible con remainder='passthrough' o 'drop'
# ============================================================

import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import joblib

# 1Ô∏è‚É£ CARGA DE DATOS
train = pd.read_csv("train.csv")
print("Archivo cargado correctamente ‚úÖ")
print("Forma inicial:", train.shape)

# 2Ô∏è‚É£ EXPLORACI√ìN INICIAL
print("\nInformaci√≥n general:")
print(train.info())

print("\nPorcentaje de valores nulos por columna:")
missing = train.isnull().mean().sort_values(ascending=False)
display(missing[missing > 0].head(10))

# 3Ô∏è‚É£ LIMPIEZA B√ÅSICA
cols_to_drop = []
if 'id' in train.columns:
    cols_to_drop.append('id')
if 'ID' in train.columns:
    cols_to_drop.append('ID')

if cols_to_drop:
    train = train.drop(columns=cols_to_drop)
    print(f"Se eliminaron las columnas: {cols_to_drop}")

duplicates = train.duplicated().sum()
if duplicates > 0:
    train = train.drop_duplicates()
    print(f"Se eliminaron {duplicates} duplicados.")
else:
    print("No se encontraron duplicados.")

# 4Ô∏è‚É£ SEPARAR VARIABLES
cat_cols = train.select_dtypes(include=['object', 'category']).columns.tolist()
num_cols = train.select_dtypes(include=[np.number]).columns.tolist()

print("\nColumnas num√©ricas:", num_cols)
print("Columnas categ√≥ricas:", cat_cols)

# 5Ô∏è‚É£ PIPELINES
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='__missing__')),
    ('encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

preprocessor = ColumnTransformer([
    ('num', num_pipeline, num_cols),
    ('cat', cat_pipeline, cat_cols)
], remainder='passthrough')  # üëà dejamos passthrough, ahora manejado correctamente

# 6Ô∏è‚É£ TRANSFORMACI√ìN
X_processed = preprocessor.fit_transform(train)
print("\nTransformaci√≥n completada ‚úÖ")
print("Forma resultante:", X_processed.shape)

# 7Ô∏è‚É£ NOMBRES DE COLUMNAS
# ------------------------------------------------------------
# Creamos nombres correctos seg√∫n el contenido del preprocessor.
# Si hay 'passthrough', se a√±aden las columnas originales restantes.
# ------------------------------------------------------------
feature_names = []

# Columnas num√©ricas
if num_cols:
    feature_names.extend(num_cols)

# Columnas categ√≥ricas (One-Hot)
if cat_cols:
    ohe = preprocessor.named_transformers_['cat'].named_steps['encoder']
    ohe_names = ohe.get_feature_names_out(cat_cols).tolist()
    feature_names.extend(ohe_names)

# Si passthrough est√° activo, agregamos las columnas que no se transformaron
if preprocessor.remainder == 'passthrough':
    passthrough_cols = [c for c in train.columns if c not in num_cols + cat_cols]
    feature_names.extend(passthrough_cols)

print("\nN√∫mero esperado de columnas:", len(feature_names))
print("N√∫mero real en X_processed:", X_processed.shape[1])

# Si hay desajuste, generamos nombres gen√©ricos para evitar error
if len(feature_names) != X_processed.shape[1]:
    print("‚ö†Ô∏è Aviso: n√∫mero de nombres y columnas no coincide, ajustando autom√°ticamente.")
    feature_names = [f"feature_{i}" for i in range(X_processed.shape[1])]

# 8Ô∏è‚É£ CREAR DATAFRAME FINAL
train_processed = pd.DataFrame(X_processed, columns=feature_names)
print("\n‚úÖ DataFrame procesado creado correctamente:")
display(train_processed.head())

# 9Ô∏è‚É£ GUARDADO
output_data_path = "train_processed.csv"
output_model_path = "preprocessor.joblib"

train_processed.to_csv(output_data_path, index=False)
joblib.dump(preprocessor, output_model_path)

print(f"\n‚úÖ Dataset procesado guardado como: {output_data_path}")
print(f"‚úÖ Pipeline guardado como: {output_model_path}")

# üîü CONCLUSI√ìN
print("\nüéØ Preprocesamiento finalizado correctamente.")
print("El archivo 'train_processed.csv' est√° listo para usarse en modelos.")


Archivo cargado correctamente ‚úÖ
Forma inicial: (85879, 21)

Informaci√≥n general:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 85879 entries, 0 to 85878
Data columns (total 21 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   ID                           85879 non-null  int64  
 1   PERIODO_ACADEMICO            85879 non-null  int64  
 2   E_PRGM_ACADEMICO             85879 non-null  object 
 3   E_PRGM_DEPARTAMENTO          85879 non-null  object 
 4   E_VALORMATRICULAUNIVERSIDAD  85096 non-null  object 
 5   E_HORASSEMANATRABAJA         82027 non-null  object 
 6   F_ESTRATOVIVIENDA            81866 non-null  object 
 7   F_TIENEINTERNET              82573 non-null  object 
 8   F_EDUCACIONPADRE             82994 non-null  object 
 9   F_TIENELAVADORA              80972 non-null  object 
 10  F_TIENEAUTOMOVIL             80489 non-null  object 
 11  E_PRIVADO_LIBERTAD           85878 non-null  obj

Unnamed: 0,0
F_TIENEAUTOMOVIL,0.062763
F_TIENELAVADORA,0.057139
F_TIENECOMPUTADOR,0.055101
F_ESTRATOVIVIENDA,0.046729
E_HORASSEMANATRABAJA,0.044854
F_TIENEINTERNET.1,0.038508
F_TIENEINTERNET,0.038496
F_EDUCACIONMADRE,0.034327
F_EDUCACIONPADRE,0.033594
E_PAGOMATRICULAPROPIO,0.009385


Se eliminaron las columnas: ['ID']
No se encontraron duplicados.

Columnas num√©ricas: ['PERIODO_ACADEMICO', 'INDICADOR_1', 'INDICADOR_2', 'INDICADOR_3', 'INDICADOR_4']
Columnas categ√≥ricas: ['E_PRGM_ACADEMICO', 'E_PRGM_DEPARTAMENTO', 'E_VALORMATRICULAUNIVERSIDAD', 'E_HORASSEMANATRABAJA', 'F_ESTRATOVIVIENDA', 'F_TIENEINTERNET', 'F_EDUCACIONPADRE', 'F_TIENELAVADORA', 'F_TIENEAUTOMOVIL', 'E_PRIVADO_LIBERTAD', 'E_PAGOMATRICULAPROPIO', 'F_TIENECOMPUTADOR', 'F_TIENEINTERNET.1', 'F_EDUCACIONMADRE', 'RENDIMIENTO_GLOBAL']

Transformaci√≥n completada ‚úÖ
Forma resultante: (85879, 987)

N√∫mero esperado de columnas: 987
N√∫mero real en X_processed: 987

‚úÖ DataFrame procesado creado correctamente:


Unnamed: 0,PERIODO_ACADEMICO,INDICADOR_1,INDICADOR_2,INDICADOR_3,INDICADOR_4,E_PRGM_ACADEMICO_3¬∞ CICLO PROFESIONAL NEGOCIOS INTERNACIONALES,E_PRGM_ACADEMICO_ACTIVIDAD FISICA Y DEPORTE,E_PRGM_ACADEMICO_ACUICULTURA,E_PRGM_ACADEMICO_ADMINISTRACION,E_PRGM_ACADEMICO_ADMINISTRACION FINANCIERA,...,F_EDUCACIONMADRE_Secundaria (Bachillerato) completa,F_EDUCACIONMADRE_Secundaria (Bachillerato) incompleta,F_EDUCACIONMADRE_T√©cnica o tecnol√≥gica completa,F_EDUCACIONMADRE_T√©cnica o tecnol√≥gica incompleta,F_EDUCACIONMADRE___missing__,RENDIMIENTO_GLOBAL___missing__,RENDIMIENTO_GLOBAL_alto,RENDIMIENTO_GLOBAL_bajo,RENDIMIENTO_GLOBAL_medio-alto,RENDIMIENTO_GLOBAL_medio-bajo
0,1.296064,0.433439,-0.550771,0.810301,0.063928,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,1.296064,0.34334,-0.47559,0.505218,0.01988,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
2,0.440145,0.22867,-0.486331,0.725556,0.01988,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,-0.320672,1.768533,-0.937413,-0.172744,-1.06663,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,1.296064,0.384294,-0.293009,0.386575,0.460357,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0



‚úÖ Dataset procesado guardado como: train_processed.csv
‚úÖ Pipeline guardado como: preprocessor.joblib

üéØ Preprocesamiento finalizado correctamente.
El archivo 'train_processed.csv' est√° listo para usarse en modelos.
