#  Pipelines de Preprocesamiento

In [1]:
import subprocess, sys
from pathlib import Path
PROJECT_ROOT = Path.cwd().parent if 'notebooks' in str(Path.cwd()) else Path.cwd()
sys.path.insert(0, str(PROJECT_ROOT))
for pkg in ['pandas', 'numpy', 'scikit-learn']: 
    try: __import__(pkg)
    except: subprocess.check_call([sys.executable, '-m', 'pip', 'install', '-q', pkg])

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import sys; sys.path.insert(0, str(PROJECT_ROOT / 'src'))
from preprocessing import PreprocessingPipeline, compare_pca_components
from io_utils import identify_column_types
import warnings; warnings.filterwarnings('ignore')

DATA_PROCESSED = PROJECT_ROOT / 'data' / 'processed'
X = pd.read_parquet(DATA_PROCESSED / 'imputed.parquet')
y = pd.read_parquet(DATA_PROCESSED / 'target.parquet')['GRUPO']

## Pipeline con PCA

In [3]:
col_types = identify_column_types(X)
numeric_cols = col_types['numeric']
categorical_cols = col_types['categorical']
pca_comparison = compare_pca_components(X, numeric_cols, categorical_cols, y, [0.90, 0.95, 0.99])
print(pca_comparison)


ðŸ“Š Probando PCA con 90.0% varianza explicada...
  Componentes: 12
  Varianza real: 90.68%
  F1-Score: 0.8407 (Â±0.0000)

ðŸ“Š Probando PCA con 95.0% varianza explicada...
  Componentes: 23
  Varianza real: 95.04%
  F1-Score: 0.8407 (Â±0.0000)

ðŸ“Š Probando PCA con 99.0% varianza explicada...
  Componentes: 91
  Varianza real: 99.01%
  F1-Score: 0.8408 (Â±0.0001)
   variance_threshold  n_components  actual_variance  f1_score    f1_std  \
0                0.90            12         0.906847  0.840729  0.000043   
1                0.95            23         0.950387  0.840729  0.000043   
2                0.99            91         0.990125  0.840775  0.000107   

   transform_time  eval_time  
0        0.884194   9.676269  
1        0.432174   7.897848  
2        0.787330   9.984867  


In [None]:
prep = PreprocessingPipeline(numeric_cols, categorical_cols, scaler_type='standard', use_pca=True, pca_variance=0.95)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
X_train_transformed, y_train_encoded = prep.fit_transform(X_train, y_train)
X_test_transformed = prep.transform(X_test)
y_test_encoded, _ = prep.label_encoder.transform(y_test), None
prep.save(DATA_PROCESSED / 'preprocessor.joblib')
np.save(DATA_PROCESSED / 'X_train.npy', X_train_transformed)
np.save(DATA_PROCESSED / 'X_test.npy', X_test_transformed)
np.save(DATA_PROCESSED / 'y_train.npy', y_train_encoded)
np.save(DATA_PROCESSED / 'y_test.npy', y_test_encoded)
print(' Guardado')

âœ“ Pipeline guardado en: c:\Proyecto_Enfermedades_Alto_Costo completo\Proyecto_Enfermedades_Alto_Costo completo\data\processed\preprocessor.joblib
âœ… Guardado


AnÃ¡lisis: se Implemento pipeline con codificaciÃ³n, escalado y opciÃ³n PCA; para 90% de varianza se requieren 12 componentes (F1 â‰ˆ 0.8407), para 95% 23 componentes (F1 â‰ˆ 0.8408) y para 99% 91 componentes (F1 â‰ˆ 0.8408). Los resultados indican mÃ­nima pÃ©rdida de F1 al reducir dimensionalidad.

Conclusiones: El pipeline estandarizado garantiza reproducibilidad; aplicar PCA a 90â€“95% permite reducciÃ³n dimensional sin degradar significativamente F1.