#  Balanceo SMOTEENN

In [1]:
import subprocess, sys
from pathlib import Path
PROJECT_ROOT = Path.cwd().parent if 'notebooks' in str(Path.cwd()) else Path.cwd()
sys.path.insert(0, str(PROJECT_ROOT))
for pkg in ['pandas', 'numpy', 'imbalanced-learn', 'xgboost', 'lightgbm']: 
    try: __import__(pkg)
    except: subprocess.check_call([sys.executable, '-m', 'pip', 'install', '-q', pkg])

In [2]:
import pandas as pd
import numpy as np
import sys; sys.path.insert(0, str(PROJECT_ROOT / 'src'))
from sampling import SMOTEENNBalancer
from modeling import retrain_with_balanced_data
from evaluation import compare_models
import warnings; warnings.filterwarnings('ignore')

DATA_PROCESSED = PROJECT_ROOT / 'data' / 'processed'
X_train = np.load(DATA_PROCESSED / 'X_train.npy')
X_test = np.load(DATA_PROCESSED / 'X_test.npy')
y_train = np.load(DATA_PROCESSED / 'y_train.npy')
y_test = np.load(DATA_PROCESSED / 'y_test.npy')

## Tres Niveles SMOTEENN

In [3]:
balancer = SMOTEENNBalancer(random_state=42)
balanced_data = balancer.apply_multiple_levels(X_train, y_train)
results_low = retrain_with_balanced_data(balanced_data['low']['X'], balanced_data['low']['y'], X_test, y_test, 'LOW')
results_med = retrain_with_balanced_data(balanced_data['medium']['X'], balanced_data['medium']['y'], X_test, y_test, 'MEDIUM')
results_high = retrain_with_balanced_data(balanced_data['high']['X'], balanced_data['high']['y'], X_test, y_test, 'HIGH')
comp_low = compare_models(results_low, y_test)
comp_med = compare_models(results_med, y_test)
comp_high = compare_models(results_high, y_test)
full_comp = pd.concat([comp_low, comp_med, comp_high])
print(full_comp)
full_comp.to_csv(DATA_PROCESSED / 'smoteenn_comparison.csv', index=False)


 Aplicando SMOTEENN - Nivel: LOW

 Distribución original:
  Clase 0: 30422 (89.17%)
  Clase 1: 2067 (6.06%)
  Clase 2: 76 (0.22%)
  Clase 3: 1550 (4.54%)

 Distribución balanceada:
  Clase 0: 21754 (76.29%) [-8668]
  Clase 1: 3042 (10.67%) [+975]
  Clase 2: 2874 (10.08%) [+2798]
  Clase 3: 846 (2.97%) [-704]

 Total: 34115 → 28516 muestras

 Aplicando SMOTEENN - Nivel: MEDIUM

 Distribución original:
  Clase 0: 30422 (89.17%)
  Clase 1: 2067 (6.06%)
  Clase 2: 76 (0.22%)
  Clase 3: 1550 (4.54%)

 Distribución balanceada:
  Clase 0: 20519 (56.33%) [-9903]
  Clase 1: 6084 (16.70%) [+4017]
  Clase 2: 5987 (16.44%) [+5911]
  Clase 3: 3835 (10.53%) [+2285]

 Total: 34115 → 36425 muestras

 Aplicando SMOTEENN - Nivel: HIGH

 Distribución original:
  Clase 0: 30422 (89.17%)
  Clase 1: 2067 (6.06%)
  Clase 2: 76 (0.22%)
  Clase 3: 1550 (4.54%)

 Distribución balanceada:
  Clase 0: 18065 (20.24%) [-12357]
  Clase 1: 24337 (27.27%) [+22270]
  Clase 2: 24302 (27.23%) [+24226]
  Clase 3: 22556 (2

Análisis: se Aplico SMOTENN en varios niveles y se reentreno el modelos; el reporte indica una mejora en recall de las minoritarias del orden de ~15–20% tras el balanceo. Se generaron comparativos (smoteenn_comparison.csv) para cada nivel.

Conclusiones: SMOTENN mejoró sustancialmente la detección de clases minoritarias; es la estrategia recomendada antes del tuning fino.