# Report Práctica Predicción Abandono

___________________________________________________________________

En las siguientes celdas adjunto el código que he usado para entrenar el módelo (Gradient Boosting) con el que he obtenido los mejores resultados en balanced accuracy. No obstante, ha habido muchas más fases en este proyecto en las que el código no resultaba así. 

Debajo de las conclusiones y resultados podrá encontrar las diferentes pruebas tanto de modelos como parámetros o limpieza/preprocesado que se han hecho

___________________________________________________________________

### Imports

In [23]:
import numpy as np
import random
import pandas as pd
import pickle as pkl
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings(action='ignore')

# Preprocesado
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE, BorderlineSMOTE
from imblearn.pipeline import Pipeline as ImbPipeline

# Prueba de Modelos
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier, AdaBoostClassifier
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.base import BaseEstimator, TransformerMixin
# Evaluacion
from sklearn.metrics import (balanced_accuracy_score, classification_report,
                              confusion_matrix, roc_auc_score, accuracy_score,
                              precision_score, recall_score, f1_score)
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold

# Fijamos Semilla
random.seed(100473223)
np.random.seed(100473223)


### Clase de Feature Engineering

In [24]:
class FeatureEngineer(BaseEstimator, TransformerMixin):
    """Crea features derivadas para capturar patrones no lineales"""
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X = X.copy()
        
        # Evitar división por cero
        def safe_divide(a, b):
            return np.where(b != 0, a / b, 0)
        
        # Estabilidad laboral
        X['Years_per_Company_Ratio'] = safe_divide(
            X['Total Active Years'], 
            X['Number of Other Companies'] + 1
        )
        
        # Retraso en promoción
        X['Promotion_Lag'] = X['Years at Current Company'] - X['Years Since Last Promotion']
        
        # Ingreso por año de experiencia
        X['Income_per_Year'] = safe_divide(
            X['Yearly Income'], 
            X['Total Active Years'] + 1
        )
        
        # Estabilidad con manager actual
        X['Manager_Stability'] = safe_divide(
            X['Years with Current Manager'], 
            X['Years at Current Company'] + 1
        )
        
        # Satisfacción general (promedio de satisfacciones)
        satisfaction_cols = ['Job Satisfaction', 'Environment Satisfaction', 
                            'Work Life Balance Satisfaction']
        
        # Manejar valores faltantes temporalmente para el cálculo
        X['Overall_Satisfaction'] = X[satisfaction_cols].mean(axis=1, skipna=True)
        
        # Baja satisfacción general
        X['Low_Satisfaction'] = (X['Overall_Satisfaction'] < 3).astype(int)
        
        # Empleado recién contratado
        X['Recent_Hire'] = (X['Years at Current Company'] < 2).astype(int)
        
        # Promoción atrasada
        X['Overdue_Promotion'] = (X['Years Since Last Promotion'] > 3).astype(int)
        
        # Commute largo (si la columna existe)
        if 'Miles from Home to Work' in X.columns:
            X['Long_Commute'] = (X['Miles from Home to Work'] > 20).astype(int)
        
        # Edad vs. Nivel de trabajo (senior joven o junior viejo = señal)
        X['Age_x_JobLevel'] = X['Age'] * pd.factorize(X.get('Job Level', pd.Series([0]*len(X))))[0]
        
        # Satisfacción baja + salario bajo = alto riesgo
        X['LowSat_LowIncome'] = (
            (X['Overall_Satisfaction'] < 3) & 
            (X['Yearly Income'] < X['Yearly Income'].median())
        ).astype(int)
        
        return X

### 1. Carga y Exploración de Datos

In [25]:
data = pd.read_csv('train.csv')

print(f"\nDimensiones del dataset: {data.shape}")
print(f"\nDistribución de la clase objetivo (Attrition):")
print(data['Attrition'].value_counts())
attrition_dist = data['Attrition'].value_counts(normalize=True) * 100
print(f"\nPorcentaje de Attrition:\n{attrition_dist}")

# Ratio de desbalanceo (solo informativo)
class_counts = data['Attrition'].value_counts()
imbalance_ratio = class_counts['No'] / class_counts['Yes']
print(f"\n⚠ Ratio de desbalanceo: {imbalance_ratio:.2f}:1")
print(f"Clase minoritaria (Yes): {attrition_dist['Yes']:.1f}%")
print("→ Se usará SMOTE en el pipeline de entrenamiento y CV.")




Dimensiones del dataset: (3528, 27)

Distribución de la clase objetivo (Attrition):
Attrition
No     2956
Yes     572
Name: count, dtype: int64

Porcentaje de Attrition:
Attrition
No     83.786848
Yes    16.213152
Name: proportion, dtype: float64

⚠ Ratio de desbalanceo: 5.17:1
Clase minoritaria (Yes): 16.2%
→ Se usará SMOTE en el pipeline de entrenamiento y CV.


### 2. División en Entrenamiento y Test

In [26]:
X = data.drop(['Attrition', 'ID'], axis=1)
y = data['Attrition'].map({'No': 0, 'Yes': 1})

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, shuffle=True, random_state=42, stratify=y
)

print(f"\nConjunto de entrenamiento: {X_train.shape[0]} instancias")
print(f"Conjunto de test: {X_test.shape[0]} instancias")
print(f"\nDistribución en entrenamiento:\n{y_train.value_counts()}")
print(f"\nDistribución en test:\n{y_test.value_counts()}")




Conjunto de entrenamiento: 2822 instancias
Conjunto de test: 706 instancias

Distribución en entrenamiento:
Attrition
0    2364
1     458
Name: count, dtype: int64

Distribución en test:
Attrition
0    592
1    114
Name: count, dtype: int64


### 3. Preprocesado de datos

In [27]:
numerical_cols = [
    'Age', 'Miles from Home to Work', 'Yearly Income', 'Absences per Year',
    'Performance Rating', 'Job Satisfaction', 'Environment Satisfaction',
    'Work Life Balance Satisfaction', 'Last Salary Increase (%)',
    'Number of Training Sessions Last Year', 'Number of Other Companies',
    'Total Active Years', 'Years at Current Company',
    'Years Since Last Promotion', 'Years with Current Manager'
]

ordinal_cols = {
    'Education Level': [['High School', 'College', 'Bachelor', 'Master', 'Doctor']],
    'Job Level': [['Entry Level', 'Mid Level', 'Senior Level', 'Director', 'Executive']],
    'Job Involvement': [['Low', 'Medium', 'High', 'Very High']]
}

categorical_cols = [
    'Gender', 'Marital Status', 'Education Field', 'Department Name',
    'Job Role Name', 'Business Travel Frequency', 'Amount of Stock Option'
]

print(f"\nColumnas numéricas: {len(numerical_cols)}")
print(f"Columnas ordinales: {len(ordinal_cols)}")
print(f"Columnas categóricas: {len(categorical_cols)}")


# Análisis de valores faltantes
print("\n" + "="*50)
print("ANÁLISIS DE VALORES FALTANTES")
print("="*50)
missing_values = X_train.isnull().sum()
missing_percent = (missing_values / len(X_train)) * 100
missing_df = pd.DataFrame({
    'Columna': missing_values.index,
    'Valores_Faltantes': missing_values.values,
    'Porcentaje': missing_percent.values
})
missing_df = missing_df[missing_df['Valores_Faltantes'] > 0].sort_values('Valores_Faltantes', ascending=False)
if len(missing_df) > 0:
    print(missing_df.to_string(index=False))
else:
    print("✓ No hay valores faltantes en el dataset")
print(f"\nTotal de columnas con valores faltantes: {len(missing_df)}")
    
# Eliminar columnas con >30% de valores faltantes
cols_to_drop = missing_df[missing_df['Porcentaje'] > 30]['Columna'].tolist()
if cols_to_drop:
    print(f"\n⚠ Eliminando columnas con >30% de valores faltantes: {cols_to_drop}")
    X_train = X_train.drop(columns=cols_to_drop)
    X_test = X_test.drop(columns=cols_to_drop)
    
    # Actualizar listas de columnas
    numerical_cols = [col for col in numerical_cols if col not in cols_to_drop]
    categorical_cols = [col for col in categorical_cols if col not in cols_to_drop]
    ordinal_cols = {k: v for k, v in ordinal_cols.items() if k not in cols_to_drop}
    
    print(f"✓ Columnas eliminadas. Nuevas dimensiones: {X_train.shape}")
else:
    print("✓ No hay valores faltantes en el dataset")
# === PREPROCESADO ===

# 1) Transformadores por tipo de columna
num_transformer = ImbPipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

# Un pipeline ordinal por cada columna ordinal (imputar + codificar)
ord_transformers = []
for col, categories in ordinal_cols.items():
    ord_transformers.append((
        f'ord_{col}',
        ImbPipeline(steps=[
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('encoder', OrdinalEncoder(
                categories=categories,
                handle_unknown='use_encoded_value',
                unknown_value=-1
            ))        
        ]),
        [col]
    ))

cat_transformer = ImbPipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(sparse_output=False, handle_unknown='ignore'))
])

# 2) Un único ColumnTransformer como preprocesador
preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_transformer, numerical_cols),
        *ord_transformers,
        ('cat', cat_transformer, categorical_cols)
    ],
    remainder='drop',
    verbose_feature_names_out=False
)

# 3) Pipeline FINAL con SMOTE (Sin nested Pipeline en pasos intermedios)
# MEJORA #2: Usar BorderlineSMOTE en lugar de SMOTE
ml_pipe_improved = ImbPipeline(steps=[
    ('feature_engineer', FeatureEngineer()),  # "Juego de features"
    ('preprocessor', preprocessor),
    ('smote', BorderlineSMOTE(random_state=42, kind='borderline-1')),  # Tiene en cuenta los bordes
    ('classifier', LogisticRegression())  # placeholder
])





Columnas numéricas: 15
Columnas ordinales: 3
Columnas categóricas: 7

ANÁLISIS DE VALORES FALTANTES
                       Columna  Valores_Faltantes  Porcentaje
        Amount of Stock Option               1213   42.983700
       Miles from Home to Work                572   20.269313
                Marital Status                419   14.847626
Work Life Balance Satisfaction                168    5.953225
              Job Satisfaction                154    5.457123
      Environment Satisfaction                139    4.925585
     Number of Other Companies                 11    0.389794
            Total Active Years                  4    0.141743

Total de columnas con valores faltantes: 8

⚠ Eliminando columnas con >30% de valores faltantes: ['Amount of Stock Option']
✓ Columnas eliminadas. Nuevas dimensiones: (2822, 24)


### Modelos a probar: 
(Las pruebas descartadas están incluidas debajo de los resultados oficiales)

In [28]:
grid= [
    # Gradient Boosting - Excelente para balanced accuracy
    {
        'classifier': [GradientBoostingClassifier(random_state=42)],
        'classifier__n_estimators': [600],
        'classifier__learning_rate': [0.15],
        'classifier__max_depth': [5],
        'classifier__subsample': [0.9],
        'classifier__min_samples_leaf': [7]
    }
]
# La mejor combinación será (en mi experiencia): {'classifier': GradientBoostingClassifier(random_state=42), 'classifier__learning_rate': 0.15, 'classifier__max_depth': 5, 'classifier__min_samples_leaf': 7, 'classifier__n_estimators': 600, 'classifier__subsample': 0.9}


### Inicializamos la búsqueda

In [29]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
search_smote = GridSearchCV(
    estimator=ml_pipe_improved,   # ImbPipeline
    param_grid=grid,
    scoring='balanced_accuracy',
    cv=cv,
    n_jobs=-1,
    error_score='raise',
    verbose = 1,
)


print("\nIniciando búsqueda CON SMOTE integrado (CV estratificada)...")
search_smote.fit(X_train, y_train)
print("✓ Búsqueda CON SMOTE completada!")

print(f"\nMejor modelo CON SMOTE:")
print(f"  Clasificador: {search_smote.best_estimator_['classifier'].__class__.__name__}")
print(f"  Balanced Accuracy (CV): {search_smote.best_score_:.4f}")

best_model = search_smote.best_estimator_

def find_optimal_threshold(model, X_val, y_val):
    """Encuentra el threshold óptimo para balanced accuracy"""
    y_proba = model.predict_proba(X_val)[:, 1]
    
    thresholds = np.arange(0.1, 0.9, 0.01)
    scores = []
    
    for thresh in thresholds:
        y_pred = (y_proba >= thresh).astype(int)
        score = balanced_accuracy_score(y_val, y_pred)
        scores.append(score)
    
    optimal_idx = np.argmax(scores)
    optimal_threshold = thresholds[optimal_idx]
    optimal_score = scores[optimal_idx]
    
    print(f"Threshold por defecto (0.5): {balanced_accuracy_score(y_val, (y_proba >= 0.5).astype(int)):.4f}")
    print(f"Threshold óptimo encontrado: {optimal_threshold:.3f}")
    print(f"Balanced Accuracy con threshold óptimo: {optimal_score:.4f}")
    print(f"Mejora: +{(optimal_score - balanced_accuracy_score(y_val, (y_proba >= 0.5).astype(int))):.4f}")
    
    return optimal_threshold

# Encontrar threshold óptimo en train
optimal_threshold = find_optimal_threshold(best_model, X_train, y_train)



Iniciando búsqueda CON SMOTE integrado (CV estratificada)...
Fitting 5 folds for each of 1 candidates, totalling 5 fits


✓ Búsqueda CON SMOTE completada!

Mejor modelo CON SMOTE:
  Clasificador: GradientBoostingClassifier
  Balanced Accuracy (CV): 0.9021
Threshold por defecto (0.5): 1.0000
Threshold óptimo encontrado: 0.100
Balanced Accuracy con threshold óptimo: 1.0000
Mejora: +0.0000


### Mejor Modelo:
{'classifier': GradientBoostingClassifier(random_state=42), 'classifier__learning_rate': 0.15, 'classifier__max_depth': 5, 'classifier__min_samples_leaf': 7, 'classifier__n_estimators': 600, 'classifier__subsample': 0.9}

### 5. Evaluación en test

In [None]:
# Predicciones con threshold por defecto (0.5)
y_pred_default = best_model.predict(X_test)
y_proba = best_model.predict_proba(X_test)[:, 1]

# Predicciones con threshold óptimo
y_pred_optimal = (y_proba >= optimal_threshold).astype(int)

# Métricas con threshold por defecto
ba_default = balanced_accuracy_score(y_test, y_pred_default)
acc_default = accuracy_score(y_test, y_pred_default)
prec_default = precision_score(y_test, y_pred_default)
rec_default = recall_score(y_test, y_pred_default)
f1_default = f1_score(y_test, y_pred_default)

# Métricas con threshold óptimo
ba_optimal = balanced_accuracy_score(y_test, y_pred_optimal)
acc_optimal = accuracy_score(y_test, y_pred_optimal)
prec_optimal = precision_score(y_test, y_pred_optimal)
rec_optimal = recall_score(y_test, y_pred_optimal)
f1_optimal = f1_score(y_test, y_pred_optimal)
auc = roc_auc_score(y_test, y_proba)

# Comparación de métricas
metrics_comparison = pd.DataFrame({
    'Métrica': ['Balanced Accuracy', 'Accuracy', 'Precision (Yes)', 'Recall (Yes)', 'F1-Score (Yes)', 'AUC-ROC'],
    'Threshold 0.5': [ba_default, acc_default, prec_default, rec_default, f1_default, auc],
    'Threshold Óptimo': [ba_optimal, acc_optimal, prec_optimal, rec_optimal, f1_optimal, auc],
    'Mejora': [
        ba_optimal - ba_default,
        acc_optimal - acc_default,
        prec_optimal - prec_default,
        rec_optimal - rec_default,
        f1_optimal - f1_default,
        0
    ]
})

print("\n📊 COMPARACIÓN DE RESULTADOS:")
print(metrics_comparison.to_string(index=False))

print(f"\n🎉 Balanced Accuracy Final: {ba_optimal:.4f}")
print(f"   Mejora vs threshold 0.5: +{(ba_optimal - ba_default):.4f}")

# Matriz de confusión con threshold óptimo
cm = confusion_matrix(y_test, y_pred_optimal)
tn, fp, fn, tp = cm.ravel()
print("\n📋 Matriz de Confusión (Threshold Óptimo):")
print(cm)
print(f"  TP={tp}, FP={fp}, FN={fn}, TN={tn}")

print("\n📄 REPORTE DE CLASIFICACIÓN (Threshold Óptimo):")
print(classification_report(y_test, y_pred_optimal, target_names=['No Attrition', 'Attrition']))



📊 COMPARACIÓN DE RESULTADOS:
          Métrica  Threshold 0.5  Threshold Óptimo    Mejora
Balanced Accuracy       0.949384          0.965238  0.015855
         Accuracy       0.974504          0.977337  0.002833
  Precision (Yes)       0.928571          0.915254 -0.013317
     Recall (Yes)       0.912281          0.947368  0.035088
   F1-Score (Yes)       0.920354          0.931034  0.010681
          AUC-ROC       0.976307          0.976307  0.000000

🎉 Balanced Accuracy Final: 0.9652
   Mejora vs threshold 0.5: +0.0159

📋 Matriz de Confusión (Threshold Óptimo):
[[582  10]
 [  6 108]]
  TP=108, FP=10, FN=6, TN=582

📄 REPORTE DE CLASIFICACIÓN (Threshold Óptimo):
              precision    recall  f1-score   support

No Attrition       0.99      0.98      0.99       592
   Attrition       0.92      0.95      0.93       114

    accuracy                           0.98       706
   macro avg       0.95      0.97      0.96       706
weighted avg       0.98      0.98      0.98       706




### 6. Guardar Modelo y Resultados

In [None]:
model_data = {
    'model': best_model,
    'optimal_threshold': optimal_threshold,
    'feature_names': X_train.columns.tolist()
}
with open('best_model_improved.pkl', 'wb') as f:
    pkl.dump(model_data, f)
print("✓ Modelo mejorado guardado: best_model_improved.pkl")
print(f"  (incluye threshold óptimo: {optimal_threshold:.3f})")

✓ Modelo mejorado guardado: best_model_improved.pkl
  (incluye threshold óptimo: 0.100)


## Parte Adicional del Proyecto: Intentos Fallidos

### Pruebas de Modelos

In [32]:
grid= [
    # 1) Logistic Regression (l1/l2, con y sin class_weight, distintos solvers)
{
    'classifier': [LogisticRegression(random_state=42, max_iter=2000)],
    'classifier__penalty': ['l2', 'l1'],
    'classifier__solver': ['liblinear', 'saga'],   # l1 soportado por liblinear/saga
    'classifier__C': [0.1, 1, 3, 10],
    'classifier__class_weight': ['balanced'],
},
   ## 2) Árbol de decisión (más profundo, min_samples_* y max_features)
{
    'classifier': [DecisionTreeClassifier(random_state=42)],
    'classifier__criterion': ['gini', 'entropy', 'log_loss'],
    'classifier__max_depth': [None, 5, 10, 20],
    'classifier__min_samples_split': [2, 5, 10],
    'classifier__min_samples_leaf': [1, 2, 5],
    'classifier__class_weight': ['balanced']
},
   ## 3) KNN (más vecinos, distancia y leaf_size)
{
    'classifier': [KNeighborsClassifier()],
    'classifier__n_neighbors': [3, 5, 7, 9, 11, 15],
    'classifier__weights': ['uniform', 'distance'],
    'classifier__p': [1, 2],                # manhattan / euclid
    'classifier__leaf_size': [15, 30, 45]
},
  # Random Forest optimizado para balanced accuracy
   ## 5) Random Forest (más n_estimators, max_features, bootstrap)
{
    'classifier': [RandomForestClassifier(random_state=42)],
    'classifier__n_estimators': [100, 300, 500],
    'classifier__criterion': ['gini', 'entropy'],
    'classifier__max_depth': [None, 10, 20],
    'classifier__min_samples_split': [2, 5, 10],
    'classifier__min_samples_leaf': [1, 2, 4],
    'classifier__class_weight': ['balanced']
},
   ## 6) Gradient Boosting
{
    'classifier': [GradientBoostingClassifier(random_state=42)],
    'classifier__n_estimators': [100, 200, 400, 600],
    'classifier__learning_rate': [0.02, 0.05, 0.1, 0.2],
    'classifier__max_depth': [2, 3, 5],
    'classifier__min_samples_leaf': [1, 2, 5],
    'classifier__subsample': [0.6, 0.8, 0.9, 1.0]
},
  ## 7) Extra Trees
{
    'classifier': [ExtraTreesClassifier(random_state=42)],
    'classifier__n_estimators': [200, 500],
    'classifier__criterion': ['gini', 'entropy', 'log_loss'],
    'classifier__max_depth': [None, 10, 20],
    'classifier__min_samples_split': [2, 5, 10],
    'classifier__min_samples_leaf': [1, 2, 4],
    'classifier__class_weight': ['balanced']
},
  ## 8) AdaBoost
{
   'classifier': [AdaBoostClassifier(
       random_state=42,
       estimator=DecisionTreeClassifier(random_state=42)
   )],
   'classifier__algorithm': ['SAMME'],
   'classifier__n_estimators': [100, 300],
   'classifier__learning_rate': [0.1, 0.2, 0.5],
   'classifier__estimator__max_depth': [1, 2, 3],
   'classifier__estimator__min_samples_leaf': [1, 2, 5]
},
  # multi-layer Perceptron
  {
      'classifier': [MLPClassifier(random_state=42, max_iter=1000)],
      'classifier__hidden_layer_sizes': [(50,), (100,), (50, 50)],
      'classifier__activation': ['relu', 'tanh'],
      'classifier__alpha': [0.001, 0.01]
  }
]





### Plot de los resultados con varios modelos

In [33]:
# 7. RESUMEN COMPLETO DE LA BÚSQUEDA + PLOT

#cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
#search_smote = GridSearchCV(
#    estimator=ml_pipe_smote,   # ImbPipeline
#    param_grid=grid,
#    scoring='balanced_accuracy',
#    cv=cv,
#    n_jobs=-1,
#    error_score='raise',
#    verbose = 1,
#)
from sklearn.model_selection import RandomizedSearchCV

search = RandomizedSearchCV(
    estimator=ml_pipe_smote,
    param_distributions=grid,
    n_iter=100,  # Prueba 100 combinaciones aleatorias
    scoring='balanced_accuracy',
    cv=cv,
    n_jobs=-1,
    verbose=1,
    random_state=42
)
search_smote.fit(X_train, y_train)
cv_results_df = pd.DataFrame(search_smote.cv_results_)
score_cols = [
    col for col in cv_results_df.columns
    if col.startswith('split') and col.endswith('_test_score')
]
summary_cols = ['params', 'mean_test_score', 'std_test_score', 'rank_test_score'] + score_cols
cv_results_summary = cv_results_df[summary_cols].copy()
cv_results_summary = cv_results_summary.sort_values('rank_test_score')

def _format_params(params: dict) -> str:
    return ', '.join(f"{key}={value}" for key, value in params.items())

cv_results_summary['params'] = cv_results_summary['params'].apply(_format_params)
cv_results_summary = cv_results_summary.rename(columns={
    'params': 'Parámetros',
    'mean_test_score': 'Balanced Accuracy media',
    'std_test_score': 'Desviación estándar',
    'rank_test_score': 'Ranking'
})

print("\nResultados completos de la búsqueda (ordenados por ranking):")
print(cv_results_summary.to_string(index=False))

cv_results_summary.to_csv('cv_results_smote.csv', index=False)
print("\n✓ Resultados de la búsqueda guardados en: cv_results_smote.csv")

if len(cv_results_summary) > 0:
    top_n = min(10, len(cv_results_summary))
    top_results = cv_results_summary.head(top_n)
    plt.figure(figsize=(12, max(6, top_n * 0.5)))
    sns.barplot(
        data=top_results,
        x='Balanced Accuracy media',
        y='Parámetros',
        palette='viridis'
    )
    plt.title(f'Mejores {top_n} combinaciones - Balanced Accuracy (CV)')
    plt.xlabel('Balanced Accuracy media (CV)')
    plt.ylabel('Parámetros')
    plt.tight_layout()
    plt.savefig('cv_results_smote_top.png', dpi=150, bbox_inches='tight')
    print("✓ Gráfico guardado en: cv_results_smote_top.png")
    plt.close()
else:
    print("No hay resultados disponibles para graficar.")

NameError: name 'ml_pipe_smote' is not defined

## Parte Adicional del Proyecto: Posibles Mejoras