In [8]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import time
from collections import Counter

from pandas.plotting import parallel_coordinates
from sklearn.tree import DecisionTreeClassifier
from sklearn.utils import resample
from sklearn.base import BaseEstimator
from sklearn.base import clone

from sklearn.model_selection import (
    StratifiedKFold, 
    ParameterGrid, 
    cross_validate)


from sklearn.metrics import (
    balanced_accuracy_score,
    f1_score,
    classification_report,
    confusion_matrix,
    ConfusionMatrixDisplay
)

import warnings
warnings.filterwarnings('ignore')
np.random.seed(42)

plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

print("Librerías importadas correctamente")
print(f"Versión de scikit-learn: {__import__('sklearn').__version__}")

Librerías importadas correctamente
Versión de scikit-learn: 1.7.2


In [2]:
df = pd.read_csv('../data/processed/diabetes_sample.csv')
df.head()

Unnamed: 0,Diabetes_012,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,...,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
0,0.0,0.0,0.0,1.0,42.0,0.0,0.0,0.0,1.0,1.0,...,1.0,1.0,3.0,0.0,0.0,0.0,0.0,3.0,4.0,2.0
1,0.0,0.0,0.0,1.0,19.0,0.0,0.0,0.0,1.0,0.0,...,1.0,0.0,3.0,30.0,0.0,0.0,0.0,7.0,5.0,8.0
2,0.0,0.0,0.0,1.0,23.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,1.0,0.0,0.0,0.0,1.0,5.0,6.0,8.0
3,0.0,0.0,0.0,1.0,31.0,1.0,0.0,0.0,1.0,1.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,8.0,5.0,8.0
4,0.0,0.0,0.0,1.0,28.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,1.0,0.0,0.0,0.0,1.0,3.0,6.0,8.0


In [3]:
target_col =  'Diabetes_012'
X = df.drop(columns=[target_col])
y = df[target_col]


print("Variables separadas:")
print(f"   X: {X.shape} → {X.shape[1]} features")
print(f"   y: {y.shape} → {y.nunique()} clases")
print(f"Features:")
for i, col in enumerate(X.columns, 1):
    print(f"   {i:2d}. {col}")

Variables separadas:
   X: (38052, 21) → 21 features
   y: (38052,) → 3 clases
Features:
    1. HighBP
    2. HighChol
    3. CholCheck
    4. BMI
    5. Smoker
    6. Stroke
    7. HeartDiseaseorAttack
    8. PhysActivity
    9. Fruits
   10. Veggies
   11. HvyAlcoholConsump
   12. AnyHealthcare
   13. NoDocbcCost
   14. GenHlth
   15. MentHlth
   16. PhysHlth
   17. DiffWalk
   18. Sex
   19. Age
   20. Education
   21. Income


In [4]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

print("Datos divididos en entrenamiento y prueba:")
print(f"   X_train: {X_train.shape[0]} samples")   
print(f"   X_test : {X_test.shape[0]} samples")
print(f"   y_train: {y_train.shape[0]} samples")
print(f"   y_test : {y_test.shape[0]} samples")

Datos divididos en entrenamiento y prueba:
   X_train: 30441 samples
   X_test : 7611 samples
   y_train: 30441 samples
   y_test : 7611 samples


# BalancedDecisionForest: Clase auxiliar para el desarrollo del Arbol

In [9]:
class BalancedDecisionForest(BaseEstimator): # <--- HEREDAR DE BaseEstimator
    """
    Implementación de un Ensamble de Árboles de Decisión Balanceados.
    """
    def __init__(self, n_estimators=50, base_estimator=None):
        # NOTA: Los parámetros DEBEN ser guardados como atributos en __init__
        self.n_estimators = n_estimators
        self.base_estimator = base_estimator
        self.models = [] 
        self.classes_ = None
        
        # Asignar un base_estimator por defecto si no se proporciona
        if self.base_estimator is None:
             self.base_estimator = DecisionTreeClassifier(max_depth=5, min_samples_leaf=5, random_state=42)

    def fit(self, X, y):
        """
        Entrena el ensamble creando n_estimators subconjuntos balanceados.
        """
        self.models = []
        # Aseguramos que X sea un DataFrame para un manejo fácil de índices si viene de un array
        if not isinstance(X, pd.DataFrame):
            X = pd.DataFrame(X, index=range(len(X)))
            
        # Aseguramos que y sea una Serie para un manejo fácil de índices
        if not isinstance(y, pd.Series):
            y = pd.Series(y, index=X.index)
        
        self.classes_ = np.unique(y)
        n_classes = len(self.classes_)
        
        # 1. Identificar el tamaño de la clase minoritaria (N)
        class_counts = y.value_counts()
        N = class_counts.min() 
        
        # 2. Separar los índices de los datos por clase
        data_by_index = y.index.to_series().groupby(y).apply(list).to_dict()
        
        # 3. Entrenar n_estimators árboles
        for _ in range(self.n_estimators):
            balanced_indices = []
            
            for c in self.classes_:
                # Muestreo aleatorio de tamaño N de los índices de la clase
                indices_sampled = resample(
                    data_by_index[c], 
                    replace=True, 
                    n_samples=N, 
                    random_state=np.random.randint(0, 10000)
                )
                balanced_indices.extend(indices_sampled)
            
            # Subconjunto balanceado
            X_tree = X.loc[balanced_indices]
            y_tree = y.loc[balanced_indices]
            
            # Clonar el estimador base para asegurar que cada árbol es independiente
            tree = clone(self.base_estimator)
            tree.fit(X_tree, y_tree)
            self.models.append(tree)
        
        return self

    # Métodos predict y predict_proba (no requieren cambios funcionales)
    def predict(self, X):
        all_tree_predictions = []
        for model in self.models:
            all_tree_predictions.append(model.predict(X))
        
        predictions_matrix = np.array(all_tree_predictions)
        final_predictions = np.apply_along_axis(
            lambda x: np.bincount(x.astype(int), minlength=len(self.classes_)).argmax(), 
            axis=0, 
            arr=predictions_matrix
        )
        return final_predictions

    def predict_proba(self, X):
        all_tree_predictions = []
        for model in self.models:
            all_tree_predictions.append(model.predict(X))
        
        predictions_matrix = np.array(all_tree_predictions)
        n_samples = X.shape[0]
        n_classes = len(self.classes_)
        proba_matrix = np.zeros((n_samples, n_classes))
        
        for i in range(n_samples):
            votes = predictions_matrix[:, i]
            vote_counts = np.bincount(votes.astype(int), minlength=n_classes)
            proba_matrix[i] = vote_counts / self.n_estimators
            
        return proba_matrix

In [None]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
results_list = []
param_grid_bf = {
    'max_depth': [3, 5, 7, 8],
    'min_samples_leaf': [20, 30, 50, 100],
    'criterion': ['gini', 'entropy'],
    'n_estimators': [50, 100]
}

print(f"Iniciando búsqueda en grilla con {len(list(ParameterGrid(param_grid_bf)))} combinaciones...")
start_time = time.time()

for params in ParameterGrid(param_grid_bf):
    # a. Definir el Estimador Base
    base_tree = DecisionTreeClassifier(
        max_depth=params['max_depth'], 
        min_samples_leaf=params['min_samples_leaf'], 
        criterion=params['criterion'],
        random_state=42
    )
    
    # b. Crear el Modelo de Ensamble (con el parámetro n_estimators)
    # Ahora que hereda de BaseEstimator, ya es clonable.
    model = BalancedDecisionForest(
        n_estimators=params['n_estimators'], 
        base_estimator=base_tree
    )
    
    # c. Evaluación por Cross-Validation
    scores = cross_validate(
        model, 
        X_train, y_train, # Usar X_train y y_train (asume que están definidos)
        cv=skf, 
        scoring='balanced_accuracy', 
        return_train_score=True, 
        n_jobs=-1
    )

    # d. Almacenar resultados
    mean_test_score = scores['test_score'].mean()
    mean_train_score = scores['train_score'].mean()

    results_list.append({
        'params': params,
        'mean_test_score': mean_test_score,
        'mean_train_score': mean_train_score
    })

# ... (Resto del código para resultados) ...

end_time = time.time()
print(f"Tiempo total de búsqueda en grilla: {end_time - start_time:.2f} segundos.")

results_df = pd.DataFrame(results_list)
best_model_row = results_df.loc[results_df['mean_test_score'].idxmax()]

print("\n--- MEJOR MODELO ENCONTRADO ---")
print(f"Mejores Parámetros: {best_model_row['params']}")
print(f"Balanced Accuracy (CV Mean): {best_model_row['mean_test_score']:.4f}")
print(f"Gap (Train - Test): {best_model_row['mean_train_score'] - best_model_row['mean_test_score']:.4f}")

Iniciando búsqueda en grilla con 64 combinaciones...
Tiempo total de búsqueda en grilla: 91.95 segundos.

--- MEJOR MODELO ENCONTRADO ---
Mejores Parámetros: {'criterion': 'gini', 'max_depth': 10, 'min_samples_leaf': 15, 'n_estimators': 50}
Balanced Accuracy (CV Mean): 0.5281
Gap (Train - Test): 0.0920
