In [1]:
!pip install xgboost scikit-learn imbalanced-learn



In [26]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import classification_report
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import roc_curve
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt
from sklearn.calibration import CalibratedClassifierCV
import warnings
warnings.filterwarnings('ignore')

In [3]:
# Cargar el conjunto de datos de Pima Indians Diabetes
url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv "
columns = ["Pregnancies", "Glucose", "BloodPressure", "SkinThickness",
           "Insulin", "BMI", "DiabetesPedigree", "Age", "Outcome"]

df = pd.read_csv(url, names=columns)

In [4]:
# Mostrar las primeras filas del dataset
print("Primeras filas del dataset:")
print(df.head())

Primeras filas del dataset:
   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0            6      148             72             35        0  33.6   
1            1       85             66             29        0  26.6   
2            8      183             64              0        0  23.3   
3            1       89             66             23       94  28.1   
4            0      137             40             35      168  43.1   

   DiabetesPedigree  Age  Outcome  
0             0.627   50        1  
1             0.351   31        0  
2             0.672   32        1  
3             0.167   21        0  
4             2.288   33        1  


In [5]:
# 1. Manejo avanzado de valores faltantes (los 0s en ciertas columnas)
zero_fields = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']
df[zero_fields] = df[zero_fields].replace(0, np.nan)

In [6]:
# Imputación por mediana (mejor que la media para distribuciones sesgadas)
imputer = SimpleImputer(strategy='median')
df[zero_fields] = imputer.fit_transform(df[zero_fields])

In [7]:
# 2. Ingeniería de características (agregar interacción BMI*Age)
df['BMI_Age'] = df['BMI'] * df['Age']

In [8]:
# 3. Selección de características mediante importancia (mejor que forward/backward manual)
selected_features = ['Glucose', 'BMI', 'Age', 'BMI_Age', 'DiabetesPedigree']


In [9]:
# 4. Balanceo de clases con SMOTE (el paper no menciona esto pero es crucial)
X = df[selected_features]
y = df['Outcome']
smote = SMOTE(random_state=42)
X_res, y_res = smote.fit_resample(X, y)

In [10]:
print("\nDistribución después de SMOTE:")
print(pd.Series(y_res).value_counts())


Distribución después de SMOTE:
Outcome
1    500
0    500
Name: count, dtype: int64


In [11]:
# 5. División estratificada de datos
X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.3,
                                                   stratify=y_res, random_state=42)


In [12]:
# 6. Optimización de hiperparámetros para XGBoost (ajuste fino)
params = {
    'learning_rate': [0.01, 0.1, 0.3],
    'max_depth': [4, 6, 8],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'n_estimators': [200, 1000]
}

In [13]:
xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
grid_search = GridSearchCV(xgb, params, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

In [14]:
# Mejores parámetros encontrados
best_xgb = grid_search.best_estimator_

In [15]:
# 7. Entrenamiento de modelos con mejores parámetros
final_models = {
    'XGBoost': best_xgb,
    'AdaBoost': AdaBoostClassifier(n_estimators=500, learning_rate=0.1),
    'GradientBoosting': GradientBoostingClassifier(n_estimators=500,
                                                  max_depth=3,
                                                  learning_rate=0.1)
}

In [16]:
# Evaluación rigurosa
for name, model in final_models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    print(f"\n🔍 Resultados para {name}:")
    print(classification_report(y_test, y_pred))
   # print(f"Matthews CC: {matthews_corrcoef(y_test, y_pred):.4f}")
   # print(f"AUC-ROC: {roc_auc_score(y_test, y_pred):.4f}")
  #  print("="*60)


🔍 Resultados para XGBoost:
              precision    recall  f1-score   support

           0       0.84      0.73      0.78       150
           1       0.76      0.86      0.81       150

    accuracy                           0.80       300
   macro avg       0.80      0.80      0.80       300
weighted avg       0.80      0.80      0.80       300


🔍 Resultados para AdaBoost:
              precision    recall  f1-score   support

           0       0.85      0.73      0.79       150
           1       0.77      0.87      0.82       150

    accuracy                           0.80       300
   macro avg       0.81      0.80      0.80       300
weighted avg       0.81      0.80      0.80       300


🔍 Resultados para GradientBoosting:
              precision    recall  f1-score   support

           0       0.85      0.77      0.81       150
           1       0.79      0.87      0.83       150

    accuracy                           0.82       300
   macro avg       0.82      0.82 