In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA, KernelPCA
from sklearn.decomposition import IncrementalPCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from pathlib import Path
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, mean_squared_error
from sklearn.linear_model import LogisticRegression, LinearRegression, Lasso, Ridge, ElasticNet, RANSACRegressor, HuberRegressor
from sklearn.svm import SVR

from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import BaggingClassifier

from utils.paths import DATA_RAW_DIR

In [2]:
path_heart = str(DATA_RAW_DIR / "heart.csv")
print(Path(path_heart).exists())

True


In [3]:
df_heart = pd.read_csv(path_heart, sep=",", encoding="utf-8")
df_heart.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,52,1,0,125,212,0,1,168,0,1.0,2,2,3,0
1,53,1,0,140,203,1,0,155,1,3.1,0,0,3,0
2,70,1,0,145,174,0,1,125,1,2.6,0,0,3,0
3,61,1,0,148,203,0,1,161,0,0.0,2,1,3,0
4,62,0,0,138,294,1,1,106,0,1.9,1,3,2,0


In [4]:
X = df_heart.drop(columns=["target"], axis=1)
y = df_heart["target"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.35, random_state=42)

In [5]:
model_knn = KNeighborsClassifier()
model_knn.fit(X_train, y_train)
y_pred_knn = model_knn.predict(X_test)
print(accuracy_score(y_test, y_pred_knn))
print(confusion_matrix(y_test, y_pred_knn))
print(classification_report(y_test, y_pred_knn))

0.6908077994428969
[[127  53]
 [ 58 121]]
              precision    recall  f1-score   support

           0       0.69      0.71      0.70       180
           1       0.70      0.68      0.69       179

    accuracy                           0.69       359
   macro avg       0.69      0.69      0.69       359
weighted avg       0.69      0.69      0.69       359



In [6]:
model_bagging = BaggingClassifier(
    estimator=model_knn,
    n_estimators=50
)
model_bagging.fit(X_train, y_train)
y_pred_bagging = model_bagging.predict(X_test)
print(accuracy_score(y_test, y_pred_bagging))
print(confusion_matrix(y_test, y_pred_bagging))
print(classification_report(y_test, y_pred_bagging))

0.7186629526462396
[[131  49]
 [ 52 127]]
              precision    recall  f1-score   support

           0       0.72      0.73      0.72       180
           1       0.72      0.71      0.72       179

    accuracy                           0.72       359
   macro avg       0.72      0.72      0.72       359
weighted avg       0.72      0.72      0.72       359



# Chat GPT Recommendation

In [7]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
import warnings

warnings.filterwarnings("ignore")  # Oculta warnings innecesarios

# 1. Separar features y target
X = df_heart.drop(columns=["target"])
y = df_heart["target"]

# 2. Dividir los datos (con estratificación para balance de clases)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.35, stratify=y, random_state=42
)

# 3. Crear pipeline con escalado y modelo base KNN
pipe_knn = Pipeline([
    ("scaler", StandardScaler()),
    ("knn", KNeighborsClassifier())
])

# 4. Definir hiperparámetros a tunear
param_grid_knn = {
    "knn__n_neighbors": [3, 5, 7],
    "knn__weights": ["uniform", "distance"],
    "knn__p": [1, 2]  # 1 = Manhattan, 2 = Euclidean
}

# 5. Hacer GridSearchCV con validación cruzada
grid_knn = GridSearchCV(pipe_knn, param_grid_knn, cv=5, scoring="accuracy", n_jobs=-1)
grid_knn.fit(X_train, y_train)

# 6. Evaluar el mejor modelo KNN
print("🔍 Best KNN Parameters:", grid_knn.best_params_)
print("✅ KNN Accuracy:", grid_knn.score(X_test, y_test))
print(classification_report(y_test, grid_knn.predict(X_test)))

🔍 Best KNN Parameters: {'knn__n_neighbors': 5, 'knn__p': 1, 'knn__weights': 'distance'}
✅ KNN Accuracy: 0.9721448467966574
              precision    recall  f1-score   support

           0       0.97      0.98      0.97       175
           1       0.98      0.97      0.97       184

    accuracy                           0.97       359
   macro avg       0.97      0.97      0.97       359
weighted avg       0.97      0.97      0.97       359



In [8]:
# 8. Crear Bagging con el mejor KNN
best_knn = grid_knn.best_estimator_.named_steps["knn"]

pipe_bagging = Pipeline([
    ("scaler", StandardScaler()),
    ("bagging", BaggingClassifier(estimator=best_knn, random_state=42))
])

# 9. Hiperparámetros de Bagging
param_grid_bagging = {
    "bagging__n_estimators": [10, 30, 50],
    "bagging__max_samples": [0.8, 1.0],
    "bagging__bootstrap": [True, False]
}

grid_bagging = GridSearchCV(pipe_bagging, param_grid_bagging, cv=5, scoring="accuracy", n_jobs=-1)
grid_bagging.fit(X_train, y_train)

# 10. Evaluar el mejor modelo Bagging
print("🔍 Best Bagging Parameters:", grid_bagging.best_params_)
print("✅ Bagging Accuracy:", grid_bagging.score(X_test, y_test))
print(classification_report(y_test, grid_bagging.predict(X_test)))

🔍 Best Bagging Parameters: {'bagging__bootstrap': False, 'bagging__max_samples': 0.8, 'bagging__n_estimators': 30}
✅ Bagging Accuracy: 0.9721448467966574
              precision    recall  f1-score   support

           0       0.97      0.98      0.97       175
           1       0.98      0.97      0.97       184

    accuracy                           0.97       359
   macro avg       0.97      0.97      0.97       359
weighted avg       0.97      0.97      0.97       359

