In [31]:
pip install xgboost


### install XGBoost

Collecting xgboost
  Downloading xgboost-2.1.3-py3-none-win_amd64.whl.metadata (2.1 kB)
Downloading xgboost-2.1.3-py3-none-win_amd64.whl (124.9 MB)
   ---------------------------------------- 0.0/124.9 MB ? eta -:--:--
   ---------------------------------------- 0.0/124.9 MB ? eta -:--:--
   ---------------------------------------- 0.0/124.9 MB ? eta -:--:--
   ---------------------------------------- 0.0/124.9 MB ? eta -:--:--
   ---------------------------------------- 0.1/124.9 MB 944.1 kB/s eta 0:02:13
   ---------------------------------------- 0.4/124.9 MB 2.2 MB/s eta 0:00:56
   ---------------------------------------- 0.7/124.9 MB 3.0 MB/s eta 0:00:42
   ---------------------------------------- 0.9/124.9 MB 3.5 MB/s eta 0:00:36
   ---------------------------------------- 1.2/124.9 MB 3.9 MB/s eta 0:00:33
   ---------------------------------------- 1.5/124.9 MB 4.3 MB/s eta 0:00:30
    --------------------------------------- 1.6/124.9 MB 4.0 MB/s eta 0:00:31
    ----------------

In [19]:
###Load Dataset and Preprocess Data


import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler


df = pd.read_csv('cleaned_heart_disease_dataset.csv')


X = df.drop(columns=['HeartDisease'])  
y = df['HeartDisease']


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


np.save("X_train_scaled.npy", X_train_scaled)
np.save("X_test_scaled.npy", X_test_scaled)
np.save("y_train.npy", y_train)
np.save("y_test.npy", y_test)


In [21]:
###Train Baseline Models

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC


X_train_scaled = np.load("X_train_scaled.npy")
X_test_scaled = np.load("X_test_scaled.npy")
y_train = np.load("y_train.npy")
y_test = np.load("y_test.npy")


models = {
    "Logistic Regression": LogisticRegression(),
    "K-Nearest Neighbors": KNeighborsClassifier(),
    "Naïve Bayes": GaussianNB(),
    "Support Vector Machine": SVC(probability=True)
}


best_models = {}
for name, model in models.items():
    model.fit(X_train_scaled, y_train)
    best_models[name] = model


In [37]:
### Train and Tune Advanced Models with GridSearchCV

from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
import pandas as pd


X_train_scaled = np.load("X_train_scaled.npy")
X_test_scaled = np.load("X_test_scaled.npy")
y_train = np.load("y_train.npy")
y_test = np.load("y_test.npy")


models_tuned = {
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    "Gradient Boosting": GradientBoostingClassifier(),
    "XGBoost": XGBClassifier(eval_metric='logloss')
}


param_grid = {
    "Decision Tree": {'max_depth': [None, 5, 10, 20]},
    "Random Forest": {'n_estimators': [50, 100, 200], 'max_depth': [None, 10, 20]},
    "Gradient Boosting": {'n_estimators': [50, 100, 200], 'learning_rate': [0.01, 0.1, 0.2]},
    "XGBoost": {'n_estimators': [50, 100, 200], 'learning_rate': [0.01, 0.1, 0.2]}
}


best_models = {}  
for name, model in models_tuned.items():
    grid_search = GridSearchCV(model, param_grid[name], cv=5, scoring='accuracy')
    grid_search.fit(X_train_scaled, y_train)
    best_models[name] = grid_search.best_estimator_


model_params = {}

for name, model in best_models.items():
    if hasattr(model, "get_params"):  
        model_params[name] = model.get_params()
    else:
        model_params[name] = "No hyperparameters available"


params_df = pd.DataFrame(model_params).T
print("\nModel Hyperparameters:")
print(params_df)




Model Hyperparameters:
                  ccp_alpha class_weight     criterion max_depth max_features  \
Decision Tree           0.0         None          gini         5         None   
Random Forest           0.0         None          gini      None         sqrt   
Gradient Boosting       0.0          NaN  friedman_mse         3         None   
XGBoost                 NaN          NaN           NaN      None          NaN   

                  max_leaf_nodes min_impurity_decrease min_samples_leaf  \
Decision Tree               None                   0.0                1   
Random Forest               None                   0.0                1   
Gradient Boosting           None                   0.0                1   
XGBoost                      NaN                   NaN              NaN   

                  min_samples_split min_weight_fraction_leaf  ...  \
Decision Tree                     2                      0.0  ...   
Random Forest                     2                     

In [39]:
### save trained models

import joblib


for name, model in best_models.items():
    filename = f"{name.replace(' ', '_').lower()}_model.pkl"
    joblib.dump(model, filename)

print("All models saved successfully.")


All models saved successfully.
