## Applying machine learning models to our data

Before we start with any ML or even evaluating those models we need to be aware of our baseline. How far is guessing going to get us? Over 20.5 million or 7.2% of adults in the United States aged 20 and older have had a CAD alone (CDC, 2022). Our specific baseline comes down to 90.19% of accuracy by guessing nobody has a heart attack (because our target values includes multiple kinds of heart attack). Meaning that 9.81% of the survey's participants have had any kind of heart attack. In order for our models to be any helpful we need a accuracy greater than 90.19%.

In [None]:
import os
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import math
from scipy import stats

In [None]:
#importing cleaned dataset
pd.set_option("display.max_columns", 125)
df = pd.read_csv('Datasets/CleanedData.csv')

### Preprocessing

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV

In [None]:
# Seperate Dataset in X and y
X = df.drop('_MICHD', axis=1)
y = df['_MICHD']

# train test
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [None]:
# Standardize the features
scaler = StandardScaler()
X_train_std = scaler.fit_transform(X_train)

X_test_std = scaler.transform(X_test)

### K-nearest neighboor

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
# KNN model default
knn = KNeighborsClassifier().fit(X_train, y_train)

print('The accuracy of the KNN model on the training data is {:.4f}'.format(knn.score(X_train, y_train)))
print('The accuracy of the KNN model on the test data is {:.4f}'.format(knn.score(X_test, y_test)))

In [None]:
#loop for perfect k
for i in range(100)
    knn = KNeighborsClassifier().fit(X_train, y_train)
    
    train_pred = knn.predict(X_train)
    test_pred = knn.predict(X_test)
    
    print("k=", k, "Train Accuracy:", accuracy_score(y_train, train_pred))
    print("k=", k, "Test Accuracy:", accuracy_score(y_test, test_pred))

In [None]:
# KNN optimal
knn_optimal = KNeighborsClassifier(n_neighbors=65) 

print('The accuracy of the KNN model on the training data is {:.4f}'.format(knn.score(X_train, y_train)))
print('The accuracy of the KNN model on the test data is {:.4f}'.format(knn.score(X_test, y_test)))

### Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
logisticRegr = LogisticRegression().fit(X_train, y_train)

print('The accuracy of the LogisticRegression model on the training data is {:.4f}'.format(logisticRegr.score(X_train,y_train)))
print('The accuracy of the LogisticRegression model on the test data is {:.4f}'.format(logisticRegr.score(X_test,y_test)))

In [None]:
# Define the parameter 
penalty_options = ['l1', 'l2', 'elasticnet']
solver_options = ['lbfgs', 'newton-cg', 'newton-cholesky', 'sag', 'saga']
C_values = [0.1, 1, 5, 10]
iter_values = [100, 1000]

# Create a grid
param_grid = dict(penalty=penalty_options, C=C_values, solver=solver_options, max_iter=iter_values)


grid = GridSearchCV(LogisticRegression(), param_grid, cv=5, scoring='accuracy')


grid.fit(X_train_std, y_train)

# Check the results
print(grid.cv_results_)
print('\n')
print(grid.best_score_)
print('\n')
print(grid.best_params_)
print('\n')
print(grid.best_estimator_)

In [None]:
# Fit the model to the scaled data
logisticRegr = LogisticRegression(penalty='l1',C=1,solver='saga',max_iter=1000, n_jobs=-1).fit(X_train_std, y_train)

print('The accuracy of the fine-tuned LogisticRegression model on the scaled training data is {:.4f}'.format(logisticRegr.score(X_train_std, 
                                                                                                                                y_train)))
print('The accuracy of the fine-tuned LogisticRegression model on the scaled test data is {:.4f}'.format(logisticRegr.score(X_test_std, 
                                                                                                                            y_test)))

### Naive Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB

In [None]:
clf = GaussianNB().fit(X_train, y_train)

print("Accuracy on training set: {:.4f}".format(clf.score(X_train, y_train))) 
print("Accuracy on test set: {:.4f}".format(clf.score(X_test, y_test)))

In [None]:
# Loop to try all combinations 
smoothing_values = [25, 22.5, 20, 10, 5, 2, 1, 0.5, 0.1, 0.01, 1e-3, 1e-6, 1e-9, 1e-12]

for s in smoothing_values:
    # Create and train the GaussianNB
    clf = GaussianNB(var_smoothing=s).fit(X_train_std, y_train)

    print('var_smoothing: {}, train accuracy: {:.4f}, test accuracy: {:.4f}'.format(s, clf.score(X_train_std, y_train),clf.score(X_test_std, y_test)))


In [None]:
clf = GaussianNB(var_smoothing=20).fit(X_train_std, y_train)

print("Accuracy on training set: {:.4f}".format(clf.score(X_train_std, y_train))) 
print("Accuracy on test set: {:.4f}".format(clf.score(X_test_std, y_test)))

### Support vector classifier

In [None]:
from sklearn.svm import SVC

In [None]:
svc = SVC().fit(X_train, y_train)

# Accuracy of the model with out scaling and fine-tuning
print("Accuracy on training set: {:.4f}".format(svc.score(X_train, y_train))) 
print("Accuracy on test set: {:.4f}".format(svc.score(X_test, y_test)))

In [None]:
# Define the parameters
kernel_options = ['linear', 'poly', 'rbf', 'sigmoid']
C_values = [0.1, 1, 5, 10]
gamma_values = ['auto', 'scale']

# Creating grid
param_grid = dict(kernel=kernel_options, C=C_values, gamma=gamma_values)

grid = GridSearchCV(SVC(), param_grid, cv=5, scoring='accuracy')


grid.fit(X_train_std, y_train)

# Results
print(grid.cv_results_)
print('\n')
print(grid.best_score_)
print('\n')
print(grid.best_params_)
print('\n')
print(grid.best_estimator_)

In [None]:
svc = SVC(C=2.65, gamma='scale', kernel='rbf').fit(X_train, y_train)

print("Accuracy on training set: {:.4f}".format(svc.score(X_train, y_train))) 
print("Accuracy on test set: {:.4f}".format(svc.score(X_test, y_test)))

### Decision tree

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
dtree = DecisionTreeClassifier(random_state=0).fit(X_train, y_train)

print("Accuracy on training set: {:.4f}".format(dtree.score(X_train, y_train))) 
print("Accuracy on test set: {:.4f}".format(dtree.score(X_test, y_test)))

In [None]:
dtree = DecisionTreeClassifier(max_depth=6, random_state=0).fit(X_train, y_train)

print("Accuracy on training set: {:.4f}".format(dtree.score(X_train, y_train))) 
print("Accuracy on test set: {:.4f}".format(dtree.score(X_test, y_test)))

### Random forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
forest = RandomForestClassifier(random_state=42).fit(X_train, y_train)
print("Accuracy on training set: {:.4f}".format(rand_forest.score(X_train, y_train))) 
print("Accuracy on test set: {:.4f}".format(rand_forest.score(X_test, y_test)))

In [None]:
n_values = [2, 5, 7, 10, 12, 15, 20]
f_values = [7, 9, 11, 13, 15, 17]
d_values = [2, 4, 6, 8, 10]

# 3 for loops
for n in n_values:

    for f in f_values:
        
        for d in d_values:
    
            forest = RandomForestClassifier(n_estimators=n, random_state=42, n_jobs=-1, max_features=f, max_depth=d).fit(X_train, y_train)

            print("n=", n, ", f=", f, "d=", d, "Accuracy on training set: {:.4f}".format(forest.score(X_train, y_train))) 
            print("n=", n, ", f=", f, "d=", d, "Accuracy on test set: {:.4f}".format(forest.score(X_test, y_test)))

In [None]:
rand_forest = RandomForestClassifier(n_estimators=12, random_state=42, n_jobs=-1, max_features=11, max_depth=10).fit(X_train, y_train)

print("Accuracy on training set: {:.4f}".format(rand_forest.score(X_train, y_train))) 
print("Accuracy on test set: {:.4f}".format(rand_forest.score(X_test, y_test)))

### Gradient boosted regression tree

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

In [None]:
gbrt = GradientBoostingClassifier(random_state=42).fit(X_train, y_train)

print("Accuracy on training set: {:.4f}".format(gbrt.score(X_train, y_train)))
print("Accuracy on test set: {:.4f}".format(gbrt.score(X_test, y_test)))

In [None]:
# Define the parameters
learning_options = [0.001, 0.01, 0.1, 0.25, 0.5, 1]
Depth_options = [1, 2, 3, 5, 7]
random = [0]
n_est = [75, 100, 150, 200, 350, 500]

# Create grid
param_grid = dict(learning_rate=learning_options, max_depth=Depth_options, random_state=random, n_estimators=n_est)


grid = GridSearchCV(GradientBoostingClassifier(), param_grid, cv=3, scoring='accuracy').fit(X_train, y_train)

# results
print(grid.cv_results_)
print('\n')
print(grid.best_score_)
print('\n')
print(grid.best_params_)
print('\n')
print(grid.best_estimator_)

In [None]:
gbrt = GradientBoostingClassifier(random_state=0,learning_rate=0.1,max_depth=3,n_estimators=100).fit(X_train, y_train)

print("Accuracy on training set: {:.4f}".format(gbrt.score(X_train, y_train)))
print("Accuracy on test set: {:.4f}".format(gbrt.score(X_test, y_test))

### Gradient boosted trees using XGBoost

In [None]:
import xgboost as xgb

In [None]:
#XGBoost expects binary values [0 1] from the y 
y_train = y_train - 1
y_test = y_test - 1

In [None]:
xg_cl = xgb.XGBClassifier().fit(X_train,y_train)

print("Accuracy on training set: {:.4f}".format(xg_cl.score(X_train, y_train)))
print("Accuracy on test set: {:.4f}".format(xg_cl.score(X_test, y_test)))

In [None]:
xg_cl = xgb.XGBClassifier(objective='binary:logistic', n_estimators = 20, max_depth=5, eta=0.30, seed = 123).fit(X_train,y_train)

print("Accuracy on training set: {:.6f}".format(xg_cl.score(X_train, y_train)))
print("Accuracy on test set: {:.6f}".format(xg_cl.score(X_test, y_test)))