In [16]:
# Import dependencies
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sn
import warnings
warnings.simplefilter('ignore')
import os
import numpy as np

In [17]:
# Read the CSV
heart = pd.read_csv("cleaned_cardio.csv")
heart.head()

Unnamed: 0,age,height,weight,ap_hi,ap_lo,cardio,BMI,gender_1,gender_2,cholesterol_1,...,cholesterol_3,gluc_1,gluc_2,gluc_3,smoke_0,smoke_1,alco_0,alco_1,active_0,active_1
0,50.0,66.0,136.0,110,80,0,21.948577,0,1,1,...,0,1,0,0,1,0,1,0,0,1
1,55.0,61.0,187.0,140,90,1,35.329481,1,0,0,...,1,1,0,0,1,0,1,0,0,1
2,52.0,65.0,141.0,130,70,1,23.461065,1,0,0,...,1,1,0,0,1,0,1,0,1,0
3,48.0,67.0,180.0,150,100,1,28.188906,0,1,1,...,0,1,0,0,1,0,1,0,0,1
4,48.0,61.0,123.0,100,60,0,23.238108,1,0,1,...,0,1,0,0,1,0,1,0,1,0


In [21]:
# Define variables
y = heart["cardio"]
target_names = ["negative", "positive"]
X = heart.drop("cardio", axis=1)
X2 = X.values
X2

array([[ 50.,  66., 136., ...,   0.,   0.,   1.],
       [ 55.,  61., 187., ...,   0.,   0.,   1.],
       [ 52.,  65., 141., ...,   0.,   1.,   0.],
       ...,
       [ 52.,  72., 231., ...,   1.,   1.,   0.],
       [ 61.,  64., 158., ...,   0.,   1.,   0.],
       [ 56.,  67., 158., ...,   0.,   0.,   1.]])

In [22]:
# Split data to test and train
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X2, y, random_state=42)

In [23]:
# Perform XG Boost
import xgboost as xgb
from xgboost import XGBClassifier
# fit model no training data
model = XGBClassifier(probability = True)
model.fit(X_train, y_train)
model.score(X_test, y_test)

0.7431203358208955

In [None]:
# GridSearch for XGBoost
from sklearn.model_selection import GridSearchCV

params_xg = {'min_child_weight': [1, 5, 10],
            'gamma': [0.5, 1, 1.5, 2, 5],
            'subsample': [0.6, 0.8, 1.0],
            'colsample_bytree': [0.6, 0.8, 1.0],
            'max_depth': [3, 4, 5]
            }

grid_xg = GridSearchCV(model, param_grid = params_xg, n_jobs=-1)
grid_xg.fit(X_train, y_train)
print("Best Hyper Parameters:\n", grid_xg.best_params_)
prediction_xg = grid_xg.predict(X_test)

# Print Accuracy
from sklearn import metrics

accuracy_xg = metrics.accuracy_score(prediction_xg, y_test)
print("Accuracy: ", accuracy_xg)

In [None]:
# Perform Confusion Matrix on XG Boost
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_test, prediction_xg)
conf_matrix = pd.DataFrame(data = cm, columns = ['Predicted:0','Predicted:1'], index = ['Actual:0','Actual:1'])
plt.figure(figsize = (8,6))
sn.heatmap(conf_matrix, annot=True,fmt='d',cmap="YlGnBu")

In [None]:
# Find the best model
best_xg = grid_xg.best_estimator_
best_xg

In [None]:
# Use pickle to have user inputs
import pickle

pkl_xg_filename = "pickle_xg.pkl"
with open(pkl_xg_filename, "wb") as file:
    pickle.dump(best_xg, file)
    
with open(pkl_xg_filename, "rb") as file:
    pickle_model_xg = pickle.load(file)

pickle_model_xg

In [None]:
# Test the model
import numpy as np

new_user_input = np.array([[61,62,205,130,80,37.49,1,0,0,0,1,1,0,0,1,0,1,0,0,1]])

best_xg_score = pickle_model_xg.score(X_test, y_test)
perc_score_xg = 100*best_xg_score
print("Test Score: {:.2f} %".format(perc_score_xg))
predict_xg = pickle_model_xg.predict(new_user_input)
predict_xg