In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.utils import resample
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
import xgboost as xgb
from sklearn.model_selection import cross_validate, cross_val_score
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
import os
import csv
import gzip
import pickle
from sklearn.linear_model import LinearRegression ,LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC ,SVR
from sklearn.metrics import *
from sklearn.model_selection import GridSearchCV

In [2]:
df = pd.read_csv('heart_2020_cleaned.csv')
df.drop(columns=['Race'], inplace=True)
df

Unnamed: 0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer
0,No,16.60,Yes,No,No,3.0,30.0,No,Female,55-59,Yes,Yes,Very good,5.0,Yes,No,Yes
1,No,20.34,No,No,Yes,0.0,0.0,No,Female,80 or older,No,Yes,Very good,7.0,No,No,No
2,No,26.58,Yes,No,No,20.0,30.0,No,Male,65-69,Yes,Yes,Fair,8.0,Yes,No,No
3,No,24.21,No,No,No,0.0,0.0,No,Female,75-79,No,No,Good,6.0,No,No,Yes
4,No,23.71,No,No,No,28.0,0.0,Yes,Female,40-44,No,Yes,Very good,8.0,No,No,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
319790,Yes,27.41,Yes,No,No,7.0,0.0,Yes,Male,60-64,Yes,No,Fair,6.0,Yes,No,No
319791,No,29.84,Yes,No,No,0.0,0.0,No,Male,35-39,No,Yes,Very good,5.0,Yes,No,No
319792,No,24.24,No,No,No,0.0,0.0,No,Female,45-49,No,Yes,Good,6.0,No,No,No
319793,No,32.81,No,No,No,0.0,0.0,No,Female,25-29,No,No,Good,12.0,No,No,No


In [3]:
cols_yes__no_values = ['HeartDisease', 'Smoking', 'AlcoholDrinking', 'Stroke', 'DiffWalking', 'PhysicalActivity',
                       'Asthma', 'KidneyDisease', 'SkinCancer']
yes__no_values = {'No':0, 'Yes':1}
for i in range(0, len(cols_yes__no_values)):
    df[cols_yes__no_values[i]] = df[cols_yes__no_values[i]].replace(yes__no_values)

In [4]:
sex = {'Female':0, 'Male':1}
df['Sex'] = df['Sex'].replace(sex)

ageCategory= {'18-24':0, '25-29':1, '30-34':2, '35-39':3, '40-44':4, '45-49':5, '50-54':6, '55-59':7, '60-64':8,
             '65-69':9, '70-74':10, '75-79':11, '80 or older':12}
df['AgeCategory'] = df['AgeCategory'].replace(ageCategory)

genHealth = {'Poor':0, 'Fair':1, 'Good':2, 'Very good':3, 'Excellent':4}
df['GenHealth'] = df['GenHealth'].replace(genHealth)

diabetic = {'No':0, 'No, borderline diabetes':1, 'Yes (during pregnancy)':2, 'Yes':3}
df['Diabetic'] = df['Diabetic'].replace(diabetic)

In [5]:
df_majority_0 = df[(df['HeartDisease']==0)] 
df_minority_1 = df[(df['HeartDisease']==1)] 

df_minority_upsampled = resample(df_minority_1, 
                                 replace=True,    
                                 n_samples= 1000, 
                                 random_state=42) 

df_upsampled = pd.concat([df_minority_upsampled, df_majority_0])

In [6]:
X = df_upsampled.drop(columns='HeartDisease')
y = df_upsampled['HeartDisease']

In [7]:
X.head()

Unnamed: 0,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer
276535,35.43,0,1,1,0.0,0.0,1,0,12,0,0,2,12.0,0,0,0
183941,23.06,0,0,0,0.0,0.0,0,1,12,3,1,1,8.0,0,0,0
8829,23.49,1,0,0,5.0,0.0,0,1,12,3,1,1,7.0,0,0,0
60254,26.63,1,0,0,4.0,3.0,1,0,12,3,1,1,10.0,0,0,0
251583,28.07,0,1,0,0.0,0.0,0,1,11,0,1,4,8.0,0,0,1


In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.98, shuffle=True, random_state=33)

In [13]:
rf = RandomForestClassifier(n_jobs=-1).fit(X_train, y_train)
y_pred = rf.predict(X_test)

print(classification_report(y_test,y_pred))
print('RandomForest Train Score is : ' , rf.score(X_train, y_train))
print('RandomForest Test Score is : ' , rf.score(X_test, y_test))

              precision    recall  f1-score   support

           0       1.00      0.94      0.97      5808
           1       0.95      1.00      0.97      5885

    accuracy                           0.97     11693
   macro avg       0.97      0.97      0.97     11693
weighted avg       0.97      0.97      0.97     11693

RandomForest Train Score is :  0.9958321261392712
RandomForest Test Score is :  0.9721200718378517


In [21]:
clf = DecisionTreeClassifier(n_jobs=-1)

# Train Decision Tree Classifer
clf.fit(X_train,y_train)

#Predict the response for test dataset
y_pred = clf.predict(X_test)

print(classification_report(y_test,y_pred))
print('DecisionTree Train Score is : ' , clf.score(X_train, y_train))
print('DecisionTree Test Score is : ' , clf.score(X_test, y_test))

TypeError: DecisionTreeClassifier.__init__() got an unexpected keyword argument 'n_jobs'

In [22]:
logreg = LogisticRegression(n_jobs=-1)
logreg.fit(X_train, y_train)

y_pred = logreg.predict(X_test)
print(classification_report(y_test,y_pred))
print('LogisticRegression Train Score is : ' , logreg.score(X_train, y_train))
print('LogisticRegression Test Score is : ' , logreg.score(X_test, y_test))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


              precision    recall  f1-score   support

           0       0.77      0.75      0.76      5808
           1       0.76      0.77      0.77      5885

    accuracy                           0.76     11693
   macro avg       0.76      0.76      0.76     11693
weighted avg       0.76      0.76      0.76     11693

LogisticRegression Train Score is :  0.7630263511555901
LogisticRegression Test Score is :  0.7621653981014282


In [8]:
xgb_model = xgb.XGBClassifier(n_estimators=150, max_depth=18, learning_rate=0.5, nthread=-1, random_state=44)
xgb_model.fit(X_train, y_train)
print('XGBModel Train Score is : ' , xgb_model.score(X_train, y_train))
print('XGBModel Test Score is : ' , xgb_model.score(X_test, y_test))

GBCModel Train Score is :  0.9974056956456723
GBCModel Test Score is :  0.9899471800988243


In [9]:
with open('GBCModel.pkl', 'wb') as f:
    pickle.dump(xgb_model, f)

In [12]:
import pandas as pd
from sklearn.utils import resample
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
import xgboost as xgb

df = pd.read_csv('heart_2020_cleaned.csv')
df.drop(columns=['Race'], inplace=True)

cols_yes__no_values = ['HeartDisease', 'Smoking', 'AlcoholDrinking', 'Stroke', 'DiffWalking', 'PhysicalActivity',
                       'Asthma', 'KidneyDisease', 'SkinCancer']
yes__no_values = {'No':0, 'Yes':1}

for i in range(0, len(cols_yes__no_values)):
    df[cols_yes__no_values[i]] = df[cols_yes__no_values[i]].replace(yes__no_values)

sex = {'Female':0, 'Male':1}
df['Sex'] = df['Sex'].replace(sex)

ageCategory= {'18-24':0, '25-29':1, '30-34':2, '35-39':3, '40-44':4, '45-49':5, '50-54':6, '55-59':7, '60-64':8,
             '65-69':9, '70-74':10, '75-79':11, '80 or older':12}
df['AgeCategory'] = df['AgeCategory'].replace(ageCategory)

genHealth = {'Poor':0, 'Fair':1, 'Good':2, 'Very good':3, 'Excellent':4}
df['GenHealth'] = df['GenHealth'].replace(genHealth)

diabetic = {'No':0, 'No, borderline diabetes':1, 'Yes (during pregnancy)':2, 'Yes':3}
df['Diabetic'] = df['Diabetic'].replace(diabetic)

df_majority_0 = df[(df['HeartDisease']==0)] 
df_minority_1 = df[(df['HeartDisease']==1)] 

df_minority_upsampled = resample(df_minority_1, 
                                 replace=True,    
                                 n_samples= 292225, 
                                 random_state=42) 

df_upsampled = pd.concat([df_minority_upsampled, df_majority_0])
X = df_upsampled.drop(columns='HeartDisease')
y = df_upsampled['HeartDisease']
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.98, shuffle=True, random_state=33)
xgb_model = xgb.XGBClassifier(n_estimators=150, max_depth=18, learning_rate=0.5, nthread=-1, random_state=44)
xgb_model.fit(X_train, y_train)
print('XGBModel Train Score is : ' , xgb_model.score(X_train, y_train))
print('XGBModel Test Score is : ' , xgb_model.score(X_test, y_test))

XGBModel Train Score is :  0.9948966234636637
XGBModel Test Score is :  0.968357136748482
