In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier,GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from catboost import CatBoostClassifier
from xgboost import XGBClassifier

In [2]:
df = pd.read_csv('data/eda_heart_data.csv')

In [3]:
df.head()

Unnamed: 0,HeartDiseaseorAttack,HighBP,HighChol,BMI,Smoker,Stroke,Diabetes,PhysActivity,Fruits,Veggies,HvyAlcoholConsump,AnyHealthcare,GenHlth,DiffWalk,Sex,Mental_Health_Category,Physical_Health_Category
0,0,1,1,Obese,1,0,0,0,0,1,0,1,5,1,0,Moderate,Good
1,0,0,0,Overweight,1,0,0,1,0,0,0,0,3,0,0,Low,Poor
2,0,1,1,Overweight,0,0,0,0,1,0,0,1,5,1,0,Severe,Excellent
3,0,1,0,Overweight,0,0,0,1,1,1,0,1,2,0,0,Low,Poor
4,0,1,1,Normal,0,0,0,1,1,1,0,1,2,0,0,Low,Poor


In [4]:
X = df[['HighBP','Smoker','Stroke', 'Diabetes', 'GenHlth','DiffWalk']]
y = df.HeartDiseaseorAttack

In [5]:
# Standardising the Input Dataset

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
cat_features = ['BMI', 'GenHlth', 'MentHlth', 'PhysHlth']
cat_transformer = OneHotEncoder()

preprocessor = ColumnTransformer(
    [
        ("OHEncoder", cat_transformer, cat_features),

    ]
)

In [6]:
preprocessor

In [7]:
# X = preprocessor.fit_transform(X)

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=42, test_size=0.2)

In [9]:
models = {
    'LogisticRegression' : LogisticRegression(),
    'DecisionTreeClassifier' : DecisionTreeClassifier(),
    'RandomForestClassifier': RandomForestClassifier(),
    'AdaBoostClassifier': AdaBoostClassifier(),
    'GradientBoostingClassifier' : GradientBoostingClassifier(),
    'KNeighborsClassifier' : KNeighborsClassifier(),
    # 'Support Vector Classifier' : SVC(),
    'CatBoostClassifier' : CatBoostClassifier(task_type='GPU', devices='0', depth=6),
    'XGBClassifier' : XGBClassifier(tree_method='gpu_hist', max_depth=6, max_bin=256)
}

model_list = []
cross_v_score = []

for model,instance in models.items():
    instance.fit(X_train,y_train)

    y_train_pred = instance.predict(X_train)
    y_test_pred = instance.predict(X_test)


    model_list.append(model)

    print(model)
    from sklearn.model_selection import cross_val_score

    scores = cross_val_score(instance, X_train, y_train, cv=5, scoring='accuracy')
    print(f'Cross-Validation Accuracy: {scores.mean()}')
    cross_v_score.append(scores.mean())
    print('='*35)
    print('\n')

LogisticRegression
Cross-Validation Accuracy: 0.8980872978954586


DecisionTreeClassifier
Cross-Validation Accuracy: 0.8976684218383506


RandomForestClassifier
Cross-Validation Accuracy: 0.8976738617948309


AdaBoostClassifier
Cross-Validation Accuracy: 0.8980492180521266


GradientBoostingClassifier
Cross-Validation Accuracy: 0.8979785002455483


KNeighborsClassifier
Cross-Validation Accuracy: 0.8885890757286955


Learning rate set to 0.025306
0:	learn: 0.6653446	total: 105ms	remaining: 1m 45s
1:	learn: 0.6389382	total: 113ms	remaining: 56.2s
2:	learn: 0.6145863	total: 119ms	remaining: 39.6s
3:	learn: 0.5924873	total: 126ms	remaining: 31.4s
4:	learn: 0.5716916	total: 132ms	remaining: 26.3s
5:	learn: 0.5520923	total: 139ms	remaining: 23.1s
6:	learn: 0.5337263	total: 147ms	remaining: 20.9s
7:	learn: 0.5172713	total: 153ms	remaining: 18.9s
8:	learn: 0.5013810	total: 161ms	remaining: 17.8s
9:	learn: 0.4878569	total: 167ms	remaining: 16.5s
10:	learn: 0.4745831	total: 174ms	remaining: 15.6


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"

Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.



    E.g. tree_method = "hist", device = "cuda"



XGBClassifier



    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"



Cross-Validation Accuracy: 0.8976521022648487





    E.g. tree_method = "hist", device = "cuda"



In [10]:
pd.DataFrame(list(zip(model_list,cross_v_score)), columns=['Model Name','Cross_Val_Score']).sort_values(by=['Cross_Val_Score'], ascending = False).reset_index()

Unnamed: 0,index,Model Name,Cross_Val_Score
0,0,LogisticRegression,0.898087
1,3,AdaBoostClassifier,0.898049
2,4,GradientBoostingClassifier,0.897979
3,6,CatBoostClassifier,0.897761
4,2,RandomForestClassifier,0.897674
5,1,DecisionTreeClassifier,0.897668
6,7,XGBClassifier,0.897652
7,5,KNeighborsClassifier,0.888589
