In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error,mean_absolute_error
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier,GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from catboost import CatBoostClassifier
from xgboost import XGBClassifier

In [2]:
df = pd.read_csv('data/heart_disease_health_indicators_BRFSS2015.csv')

In [3]:
df.head()

Unnamed: 0,HeartDiseaseorAttack,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,Diabetes,PhysActivity,Fruits,...,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
0,0.0,1.0,1.0,1.0,40.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,5.0,18.0,15.0,1.0,0.0,9.0,4.0,3.0
1,0.0,0.0,0.0,0.0,25.0,1.0,0.0,0.0,1.0,0.0,...,0.0,1.0,3.0,0.0,0.0,0.0,0.0,7.0,6.0,1.0
2,0.0,1.0,1.0,1.0,28.0,0.0,0.0,0.0,0.0,1.0,...,1.0,1.0,5.0,30.0,30.0,1.0,0.0,9.0,4.0,8.0
3,0.0,1.0,0.0,1.0,27.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,0.0,0.0,0.0,0.0,11.0,3.0,6.0
4,0.0,1.0,1.0,1.0,24.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,3.0,0.0,0.0,0.0,11.0,5.0,4.0


In [4]:
X = df[df['HighBP','Smoker','Stroke', 'Diabetes', 'GenHlth','Age','DiffWalk']]
y = df.HeartDiseaseorAttack

In [5]:
# Standardising the Input Dataset

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
num_features = ['BMI', 'GenHlth', 'MentHlth', 'PhysHlth', 'Age']
transformer = StandardScaler()

preprocessor = ColumnTransformer(
    [
        ("StandardScaler", transformer, num_features)
    ]
)

In [6]:
preprocessor

In [7]:
# X = preprocessor.fit_transform(X)

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=42, test_size=0.2)

In [9]:
def evaluation_matrix(real,pred):
    mae = mean_absolute_error(real,pred)
    mse = mean_squared_error(real,pred)
    r2_square = r2_score(real,pred)
    rmse = np.sqrt(mse)
    return mae, rmse, r2_square


In [10]:
models = {
    'LogisticRegression' : LogisticRegression(),
    'DecisionTreeClassifier' : DecisionTreeClassifier(),
    'RandomForestClassifier': RandomForestClassifier(),
    'AdaBoostClassifier': AdaBoostClassifier(),
    'GradientBoostingClassifier' : GradientBoostingClassifier(),
    'KNeighborsClassifier' : KNeighborsClassifier(),
    'Support Vector Classifier' : SVC(),
    # 'CatBoostClassifier' : CatBoostClassifier(task_type='GPU', depth=6),
    # 'XGBClassifier' : XGBClassifier(tree_method='gpu_hist', max_depth=6, max_bin=256)
}

model_list = []
r2_list = []
cross_v_score = []

for model,instance in models.items():
    instance.fit(X_train,y_train)

    y_train_pred = instance.predict(X_train)
    y_test_pred = instance.predict(X_test)

    model_train_mae, model_train_rmse, model_train_r2 = evaluation_matrix(y_train,y_train_pred)
    model_test_mae, model_test_rmse, model_test_r2 = evaluation_matrix(y_test,y_test_pred)

    model_list.append(model)
    r2_list.append(model_test_r2)

    print(model)
    print('Model Performance for training set')
    print("- Root Mean Squared Error: {:.4f}".format(model_train_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_train_mae))
    print("- R2 Square Error: {:.4f}".format(model_train_r2))

    print('-------------------------------------')

    print('Model Performance for test set')
    print("- Root Mean Squared Error: {:.4f}".format(model_test_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_test_mae))
    print("- R2 Square Error: {:.4f}".format(model_test_r2))

    from sklearn.model_selection import cross_val_score

    scores = cross_val_score(instance, X_train, y_train, cv=5, scoring='accuracy')
    print(f'Cross-Validation Accuracy: {scores.mean()}')
    cross_v_score.append(scores.mean())
    print('='*35)
    print('\n')

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression
Model Performance for training set
- Root Mean Squared Error: 0.3063
- Mean Absolute Error: 0.0938
- R2 Square Error: -0.0988
-------------------------------------
Model Performance for test set
- Root Mean Squared Error: 0.3053
- Mean Absolute Error: 0.0932
- R2 Square Error: -0.0947


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Cross-Validation Accuracy: 0.9061465261318891


DecisionTreeClassifier
Model Performance for training set
- Root Mean Squared Error: 0.0639
- Mean Absolute Error: 0.0041
- R2 Square Error: 0.9522
-------------------------------------
Model Performance for test set
- Root Mean Squared Error: 0.3860
- Mean Absolute Error: 0.1490
- R2 Square Error: -0.7500
Cross-Validation Accuracy: 0.8499536872766218


RandomForestClassifier
Model Performance for training set
- Root Mean Squared Error: 0.0641
- Mean Absolute Error: 0.0041
- R2 Square Error: 0.9518
-------------------------------------
Model Performance for test set
- Root Mean Squared Error: 0.3113
- Mean Absolute Error: 0.0969
- R2 Square Error: -0.1384
Cross-Validation Accuracy: 0.9022833912065937


AdaBoostClassifier
Model Performance for training set
- Root Mean Squared Error: 0.3049
- Mean Absolute Error: 0.0930
- R2 Square Error: -0.0891
-------------------------------------
Model Performance for test set
- Root Mean Squared Error:

In [13]:
pd.DataFrame(list(zip(model_list,r2_list,cross_v_score)), columns=['Model Name','R2_Score','Cross_Val_Score']).sort_values(by=['R2_Score'], ascending = False).reset_index()

Unnamed: 0,index,Model Name,R2_Score,Cross_Val_Score
0,0,LogisticRegression,-0.073785,0.896863
1,3,AdaBoostClassifier,-0.103612,0.88762
2,6,Support Vector Classifier,-0.123497,0.898565
3,4,GradientBoostingClassifier,-0.203037,0.895161
4,2,RandomForestClassifier,-0.332289,0.881782
5,5,KNeighborsClassifier,-0.352174,0.888349
6,1,DecisionTreeClassifier,-0.590793,0.843103


In [None]:
9979218099