In [74]:
from functools import reduce
from collections import Counter
import warnings
import pandas as pd 
import re
import plotly.graph_objects as go
import plotly.express as px
import seaborn as sns
import ast
import numpy as np
from sklearn.cluster import AgglomerativeClustering
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.cluster import KMeans
from sklearn.model_selection import GridSearchCV
import joblib
from sklearn.preprocessing import RobustScaler
from sklearn.pipeline import make_pipeline
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

warnings.filterwarnings("ignore")

# Read and prepare data

In [33]:
data = pd.read_csv('heart.csv')
data['output'] = data['output'].map(lambda x: 'seek' if x == 1 else 'not seek')
data

Unnamed: 0,age,sex,cp,trtbps,chol,fbs,restecg,thalachh,exng,oldpeak,slp,caa,thall,output
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,seek
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,seek
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,seek
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,seek
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,seek
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,57,0,0,140,241,0,1,123,1,0.2,1,0,3,not seek
299,45,1,3,110,264,0,1,132,0,1.2,1,0,3,not seek
300,68,1,0,144,193,1,1,141,0,3.4,1,2,3,not seek
301,57,1,0,130,131,0,1,115,1,1.2,1,1,3,not seek


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       303 non-null    int64  
 1   sex       303 non-null    int64  
 2   cp        303 non-null    int64  
 3   trtbps    303 non-null    int64  
 4   chol      303 non-null    int64  
 5   fbs       303 non-null    int64  
 6   restecg   303 non-null    int64  
 7   thalachh  303 non-null    int64  
 8   exng      303 non-null    int64  
 9   oldpeak   303 non-null    float64
 10  slp       303 non-null    int64  
 11  caa       303 non-null    int64  
 12  thall     303 non-null    int64  
 13  output    303 non-null    int64  
dtypes: float64(1), int64(13)
memory usage: 33.3 KB


# Classification

## Train test split

In [69]:
X_train, X_test, y_train, y_test = train_test_split(data.iloc[: ,:-1], data.iloc[: ,-1:], test_size = 0.2)

## Linear classifiers

### SVC

In [83]:
# instantiating the model
svm = make_pipeline(RobustScaler(), SVC())

# parameters to grid
parameters = {"svc__C":np.arange(1,10,1),'svc__gamma':[0.00001,0.00005, 0.0001,0.0005,0.001,0.005,0.01,0.05,0.1,0.5,1,5]}

# instantiating the GridSearchCV object
searcher = GridSearchCV(svm, parameters)

# fitting
searcher.fit(X_train, y_train)

# predicting the values
y_pred = searcher.predict(X_test)

# print chosen estimator
print(searcher.best_estimator_)

# printing the test accuracy
print("Accuracy: ", accuracy_score(y_test, y_pred))

Pipeline(steps=[('robustscaler', RobustScaler()),
                ('svc', SVC(C=6, gamma=0.05))])
Accuracy:  0.8852459016393442


In [84]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

    not seek       0.90      0.87      0.89        31
        seek       0.87      0.90      0.89        30

    accuracy                           0.89        61
   macro avg       0.89      0.89      0.89        61
weighted avg       0.89      0.89      0.89        61



### Logistic regression

In [79]:
# instantiating the model
logreg = LogisticRegression()

# fitting
logreg.fit(X_train, y_train)

# finding the predicted valued
y_pred = logreg.predict(X_test)

# printing the test accuracy
print("Accuracy: ", accuracy_score(y_test, y_pred))

Accuracy:  0.9016393442622951


In [80]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

    not seek       1.00      0.81      0.89        31
        seek       0.83      1.00      0.91        30

    accuracy                           0.90        61
   macro avg       0.92      0.90      0.90        61
weighted avg       0.92      0.90      0.90        61



## Tree models

### Decision Tree

In [85]:
# instantiating the model
dt = make_pipeline(RobustScaler(), DecisionTreeClassifier())

# parameters to grid
parameters = {'decisiontreeclassifier__max_depth': [2, 3, 5, 10, 12, 15, 20, 50, 100]}

# instantiating the GridSearchCV object
searcher = GridSearchCV(dt, parameters)

# fitting
searcher.fit(X_train, y_train)

# predicting the values
y_pred = searcher.predict(X_test)

# print chosen estimator
print(searcher.best_estimator_)

# printing the test accuracy
print("Accuracy: ", accuracy_score(y_test, y_pred))

Pipeline(steps=[('robustscaler', RobustScaler()),
                ('decisiontreeclassifier',
                 DecisionTreeClassifier(max_depth=3))])
Accuracy:  0.819672131147541


In [86]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

    not seek       0.86      0.77      0.81        31
        seek       0.79      0.87      0.83        30

    accuracy                           0.82        61
   macro avg       0.82      0.82      0.82        61
weighted avg       0.82      0.82      0.82        61



### Random Forest

In [87]:
# instantiating the model
rf = make_pipeline(RobustScaler(), RandomForestClassifier())

# parameters to grid
parameters = {'randomforestclassifier__max_depth': [2, 3, 5, 10, 12, 15, 20, 50, 100],
             'randomforestclassifier__n_estimators': [100, 200, 400]}

# instantiating the GridSearchCV object
searcher = GridSearchCV(rf, parameters)

# fitting
searcher.fit(X_train, y_train)

# predicting the values
y_pred = searcher.predict(X_test)

# print chosen estimator
print(searcher.best_estimator_)

# printing the test accuracy
print("Accuracy: ", accuracy_score(y_test, y_pred))

Pipeline(steps=[('robustscaler', RobustScaler()),
                ('randomforestclassifier',
                 RandomForestClassifier(max_depth=3))])
Accuracy:  0.9016393442622951


In [88]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

    not seek       0.96      0.84      0.90        31
        seek       0.85      0.97      0.91        30

    accuracy                           0.90        61
   macro avg       0.91      0.90      0.90        61
weighted avg       0.91      0.90      0.90        61



### Gradient Boosting Classifier

In [89]:
# instantiating the model
gb = make_pipeline(RobustScaler(), GradientBoostingClassifier(n_estimators = 300))

# parameters to grid
parameters = {'gradientboostingclassifier__max_depth': [1, 2, 5, 10],
             'gradientboostingclassifier__subsample': [0.5, 0.8, 1]}

# instantiating the GridSearchCV object
searcher = GridSearchCV(gb, parameters)

# fitting
searcher.fit(X_train, y_train)

# predicting the values
y_pred = searcher.predict(X_test)

# print chosen estimator
print(searcher.best_estimator_)

# printing the test accuracy
print("Accuracy: ", accuracy_score(y_test, y_pred))

Pipeline(steps=[('robustscaler', RobustScaler()),
                ('gradientboostingclassifier',
                 GradientBoostingClassifier(max_depth=1, subsample=1))])
Accuracy:  0.9180327868852459


In [90]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

    not seek       0.93      0.90      0.92        31
        seek       0.90      0.93      0.92        30

    accuracy                           0.92        61
   macro avg       0.92      0.92      0.92        61
weighted avg       0.92      0.92      0.92        61

