

#### Dataset link: https://archive.ics.uci.edu/ml/datasets/Heart+Disease

This database contains 76 attributes, but all published experiment refers to using a subset of 14 of them.

### Features: 
1. age: age in years
2. sex: sex (1 = male; 0 = female)
3. cp: chest pain type
-- Value 1: typical angina
-- Value 2: atypical angina
-- Value 3: non-anginal pain
-- Value 4: asymptomatic
4. trestbps: resting blood pressure (in mm Hg on admission to the hospital)
5. chol: serum cholestoral in mg/dl
6. fbs: (fasting blood sugar > 120 mg/dl) (1 = true; 0 = false)
7. restecg: resting electrocardiographic results
-- Value 0: normal
-- Value 1: having ST-T wave abnormality (T wave inversions and/or ST elevation or depression of > 0.05 mV)
-- Value 2: showing probable or definite left ventricular hypertrophy by Estes' criteria
8. thalach: maximum heart rate achieved
9. exang: exercise induced angina (1 = yes; 0 = no)
10. oldpeak = ST depression induced by exercise relative to rest
11. slope: the slope of the peak exercise ST segment
-- Value 1: upsloping
-- Value 2: flat
-- Value 3: downsloping
12. ca: number of major vessels (0-3) colored by flourosopy
13. thal: 3 = normal; 6 = fixed defect; 7 = reversable defect
14. num: diagnosis of heart disease (angiographic disease status)
-- Value 0: < 50% diameter narrowing
-- Value 1: > 50% diameter narrowing
(in any major vessel: attributes 59 through 68 are vessels)

In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns; sns.set(style="whitegrid")

In [2]:
heart = pd.read_csv("heart.csv")

In [3]:
heart.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [4]:
numerical_Attributes = ["age", "trestbps", "chol", "thalach", "oldpeak"]

categorical_Attributes = ["sex", "cp", "fbs", "restecg", "exang", 
                           "slope", "ca", "thal"]

In [5]:
heart.shape

(303, 14)

In [6]:
heart.isna().any()

age         False
sex         False
cp          False
trestbps    False
chol        False
fbs         False
restecg     False
thalach     False
exang       False
oldpeak     False
slope       False
ca          False
thal        False
target      False
dtype: bool

In [7]:
heart.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       303 non-null    int64  
 1   sex       303 non-null    int64  
 2   cp        303 non-null    int64  
 3   trestbps  303 non-null    int64  
 4   chol      303 non-null    int64  
 5   fbs       303 non-null    int64  
 6   restecg   303 non-null    int64  
 7   thalach   303 non-null    int64  
 8   exang     303 non-null    int64  
 9   oldpeak   303 non-null    float64
 10  slope     303 non-null    int64  
 11  ca        303 non-null    int64  
 12  thal      303 non-null    int64  
 13  target    303 non-null    int64  
dtypes: float64(1), int64(13)
memory usage: 33.3 KB


In [8]:
heart.describe()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
count,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0
mean,54.366337,0.683168,0.966997,131.623762,246.264026,0.148515,0.528053,149.646865,0.326733,1.039604,1.39934,0.729373,2.313531,0.544554
std,9.082101,0.466011,1.032052,17.538143,51.830751,0.356198,0.52586,22.905161,0.469794,1.161075,0.616226,1.022606,0.612277,0.498835
min,29.0,0.0,0.0,94.0,126.0,0.0,0.0,71.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,47.5,0.0,0.0,120.0,211.0,0.0,0.0,133.5,0.0,0.0,1.0,0.0,2.0,0.0
50%,55.0,1.0,1.0,130.0,240.0,0.0,1.0,153.0,0.0,0.8,1.0,0.0,2.0,1.0
75%,61.0,1.0,2.0,140.0,274.5,0.0,1.0,166.0,1.0,1.6,2.0,1.0,3.0,1.0
max,77.0,1.0,3.0,200.0,564.0,1.0,2.0,202.0,1.0,6.2,2.0,4.0,3.0,1.0


In [9]:
heart["target"].value_counts()

1    165
0    138
Name: target, dtype: int64

In [10]:
for n in categorical_Attributes :
    print(heart[n].value_counts())
    print("-------------")

1    207
0     96
Name: sex, dtype: int64
-------------
0    143
2     87
1     50
3     23
Name: cp, dtype: int64
-------------
0    258
1     45
Name: fbs, dtype: int64
-------------
1    152
0    147
2      4
Name: restecg, dtype: int64
-------------
0    204
1     99
Name: exang, dtype: int64
-------------
2    142
1    140
0     21
Name: slope, dtype: int64
-------------
0    175
1     65
2     38
3     20
4      5
Name: ca, dtype: int64
-------------
2    166
3    117
1     18
0      2
Name: thal, dtype: int64
-------------


In [11]:
corr_matrix = heart.corr()

corr_matrix["target"].sort_values(ascending = False)

target      1.000000
cp          0.433798
thalach     0.421741
slope       0.345877
restecg     0.137230
fbs        -0.028046
chol       -0.085239
trestbps   -0.144931
age        -0.225439
sex        -0.280937
thal       -0.344029
ca         -0.391724
oldpeak    -0.430696
exang      -0.436757
Name: target, dtype: float64

s = sns.pairplot(vars =numerical_Attributes, data = heart, hue = "target")

plt.show()

In [12]:
X = heart.drop(["target"], axis=1)
y = heart["target"]

In [13]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                     test_size = 0.2, random_state = 5)

In [14]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer  
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder 

In [15]:
num_pipeline_with_scale= Pipeline([
        ("imputer", SimpleImputer(strategy="median")),   
        ('std_scaler', StandardScaler()),
    ])

In [16]:
num_pipeline_without_scale= Pipeline([
        ("imputer", SimpleImputer(strategy="median")),
    ])

In [17]:
cat_pipeline = Pipeline([
        ("cat_encoder", OneHotEncoder()),
    ])

In [18]:
numerical_Attributes = ["age", "trestbps", "chol", "thalach", "oldpeak"]

categorical_Attributes = ["sex", "cp", "fbs", "restecg", "exang", 
                           "slope", "ca", "thal"]

In [19]:
from sklearn.compose import ColumnTransformer

full_pipeline_with_scale = ColumnTransformer([
        ("num", num_pipeline_with_scale, numerical_Attributes),
        ("cat", cat_pipeline, categorical_Attributes),
    ])

full_pipeline_without_scale = ColumnTransformer([
        ("num", num_pipeline_without_scale, numerical_Attributes),
        ("cat", cat_pipeline, categorical_Attributes),
    ])

In [20]:
X_train_prepared = full_pipeline_with_scale.fit_transform(X_train)
X_test_prepared = full_pipeline_with_scale.fit_transform(X_test)

In [21]:
X_train_prepared_not_scaled = full_pipeline_without_scale.fit_transform(X_train)
X_test_prepared_not_scaled = full_pipeline_without_scale.fit_transform(X_test)

In [22]:
X_train_prepared.shape, X_test_prepared.shape

((242, 30), (61, 30))

In [23]:
y_train.shape, y_test.shape

((242,), (61,))

.

In [24]:
from sklearn.model_selection import cross_val_score
from sklearn import metrics
from sklearn.metrics import precision_score, recall_score
from sklearn.metrics import accuracy_score, f1_score

In [25]:
def cv_func(model, i, X, y):

    cross_accu_knn = cross_val_score(model, X, y, cv=i, scoring="accuracy")

    cross_precision_knn= cross_val_score(model, X, y, cv=i, scoring="precision")

    cross_recall_knn = cross_val_score(model, X, y, cv=i, scoring="recall")

    cross_f1_knn = cross_val_score(model, X, y, cv=i, scoring="f1")


    print("CV accuracy :" ,round(cross_accu_knn.mean(), 4))
    print("CV precision :", cross_precision_knn.mean())
    print("CV recall : ", cross_recall_knn.mean())
    print("CV f1 score : ", cross_f1_knn.mean())

#### Logistic Regression

In [26]:
from sklearn.linear_model import LogisticRegression

log_reg = LogisticRegression(random_state = 42)

cv_func(log_reg, 3, X_train_prepared, y_train)

CV accuracy : 0.8309
CV precision : 0.8401234567901233
CV recall :  0.866161616161616
CV f1 score :  0.8498770476298566


#### Decision Tree

In [27]:
from sklearn.tree import DecisionTreeClassifier 

tree_clf = DecisionTreeClassifier(random_state = 42)

cv_func(tree_clf, 3, X_train_prepared_not_scaled, y_train)

CV accuracy : 0.6861
CV precision : 0.7243280290839741
CV recall :  0.7020202020202021
CV f1 score :  0.7082816635738801


#### Random Forest

In [28]:
from sklearn.ensemble import RandomForestClassifier

rnd_clf = RandomForestClassifier(max_depth=3, max_leaf_nodes=5, n_estimators=10,
                       random_state=42)

cv_func(rnd_clf, 3, X_train_prepared_not_scaled, y_train)

CV accuracy : 0.8183
CV precision : 0.8230769230769232
CV recall :  0.8653198653198654
CV f1 score :  0.8413301294473298


#### KNN Classifier

In [29]:
from sklearn.neighbors import KNeighborsClassifier 

knn_clf = KNeighborsClassifier(n_neighbors = 3, metric='euclidean')

cv_func(knn_clf, 3, X_train_prepared, y_train)

CV accuracy : 0.8061
CV precision : 0.8181397306397306
CV recall :  0.8361952861952862
CV f1 score :  0.8268762897795158


#### Linear SVC

In [30]:
from sklearn.svm import LinearSVC

lin_svm_clf = LinearSVC(C=1, loss="hinge", max_iter=100*100, random_state=42)

cv_func(lin_svm_clf, 3, X_train_prepared, y_train)

CV accuracy : 0.806
CV precision : 0.8204610800355482
CV recall :  0.8437710437710438
CV f1 score :  0.8265362279996427


#### Polynomial SVC

In [31]:
from sklearn.preprocessing import PolynomialFeatures

polynomial_svm_clf = Pipeline([
        ("poly_features", PolynomialFeatures(degree=2)),
        ("scaler", StandardScaler()),
        ("svm_clf", LinearSVC(C=1, max_iter=100*100, random_state=42))
    ])

cv_func(polynomial_svm_clf, 3, X_train_prepared, y_train)

CV accuracy : 0.7563
CV precision : 0.7946299732014017
CV recall :  0.7614478114478115
CV f1 score :  0.7744764429161592


#### Fine tuning of Random Forest Model

In [32]:
### n_estimators, max_depth, max_leaf_nodes, max_features

#### Grid Search CV

In [33]:
from sklearn.model_selection import GridSearchCV

param_grid = [
    {'n_estimators': [3, 10, 30], 'max_depth': [3, 5, 7], "max_leaf_nodes":[5, 7, 9]},
    {'bootstrap': [False], 'n_estimators': [3, 10, 30], 'max_depth': [3, 5, 7], "max_leaf_nodes":[5, 7, 9]},
  ]

forest_clf = RandomForestClassifier(random_state=42)

grid_search = GridSearchCV(forest_clf, param_grid, cv=3,
                           scoring='precision',
                           return_train_score=True)

grid_search.fit(X_train_prepared, y_train)

GridSearchCV(cv=3, estimator=RandomForestClassifier(random_state=42),
             param_grid=[{'max_depth': [3, 5, 7], 'max_leaf_nodes': [5, 7, 9],
                          'n_estimators': [3, 10, 30]},
                         {'bootstrap': [False], 'max_depth': [3, 5, 7],
                          'max_leaf_nodes': [5, 7, 9],
                          'n_estimators': [3, 10, 30]}],
             return_train_score=True, scoring='precision')

In [34]:
grid_search.best_params_

{'max_depth': 3, 'max_leaf_nodes': 5, 'n_estimators': 10}

In [35]:
grid_search.best_estimator_

RandomForestClassifier(max_depth=3, max_leaf_nodes=5, n_estimators=10,
                       random_state=42)

In [36]:
cvres = grid_search.cv_results_
for precision, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(precision, params)

0.7725524992605738 {'max_depth': 3, 'max_leaf_nodes': 5, 'n_estimators': 3}
0.8230769230769232 {'max_depth': 3, 'max_leaf_nodes': 5, 'n_estimators': 10}
0.8045454545454546 {'max_depth': 3, 'max_leaf_nodes': 5, 'n_estimators': 30}
0.7819561273297048 {'max_depth': 3, 'max_leaf_nodes': 7, 'n_estimators': 3}
0.8093954745018576 {'max_depth': 3, 'max_leaf_nodes': 7, 'n_estimators': 10}
0.8013963161021985 {'max_depth': 3, 'max_leaf_nodes': 7, 'n_estimators': 30}
0.7715228526398739 {'max_depth': 3, 'max_leaf_nodes': 9, 'n_estimators': 3}
0.8093954745018576 {'max_depth': 3, 'max_leaf_nodes': 9, 'n_estimators': 10}
0.8013963161021985 {'max_depth': 3, 'max_leaf_nodes': 9, 'n_estimators': 30}
0.7820707849385 {'max_depth': 5, 'max_leaf_nodes': 5, 'n_estimators': 3}
0.8135802469135802 {'max_depth': 5, 'max_leaf_nodes': 5, 'n_estimators': 10}
0.80291146761735 {'max_depth': 5, 'max_leaf_nodes': 5, 'n_estimators': 30}
0.757037037037037 {'max_depth': 5, 'max_leaf_nodes': 7, 'n_estimators': 3}
0.79988410

.

In [37]:
rnd_clf_fine = RandomForestClassifier(max_depth=3, max_leaf_nodes=5, n_estimators=10,
                       random_state=42)

In [38]:
rnd_clf_fine.fit(X_train_prepared_not_scaled, y_train)

RandomForestClassifier(max_depth=3, max_leaf_nodes=5, n_estimators=10,
                       random_state=42)

In [39]:
y_test_pred = rnd_clf_fine.predict(X_test_prepared)

In [40]:
accuracy_score(y_test, y_test_pred)

0.8524590163934426

In [41]:
precision_score(y_test, y_test_pred)

0.8666666666666667

In [42]:
recall_score(y_test, y_test_pred)

0.8387096774193549

In [43]:
f1_score(y_test, y_test_pred)

0.8524590163934426

### Dimensionality Reduction

In [44]:
X_train_prepared.shape, X_test_prepared.shape

((242, 30), (61, 30))

In [45]:
from sklearn.decomposition import PCA

#pca = PCA(n_components=0.95)             # 0.0 - 1.0
pca = PCA(n_components=15)
X_train_reduced = pca.fit_transform(X_train_prepared)
X_test_reduced = pca.fit_transform(X_test_prepared)

print(X_train_reduced.shape, X_test_reduced.shape)

(242, 15) (61, 15)


In [46]:
pca = PCA()
pca.fit(X_train_prepared)
cumsum = np.cumsum(pca.explained_variance_ratio_)
d = np.argmax(cumsum >= 0.95) + 1

print("Maximum no for components to reduce to 95% Variance : ", d)

Maximum no for components to reduce to 95% Variance :  15


In [47]:
pca.explained_variance_ratio_

array([2.47591517e-01, 1.30332334e-01, 1.07029108e-01, 8.94987519e-02,
       6.76835263e-02, 5.82436866e-02, 5.25510819e-02, 3.85385166e-02,
       3.42448610e-02, 2.82739019e-02, 2.81463921e-02, 2.57441691e-02,
       2.37666961e-02, 1.81333619e-02, 1.31081133e-02, 1.01963683e-02,
       9.05288684e-03, 7.43390983e-03, 5.99748271e-03, 2.07909114e-03,
       1.78507577e-03, 5.69168491e-04, 1.37766193e-33, 1.37766193e-33,
       1.37766193e-33, 1.37766193e-33, 1.37766193e-33, 1.37766193e-33,
       1.37766193e-33, 1.37766193e-33])

In [48]:
cumsum 

array([0.24759152, 0.37792385, 0.48495296, 0.57445171, 0.64213524,
       0.70037892, 0.75293   , 0.79146852, 0.82571338, 0.85398728,
       0.88213368, 0.90787785, 0.93164454, 0.9497779 , 0.96288602,
       0.97308239, 0.98213527, 0.98956918, 0.99556666, 0.99764576,
       0.99943083, 1.        , 1.        , 1.        , 1.        ,
       1.        , 1.        , 1.        , 1.        , 1.        ])