In [1]:
import logging
import os

logging.disable(logging.WARNING)
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = ""

import numpy as np
import pandas as pd
from IPython.core.interactiveshell import InteractiveShell
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers import Dense
from tensorflow.keras.models import Sequential

InteractiveShell.ast_node_interactivity = "all"

## Data set source: https://www.kaggle.com/nareshbhat/health-care-data-set-on-heart-attack-possibility

In [2]:
df = pd.read_csv("heart.csv")
df.head()
df.describe()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
count,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0
mean,54.366337,0.683168,0.966997,131.623762,246.264026,0.148515,0.528053,149.646865,0.326733,1.039604,1.39934,0.729373,2.313531,0.544554
std,9.082101,0.466011,1.032052,17.538143,51.830751,0.356198,0.52586,22.905161,0.469794,1.161075,0.616226,1.022606,0.612277,0.498835
min,29.0,0.0,0.0,94.0,126.0,0.0,0.0,71.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,47.5,0.0,0.0,120.0,211.0,0.0,0.0,133.5,0.0,0.0,1.0,0.0,2.0,0.0
50%,55.0,1.0,1.0,130.0,240.0,0.0,1.0,153.0,0.0,0.8,1.0,0.0,2.0,1.0
75%,61.0,1.0,2.0,140.0,274.5,0.0,1.0,166.0,1.0,1.6,2.0,1.0,3.0,1.0
max,77.0,1.0,3.0,200.0,564.0,1.0,2.0,202.0,1.0,6.2,2.0,4.0,3.0,1.0


In [3]:
## normalize (z-score) data
X = df.drop(columns=["target"])
y = df["target"]

transformer = StandardScaler()
_ = transformer.fit(X)
X = transformer.transform(X)
X

array([[ 0.9521966 ,  0.68100522,  1.97312292, ..., -2.27457861,
        -0.71442887, -2.14887271],
       [-1.91531289,  0.68100522,  1.00257707, ..., -2.27457861,
        -0.71442887, -0.51292188],
       [-1.47415758, -1.46841752,  0.03203122, ...,  0.97635214,
        -0.71442887, -0.51292188],
       ...,
       [ 1.50364073,  0.68100522, -0.93851463, ..., -0.64911323,
         1.24459328,  1.12302895],
       [ 0.29046364,  0.68100522, -0.93851463, ..., -0.64911323,
         0.26508221,  1.12302895],
       [ 0.29046364, -1.46841752,  0.03203122, ..., -0.64911323,
         0.26508221, -0.51292188]])

In [4]:
def get_new_keras_model(x_train):
    nn = Sequential()

    nn.add(Dense(30, input_shape=(x_train.shape[1],), activation='relu'))
    nn.add(Dense(30, activation='relu'))
    nn.add(Dense(30, activation='relu'))
    nn.add(Dense(1, activation='sigmoid'))

    nn.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    
    return nn

In [5]:
def train(splitter):
    for index, (train, test) in enumerate(splitter.split(X), 1):

        print(f'Split {index}:\n')

        x_train = X[train]
        x_test = X[test]
        y_train = y[train]
        y_test = y[test]

        nn = get_new_keras_model(x_train)
        early_stopping = EarlyStopping(monitor='val_loss', patience=3)
        _ = nn.fit(x_train, y_train, epochs=200, validation_data=(x_test, y_test), verbose=False, callbacks=[early_stopping])

        y_pred = np.round(nn.predict(x_test))
        print(confusion_matrix(y_test, y_pred))
        print(classification_report(y_test, y_pred))

        print('\n')

### KFold

In [6]:
from sklearn.model_selection import KFold
kf = KFold(n_splits=5, random_state=42, shuffle=True)
train(kf)

Split 1:

[[25  4]
 [ 3 29]]
              precision    recall  f1-score   support

           0       0.89      0.86      0.88        29
           1       0.88      0.91      0.89        32

    accuracy                           0.89        61
   macro avg       0.89      0.88      0.88        61
weighted avg       0.89      0.89      0.89        61



Split 2:

[[18  5]
 [ 3 35]]
              precision    recall  f1-score   support

           0       0.86      0.78      0.82        23
           1       0.88      0.92      0.90        38

    accuracy                           0.87        61
   macro avg       0.87      0.85      0.86        61
weighted avg       0.87      0.87      0.87        61



Split 3:

[[24  6]
 [ 5 26]]
              precision    recall  f1-score   support

           0       0.83      0.80      0.81        30
           1       0.81      0.84      0.83        31

    accuracy                           0.82        61
   macro avg       0.82      0.82    

### Repeated KFold

In [7]:
from sklearn.model_selection import RepeatedKFold
rkf = RepeatedKFold(n_splits=2, n_repeats=2, random_state=42)
train(rkf)

Split 1:

[[56 14]
 [16 66]]
              precision    recall  f1-score   support

           0       0.78      0.80      0.79        70
           1       0.82      0.80      0.81        82

    accuracy                           0.80       152
   macro avg       0.80      0.80      0.80       152
weighted avg       0.80      0.80      0.80       152



Split 2:

[[50 18]
 [14 69]]
              precision    recall  f1-score   support

           0       0.78      0.74      0.76        68
           1       0.79      0.83      0.81        83

    accuracy                           0.79       151
   macro avg       0.79      0.78      0.78       151
weighted avg       0.79      0.79      0.79       151



Split 3:

[[57 19]
 [12 64]]
              precision    recall  f1-score   support

           0       0.83      0.75      0.79        76
           1       0.77      0.84      0.81        76

    accuracy                           0.80       152
   macro avg       0.80      0.80    

### shuffle split

In [8]:
from sklearn.model_selection import ShuffleSplit

ss = ShuffleSplit(n_splits=5, test_size=0.2, random_state=42)
train(ss)

Split 1:

[[26  3]
 [ 3 29]]
              precision    recall  f1-score   support

           0       0.90      0.90      0.90        29
           1       0.91      0.91      0.91        32

    accuracy                           0.90        61
   macro avg       0.90      0.90      0.90        61
weighted avg       0.90      0.90      0.90        61



Split 2:

[[24  8]
 [ 5 24]]
              precision    recall  f1-score   support

           0       0.83      0.75      0.79        32
           1       0.75      0.83      0.79        29

    accuracy                           0.79        61
   macro avg       0.79      0.79      0.79        61
weighted avg       0.79      0.79      0.79        61



Split 3:

[[26  5]
 [ 3 27]]
              precision    recall  f1-score   support

           0       0.90      0.84      0.87        31
           1       0.84      0.90      0.87        30

    accuracy                           0.87        61
   macro avg       0.87      0.87    