In [30]:
import pandas as pd
import numpy as np
import time
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
import seaborn as sns
import matplotlib.pyplot as plt

heart_df = pd.read_csv('heart.csv')

heart_df = heart_df.drop(labels=['exng', 'oldpeak'], axis=1)

x = heart_df.iloc[:, :-1]
y = heart_df.iloc[:, -1]
x

Unnamed: 0,age,sex,cp,trtbps,chol,fbs,restecg,thalachh,slp,caa,thall
0,63,1,3,145,233,1,0,150,0,0,1
1,37,1,2,130,250,0,1,187,0,0,2
2,41,0,1,130,204,0,0,172,2,0,2
3,56,1,1,120,236,0,1,178,2,0,2
4,57,0,0,120,354,0,1,163,2,0,2
...,...,...,...,...,...,...,...,...,...,...,...
298,57,0,0,140,241,0,1,123,1,0,3
299,45,1,3,110,264,0,1,132,1,0,3
300,68,1,0,144,193,1,1,141,1,2,3
301,57,1,0,130,131,0,1,115,1,1,3


In [43]:
bins = [25, 35, 45, 55, 100]
category = ['young', 'adult', 'old', 'senior']

heart_df['age_grps'] = pd.cut(heart_df['age'], bins, labels=category)

In [45]:
heart_df

Unnamed: 0,age,sex,cp,trtbps,chol,fbs,restecg,thalachh,slp,caa,thall,output,age_grps
0,63,1,3,145,233,1,0,150,0,0,1,1,senior
1,37,1,2,130,250,0,1,187,0,0,2,1,adult
2,41,0,1,130,204,0,0,172,2,0,2,1,adult
3,56,1,1,120,236,0,1,178,2,0,2,1,senior
4,57,0,0,120,354,0,1,163,2,0,2,1,senior
...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,57,0,0,140,241,0,1,123,1,0,3,0,senior
299,45,1,3,110,264,0,1,132,1,0,3,0,adult
300,68,1,0,144,193,1,1,141,1,2,3,0,senior
301,57,1,0,130,131,0,1,115,1,1,3,0,senior


In [36]:
heart_df.describe()

Unnamed: 0,age,sex,cp,trtbps,chol,fbs,restecg,thalachh,slp,caa,thall,output
count,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0
mean,54.366337,0.683168,0.966997,131.623762,246.264026,0.148515,0.528053,149.646865,1.39934,0.729373,2.313531,0.544554
std,9.082101,0.466011,1.032052,17.538143,51.830751,0.356198,0.52586,22.905161,0.616226,1.022606,0.612277,0.498835
min,29.0,0.0,0.0,94.0,126.0,0.0,0.0,71.0,0.0,0.0,0.0,0.0
25%,47.5,0.0,0.0,120.0,211.0,0.0,0.0,133.5,1.0,0.0,2.0,0.0
50%,55.0,1.0,1.0,130.0,240.0,0.0,1.0,153.0,1.0,0.0,2.0,1.0
75%,61.0,1.0,2.0,140.0,274.5,0.0,1.0,166.0,2.0,1.0,3.0,1.0
max,77.0,1.0,3.0,200.0,564.0,1.0,2.0,202.0,2.0,4.0,3.0,1.0


## Accuracy before data enhancement

In [31]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)

x_train = StandardScaler().fit_transform(x_train)
x_test = StandardScaler().fit_transform(x_test)

results = pd.DataFrame({'Model': [], 'Accuracy': [], 'Bal Acc.': [], 'Time': []})

classifiers = {
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(n_estimators=100),
    "Skl GBM": GradientBoostingClassifier(n_estimators=100),
    "Logistic Regression": LogisticRegression(),
    "SVM": SVC()
}

for model_name, model in classifiers.items():
    start_time = time.time()
    model.fit(x_train, y_train)
    pred = model.predict(x_test)

    total_time = time.time() - start_time

    results = results.append({"Model": model_name,
                              "Accuracy": metrics.accuracy_score(y_test, pred) * 100,
                              "Bal Acc.": metrics.balanced_accuracy_score(y_test, pred) * 100,
                              "Time": total_time},
                             ignore_index=True)

results_ord = results.sort_values(by=['Accuracy'], ascending=False, ignore_index=True)
results_ord.index += 1
results_ord.style.bar(subset=['Accuracy', 'Bal Acc.'], vmin=0, vmax=100, color='#5fba7d')

Unnamed: 0,Model,Accuracy,Bal Acc.,Time
1,Logistic Regression,88.52459,88.180828,0.021142
2,SVM,85.245902,84.858388,0.017721
3,Random Forest,83.606557,83.769063,0.291019
4,Skl GBM,83.606557,83.3878,0.139584
5,Decision Tree,77.04918,77.124183,0.029986


## Data Enhancement w/ Standard Deviation

In [32]:
np.random.seed(0)

def data_enhancement(data):
    gen_data = data.copy()

    for cp in data['cp'].unique():
        cp_data = gen_data[gen_data['cp'] == cp]
        trtbps_std = cp_data['trtbps'].std()
        age_std = cp_data['age'].std()
        chol_std = cp_data['chol'].std()
        thalachh_std = cp_data['thalachh'].std()

        for i in gen_data[gen_data['cp'] == cp].index:
            if np.random.randint(2) == 1:
                gen_data['trtbps'].values[i] += trtbps_std/10
            else:
                gen_data['trtbps'].values[i] -= trtbps_std/10
            if np.random.randint(2) == 1:
                gen_data['age'].values[i] += age_std/10
            else:
                gen_data['age'].values[i] -= age_std/10
            if np.random.randint(2) == 1:
                gen_data['chol'].values[i] += chol_std/10
            else:
                gen_data['chol'].values[i] -= chol_std/10
            if np.random.randint(2) == 1:
                gen_data['thalachh'].values[i] += thalachh_std/10
            else:
                gen_data['thalachh'].values[i] -= thalachh_std/10
    return gen_data

gen = data_enhancement(heart_df)
# print(heart_df.head())
gen

Unnamed: 0,age,sex,cp,trtbps,chol,fbs,restecg,thalachh,slp,caa,thall,output
0,64,1,3,143,236,1,0,147,0,0,1,1
1,36,1,2,128,256,0,1,185,0,0,2,1
2,40,0,1,131,208,0,0,170,2,0,2,1
3,56,1,1,121,240,0,1,176,2,0,2,1
4,56,0,0,121,348,0,1,160,2,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...
298,56,0,0,138,246,0,1,125,1,0,3,0
299,43,1,3,111,267,0,1,134,1,0,3,0
300,68,1,0,145,187,1,1,138,1,2,3,0
301,56,1,0,128,125,0,1,112,1,1,3,0


In [33]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)

extra_sample = gen.sample(gen.shape[0] // 3)
x_train = pd.concat([x_train, extra_sample.drop(['output'], axis=1)])
y_train = pd.concat([y_train, extra_sample['output']])


## Accuracy with Data Enhancement

In [35]:


x_train = StandardScaler().fit_transform(x_train)
x_test = StandardScaler().fit_transform(x_test)

results = pd.DataFrame({'Model': [], 'Accuracy': [], 'Bal Acc.': [], 'Time': []})

classifiers = {
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(n_estimators=100),
    "Skl GBM": GradientBoostingClassifier(n_estimators=100),
    "Logistic Regression": LogisticRegression(),
    "SVM": SVC()
}

for model_name, model in classifiers.items():
    start_time = time.time()
    model.fit(x_train, y_train)
    pred = model.predict(x_test)

    total_time = time.time() - start_time

    results = results.append({"Model": model_name,
                              "Accuracy": metrics.accuracy_score(y_test, pred) * 100,
                              "Bal Acc.": metrics.balanced_accuracy_score(y_test, pred) * 100,
                              "Time": total_time},
                             ignore_index=True)

results_ord = results.sort_values(by=['Accuracy'], ascending=False, ignore_index=True)
results_ord.index += 1
results_ord.style.bar(subset=['Accuracy', 'Bal Acc.'], vmin=0, vmax=100, color='#5fba7d')

Unnamed: 0,Model,Accuracy,Bal Acc.,Time
1,Random Forest,90.163934,90.413943,0.28304
2,SVM,88.52459,87.799564,0.013004
3,Logistic Regression,86.885246,86.328976,0.003003
4,Skl GBM,83.606557,83.769063,0.116009
5,Decision Tree,80.327869,80.065359,0.00399
