## Adding new feature

In [64]:
import pandas as pd
import numpy as np
import time
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn import pipeline
from sklearn import impute, compose, metrics, preprocessing
heart_df = pd.read_csv('heart.csv')

In [65]:
bins = [25, 35, 45, 55, 100]
category = ['young', 'adult', 'old', 'senior']

nf = pd.cut(heart_df['age'], bins, labels=category)
heart_df.insert(1, 'age_grps', nf)
heart_df = heart_df.drop(['age'], axis=1)

x = heart_df.iloc[:, :-1]
y = heart_df.iloc[:, -1]

In [85]:
heart_df

Unnamed: 0,age_grps,sex,cp,trtbps,chol,fbs,restecg,thalachh,exng,oldpeak,slp,caa,thall,output
0,senior,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,adult,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,adult,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,senior,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,senior,0,0,120,354,0,1,163,1,0.6,2,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,senior,0,0,140,241,0,1,123,1,0.2,1,0,3,0
299,adult,1,3,110,264,0,1,132,0,1.2,1,0,3,0
300,senior,1,0,144,193,1,1,141,0,3.4,1,2,3,0
301,senior,1,0,130,131,0,1,115,1,1.2,1,1,3,0


In [80]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0, stratify=y)


cs = [1,2,3,4,5,6,7,8,9,10,11,12]
ce = [0]

p = pipeline.Pipeline(
    [("coltransformer", compose.ColumnTransformer(
        transformers=[
            ("varied", pipeline.Pipeline([("scale", StandardScaler())]), cs),
            ("category", pipeline.Pipeline([("encode", OrdinalEncoder())]), ce),
        ]),
    )]
)

x_train = p.fit_transform(x_train)
x_test = p.transform(x_test)


classifiers = {
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(n_estimators=100),
    "Skl GBM": GradientBoostingClassifier(n_estimators=100),
    "Logistic Regression": LogisticRegression(),
    "SVM": SVC(kernel='linear')
}

classifiers = {name: pipeline.make_pipeline(p, model) for name, model in classifiers.items()}


results = pd.DataFrame({'Model': [], 'Accuracy': [], 'Bal Acc.': [], 'Time': []})

for model_name, model in classifiers.items():
    start_time = time.time()
    model.fit(x_train, y_train)
    pred = model.predict(x_test)

    total_time = time.time() - start_time

    results = results.append({"Model": model_name,
                              "Accuracy": metrics.accuracy_score(y_test, pred) * 100,
                              "Bal Acc.": metrics.balanced_accuracy_score(y_test, pred) * 100,
                              "Time": total_time},
                             ignore_index=True)

results_ord = results.sort_values(by=['Accuracy'], ascending=False, ignore_index=True)
results_ord.index += 1
results_ord.style.bar(subset=['Accuracy', 'Bal Acc.'], vmin=0, vmax=100, color='#5fba7d')

Unnamed: 0,Model,Accuracy,Bal Acc.,Time
1,Logistic Regression,86.885246,86.525974,0.015998
2,SVM,85.245902,84.74026,0.018005
3,Random Forest,80.327869,80.194805,0.367085
4,Skl GBM,80.327869,80.465368,0.161788
5,Decision Tree,77.04918,76.352814,0.017011


## Enhancement w/ Std

In [86]:
np.random.seed(0)

def data_enhancement(data):
    gen_data = data.copy()

    for cp in data['cp'].unique():
        cp_data = gen_data[gen_data['cp'] == cp]
        trtbps_std = cp_data['trtbps'].std()
        #age_std = cp_data['age'].std()
        chol_std = cp_data['chol'].std()
        thalachh_std = cp_data['thalachh'].std()

        for i in gen_data[gen_data['cp'] == cp].index:
            if np.random.randint(2) == 1:
                gen_data['trtbps'].values[i] += trtbps_std/10
            else:
                gen_data['trtbps'].values[i] -= trtbps_std/10
            # if np.random.randint(2) == 1:
            #     gen_data['age'].values[i] += age_std/10
            # else:
            #     gen_data['age'].values[i] -= age_std/10
            if np.random.randint(2) == 1:
                gen_data['chol'].values[i] += chol_std/10
            else:
                gen_data['chol'].values[i] -= chol_std/10
            if np.random.randint(2) == 1:
                gen_data['thalachh'].values[i] += thalachh_std/10
            else:
                gen_data['thalachh'].values[i] -= thalachh_std/10
    return gen_data

gen = data_enhancement(heart_df)
# print(heart_df.head())
#gen

In [70]:
x_h = gen.iloc[:,:-1]
y_h = gen.iloc[:,-1]

In [71]:
x_trainh, x_testh, y_trainh, y_testh = train_test_split(x_h, y_h, test_size=0.2, random_state=0, stratify=y)


csh = [1,2,3,4,5,6,7,8,9,10,11,12]
ceh = [0]

p = pipeline.Pipeline(
    [("coltransformer", compose.ColumnTransformer(
        transformers=[
            ("varied", pipeline.Pipeline([("scale", StandardScaler())]), csh),
            ("category", pipeline.Pipeline([("encode", OrdinalEncoder())]), ceh),
        ]),
    )]
)



In [72]:
extra_sample = gen.sample(gen.shape[0] // 3)
x_trainh = pd.concat([x_trainh, extra_sample.drop(['output'], axis=1)])
y_trainh = pd.concat([y_trainh, extra_sample['output']])

In [74]:
x_trainh = p.fit_transform(x_trainh)
x_testh = p.transform(x_testh)

In [78]:
results = pd.DataFrame({'Model': [], 'Accuracy': [], 'Bal Acc.': [], 'Time': []})

classifiers = {
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(n_estimators=100),
    "Skl GBM": GradientBoostingClassifier(n_estimators=100),
    "Logistic Regression": LogisticRegression(),
    "SVM": SVC()
}

for model_name, model in classifiers.items():
    start_time = time.time()
    model.fit(x_trainh, y_trainh)
    pred = model.predict(x_testh)

    total_time = time.time() - start_time

    results = results.append({"Model": model_name,
                              "Accuracy": metrics.accuracy_score(y_testh, pred) * 100,
                              "Bal Acc.": metrics.balanced_accuracy_score(y_testh, pred) * 100,
                              "Time": total_time},
                             ignore_index=True)

results_ord = results.sort_values(by=['Accuracy'], ascending=False, ignore_index=True)
results_ord.index += 1
results_ord.style.bar(subset=['Accuracy', 'Bal Acc.'], vmin=0, vmax=100, color='#5fba7d')

Unnamed: 0,Model,Accuracy,Bal Acc.,Time
1,Random Forest,90.163934,90.638528,0.494036
2,Decision Tree,88.52459,88.852814,0.006
3,SVM,86.885246,86.796537,0.012985
4,Logistic Regression,85.245902,85.010823,0.009016
5,Skl GBM,83.606557,83.766234,0.147012
