In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss, roc_auc_score, recall_score, precision_score, average_precision_score, f1_score, classification_report, accuracy_score, plot_roc_curve, plot_precision_recall_curve, plot_confusion_matrix

In [2]:
# load dataframe and transform categorical variables
df = pd.read_csv('data/heart.csv')
df = pd.get_dummies(df, columns = ['cp', 'restecg', 'slp', 'caa', 'thall'], drop_first = True)

In [3]:
df

Unnamed: 0,age,sex,trtbps,chol,fbs,thalachh,exng,oldpeak,output,cp_1,...,restecg_2,slp_1,slp_2,caa_1,caa_2,caa_3,caa_4,thall_1,thall_2,thall_3
0,63,1,145,233,1,150,0,2.3,1,0,...,0,0,0,0,0,0,0,1,0,0
1,37,1,130,250,0,187,0,3.5,1,0,...,0,0,0,0,0,0,0,0,1,0
2,41,0,130,204,0,172,0,1.4,1,1,...,0,0,1,0,0,0,0,0,1,0
3,56,1,120,236,0,178,0,0.8,1,1,...,0,0,1,0,0,0,0,0,1,0
4,57,0,120,354,0,163,1,0.6,1,0,...,0,0,1,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,57,0,140,241,0,123,1,0.2,0,0,...,0,1,0,0,0,0,0,0,0,1
299,45,1,110,264,0,132,0,1.2,0,0,...,0,1,0,0,0,0,0,0,0,1
300,68,1,144,193,1,141,0,3.4,0,0,...,0,1,0,0,1,0,0,0,0,1
301,57,1,130,131,0,115,1,1.2,0,0,...,0,1,0,1,0,0,0,0,0,1


In [4]:
# split into training and test datasets
random_seed = 777
df_train, df_test = train_test_split(df, test_size = 0.2, random_state = random_seed, stratify = df['output'])

print(df_train.shape)
print(df_test.shape)
print()
print(df_train['output'].value_counts(normalize = True))
print()
print(df_test['output'].value_counts(normalize = True))

(242, 23)
(61, 23)

1    0.545455
0    0.454545
Name: output, dtype: float64

1    0.540984
0    0.459016
Name: output, dtype: float64


In [9]:
num_cols = ['age', 'trtbps', 'chol', 'thalachh', 'oldpeak']
cat_cols = list(set(df.columns) - set(num_cols) - {'output'})
cat_cols.sort()

In [10]:
# scaling
scaler = StandardScaler()
scaler.fit(df_train[num_cols])

def get_features_and_target_arrays(df, num_cols, cat_cols, scaler):
    X_num_scaled = scaler.transform(df[num_cols])
    X_cat = df[cat_cols].to_numpy()
    X = np.hstack((X_cat, X_num_scaled))
    y = df['output']
    return X, y

X, y = get_features_and_target_arrays(df_train, num_cols, cat_cols, scaler)
