# Heart Disease Prediction

## Import packages

In [4]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split

from sklearn import preprocessing
from sklearn import compose

## Data Preprocessing

### Loading the data

In [8]:
data_df = pd.read_csv('heart.csv')
data_df

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0
...,...,...,...,...,...,...,...,...,...,...,...,...
913,45,M,TA,110,264,0,Normal,132,N,1.2,Flat,1
914,68,M,ASY,144,193,1,Normal,141,N,3.4,Flat,1
915,57,M,ASY,130,131,0,Normal,115,Y,1.2,Flat,1
916,57,F,ATA,130,236,0,LVH,174,N,0.0,Flat,1


### Encoding Features

In [4]:
sex_oh_enc = pd.get_dummies(data_df["Sex"], dtype="int").rename(columns={"F":"Female", "M":"Male"})
chestPain_oh_enc = pd.get_dummies(data_df["ChestPainType"], dtype="int").rename(columns={"TA":"ChestPain_TA", "ATA":"ChestPain_ATA", "NAP":"ChestPain_NAP","ASY":"ChestPain_ASY"})
restECG_oh_enc = pd.get_dummies(data_df["RestingECG"], dtype="int").rename(columns={"Normal":"RestECG_Normal", "ST":"RestECG_ST", "LVH":"RestECG_LVH"})
st_slope_oh_enc = pd.get_dummies(data_df["ST_Slope"], dtype="int").rename(columns={"Up":"ST_Slope_Up", "Flat":"ST_Slope_Flat", "Down":"ST_Slope_Down"})
exerciseAngina_ord_enc = pd.DataFrame(pd.factorize(data_df["ExerciseAngina"])[0], columns=["ExerciseAngina"])

data_df_encoded = data_df.drop(columns=["Sex", "ChestPainType", "RestingECG", "ST_Slope", "ExerciseAngina"])
data_df_encoded = data_df_encoded.join([sex_oh_enc, chestPain_oh_enc, restECG_oh_enc, st_slope_oh_enc, exerciseAngina_ord_enc])
data_df_encoded = data_df_encoded[["Age",
                                  "Male", "Female",
                                  "ChestPain_TA", "ChestPain_ATA", "ChestPain_NAP", "ChestPain_ASY",
                                  "RestingBP",
                                  "Cholesterol",
                                  "FastingBS",
                                  "RestECG_Normal", "RestECG_ST", "RestECG_LVH",
                                  "MaxHR",
                                  "ExerciseAngina",
                                  "Oldpeak",
                                  "ST_Slope_Up", "ST_Slope_Flat", "ST_Slope_Down",
                                  "HeartDisease"]]
print(data_df_encoded)

     Age  Male  Female  ChestPain_TA  ChestPain_ATA  ChestPain_NAP   
0     40     1       0             0              1              0  \
1     49     0       1             0              0              1   
2     37     1       0             0              1              0   
3     48     0       1             0              0              0   
4     54     1       0             0              0              1   
..   ...   ...     ...           ...            ...            ...   
913   45     1       0             1              0              0   
914   68     1       0             0              0              0   
915   57     1       0             0              0              0   
916   57     0       1             0              1              0   
917   38     1       0             0              0              1   

     ChestPain_ASY  RestingBP  Cholesterol  FastingBS  RestECG_Normal   
0                0        140          289          0               1  \
1            

### Separating features from target

In [5]:
X = np.array(data_df_encoded)[:,:-1]
y = np.array(data_df_encoded)[:,-1].reshape(918,1)

### Splitting data into train and test sets

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, train_size=0.8, random_state=8)

### Feature Scaling/Normalization

In [7]:
# Normalize age, RestingBP, Cholesterol, MaxHR, and oldPeak

# Standardization (Z-score normzalization): zero-centered & unit variance
# ct = compose.ColumnTransformer([("standard_scaler", preprocessing.StandardScaler(), [0, 7, 8, 13, 15])], remainder="passthrough")

# Min/Max Normalization (Linear Scaling): [0,1]
ct = compose.ColumnTransformer([("min_max_scaler", preprocessing.MinMaxScaler(), [0, 7, 8, 13, 15])], remainder="passthrough")


X_train_normalized = ct.fit_transform(X_train)
X_test_normalized = ct.fit_transform(X_test)

### Add column of ones to data

In [8]:
X_train_normalized_1 = np.hstack((np.ones((X_train.shape[0],1)), X_train))
X_test_normalized_1 = np.hstack((np.ones((X_test.shape[0],1)), X_test))

## Logistic Regression

### Feature Transformations

### Regularization

### Hyperparameter Tuning