In [1]:
# Import required libraries and dependencies
import warnings
warnings.simplefilter(action='ignore')

import pandas as pd
import numpy as np
import hvplot.pandas
import seaborn as sns
import matplotlib.pyplot as plt
import missingno as msno 

from imblearn.over_sampling import SMOTE
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import RobustScaler, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import confusion_matrix,accuracy_score,roc_curve,classification_report
from tabulate import tabulate


# 1. Loading the Dataset from CSV

In [2]:
# Load the data into a Pandas DataFrame
df = pd.read_csv("Resources/heart_2022_cleaned.csv")
# Display sample data
df

Unnamed: 0,Sex,Age,Race,BMI,GenHealth,PhysicalHealth,MentalHealth,PhysicalActivity,ChestScan,SleepHours,...,ECigaretteUsage,Drinking,HeartAttack,Angina,Stroke,Asthma,SkinCancer,Depressed,KidneyDisease,Diabetes
0,0,12,0,26.57,4,0.0,0.0,0,0,6.0,...,0,0,0,0,0,0,0,0,0,0
1,0,7,0,25.61,3,2.0,3.0,1,0,5.0,...,0,0,0,0,0,0,0,0,0,0
2,0,4,0,21.77,1,2.0,0.0,1,1,9.0,...,0,1,0,0,0,0,1,0,0,0
3,1,12,0,26.08,0,1.0,0.0,0,0,7.0,...,0,0,1,0,1,0,1,0,0,1
4,0,12,1,22.96,3,0.0,0.0,1,0,7.0,...,0,1,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
318929,1,9,3,29.86,2,0.0,15.0,1,0,7.0,...,0,1,0,0,1,0,1,0,0,1
318930,1,2,0,31.19,2,0.0,0.0,1,0,8.0,...,0,1,0,0,0,0,1,0,0,0
318931,0,6,1,28.66,4,2.0,2.0,1,0,7.0,...,0,0,0,0,0,0,1,0,0,0
318932,1,10,1,32.55,3,0.0,0.0,0,1,5.0,...,0,0,1,0,0,1,1,0,0,0


# 5. Machine Learning

### 5.3. Deep Learning - Standard Scaler Module (Neural Network)

#### 5.3.1. Loading and Preprocessing the Dataset

In [3]:
# Copy dataset for Machine Learning
df2 = df.copy()

In [4]:
# Determine the number of unique values in each column by using 'nunique()'.
print(df2.nunique())

Sex                    2
Age                   13
Race                   5
BMI                 3748
GenHealth              5
PhysicalHealth        31
MentalHealth          31
PhysicalActivity       2
ChestScan              2
SleepHours            23
DiffWalking            2
Smoking                4
ECigaretteUsage        4
Drinking               2
HeartAttack            2
Angina                 2
Stroke                 2
Asthma                 2
SkinCancer             2
Depressed              2
KidneyDisease          2
Diabetes               4
dtype: int64


In [5]:
# Look at BMI value counts to identify and replace with "Other"
bmi_counts = df2["BMI"].value_counts()
bmi_counts

BMI
26.63    3438
27.46    2642
24.41    2601
27.44    2495
27.12    2468
         ... 
60.15       1
59.13       1
20.17       1
47.28       1
28.39       1
Name: count, Length: 3748, dtype: int64

In [6]:
# Choose a cutoff value (i.e., 500) and create a list of bmi to be replaced
# use the variable name `bmi_to_replace`
bmi_to_replace = list(bmi_counts[bmi_counts < 150].index)
#bmi_to_replace
# Replace in dataframe
for bmi in bmi_to_replace:
    df2['BMI'] = df2['BMI'].replace(bmi,"Other")

# Check to make sure replacement was successful
df2['BMI'].value_counts()

BMI
Other    72175
26.63     3438
27.46     2642
24.41     2601
27.44     2495
         ...  
25.29      152
26.94      151
26.46      150
20.48      150
26.17      150
Name: count, Length: 451, dtype: int64

In [7]:
# Convert categorical data to numeric with `pd.get_dummies`
num_df4 = pd.get_dummies(df2)
num_df4.head()

Unnamed: 0,Sex,Age,Race,GenHealth,PhysicalHealth,MentalHealth,PhysicalActivity,ChestScan,SleepHours,DiffWalking,...,BMI_42.07,BMI_42.51,BMI_42.57,BMI_42.91,BMI_43.05,BMI_43.27,BMI_44.29,BMI_44.3,BMI_44.63,BMI_Other
0,0,12,0,4,0.0,0.0,0,0,6.0,0,...,False,False,False,False,False,False,False,False,False,False
1,0,7,0,3,2.0,3.0,1,0,5.0,0,...,False,False,False,False,False,False,False,False,False,False
2,0,4,0,1,2.0,0.0,1,1,9.0,0,...,False,False,False,False,False,False,False,False,False,False
3,1,12,0,0,1.0,0.0,0,0,7.0,0,...,False,False,False,False,False,False,False,False,False,True
4,0,12,1,3,0.0,0.0,1,0,7.0,0,...,False,False,False,False,False,False,False,False,False,False


In [8]:
# Split data into features (X) and target variable (y)
X = num_df4.drop("HeartAttack", axis=1).values
y = num_df4["HeartAttack"].values

In [9]:
# Dealing with Imbalanced Data
# Assuming X is feature DataFrame and y is target variable. Scale features to [0, 1] range
X_scaled = MinMaxScaler().fit_transform(X)

# SMOTE for Resampling to balance the dataset
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_scaled, y)

# *** X_resampled and y_resampled are now new balanced datasets

In [10]:
# Split the data using train_test_split # stratify=y
X_train, X_test, y_train, y_test = train_test_split(
    X_resampled, 
    y_resampled, 
    stratify = y_resampled,  # to maintain the same proportion of classes in both train and test sets. 
    random_state = 78
)

In [11]:
# Further split the training set into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(
    X_train, 
    y_train, 
    stratify=y_train, 
    random_state=78, 
    test_size=0.2
)


#### 5.3.2. Fitting the Model

In [12]:
# Preprocess numerical data for Neural Network

# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler and Scale the data
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

#### 5.3.3. Compile, Train and Evaluate the Model

In [13]:
import tensorflow as tf

# Define the model - deep learning neural network, i.e., the number of input features and hidden nodes for each layer.
number_input_features = X_train.shape[1]

nn_model = tf.keras.models.Sequential()

# First hidden layer
nn_model.add(tf.keras.layers.Dense(units=80, activation="relu", input_dim=number_input_features))

# Second hidden layer
nn_model.add(tf.keras.layers.Dense(units=80, activation="relu"))

# Third hidden layer
nn_model.add(tf.keras.layers.Dense(units=80, activation="tanh"))


# Output layer
nn_model.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Check the structure of the model
nn_model.summary()

In [14]:
# Compile the model
nn_model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [15]:
# Train the model
#fit_model = nn_model.fit(X_train_scaled, y_train, epochs=50)

# Train the model with validation data
fit_model = nn_model.fit(X_train_scaled, y_train, epochs=100, validation_data=(X_val_scaled, y_val))

Epoch 1/100
[1m11325/11325[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 934us/step - accuracy: 0.8127 - loss: 0.4049 - val_accuracy: 0.8499 - val_loss: 0.3399
Epoch 2/100
[1m11325/11325[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 853us/step - accuracy: 0.8556 - loss: 0.3265 - val_accuracy: 0.8677 - val_loss: 0.3037
Epoch 3/100
[1m11325/11325[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 842us/step - accuracy: 0.8726 - loss: 0.2902 - val_accuracy: 0.8754 - val_loss: 0.2857
Epoch 4/100
[1m11325/11325[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 866us/step - accuracy: 0.8844 - loss: 0.2679 - val_accuracy: 0.8816 - val_loss: 0.2735
Epoch 5/100
[1m11325/11325[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 872us/step - accuracy: 0.8913 - loss: 0.2516 - val_accuracy: 0.8891 - val_loss: 0.2613
Epoch 6/100
[1m11325/11325[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 865us/step - accuracy: 0.8967 - loss: 0.2394 - val_accuracy: 0.8929

In [16]:
# Evaluate the model using the test data
test_loss, test_accuracy = nn_model.evaluate(X_test_scaled, y_test, verbose=2)
print(f"Loss: {test_loss}, Accuracy: {test_accuracy}")

4719/4719 - 2s - 496us/step - accuracy: 0.9302 - loss: 0.1936
Loss: 0.1935613602399826, Accuracy: 0.9302422404289246


In [17]:
# Compute the TensorFlow Confusion Matrix

# Predict on the test set
y_pred = nn_model.predict(X_test_scaled)
# print(f"Predictions: {y_pred[:5]}")  # Print first 5 predictions for inspection

# Convert predictions to binary classes
y_pred_classes = (y_pred > 0.5).astype(int).flatten() # a threshold of 0.5 to convert probabilities to binary class labels.

# Compute confusion matrix
confusion_matrix = tf.math.confusion_matrix(y_test, y_pred_classes)

print('Confusion Matrix:')
print(confusion_matrix.numpy())

[1m4719/4719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 544us/step
Confusion Matrix:
[[69690  5807]
 [ 4726 70771]]


##### Calculate Losses and Accuracies

In [18]:
# Calculate training and validation loss and accuracy from history. 
# [-1] for Accesses the element from the last epoch.
train_loss = fit_model.history['loss'][-1]
val_loss = fit_model.history['val_loss'][-1]
train_accuracy = fit_model.history['accuracy'][-1]
val_accuracy = fit_model.history['val_accuracy'][-1]

# Print the results of Loss and Accuracy
print(f"Training Loss: {train_loss:.4f}")
print(f"Validation Loss: {val_loss:.4f}")
print(f"Test Loss: {test_loss:.4f}")
print(f"------------------------------------")
print(f"Training Accuracy: {train_accuracy:.4f}")
print(f"Validation Accuracy: {val_accuracy:.4f}")
print(f"Test Accuracy: {test_accuracy:.4f}")

Training Loss: 0.1176
Validation Loss: 0.1972
Test Loss: 0.1936
------------------------------------
Training Accuracy: 0.9542
Validation Accuracy: 0.9309
Test Accuracy: 0.9302


##### Calculate Errors

In [19]:
# Calculate training error, validation error, and test error
train_error = 1 - train_accuracy
val_error = 1 - val_accuracy
test_error = 1 - test_accuracy

print(f"Training Error: {train_error:.4f}")
print(f"Validation Error: {val_error:.4f}")
print(f"Test Error: {test_error:.4f}")

Training Error: 0.0458
Validation Error: 0.0691
Test Error: 0.0698
