In [None]:
# Import required libraries and dependencies
import warnings
warnings.simplefilter(action='ignore')

import pandas as pd
import numpy as np
import hvplot.pandas
import seaborn as sns
import matplotlib.pyplot as plt
import missingno as msno 

from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import RobustScaler, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import confusion_matrix,accuracy_score,roc_curve,classification_report
from sklearn.model_selection import GridSearchCV
from tabulate import tabulate

### Oversampling using SMOTE

In [None]:
# Import required libraries and dependencies
import warnings
warnings.simplefilter(action='ignore')

import pandas as pd
import numpy as np
import hvplot.pandas
import seaborn as sns
import matplotlib.pyplot as plt
import missingno as msno 

from imblearn.over_sampling import SMOTE
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import RobustScaler, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import confusion_matrix,accuracy_score,roc_curve,classification_report
from sklearn.model_selection import GridSearchCV
from tabulate import tabulate

# Load the data into a Pandas DataFrame
df5 = pd.read_csv("heart_2022_cleaned.csv")

# Define features set X
X = df5.drop("HeartAttack", axis=1)
# Define target vector y
y = df5["HeartAttack"].values

# Dealing with Imbalanced Data. Scale features to [0, 1] range
X_scaled = MinMaxScaler().fit_transform(X)

# SMOTE for Resampling to balance the dataset
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_scaled, y)

# Split the data into training, validation, and test using train_test_split with stratification
X_train, X_temp, y_train, y_temp = train_test_split(
    X_resampled, 
    y_resampled, 
    stratify=y_resampled,  # to maintain the same proportion of classes in both train and test sets
    test_size=0.3,
    random_state=78
)

X_val, X_test, y_val, y_test = train_test_split(
    X_temp,
    y_temp,
    stratify=y_temp,
    test_size=0.5,
    random_state=78
)

# Create a StandardScaler instance
scaler = StandardScaler()

# Fit the StandardScaler and Scale the data
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

# Create a Random Forest classifier
rf_model = RandomForestClassifier(n_estimators=500, random_state=78)

# Define a grid of hyperparameters for GridSearchCV
param_grid = {
    'n_estimators': [100, 200, 500],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

# Implement GridSearchCV with cross-validation and early stopping
grid_search = GridSearchCV(
    estimator=rf_model,
    param_grid=param_grid,
    cv=3,  # 3-fold cross-validation
    n_jobs=-1,
    scoring='accuracy',
    verbose=2
)

# Fitting the model
grid_search.fit(X_train_scaled, y_train)

# Print the best parameters and best cross-validation score
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best cross-validation accuracy: {grid_search.best_score_}")

# Evaluate the best model on the validation set
best_rf_model = grid_search.best_estimator_
val_predictions = best_rf_model.predict(X_val_scaled)
val_acc_score = accuracy_score(y_val, val_predictions)
print(f"Validation Accuracy Score: {val_acc_score}")

# Making predictions using the testing data
test_predictions = best_rf_model.predict(X_test_scaled)

from sklearn.metrics import log_loss

# Predictions
y_train_pred = best_rf_model.predict(X_train_scaled)
y_val_pred = best_rf_model.predict(X_val_scaled)
y_test_pred = best_rf_model.predict(X_test_scaled)

# and Predict Probabilities
y_train_prob = best_rf_model.predict_proba(X_train_scaled)[:, 1]
y_val_prob = best_rf_model.predict_proba(X_val_scaled)[:, 1]
y_test_prob = best_rf_model.predict_proba(X_test_scaled)[:, 1]

# Calculate losses
train_loss = log_loss(y_train, y_train_prob)
val_loss = log_loss(y_val, y_val_prob)
test_loss = log_loss(y_test, y_test_prob)

# Calculate errors
train_error = 1 - accuracy_score(y_train, y_train_pred)
val_error = 1 - accuracy_score(y_val, y_val_pred)
test_error = 1 - accuracy_score(y_test, y_test_pred)

# Print losses and errors
print(f"Training Loss: {train_loss:.4f}")
print(f"Validation Loss: {val_loss:.4f}")
print(f"Test Loss: {test_loss:.4f}")
print(f"Training Error: {train_error:.4f}")
print(f"Validation Error: {val_error:.4f}")
print(f"Test Error : {test_error:.4f}")

# Calculating the confusion matrix
cm = confusion_matrix(y_test, test_predictions)
cm_df = pd.DataFrame(cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

# Calculating the accuracy score
acc_score = accuracy_score(y_test, test_predictions)

# Displaying results
print("Confusion Matrix")
print(cm_df)
print(f"Accuracy Score: {acc_score}")
print("Classification Report")
print(classification_report(y_test, test_predictions))


### Undersampling using RandomUnderSampler

In [None]:
# Load the data into a Pandas DataFrame
df5 = pd.read_csv("heart_2022_cleaned.csv")

# Define features set X
X = df5.drop("HeartAttack", axis=1)
# Define target vector y
y = df5["HeartAttack"].values

# Dealing with Imbalanced Data. Scale features to [0, 1] range
X_scaled = MinMaxScaler().fit_transform(X)

# Random UnderSampler for balancing the dataset
rus = RandomUnderSampler(random_state=42)
X_resampled, y_resampled = rus.fit_resample(X_smote, y_smote)

# Split the data into training, validation, and test using train_test_split with stratification
X_train, X_temp, y_train, y_temp = train_test_split(
    X_resampled, 
    y_resampled, 
    stratify=y_resampled,  # to maintain the same proportion of classes in both train and test sets
    test_size=0.3,
    random_state=78
)

X_val, X_test, y_val, y_test = train_test_split(
    X_temp,
    y_temp,
    stratify=y_temp,
    test_size=0.5,
    random_state=78
)

# Create a StandardScaler instance
scaler = StandardScaler()

# Fit the StandardScaler and Scale the data
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

# Create a Random Forest classifier
rf_model = RandomForestClassifier(n_estimators=500, random_state=78)

# Define a grid of hyperparameters for GridSearchCV
param_grid = {
    'n_estimators': [100, 200, 500],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

# Implement GridSearchCV with cross-validation and early stopping
grid_search = GridSearchCV(
    estimator=rf_model,
    param_grid=param_grid,
    cv=3,  # 3-fold cross-validation
    n_jobs=-1,
    scoring='accuracy',
    verbose=2
)

# Fitting the model
grid_search.fit(X_train_scaled, y_train)

# Print the best parameters and best cross-validation score
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best cross-validation accuracy: {grid_search.best_score_}")

# Evaluate the best model on the validation set
best_rf_model = grid_search.best_estimator_
val_predictions = best_rf_model.predict(X_val_scaled)
val_acc_score = accuracy_score(y_val, val_predictions)
print(f"Validation Accuracy Score: {val_acc_score}")

# Making predictions using the testing data
test_predictions = best_rf_model.predict(X_test_scaled)

from sklearn.metrics import log_loss

# Predictions
y_train_pred = best_rf_model.predict(X_train_scaled)
y_val_pred = best_rf_model.predict(X_val_scaled)
y_test_pred = best_rf_model.predict(X_test_scaled)

# and Predict Probabilities
y_train_prob = best_rf_model.predict_proba(X_train_scaled)[:, 1]
y_val_prob = best_rf_model.predict_proba(X_val_scaled)[:, 1]
y_test_prob = best_rf_model.predict_proba(X_test_scaled)[:, 1]

# Calculate losses
train_loss = log_loss(y_train, y_train_prob)
val_loss = log_loss(y_val, y_val_prob)
test_loss = log_loss(y_test, y_test_prob)

# Calculate errors
train_error = 1 - accuracy_score(y_train, y_train_pred)
val_error = 1 - accuracy_score(y_val, y_val_pred)
test_error = 1 - accuracy_score(y_test, y_test_pred)

# Print losses and errors
print(f"Training Loss: {train_loss:.4f}")
print(f"Validation Loss: {val_loss:.4f}")
print(f"Test Loss: {test_loss:.4f}")
print(f"Training Error: {train_error:.4f}")
print(f"Validation Error: {val_error:.4f}")
print(f"Test Error : {test_error:.4f}")

# Calculating the confusion matrix
cm = confusion_matrix(y_test, test_predictions)
cm_df = pd.DataFrame(cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

# Calculating the accuracy score
acc_score = accuracy_score(y_test, test_predictions)

# Displaying results
print("Confusion Matrix")
print(cm_df)
print(f"Accuracy Score: {acc_score}")
print("Classification Report")
print(classification_report(y_test, test_predictions))