## Project Overview 

This project explores various machine learning algorithms for a binary classification task using the Heart Failure Prediction dataset from Kaggle. The dataset contains 918 observations with 12 attributes, where the target variable (y) is "HeartDisease":
* 0: No heart failure
* 1: Heart failure

Link to the dataset in [Kaggle](https://www.kaggle.com/datasets/fedesoriano/heart-failure-prediction)

### Objective
The goal is to compare the performance of different machine learning models in predicting heart failure.


### Machine Learning Models Considered
The following classification algorithms will be implemented and evaluated:
* Logistic Regression Classifier
* Support Vector Machine (SVM) Classifier
* Random Forest Classifier
* Gradient Boosting Classifier
* Multi-Layer Perceptron (MLP) Classifier


In [16]:
# Libraries for the Project
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, ConfusionMatrixDisplay
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import SVC 
from sklearn.neural_network import MLPClassifier

# Setting the style of the plots
sns.set_theme(style="darkgrid", palette="pastel")

### Heart Failure Prediction Dataset overview 

In [None]:
df = pd.read_csv('heart.csv')
df.head()

In [None]:
df.describe()

In [None]:
corr_matrix_numeric_values = df.corr(method='pearson', numeric_only=True)
sns.heatmap(corr_matrix_numeric_values, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Matrix')
plt.show()

The correlation matrix shows that the dataset attributes do not exhibit a high correlation with one another.

### Data Preprocessing

The dataset includes categorical variables that need to be one-hot encoded. To accomplish this, we will use the `get_dummies()` function from pandas to encode the categorical attributes.


In [None]:
df_encoded = pd.get_dummies(df, columns=['Sex', 'ChestPainType', 'RestingECG', 'ExerciseAngina', 'ST_Slope'], dtype=int, drop_first=True)
df_encoded.head()

In [None]:
# scale the data using MinMaxScaler
scaler = MinMaxScaler()

X = df_encoded.drop('HeartDisease', axis=1)
y = df_encoded['HeartDisease']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
print(X_train.shape, X_test.shape)

### Logistic Regression Model

In [None]:
# Logistic Regression
parameters = {'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000], 'penalty': ['l1', 'l2']}

log_reg = LogisticRegression(solver='liblinear')
log_reg_cv = GridSearchCV(log_reg, parameters, cv=5, n_jobs=-1)
log_reg_cv.fit(X_train, y_train)
predictions = log_reg_cv.predict(X_test)
acc_log_reg = accuracy_score(y_test, predictions)

print(f'Best parameters: {log_reg_cv.best_params_}')
print(f'Best score: {log_reg_cv.best_score_}')
print(f'Accuracy: {acc_log_reg}')
print(f'Classification report: \n{classification_report(y_test, predictions)}')

# Confusion Matrix for Logistic Regression
ConfusionMatrixDisplay.from_predictions(y_test, predictions, cmap='Blues', values_format='d')
plt.title('Confusion Matrix for Logistic Regression')
plt.show()

### Support Vector Machine (SVM) Model 

In [None]:
# SVM Classifier
parameters = {
    'C': [0.1, 1, 10, 100, 1000],
    'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
    'kernel': ['rbf']
}

svc = SVC()
svc_cv = GridSearchCV(svc, parameters, cv=5, n_jobs=-1)
svc_cv.fit(X_train, y_train)
predictions = svc_cv.predict(X_test)
acc_svc = accuracy_score(y_test, predictions)

print(f'Best parameters: {svc_cv.best_params_}')
print(f'Best score: {svc_cv.best_score_}')
print(f'Accuracy: {acc_svc}')
print(f'Classification report: \n{classification_report(y_test, predictions)}')

ConfusionMatrixDisplay.from_predictions(y_test, predictions, cmap='Blues', values_format='d')
plt.title('Confusion Matrix for SVM Classifier')
plt.show()

### Data Preprocessing Again

But now `drop_first = False`

In [None]:
df_encoded = pd.get_dummies(df, columns=['Sex', 'ChestPainType', 'RestingECG', 'ExerciseAngina', 'ST_Slope'], dtype=int)
df_encoded.head()

In [None]:
# scale the data using MinMaxScaler
scaler = MinMaxScaler()

X = df_encoded.drop('HeartDisease', axis=1)
y = df_encoded['HeartDisease']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
print(X_train.shape, X_test.shape)

### Random Forest Model 

In [None]:
parameters = {
    'n_estimators': [10, 50, 100, 200, 500],
    'max_depth': [3, 5, 7, 9, 11],
}

rf = RandomForestClassifier()
rf_cv = GridSearchCV(rf, parameters, cv=5, n_jobs=-1)
rf_cv.fit(X_train, y_train)
predictions = rf_cv.predict(X_test)
acc_rf = accuracy_score(y_test, predictions)

print(f'Best parameters: {rf_cv.best_params_}')
print(f'Best score: {rf_cv.best_score_}')
print(f'Accuracy: {acc_rf}')
print(f'Classification report: \n{classification_report(y_test, predictions)}')

ConfusionMatrixDisplay.from_predictions(y_test, predictions, cmap='Blues', values_format='d')
plt.title('Confusion Matrix for Random Forest Classifier')
plt.show()

### Gradient Boosting Model

In [None]:
parameters = {
    'learning_rate': [0.001, 0.01, 0.1],
    'n_estimators': [10, 50, 100, 200, 500],
    'max_depth': [3, 5, 7, 9, 11],
}

gradiant_boosting = GradientBoostingClassifier()
gradiant_boosting_cv = GridSearchCV(gradiant_boosting, parameters, cv=5, n_jobs=-1)
gradiant_boosting_cv.fit(X_train, y_train)
predictions = gradiant_boosting_cv.predict(X_test)
acc_gradiant_boosting = accuracy_score(y_test, predictions)

print(f'Best parameters: {gradiant_boosting_cv.best_params_}')
print(f'Best score: {gradiant_boosting_cv.best_score_}')
print(f'Accuracy: {acc_gradiant_boosting}')
print(f'Classification report: \n{classification_report(y_test, predictions)}')

ConfusionMatrixDisplay.from_predictions(y_test, predictions, cmap='Blues', values_format='d')
plt.title('Confusion Matrix for Gradient Boosting Classifier')
plt.show()

#### Multi-Layer Perceptron (MLP) Model

In [None]:
parameters = {
    'hidden_layer_sizes': [(20, 10), (30, 15), (40, 20), (50, 25), (64, 32)],
    'activation': ['relu'], 
    'solver': ['adam'],
    'max_iter': [500, 1000, 1500],
    'alpha': [0.0001, 0.001, 0.01, 0.1],
    'learning_rate_init': [0.001, 0.005, 0.01],
    'early_stopping': [True],
}

mlp = MLPClassifier(random_state=42)
mlp_cv = GridSearchCV(mlp, parameters, cv=5, n_jobs=-1)
mlp_cv.fit(X_train, y_train)
predictions = mlp_cv.predict(X_test)
acc_mlp = accuracy_score(y_test, predictions)

print(f'Best parameters: {mlp_cv.best_params_}')
print(f'Best score: {mlp_cv.best_score_}')
print(f'Accuracy: {acc_mlp}')
print(f'Classification report: \n{classification_report(y_test, predictions)}')

ConfusionMatrixDisplay.from_predictions(y_test, predictions, cmap='Blues', values_format='d')
plt.title('Confusion Matrix for Neural Network')
plt.show()

### Results

In [None]:
print('Accuracy for Logistic Regression:', acc_log_reg)
print('Accuracy for SVM Classifier:', acc_svc)
print('Accuracy for Random Forest Classifier:', acc_rf)
print('Accuracy for Gradient Boosting Classifier:', acc_gradiant_boosting)
print('Accuracy for MLP Classifier:', acc_mlp)