# Heart Disease Classification

## Data Preprocessing

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from itertools import product

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import 

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier


import tensorflow as tf
from tensorflow.keras import Input
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import KFold

from utils import *

ModuleNotFoundError: No module named 'utils'

In [None]:
filename = 'heart_cleveland_upload.csv'
df = pd.read_csv(filename)

print(df.head())

In [None]:
# Show basic info
print(df.info())  # Check for missing values and data types
print(df.describe())  # Summary statistics

In [None]:
#Checking if dataframe has missing values
if pd.isna(df).values.any()>0:
    print("The Data has missing values")
else:
    print("The Data has no missing values")

### Histograms

In [None]:
rows, cols = 4, 4
fig, axes = plt.subplots(rows, cols, figsize=(18, 12))
axes = axes.flatten()

for i, column in enumerate(df.columns):
    sns.histplot(df[column], kde=True, bins=30, ax=axes[i])
    axes[i].set_title(f'Distribution of {column}')
    axes[i].set_xlabel(column)
    axes[i].set_ylabel('Frequency')
    
for j in range(i + 1, len(axes)):
    fig.delaxes(axes[j])

plt.tight_layout()
plt.show()

In [None]:
#As a next step the categorical data is going to be hot one encoded
df = pd.get_dummies(df, drop_first=True)  # One-hot encoding

#One hot enconding is not working because we're working with numerical variables and not categories, so instead we will map the conditions
#corresponding to each number in each categorical variable

mapcp = {0:"typical Angina", 1:"atypical angina",2:"non-anginal pain", 3:"asymptomatic"} 
df["cp_coded"] = df["cp"].map(mapcp)
df = df.drop("cp", axis=1)

maprestcg = {0:"normal",1:"ST-T",2:"hypertrophy"}
df["restecg_coded"] = df["restecg"].map(maprestcg)
df = df.drop("restecg", axis=1)

mapslope = {0:"upsloping",1:"flat",2:"downsloping"}
df["slope_coded"] = df["slope"].map(mapslope)
df = df.drop("slope",axis=1)

mapthal = {0:"normal",1:"fixed",2:"reversable"}
df["thal_coded"] = df["thal"].map(mapthal)
df = df.drop("thal",axis=1)

print(df)


In [None]:
#Now that we have our categories in place we can proceed with one-hot encoding
df = pd.get_dummies(df, drop_first=True)  # One-hot encoding
df = df.astype(int)
print(df.head())

In [None]:
#Checking if the target is balanced
c0=0
c1=0
for value in df["condition"]:
    if value==1:
        c1+=1
    else:
        c0+=1
print(f"{c1} pacients have a condition, {c0} have not")
print(f"Ratio: {c1/(c1+c0)}")

In [None]:
#As it looks since there is no high disparity between healthy people and pacients
#Next in line is Normalization



### Split data into Train and Test Set

In [None]:
# separate data from labels
X = df.iloc[:, :-1]
y = df.iloc[:, -1]

print(X.head())
print()
print(y.head())

In [None]:
# Split data into train and test set: 80:20
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
print(f"Training Set:       X_train={X_train.shape},    y_train={y_train.shape}")
print(f"Test Set:           X_test={X_test.shape},      y_test={y_test.shape}")

## All models

In [None]:
model_names = ('Logistic Regression', 'SVM', 'Random forest', 'KNN', 'XGBoost', 'Neural Network')
best_models = []

## Logistic Regression

### Grid search to find the best parameter C

In [None]:
log_reg = LogisticRegression(solver='liblinear', max_iter=300)

param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100]
}

grid_search = GridSearchCV(
    estimator=log_reg,
    param_grid=param_grid,
    cv=5,                # 5-fold cross-validation
    scoring='accuracy',
    verbose=1
)

grid_search.fit(X_train, y_train)

best_log_reg = grid_search.best_estimator_
best_models.append(best_log_reg)
print("Best C from GridSearchCV:", grid_search.best_params_['C'])

### Metrics (Classification Report, Confusion Matrix and ROC Curve)

In [None]:
print_metrics(best_log_reg, X_test, y_test)

## Support Vector Machine

### Grid search to find the best parameters

In [None]:
# svc = SVC(probability=True)

# param_grid = {
#     'C': [0.01, 0.1, 1, 10, 100],
#     'kernel': ['linear', 'rbf'],
#     'gamma': ['scale', 'auto']      # for rbf
# }

# grid_search = GridSearchCV(
#     estimator=svc,
#     param_grid=param_grid,
#     cv=5,
#     scoring='accuracy',
#     verbose=1
# )

# grid_search.fit(X_train, y_train)

# best_svm = grid_search.best_estimator_
# best_models.append(best_svm)
# print("Best Parameters:", grid_search.best_params_)

In [None]:
best_svm = SVC(C=10, kernel='linear', probability=True)
best_svm.fit(X_train, y_train)
best_models.append(best_svm)

### Metrics (Classification Report, Confusion Matrix and ROC Curve)

In [None]:
print_metrics(best_svm, X_test, y_test)

## Random Forest

### Grid search to find the best parameters

### Metrics (Classification Report, Confusion Matrix and ROC Curve)

## KNN

### Grid search to find the best parameters

In [None]:
knn = KNeighborsClassifier()

param_grid = {
    'n_neighbors': [3, 5, 7, 9, 11, 15, 25, 30],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan']
}

grid_search = GridSearchCV(
    estimator=knn,
    param_grid=param_grid,
    cv=5,
    scoring='accuracy',
    verbose=1
)

grid_search.fit(X_train, y_train)

best_knn = grid_search.best_estimator_
best_models.append(best_knn)
print("Best Parameters:", grid_search.best_params_)

In [None]:
# best_knn = KNeighborsClassifier(n_neighbors=11, weights='distance', metric='manhattan')
# best_models.append(best_knn)
# print("Best Parameters:", grid_search.best_params_)

### Metrics (Classification Report, Confusion Matrix and ROC Curve)

In [None]:
print_metrics(best_knn, X_test, y_test)

## XGBoost

### Grid search to find the best parameters

### Metrics (Classification Report, Confusion Matrix and ROC Curve)

## Neural Network

### Architecture

In [None]:
def build_model(lr=0.001):
    model = Sequential([
        Input(shape=(X_train.shape[1],)),
        Dense(12, activation='relu'),
        Dense(8, activation='relu'),
        Dense(4, activation='relu'),
        Dense(1, activation='sigmoid')
    ])
    
    model.compile(
        loss='binary_crossentropy',
        optimizer=Adam(learning_rate=lr),
        metrics=['accuracy']
    )
    return model

### Grid search to find the best parameters

In [None]:
# lr_values = [0.0001, 0.001, 0.01, 0.1]
# batch_sizes = [8, 16, 32, 64]
# epochs_list = [50, 100, 150, 200] 

# # Prepare the KFold cross-validation
# X_train = np.array(X_train)
# y_train = np.array(y_train)

# kf = KFold(n_splits=5, shuffle=True, random_state=42)

# best_accuracy = 0
# best_params = {}

# for lr, batch_size, epochs in product(lr_values, batch_sizes, epochs_list):
#     fold_accuracies = []
    
#     for train_index, val_index in kf.split(X_train):
#         X_train_fold, X_val_fold = X_train[train_index], X_train[val_index]
#         y_train_fold, y_val_fold = y_train[train_index], y_train[val_index]
        
#         model = build_model(lr=lr)
#         model.fit(X_train_fold, y_train_fold, batch_size=batch_size, epochs=epochs, verbose=0)
        
#         y_pred_fold = (model.predict(X_val_fold) > 0.5).astype(int)
        
#         accuracy = accuracy_score(y_val_fold, y_pred_fold)
#         fold_accuracies.append(accuracy)
    
#     mean_accuracy = np.mean(fold_accuracies)
    
#     if mean_accuracy > best_accuracy:
#         best_accuracy = mean_accuracy
#         best_params = {'lr': lr, 'batch_size': batch_size, 'epochs': epochs}

# print("Best Parameters:", best_params)

### Metrics (Classification Report, Confusion Matrix and ROC Curve)

In [None]:
# best_nn = build_model(lr=best_params['lr'])
# best_nn.fit(X_train, y_train, batch_size=best_params['batch_size'], epochs=best_params['epochs'], verbose=0)
# best_models.append(best_nn)

# print_metrics(best_nn, X_test, y_test)

In [None]:
best_nn = build_model(lr=0.001)
best_nn.fit(X_train, y_train, batch_size=16, epochs=100, verbose=0)
best_models.append(best_nn)

print_metrics(best_nn, X_test, y_test)