In [13]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix,
    roc_curve,
    roc_auc_score,
    precision_score,
    recall_score,
    f1_score,
    ConfusionMatrixDisplay
)
from imblearn.over_sampling import SMOTE

# Pre-req steps: from 01_exploration.ipynb

In [4]:
# read csv
df = pd.read_csv('../data/heart.csv')

# we split the data first, 
target_col = 'target'

X = df.drop(columns=target_col)
y = df[target_col]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=2, stratify=y
)

# get categorical column
cat_cols = []
target_col = 'target'

for c in df.columns:
    if c == target_col:
        continue
    if df[c].dtype == "object":
        cat_cols.append(c)
    elif pd.api.types.is_integer_dtype(X_train[c]) and X_train[c].nunique() <= 15:
        cat_cols.append(c)

# encoded data frame
X_train_enc = pd.get_dummies(X_train, columns=cat_cols, drop_first=True)
X_test_enc  = pd.get_dummies(X_test,  columns=cat_cols, drop_first=True)

X_test_enc = X_test_enc.reindex(columns=X_train_enc.columns, fill_value=0)

num_cols = [c for c in X_train_enc.columns if c not in
            [col for col in X_train_enc.columns if any(col.startswith(f"{cc}_") for cc in cat_cols)]
           ]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=2, stratify=y
)

# scaling/normalizing the numerical values
scaler = StandardScaler()
X_train_enc[num_cols] = scaler.fit_transform(X_train_enc[num_cols])
X_test_enc[num_cols]  = scaler.transform(X_test_enc[num_cols])

# 2. Model Training

## 2.1 Logistic Regression

In [7]:
log_reg = LogisticRegression(max_iter=1000, random_state=42)
log_reg.fit(X_train_enc, y_train)

y_pred_log_reg = log_reg.predict(X_test_enc)

log_reg_accuracy = accuracy_score(y_test, y_pred_log_reg)

print(f'Logistic Regression Accuracy: {log_reg_accuracy:.4f}')

print(classification_report(y_test, y_pred_log_reg, target_names=['1','0']))

Logistic Regression Accuracy: 0.8488
              precision    recall  f1-score   support

           1       0.88      0.80      0.84       100
           0       0.82      0.90      0.86       105

    accuracy                           0.85       205
   macro avg       0.85      0.85      0.85       205
weighted avg       0.85      0.85      0.85       205



## 2.2 Decision Tree Classifier

In [8]:
tree_clf = DecisionTreeClassifier(random_state=42)

tree_clf.fit(X_train_enc, y_train)

y_pred_tree = tree_clf.predict(X_test_enc)

tree_accuracy = accuracy_score(y_test, y_pred_tree)

print(f'Decision Tree Accuracy: {tree_accuracy:.4f}')

print(classification_report(y_test, y_pred_tree, target_names=['1', '0']))

Decision Tree Accuracy: 0.9854
              precision    recall  f1-score   support

           1       0.97      1.00      0.99       100
           0       1.00      0.97      0.99       105

    accuracy                           0.99       205
   macro avg       0.99      0.99      0.99       205
weighted avg       0.99      0.99      0.99       205



## 2.3 KNN

In [12]:
knn = KNeighborsClassifier()

knn.fit(X_train_enc, y_train)

y_pred_knn = knn.predict(X_test_enc)

knn_accuracy = accuracy_score(y_test, y_pred_knn)

print(f'KNN Accuracy: {knn_accuracy:.4f}')

print(classification_report(y_test, y_pred_knn, target_names=['1', '0']))

KNN Accuracy: 0.8244
              precision    recall  f1-score   support

           1       0.81      0.84      0.82       100
           0       0.84      0.81      0.83       105

    accuracy                           0.82       205
   macro avg       0.82      0.82      0.82       205
weighted avg       0.83      0.82      0.82       205



# 3. Hyperparameter Tuning

## 3.1 GridSearchCV on KNN

In [14]:
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('knn', KNeighborsClassifier())
])

param_grid = {
    'knn__n_neighbors': range(1, 31),
    'knn__weights': ['uniform', 'distance'],
    'knn__metric': ['euclidean', 'manhattan']
}


grid_search = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    cv=3,          
    scoring='f1',
    n_jobs=-1,
    verbose=1,
)

grid_search.fit(X_train_enc, y_train)

print("BEST HYPERPARAMETERS:")
best_params = grid_search.best_params_
print(best_params)

print("BEST F1 SCORE (from cross-validation):")
best_score = grid_search.best_score_
print(f"F1 Score: {best_score:.4f}")

Fitting 3 folds for each of 120 candidates, totalling 360 fits
BEST HYPERPARAMETERS:
{'knn__metric': 'euclidean', 'knn__n_neighbors': 16, 'knn__weights': 'distance'}
BEST F1 SCORE (from cross-validation):
F1 Score: 0.9832
