## Final Project

# 1. Imports

In [1]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
import math as mt

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from typing import List
from typing import Tuple

# 2. Data Formation and Cleaning

In [4]:
df = pd.read_csv("heart.csv")
df.head(10)

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0
5,39,M,NAP,120,339,0,Normal,170,N,0.0,Up,0
6,45,F,ATA,130,237,0,Normal,170,N,0.0,Up,0
7,54,M,ATA,110,208,0,Normal,142,N,0.0,Up,0
8,37,M,ASY,140,207,0,Normal,130,Y,1.5,Flat,1
9,48,F,ATA,120,284,0,Normal,120,N,0.0,Up,0


In [7]:
def clean_and_encode_data(df: pd.DataFrame,
                        numerical_cols: List[str] = ['Age', 'RestingBP', 'Cholesterol','FastingBS', 'MaxHR', 'Oldpeak' ], 
                          categorical_cols: List[str] = ['Sex', 'RestingECG', 'ExerciseAngina',
                                                          'ST_Slope', 'ChestPainType']) -> pd.DataFrame:
    for col in numerical_cols:
        df[col] = df[col].fillna(df[col].median())
    
    df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

    df = df.replace([np.inf, -np.inf], np.nan).fillna(0)

    df = df.astype(int)
    
    return df
sorted_df = clean_and_encode_data(df)
sorted_df.head(10)

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,HeartDisease,Sex_M,RestingECG_Normal,RestingECG_ST,ExerciseAngina_Y,ST_Slope_Flat,ST_Slope_Up,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA
0,40,140,289,0,172,0,0,1,1,0,0,0,1,1,0,0
1,49,160,180,0,156,1,1,0,1,0,0,1,0,0,1,0
2,37,130,283,0,98,0,0,1,0,1,0,0,1,1,0,0
3,48,138,214,0,108,1,1,0,1,0,1,1,0,0,0,0
4,54,150,195,0,122,0,0,1,1,0,0,0,1,0,1,0
5,39,120,339,0,170,0,0,1,1,0,0,0,1,0,1,0
6,45,130,237,0,170,0,0,0,1,0,0,0,1,1,0,0
7,54,110,208,0,142,0,0,1,1,0,0,0,1,1,0,0
8,37,140,207,0,130,1,1,1,1,0,1,1,0,0,0,0
9,48,120,284,0,120,0,0,0,1,0,0,0,1,1,0,0


# 3. ML Model Predictions:
a. KNN

b. Linear Regression

In [13]:
def find_optimal_k_stan(ycol):

    X = sorted_df.drop(columns=[ycol])
    y = sorted_df[ycol]
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
    
    scaler = StandardScaler()
    
    X_train_standard = scaler.fit_transform(X_train)
    X_test_standard = scaler.transform(X_test)    
    
    best_k = 0
    best_score = 0
    for k in range(1,21):
        knn = KNeighborsClassifier(n_neighbors=((2*k)+1))
        knn.fit(X_train_standard, y_train)
        score = knn.score(X_test_standard, y_test)
        if score > best_score:
            best_k = k
            best_score = score

    knn = KNeighborsClassifier(n_neighbors=best_k)
    knn.fit(X_train, y_train)
    
    
    y_pred = knn.predict(X_test)
    test_accuracy = accuracy_score(y_test, y_pred)
    conf_matrix = confusion_matrix(y_test, y_pred)
    class_report = classification_report(y_test, y_pred)
    
    return best_k, test_accuracy, conf_matrix, class_report

In [14]:
ycolstan = "HeartDisease"
best_k, test_accuracy, conf_matrix, class_report = find_optimal_k_stan(ycolstan)

print(f"Optimal k: {best_k}")
print(f"Test Accuracy: {test_accuracy}")
print("Confusion Matrix:")
print(conf_matrix)
print("Classification Report:")
print(class_report)

Optimal k: 7
Test Accuracy: 0.6992753623188406
Confusion Matrix:
[[ 76  36]
 [ 47 117]]
Classification Report:
              precision    recall  f1-score   support

           0       0.62      0.68      0.65       112
           1       0.76      0.71      0.74       164

    accuracy                           0.70       276
   macro avg       0.69      0.70      0.69       276
weighted avg       0.71      0.70      0.70       276



In [17]:
def train_logistic_regression_stan(ycol) -> float:
    X = sorted_df.drop(columns=[ycol])
    y = sorted_df[ycol]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    scaler = StandardScaler()
    
    X_train_standard = scaler.fit_transform(X_train)
    X_test_standard = scaler.transform(X_test)

    log_reg = LogisticRegression(max_iter=100)
    log_reg.fit(X_train_standard, y_train)

    y_pred = log_reg.predict(X_test_standard)

    test_accuracy = accuracy_score(y_test, y_pred)
    conf_matrix = confusion_matrix(y_test, y_pred)
    class_report = classification_report(y_test, y_pred)
    
    return test_accuracy, conf_matrix, class_report


In [18]:
input_stan = "HeartDisease"
test_accuracy, conf_matrix, class_report = train_logistic_regression_stan(input_stan)
print(f"Test Accuracy: {test_accuracy}")
print("Confusion Matrix:")
print(conf_matrix)
print("Classification Report:")
print(class_report)

Test Accuracy: 0.8532608695652174
Confusion Matrix:
[[67 10]
 [17 90]]
Classification Report:
              precision    recall  f1-score   support

           0       0.80      0.87      0.83        77
           1       0.90      0.84      0.87       107

    accuracy                           0.85       184
   macro avg       0.85      0.86      0.85       184
weighted avg       0.86      0.85      0.85       184

