In [15]:
# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Logistic Regression
import itertools
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.feature_selection import RFE
from sklearn.metrics import accuracy_score

In [2]:
df = pd.read_csv('heart.csv')

In [3]:
# 918 rows × 12 columns and no NaN values
df.head(10)

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0
5,39,M,NAP,120,339,0,Normal,170,N,0.0,Up,0
6,45,F,ATA,130,237,0,Normal,170,N,0.0,Up,0
7,54,M,ATA,110,208,0,Normal,142,N,0.0,Up,0
8,37,M,ASY,140,207,0,Normal,130,Y,1.5,Flat,1
9,48,F,ATA,120,284,0,Normal,120,N,0.0,Up,0


# Logistic Regression

In [4]:
df['SexDummy'] = df['Sex'].replace({'M': 1, 'F': 0})
df['PainDummy'] = df['ChestPainType'].replace({'ASY': 0, 'NAP': 1, 'ATA': 2, 'TA': 3})
df['ECG_Dummy'] = df['RestingECG'].replace({'Normal': 0, 'ST': 1, 'LVH': 2})
df['ExerciseDummy'] = df['ExerciseAngina'].replace({'Y': 1, 'N': 0})
df['ST_SlopeDummy'] = df['ST_Slope'].replace({'Up': 1, 'Flat': 0, 'Down': -1})

In [5]:
def logreg(X, y, random_state=20, cv=5, n_features=10):
    predictors = df[X]
    to_predict = df[y]
    
    X_train, X_test, y_train, y_test = train_test_split(predictors, to_predict, test_size=0.2, random_state=random_state)

    pipeline = make_pipeline(StandardScaler(), SimpleImputer())
    X_train_preprocessed = pipeline.fit_transform(X_train)
    X_test_preprocessed = pipeline.transform(X_test)

    model = LogisticRegression(max_iter=1000)
    
    rfe = RFE(estimator=model, n_features_to_select=n_features)
    X_train_preprocessed = rfe.fit_transform(X_train_preprocessed, y_train)
    X_test_preprocessed = rfe.transform(X_test_preprocessed)
    
    cv_scores = cross_val_score(model, X_train_preprocessed, y_train, cv=5)
    mean_cv_score = np.mean(cv_scores)
    
    return mean_cv_score

In [6]:
columns = ['Age', 
           'SexDummy', 
           'PainDummy', 
           'RestingBP', 
           'Cholesterol', 
           'FastingBS', 
           'ECG_Dummy', 
           'MaxHR', 
           'ExerciseDummy', 
           'Oldpeak', 
           'ST_SlopeDummy']

logreg(columns, 'HeartDisease')

0.8555772994129158

In [7]:
# Best Subset Selection
def logreg(X, y, random_state=21, cv=5, n_features=10):
    predictors = df[X]
    to_predict = df[y]
    
    X_train, X_test, y_train, y_test = train_test_split(predictors, to_predict, test_size=0.2, random_state=random_state)

    pipeline = make_pipeline(StandardScaler(), SimpleImputer())
    X_train_preprocessed = pipeline.fit_transform(X_train)
    X_test_preprocessed = pipeline.transform(X_test)

    model = LogisticRegression(max_iter=1000)

    best_score = float('-inf')
    best_subset = None

    column_names = X_train.columns.tolist()  # Get the column names from X_train

    for k in range(1, min(n_features, len(column_names)) + 1):
        for subset in itertools.combinations(column_names, k):
            subset = list(subset)
            X_train_subset = X_train_preprocessed[:, [column_names.index(col) for col in subset]]  # Select columns by index
            X_test_subset = X_test_preprocessed[:, [column_names.index(col) for col in subset]]  # Select columns by index
            
            model.fit(X_train_subset, y_train)
            cv_scores = cross_val_score(model, X_train_subset, y_train, cv=cv)
            mean_cv_score = np.mean(cv_scores)
            
            if mean_cv_score > best_score:
                best_score = mean_cv_score
                best_subset = subset

    return best_score, best_subset

columns = ['Age', 
           'SexDummy', 
           'PainDummy', 
           'RestingBP', 
           'Cholesterol', 
           'FastingBS', 
           'ECG_Dummy', 
           'MaxHR', 
           'ExerciseDummy', 
           'Oldpeak', 
           'ST_SlopeDummy']

best_score, best_subset = logreg(columns, 'HeartDisease')
print("Best Subset:", best_subset)
print("Best Score:", best_score)

Best Subset: ['Age', 'SexDummy', 'PainDummy', 'Cholesterol', 'ECG_Dummy', 'Oldpeak', 'ST_SlopeDummy']
Best Score: 0.855595937004939


# Random Forest

In [26]:
# one hot encode the rest of the columns to see if i can get better accuracy
columns_to_encode = ['Sex', 'ChestPainType', 'RestingECG', 'ExerciseAngina', 'ST_Slope']

df_modified = df.copy()

df_encoded = pd.get_dummies(df_modified, columns=columns_to_encode)

df_binary = df_encoded.astype(bool).astype(int)

df_modified = df.drop(columns=columns_to_encode)
df_modified = pd.concat([df_modified, df_binary], axis=1)

X = df_modified.drop(['HeartDisease'], axis=1)
y = df_modified['HeartDisease']

In [27]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [28]:
rf_classifier = RandomForestClassifier()
rf_classifier.fit(X_train, y_train)

RandomForestClassifier()

In [29]:
y_pred = rf_classifier.predict(X_test)

In [30]:
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

Accuracy: 0.88


In [31]:
print("I did get better accruacy, the original without one hot encoding did 77%, i got aroun 9% better with the one hot encoding")

I did get better accruacy, the original without one hot encoding did 77%, i got aroun 9% better with the one hot encoding
