In [66]:
# Features in the dataset:
    # 1. Age -> Age of the patient (Numerical Continuous)
    # 2. Sex -> Gender of the patient (Binary Categorical)
    # 3. cp -> Chest Pain type (Ordinal Categorical)
    # 4. trestbps -> Resting Blood Pressure (Numerical Continuous)
    # 5. chol -> Serum Cholestrol (Numerical Continuous)
    # 6. fbs -> Fasting Blood Sugar (Binary Categorical)
    # 7. restecg -> Resting Electrocardiographic Results (Ordinal Categorical)
    # 8. thalach -> Maximum Heart Rate Achieved (Numerical Continuous)
    # 9. exang -> Exercise Induced Angina (Binary Categorical)
    # 10. oldpeak -> ST Depression induced by exercise relative to rest (Numerical Continuous)
    # 11. slope -> Slope of the peak exercise ST segment (Ordinal Categorical)
    # 12. ca -> Number of major vessels (0-3) colored by Flourosopy (Ordinal Categorical)
    # 13. thal -> Thalassemia (Ordinal Categorical)
    # 14. target -> Heart Disease (Binary Categorical)

# Machine Learning

In [67]:
import pandas as pd
import numpy as np

## Read in the Data

In [68]:
df = pd.read_csv('heart.csv')
print(df.shape)
df.head()

(303, 14)


Unnamed: 0,age,sex,cp,trtbps,chol,fbs,restecg,thalachh,exng,oldpeak,slp,caa,thall,output
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


## Preprocess the Data

### Split the Data into Feature/Target 

In [69]:
X = df.drop('output', axis=1)
y = df['output']

### Splitting the Data intoTraining/Testing Sets

In [70]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [71]:
print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_test shape: {y_test.shape}")

X_train shape: (242, 13)
X_test shape: (61, 13)
y_train shape: (242,)
y_test shape: (61,)


In [72]:
# list of categorical columns
cat_cols = ['sex', 'cp', 'fbs', 'restecg', 'exng', 'slp', 'caa', 'thall']
# ordinal cols
ordinal_cols = ['cp', 'restecg', 'slp', 'caa', 'thall']
# list of numerical columns
# create a list of numerical cols except output
num_cols = [col for col in df.columns if col not in cat_cols + ['output']]

### Encode the Categorical Features

In [73]:
from sklearn.preprocessing import OrdinalEncoder
ord_encoder = OrdinalEncoder()
X_train[ordinal_cols]= ord_encoder.fit_transform(X_train[ordinal_cols])
X_test[ordinal_cols]= ord_encoder.transform(X_test[ordinal_cols])

### Feature Scaling

In [74]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train[num_cols] = sc.fit_transform(X_train[num_cols])
X_test[num_cols] = sc.transform(X_test[num_cols])

## Logistic Regression

In [75]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(random_state=42)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)

In [76]:
compare_result = pd.concat([y_test.reset_index(drop=True), pd.Series(y_pred, name='y_pred')], axis=1)
compare_result.rename(columns={'output':'y_test'}, inplace=True)
compare_result[compare_result['y_test'] != compare_result['y_pred']]

Unnamed: 0,y_test,y_pred
1,0,1
11,1,0
25,0,1
26,0,1
30,1,0
33,1,0
40,1,0
46,0,1


### Logistic Regression Produces an Accuracy Score of 86.88%

In [77]:
# logistic regression produces an accuracy score of 90.02%
from sklearn.metrics import accuracy_score, confusion_matrix
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[25  4]
 [ 4 28]]


0.8688524590163934

## Support Vector Machine (Classification)

In [78]:
from sklearn.svm import SVC
classifier = SVC(kernel='linear', random_state=42)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)

In [79]:
result = pd.concat([y_test.reset_index(drop=True), pd.Series(y_pred, name='y_pred')], axis=1)
result.rename(columns={'output':'y_test'}, inplace=True)
result[result['y_test'] != result['y_pred']]

Unnamed: 0,y_test,y_pred
1,0,1
11,1,0
25,0,1
26,0,1
30,1,0
33,1,0
40,1,0
46,0,1


### SVM (Classification) Produces a Accuracy Score of 86.89%

In [80]:
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[25  4]
 [ 4 28]]


0.8688524590163934

## Decision Tree

In [81]:
from sklearn.tree import DecisionTreeClassifier
classifier = DecisionTreeClassifier(criterion='entropy',random_state=42)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)

In [82]:
result = pd.concat([y_test.reset_index(drop=True), pd.Series(y_pred, name='y_pred')], axis=1)
result.rename(columns={'output':'y_test'}, inplace=True)
result[result['y_test'] != result['y_pred']]

Unnamed: 0,y_test,y_pred
2,1,0
4,1,0
9,1,0
13,0,1
20,1,0
22,0,1
25,0,1
30,1,0
36,1,0
43,1,0


### Decision Tree Produced an Accuracy Score of 81.97%

In [83]:
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[26  3]
 [ 8 24]]


0.819672131147541

## Random Forest Classifier

In [84]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators=100, criterion='entropy', random_state=42)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)

In [85]:
result = pd.concat([y_test.reset_index(drop=True), pd.Series(y_pred, name='y_pred')], axis=1)
result.rename(columns={'output':'y_test'}, inplace=True)
result[result['y_test'] != result['y_pred']]

Unnamed: 0,y_test,y_pred
1,0,1
9,1,0
11,1,0
20,1,0
22,0,1
25,0,1
26,0,1
30,1,0
33,1,0
46,0,1


### Random Forest Produced an Accuracy Score of 83.61%

In [86]:
# started off with 10 trees and it produced 78.69% accuracy. 
# Then tried 100 trees and it produced 80.33% accuracy.
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[24  5]
 [ 5 27]]


0.8360655737704918

## KNN

In [87]:
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors=7, metric='minkowski', p=2)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)

In [88]:
result = pd.concat([y_test.reset_index(drop=True), pd.Series(y_pred, name='y_pred')], axis=1)
result.rename(columns={'output':'y_test'}, inplace=True)
result[result['y_test'] != result['y_pred']]

Unnamed: 0,y_test,y_pred
1,0,1
9,1,0
11,1,0
25,0,1
26,0,1
30,1,0
33,1,0
46,0,1


### KNN w/ 5 Neighbors Produced an Accuracy Score of 86.89%

In [89]:
# 5 neighbors produced 86.89% accuracy  
# 7 neighbors produced 88.52% accuracy
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[25  4]
 [ 4 28]]


0.8688524590163934

## Naive Bayes

In [90]:
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)

In [91]:
result = pd.concat([y_test.reset_index(drop=True), pd.Series(y_pred, name='y_pred')], axis=1)
result.rename(columns={'output':'y_test'}, inplace=True)
result[result['y_test'] != result['y_pred']]

Unnamed: 0,y_test,y_pred
1,0,1
11,1,0
25,0,1
26,0,1
30,1,0
33,1,0
34,1,0
40,1,0


### Naive Bayes Produced an Accuracy Score of 86.89%

In [92]:
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[26  3]
 [ 5 27]]


0.8688524590163934