In [1]:
#purpose of the project - Predictive Analysis of Cardiac Events using ML Techniques
# Millions of people die every year due to heart diseases. The main reason for this is the lack of proper diagnosis and treatment.
# The main objective of this project is to predict the risk of heart disease in a person based on the given data.


In [2]:
#import required libraries:
import sklearn as sk, pandas as pd, numpy as np

Fetch the preprocessed full dataset:

In [3]:
#get the csv file and store it as a DataFrame
full_dset = pd.read_csv("C:\\Users\\kruth\\OneDrive\\Desktop\\Cardiac_Events_ML\\preprocessing\\final_dataset.csv")
#drop the index column
full_dset.drop(["index"], axis = 1, inplace = True) 
#print first 5 rows of the dataset
full_dset.head()

Unnamed: 0,age,sex,cp,trtbps,chol,fbs,restecg,thalachh,exng,oldpeak,slp,caa,thall,output
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


Target variable and feature columns separation

In [4]:
#this code snippet will separate the target variable from the features of the dataset.
y_s = full_dset["output"] #y holds target variable as a pandas Series.It has 300 rows.
y_df = pd.DataFrame(y_s) 
X_df = full_dset.drop("output", axis=1, inplace = False)# X holds only the 13 feature columns by droping output column. 

Some additional convertions before the split:

In [5]:
#convert X and y dataframes to numpy arrays:
X = X_df.values #X is a numpy array of shape (300,13)
y = y_df.values #y is a numpy array of shape (300,1)

#convert y to a 1D array:
from sklearn.utils import column_or_1d
y = column_or_1d(y, warn=False) #y is a numpy array of shape (300,)


DATA SPLIT: Random sampling technique

In [6]:
#generate training and testing sets:
from sklearn.model_selection import train_test_split 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
#test_size = 0.2 means 20% of the dataset is used for testing and 80% for training.
#random_state is 42 => controls the randomness of the training and testing indices produced. 42 renders the same pair of sets everytime I run the code.

Feature Scaling- Standard Scaler transformation:

In [7]:
#scaling the feature dataset:
from sklearn.preprocessing import StandardScaler 
# fit the Scaler
scaler = StandardScaler() #creating an instance of the StandardScaler class.
#Scaling the values such that the mean is 0 and std deviation is 1.
scaler.fit(X_train) #fitting the scaler to the training set.
X_train = scaler.transform(X_train) #transforming the training set. 
X_test = scaler.transform(X_test) #transforming the testing set.

ALGORITHMS

Metrics discription

In [8]:
#precision: The ability of the classifier not to label as positive a sample that is negative.
#recall: The ability of the classifier to find all the positive samples.
#f1-score: The weighted average of the precision and recall.
#support: The number of occurrences of each class in y_test.
#macro avg: The average of the unweighted mean per label.
#weighted avg: The average of the support-weighted mean per label.

Logistic Regression

In [9]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

# train the model
model = LogisticRegression()
model.fit(X_train, y_train)

# make predictions on the test dataset
y_pred = model.predict(X_test)

# evaluate the model's performance
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy: ', accuracy)

# get the precision, recall, and f1-score
cr = classification_report(y_test, y_pred)
print(cr) 

Accuracy:  0.8666666666666667
              precision    recall  f1-score   support

           0       0.86      0.79      0.83        24
           1       0.87      0.92      0.89        36

    accuracy                           0.87        60
   macro avg       0.87      0.85      0.86        60
weighted avg       0.87      0.87      0.87        60



Random Forest

In [10]:
#NOTE: DON'T REQUIRE SCALING
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Initialize the model
rf_clf = RandomForestClassifier(n_estimators=100, random_state=0)

# Fit the model on the training data
rf_clf.fit(X_train, y_train)

# Make predictions on the test data
y_pred1 = rf_clf.predict(X_test)

# Evaluate the model performance
from sklearn.metrics import accuracy_score
print("Accuracy:", accuracy_score(y_test, y_pred1))
# get the precision, recall, and f1-score
cr1 = classification_report(y_test, y_pred1)
print(cr1)


Accuracy: 0.8666666666666667
              precision    recall  f1-score   support

           0       0.81      0.88      0.84        24
           1       0.91      0.86      0.89        36

    accuracy                           0.87        60
   macro avg       0.86      0.87      0.86        60
weighted avg       0.87      0.87      0.87        60



Decision Tree

In [11]:
#NOTE: DON'T REQUIRE SCALING
from sklearn.tree import DecisionTreeClassifier

# Initialize the model
dt_clf = DecisionTreeClassifier(random_state=0)

# Fit the model on the training data
dt_clf.fit(X_train, y_train)

# Make predictions on the test data
y_pred = dt_clf.predict(X_test)

# Evaluate the model performance
from sklearn.metrics import accuracy_score
print("Accuracy:", accuracy_score(y_test, y_pred))
cr1 = classification_report(y_test, y_pred1)
print(cr1)

Accuracy: 0.8
              precision    recall  f1-score   support

           0       0.81      0.88      0.84        24
           1       0.91      0.86      0.89        36

    accuracy                           0.87        60
   macro avg       0.86      0.87      0.86        60
weighted avg       0.87      0.87      0.87        60



KNN

In [12]:
from sklearn.neighbors import KNeighborsClassifier

# Initialize the model
knn_clf = KNeighborsClassifier(n_neighbors=5)

# Fit the model on the training data
knn_clf.fit(X_train, y_train)

# Make predictions on the test data
y_pred = knn_clf.predict(X_test)


cr1 = classification_report(y_test, y_pred1)
print(cr1)
# Evaluate the model performance
from sklearn.metrics import accuracy_score
print("Accuracy:", accuracy_score(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.81      0.88      0.84        24
           1       0.91      0.86      0.89        36

    accuracy                           0.87        60
   macro avg       0.86      0.87      0.86        60
weighted avg       0.87      0.87      0.87        60

Accuracy: 0.85


SVM

In [18]:

#NOTE: WHEN THE SCALER IS NOT USED, THIS MODEL GIVES MORE ACCURACY!
from sklearn.svm import SVC
from sklearn.metrics import classification_report

# Initialize the model
svm = SVC(random_state=42)

# Fit the model to the training data
svm.fit(X_train, y_train)

# Make predictions on the test data
y_pred = svm.predict(X_test)

# Print the classification report
print(classification_report(y_test, y_pred))
from sklearn.metrics import accuracy_score
print("Accuracy:", accuracy_score(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.90      0.79      0.84        24
           1       0.87      0.94      0.91        36

    accuracy                           0.88        60
   macro avg       0.89      0.87      0.88        60
weighted avg       0.88      0.88      0.88        60

Accuracy: 0.8833333333333333


Naive Bayes

In [14]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report

# Initialize the model
gnb = GaussianNB()

# Fit the model to the training data
gnb.fit(X_train, y_train)

# Make predictions on the test data
y_pred = gnb.predict(X_test)

# Print the classification report
print(classification_report(y_test, y_pred))
from sklearn.metrics import accuracy_score
print("Accuracy:", accuracy_score(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.88      0.88      0.88        24
           1       0.92      0.92      0.92        36

    accuracy                           0.90        60
   macro avg       0.90      0.90      0.90        60
weighted avg       0.90      0.90      0.90        60

Accuracy: 0.9


Gradient Boosting

In [15]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report

# Initialize the model
gbc = GradientBoostingClassifier()

# Fit the model to the training data
gbc.fit(X_train, y_train)

# Make predictions on the test data
y_pred = gbc.predict(X_test)

# Print the classification report
print(classification_report(y_test, y_pred))
from sklearn.metrics import accuracy_score
print("Accuracy:", accuracy_score(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.73      0.79      0.76        24
           1       0.85      0.81      0.83        36

    accuracy                           0.80        60
   macro avg       0.79      0.80      0.79        60
weighted avg       0.80      0.80      0.80        60

Accuracy: 0.8


AdaBoost

In [16]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import precision_recall_fscore_support

# Initialize the AdaBoostClassifier
ada = AdaBoostClassifier()

# Fit the classifier to the training data
ada.fit(X_train, y_train)

# Make predictions on the test set
y_pred = ada.predict(X_test)

# Calculate precision, recall, and f1-score
precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred, average='binary')

print(f'Precision: {precision:.2f}, Recall: {recall:.2f}, F1-Score: {f1:.2f}')
from sklearn.metrics import accuracy_score
print("Accuracy:", accuracy_score(y_test, y_pred))

Precision: 0.87, Recall: 0.75, F1-Score: 0.81
Accuracy: 0.7833333333333333
