In [1]:
# Importing the required libraries
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split

In [2]:
# Importing the dataset
df = pd.read_csv("HeartDisease.csv")
df.head()

Unnamed: 0,age,gender,chest_pain,rest_bps,cholestrol,fasting_blood_sugar,rest_ecg,thalach,exer_angina,old_peak,slope,ca,thalassemia,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [3]:
#Finding the shape of the dataset
df.shape

(303, 14)

In [4]:
# Finding the amount of null and na values in the dataset

df.isnull().sum()

age                    0
gender                 0
chest_pain             0
rest_bps               0
cholestrol             0
fasting_blood_sugar    0
rest_ecg               0
thalach                0
exer_angina            0
old_peak               0
slope                  0
ca                     0
thalassemia            0
target                 0
dtype: int64

In [5]:
df.isna().sum()

age                    0
gender                 0
chest_pain             0
rest_bps               0
cholestrol             0
fasting_blood_sugar    0
rest_ecg               0
thalach                0
exer_angina            0
old_peak               0
slope                  0
ca                     0
thalassemia            0
target                 0
dtype: int64

Splitting the Dataset

In [6]:
X = df.iloc[:,:-1]
y = df["target"]
X.head()
y.shape

(303,)

In [7]:
X.head()

Unnamed: 0,age,gender,chest_pain,rest_bps,cholestrol,fasting_blood_sugar,rest_ecg,thalach,exer_angina,old_peak,slope,ca,thalassemia
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2


In [8]:
y.head()

0    1
1    1
2    1
3    1
4    1
Name: target, dtype: int64

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=16)

In [10]:
X_train.head()

Unnamed: 0,age,gender,chest_pain,rest_bps,cholestrol,fasting_blood_sugar,rest_ecg,thalach,exer_angina,old_peak,slope,ca,thalassemia
149,42,1,2,130,180,0,1,150,0,0.0,2,0,2
201,60,1,0,125,258,0,0,141,1,2.8,1,1,3
88,54,0,2,110,214,0,1,158,0,1.6,1,0,2
277,57,1,1,124,261,0,1,141,0,0.3,2,0,3
241,59,0,0,174,249,0,1,143,1,0.0,1,0,2


In [11]:
X_test.head()

Unnamed: 0,age,gender,chest_pain,rest_bps,cholestrol,fasting_blood_sugar,rest_ecg,thalach,exer_angina,old_peak,slope,ca,thalassemia
59,57,0,0,128,303,0,0,159,0,0.0,2,1,2
282,59,1,2,126,218,1,1,134,0,2.2,1,1,1
183,58,1,2,112,230,0,0,165,0,2.5,1,1,3
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1
262,53,1,0,123,282,0,1,95,1,2.0,1,2,3


In [12]:
X_train.shape

(227, 13)

In [13]:
X_test.shape

(76, 13)

In [14]:
y_train.shape

(227,)

In [15]:
y_test.shape

(76,)

Logistic Regression Model

Logistic regression is a simple and commonly used algorithm for binary classification problems like predicting heart disease.
It's easy to interpret and understand, making it a good starting point for beginners.
However, it might not capture complex relationships between features as well as some other algorithms.

In [16]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(random_state=16)
model.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [17]:
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_pred, y_test)
precision = precision_score(y_pred, y_test)
recall = recall_score(y_pred, y_test)

In [18]:
accuracy*100


81.57894736842105

In [19]:
precision*100

88.57142857142857

In [20]:
recall*100

75.60975609756098

Random Forest Tree Model

Random forests are an ensemble learning method that builds multiple decision trees and combines their predictions.
They tend to generalize well and are less prone to overfitting compared to individual decision trees.
Random forests can handle large datasets with high dimensionality and are robust to outliers and noise.

In [21]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(n_estimators=400, random_state=16)
rf_model.fit(X_train, y_train)

In [22]:
rf_y_pred = rf_model.predict(X_test)
accuracy = accuracy_score(rf_y_pred, y_test)
precision = precision_score(rf_y_pred, y_test)
recall = recall_score(rf_y_pred, y_test)

In [23]:
accuracy*100

82.89473684210526

In [24]:
precision*100

85.71428571428571

In [25]:
recall*100

78.94736842105263

Support Vector Machine Model

SVMs are powerful for classification tasks, especially when there is a clear margin of separation between classes.
They can handle high-dimensional data and are effective in cases where the number of features is greater than the number of samples.
However, SVMs can be sensitive to the choice of kernel parameters and might not perform well on very large datasets.

In [26]:
from sklearn.svm import SVC

sv_model = SVC(kernel="linear", random_state=16)
sv_model.fit(X_train, y_train)

In [27]:
sv_y_pred = sv_model.predict(X_test)
accuracy = accuracy_score(sv_y_pred, y_test)
precision = precision_score(sv_y_pred, y_test)
recall = recall_score(sv_y_pred, y_test)
classification = classification_report(sv_y_pred, y_test)
matrix = confusion_matrix(sv_y_pred, y_test)

In [28]:
accuracy*100

84.21052631578947

In [29]:
precision*100

91.42857142857143

In [30]:
recall*100

78.04878048780488

In [31]:
classification

'              precision    recall  f1-score   support\n\n           0       0.78      0.91      0.84        35\n           1       0.91      0.78      0.84        41\n\n    accuracy                           0.84        76\n   macro avg       0.85      0.85      0.84        76\nweighted avg       0.85      0.84      0.84        76\n'

In [32]:
matrix

array([[32,  3],
       [ 9, 32]], dtype=int64)

XGBoost Model

XGBoost (eXtreme Gradient Boosting) is a powerful and efficient implementation of gradient boosting, which is an ensemble learning technique that combines the predictions of multiple weak learners (usually decision trees) to produce a stronger predictive model. XGBoost is designed for speed and performance, making it popular for both classification and regression tasks. It includes several optimizations and enhancements over traditional gradient boosting methods.

In [33]:
from xgboost import XGBClassifier

In [34]:
model_xgboost = XGBClassifier()

In [36]:
model_xgboost.fit(X_train, y_train)

In [38]:
xgb_y_pred = model_xgboost.predict(X_test)
accuracy = accuracy_score(xgb_y_pred, y_test)
precision = precision_score(xgb_y_pred, y_test)
recall = recall_score(xgb_y_pred, y_test)
classification = classification_report(xgb_y_pred, y_test)
matrix = confusion_matrix(xgb_y_pred, y_test)

In [39]:
accuracy*100

82.89473684210526

In [40]:
precision*100

80.0

In [41]:
recall*100

82.35294117647058

In [42]:
matrix

array([[35,  7],
       [ 6, 28]], dtype=int64)

In [44]:
classification

'              precision    recall  f1-score   support\n\n           0       0.85      0.83      0.84        42\n           1       0.80      0.82      0.81        34\n\n    accuracy                           0.83        76\n   macro avg       0.83      0.83      0.83        76\nweighted avg       0.83      0.83      0.83        76\n'