<a href="https://colab.research.google.com/github/itsmarbo/ML-HeartDiseasePredictor/blob/main/heartdisease.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Importing the dependencies

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

Data collection processing

In [None]:
# loading the csv data to a Pandas DataFrame
heart_data = pd.read_csv('/content/data.csv')

In [None]:
# print first 5 rows of the dataset using 'head'
heart_data.head()

Unnamed: 0,id,age,sex,cp,trestbps,chol,restecg,thalch,exang,oldpeak,num
0,1,63,1,1,145,233,1,150,0,2.3,0
1,2,67,1,0,160,286,1,108,1,1.5,1
2,3,67,1,0,120,229,1,129,1,2.6,1
3,4,37,1,2,130,250,0,187,0,3.5,0
4,5,41,0,3,130,204,1,172,0,1.4,0


In [None]:
# print the last 5 rows of the dataset using "tail"
heart_data.tail()

Unnamed: 0,id,age,sex,cp,trestbps,chol,restecg,thalch,exang,oldpeak,num
811,914,62,1,0,158,170,2,138,1,0.0,1
812,915,46,1,0,134,310,0,126,0,0.0,1
813,916,54,0,0,127,333,2,154,0,0.0,1
814,918,55,1,0,122,223,2,100,0,0.0,1
815,920,62,1,3,120,254,1,93,1,0.0,1


In [None]:
# Number of rows and columns in the dataset
heart_data.shape

(816, 11)

In [None]:
# Getting some info about the data
heart_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 816 entries, 0 to 815
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   id        816 non-null    int64  
 1   age       816 non-null    int64  
 2   sex       816 non-null    int64  
 3   cp        816 non-null    int64  
 4   trestbps  816 non-null    int64  
 5   chol      816 non-null    int64  
 6   restecg   816 non-null    int64  
 7   thalch    816 non-null    int64  
 8   exang     816 non-null    int64  
 9   oldpeak   816 non-null    float64
 10  num       816 non-null    int64  
dtypes: float64(1), int64(10)
memory usage: 70.2 KB


In [None]:
# Checking for missing values
heart_data.isnull().sum()

id          0
age         0
sex         0
cp          0
trestbps    0
chol        0
restecg     0
thalch      0
exang       0
oldpeak     0
num         0
dtype: int64

In [None]:
# statistical measures about the data
heart_data.describe()

Unnamed: 0,id,age,sex,cp,trestbps,chol,restecg,thalch,exang,oldpeak,num
count,816.0,816.0,816.0,816.0,816.0,816.0,816.0,816.0,816.0,816.0,816.0
mean,427.551471,53.139706,0.773284,1.040441,132.090686,202.616422,0.551471,137.819853,0.392157,0.877941,0.547794
std,257.042552,9.300575,0.418964,1.233313,19.048188,108.889683,0.762494,25.917105,0.488531,1.096063,0.498016
min,1.0,28.0,0.0,0.0,0.0,0.0,0.0,60.0,0.0,-2.6,0.0
25%,204.75,46.0,1.0,0.0,120.0,180.0,0.0,120.0,0.0,0.0,0.0
50%,422.5,54.0,1.0,0.0,130.0,225.0,0.0,140.0,0.0,0.5,1.0
75%,641.25,60.0,1.0,2.0,140.0,268.0,1.0,158.0,1.0,1.5,1.0
max,920.0,77.0,1.0,3.0,200.0,603.0,2.0,202.0,1.0,6.2,1.0


In [None]:
# Checking the distribution of target variable
heart_data['num'].value_counts()

1    447
0    369
Name: num, dtype: int64

1 ---> Heart disease

0 ---> Healthy

Splitting the features and num

In [None]:
X = heart_data.drop(columns='num',axis=1)
Y = heart_data['num']

In [None]:
print(X)

      id  age  sex  cp  trestbps  chol  restecg  thalch  exang  oldpeak
0      1   63    1   1       145   233        1     150      0      2.3
1      2   67    1   0       160   286        1     108      1      1.5
2      3   67    1   0       120   229        1     129      1      2.6
3      4   37    1   2       130   250        0     187      0      3.5
4      5   41    0   3       130   204        1     172      0      1.4
..   ...  ...  ...  ..       ...   ...      ...     ...    ...      ...
811  914   62    1   0       158   170        2     138      1      0.0
812  915   46    1   0       134   310        0     126      0      0.0
813  916   54    0   0       127   333        2     154      0      0.0
814  918   55    1   0       122   223        2     100      0      0.0
815  920   62    1   3       120   254        1      93      1      0.0

[816 rows x 10 columns]


In [None]:
print(Y)

0      0
1      1
2      1
3      0
4      0
      ..
811    1
812    1
813    1
814    1
815    1
Name: num, Length: 816, dtype: int64


Splitting the Data into Training and Test data

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=2)

In [None]:
print(X.shape, X_train.shape, X_test.shape)

(816, 10) (652, 10) (164, 10)


Train Machine Learning Model

Logistic Regression

In [None]:
model = LogisticRegression()

In [None]:
# training the LogisticRegression model with Training data
model.fit(X_train, Y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression()

Model Evaluation

Accuracy Score

In [None]:
# Accuracy on training data
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)

In [None]:
print('Accuracy on Training data : ', training_data_accuracy)

Accuracy on Training data :  0.8159509202453987


In [None]:
# Accuracy on test data
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)

In [None]:
print('Accuracy on Test data: ', test_data_accuracy)

Accuracy on Test data:  0.8292682926829268


Building a Predicting System

In [None]:
input_data = (3,67,1,0,120,229,1,129,1,2.6)

# Change the input data to a numpy array
input_data_as_numpy_array = np.asarray(input_data)

# Reshape the numpy array as we are predicting for only one instance
input_data_reshaped = input_data_as_numpy_array.reshape(1,-1)

prediction = model.predict(input_data_reshaped)
print(prediction)

if (prediction[0]== 0):
  print('Healthy patient')
else:
  print('Patient with Heart Disease')

[1]
Patient with Heart Disease


