# **Heart Disease Prediction**

In [3]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [4]:
df = pd.read_csv("/content/drive/MyDrive/Datasets/heart.csv")
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,52,1,0,125,212,0,1,168,0,1.0,2,2,3,0
1,53,1,0,140,203,1,0,155,1,3.1,0,0,3,0
2,70,1,0,145,174,0,1,125,1,2.6,0,0,3,0
3,61,1,0,148,203,0,1,161,0,0.0,2,1,3,0
4,62,0,0,138,294,1,1,106,0,1.9,1,3,2,0


In [5]:
df.shape

(1025, 14)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1025 entries, 0 to 1024
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1025 non-null   int64  
 1   sex       1025 non-null   int64  
 2   cp        1025 non-null   int64  
 3   trestbps  1025 non-null   int64  
 4   chol      1025 non-null   int64  
 5   fbs       1025 non-null   int64  
 6   restecg   1025 non-null   int64  
 7   thalach   1025 non-null   int64  
 8   exang     1025 non-null   int64  
 9   oldpeak   1025 non-null   float64
 10  slope     1025 non-null   int64  
 11  ca        1025 non-null   int64  
 12  thal      1025 non-null   int64  
 13  target    1025 non-null   int64  
dtypes: float64(1), int64(13)
memory usage: 112.2 KB


In [7]:
df.describe()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
count,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0
mean,54.434146,0.69561,0.942439,131.611707,246.0,0.149268,0.529756,149.114146,0.336585,1.071512,1.385366,0.754146,2.323902,0.513171
std,9.07229,0.460373,1.029641,17.516718,51.59251,0.356527,0.527878,23.005724,0.472772,1.175053,0.617755,1.030798,0.62066,0.50007
min,29.0,0.0,0.0,94.0,126.0,0.0,0.0,71.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,48.0,0.0,0.0,120.0,211.0,0.0,0.0,132.0,0.0,0.0,1.0,0.0,2.0,0.0
50%,56.0,1.0,1.0,130.0,240.0,0.0,1.0,152.0,0.0,0.8,1.0,0.0,2.0,1.0
75%,61.0,1.0,2.0,140.0,275.0,0.0,1.0,166.0,1.0,1.8,2.0,1.0,3.0,1.0
max,77.0,1.0,3.0,200.0,564.0,1.0,2.0,202.0,1.0,6.2,2.0,4.0,3.0,1.0


In [8]:
df.isnull().sum()

Unnamed: 0,0
age,0
sex,0
cp,0
trestbps,0
chol,0
fbs,0
restecg,0
thalach,0
exang,0
oldpeak,0


In [9]:
df.target.value_counts()

Unnamed: 0_level_0,count
target,Unnamed: 1_level_1
1,526
0,499


Separating input features and output label 'target'

In [10]:
X = df.drop(columns='target', axis=1)
Y = df['target']
# '1' marks 'Heart Disease' while '0' marks 'No Heart Disease'

In [11]:
print(X.shape, Y.shape)

(1025, 13) (1025,)


Training Test Data Split

In [12]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=25)

In [13]:
print(X.shape, X_train.shape, X_test.shape)

(1025, 13) (820, 13) (205, 13)


Model Training and Evaluation

In [14]:
model = LogisticRegression()

In [15]:
model.fit(X_train, Y_train)

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [16]:
# Model accuracy evaluation on training dataset:
Y_pred = model.predict(X_train)
training_data_accuracy = accuracy_score(Y_pred, Y_train)
print(f"Accuracy on training dataset: {training_data_accuracy:.2%}")

Accuracy on training dataset: 86.34%


In [17]:
# Model accuracy evaluation on testing dataset:
Y_pred = model.predict(X_test)
testing_data_accuracy = accuracy_score(Y_pred, Y_test)
print(f"Accuracy on testing dataset: {testing_data_accuracy:.2%}")

Accuracy on testing dataset: 83.41%


Making a Predictive System

In [18]:
input_data = (54,1,0,120,188,0,1,113,0,1.4,1,1,3)
input_data = np.asarray(input_data)
input_data = input_data.reshape(1, -1)

prediction = model.predict(input_data)
if prediction[0] == 0:
    print("The person does not have a heart disease")
else:
    print("The person has a heart disease")

The person does not have a heart disease




In [19]:
import pickle

In [20]:
filename = 'heart_disease_model.sav'
pickle.dump(model, open(filename, 'wb'))

In [21]:
loaded_model = pickle.load(open(filename, 'rb'))

In [22]:
input_data = (54,1,0,120,188,0,1,113,0,1.4,1,1,3)
input_data = np.asarray(input_data)
input_data = input_data.reshape(1, -1)

prediction = loaded_model.predict(input_data)
if prediction[0] == 0:
    print("The person does not have a heart disease")
else:
    print("The person has a heart disease")

The person does not have a heart disease


