# Heart Disease Prediction

In [2]:
# Import the Required Library.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

## Data Collection and Processing Step.

In [3]:
dataset = pd.read_csv("heart_disease_data.csv")
dataset

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,57,0,0,140,241,0,1,123,1,0.2,1,0,3,0
299,45,1,3,110,264,0,1,132,0,1.2,1,0,3,0
300,68,1,0,144,193,1,1,141,0,3.4,1,2,3,0
301,57,1,0,130,131,0,1,115,1,1.2,1,1,3,0


In [4]:
# Check the number of qors and columns.
dataset.shape
# we have the 303 rows and the 14 columns.

(303, 14)

In [5]:
# check the Missing and null values into it.
dataset.isnull().sum()

# Its is good to know that we dont have the missing value into it.

age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
target      0
dtype: int64

In [6]:
# check the ditinct target Values into the dataset.
dataset["target"].value_counts()

target
1    165
0    138
Name: count, dtype: int64

In [7]:
# Check the feature information for the dataset.
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       303 non-null    int64  
 1   sex       303 non-null    int64  
 2   cp        303 non-null    int64  
 3   trestbps  303 non-null    int64  
 4   chol      303 non-null    int64  
 5   fbs       303 non-null    int64  
 6   restecg   303 non-null    int64  
 7   thalach   303 non-null    int64  
 8   exang     303 non-null    int64  
 9   oldpeak   303 non-null    float64
 10  slope     303 non-null    int64  
 11  ca        303 non-null    int64  
 12  thal      303 non-null    int64  
 13  target    303 non-null    int64  
dtypes: float64(1), int64(13)
memory usage: 33.3 KB


In [8]:
# Check the Statistical FUnction for the dataset.
dataset.describe()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
count,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0
mean,54.366337,0.683168,0.966997,131.623762,246.264026,0.148515,0.528053,149.646865,0.326733,1.039604,1.39934,0.729373,2.313531,0.544554
std,9.082101,0.466011,1.032052,17.538143,51.830751,0.356198,0.52586,22.905161,0.469794,1.161075,0.616226,1.022606,0.612277,0.498835
min,29.0,0.0,0.0,94.0,126.0,0.0,0.0,71.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,47.5,0.0,0.0,120.0,211.0,0.0,0.0,133.5,0.0,0.0,1.0,0.0,2.0,0.0
50%,55.0,1.0,1.0,130.0,240.0,0.0,1.0,153.0,0.0,0.8,1.0,0.0,2.0,1.0
75%,61.0,1.0,2.0,140.0,274.5,0.0,1.0,166.0,1.0,1.6,2.0,1.0,3.0,1.0
max,77.0,1.0,3.0,200.0,564.0,1.0,2.0,202.0,1.0,6.2,2.0,4.0,3.0,1.0


In [9]:
#  Find the mean of all the columns into the dataset.
print(dataset.groupby("target").mean())
# print(dataset.groupby("Outcome").sum())

              age       sex        cp    trestbps        chol       fbs  \
target                                                                    
0       56.601449  0.826087  0.478261  134.398551  251.086957  0.159420   
1       52.496970  0.563636  1.375758  129.303030  242.230303  0.139394   

         restecg     thalach     exang   oldpeak     slope        ca      thal  
target                                                                          
0       0.449275  139.101449  0.550725  1.585507  1.166667  1.166667  2.543478  
1       0.593939  158.466667  0.139394  0.583030  1.593939  0.363636  2.121212  


## Split the data into the dependent and the independent Variable.

In [10]:
x = dataset.drop("target", axis=1)
y = dataset["target"]

## Feature Scaling and Standardization

In [11]:
# As we have the very minimum value and the maximum value that will be reduced the model performance for 
# Overcome this thing we need to do the feature scaling.
scaler = StandardScaler()

In [12]:
scaling_data = scaler.fit_transform(x)

In [13]:
scaling_data = pd.DataFrame(scaling_data)
scaling_data.tail()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
298,0.290464,-1.468418,-0.938515,0.478391,-0.10173,-0.417635,0.898962,-1.165281,1.435481,-0.724323,-0.649113,-0.714429,1.123029
299,-1.033002,0.681005,1.973123,-1.234996,0.342756,-0.417635,0.898962,-0.771706,-0.696631,0.138373,-0.649113,-0.714429,1.123029
300,1.503641,0.681005,-0.938515,0.706843,-1.029353,2.394438,0.898962,-0.378132,-0.696631,2.036303,-0.649113,1.244593,1.123029
301,0.290464,0.681005,-0.938515,-0.092738,-2.227533,-0.417635,0.898962,-1.515125,1.435481,0.138373,-0.649113,0.265082,1.123029
302,0.290464,-1.468418,0.032031,-0.092738,-0.198357,-0.417635,-1.005832,1.064975,-0.696631,-0.896862,-0.649113,0.265082,-0.512922


In [14]:
# Now we have the new Scaling data means that the dependent Variables.
x = scaling_data
y = dataset["target"]

## Train Test Splitting Data

In [15]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=2)

## Logistic Regression Model Implementation.

In [16]:
model = LogisticRegression()

In [17]:
model.fit(x_train, y_train)

## Evaluation of the model Performance

In [18]:
# Evaluation of the training and the testing accuracy.

#  finding the training accuracy.
training_data_pred = model.predict(x_train)
training_data_accu = accuracy_score(training_data_pred, y_train)
print("The Training accuracy of the model is:-", training_data_accu * 100)

#  finding the testing accuracy.
testing_data_pred = model.predict(x_test)
testing_data_accu = accuracy_score(testing_data_pred, y_test)
print("The Testing accuracy of the model is:-", testing_data_accu * 100)

The Training accuracy of the model is:- 82.64462809917356
The Testing accuracy of the model is:- 86.88524590163934


In [19]:
# Build the Model Prediction System.
# new_data = (63,1,3,145,233,1,0,150,0,2.3,0,0,1)
# new_data = (57,0,0,140,241,0,1,123,1,0.2,1,0,3)
# new_data = (0.952197, 0.681005, 1.973123, 0.763956, -0.256334, 2.394438, -1.005832, 0.015443, -0.696631, 1.087338, -2.274579, -0.714429, -2.148873)
new_data = (0.290464, -1.468418, -0.938515, 0.478391, -0.101730, -0.417635, 0.898962, -1.165281, 1.435481, -0.724323, -0.649113, -0.714429, 1.123029)
# We need to convert into the numpy array.
new_numpy_array = np.asarray(new_data)

#  also for the ML Model we need to give the data in the form of the 2D
new_shaping_array = new_numpy_array.reshape(1,-1)

prediction = model.predict(new_shaping_array)

if(prediction[0] == 0):
    print("The Person is not suffered by Heart Disease")
else:
    print("The Person is suffered by Heart Disease") 

The Person is not suffered by Heart Disease


In [None]:

  #  Head Data..
# (0.952197, 0.681005, 1.973123, 0.763956, -0.256334, 2.394438, -1.005832, 0.015443, -0.696631, 1.087338, -2.274579, -0.714429, -2.148873)
# (-1.915313, 0.681005, 1.002577, -0.092738, 0.072199, -0.417635, 0.898962, 1.633471, -0.696631, 2.122573, -2.274579, -0.714429, -0.512922)
# (-1.474158, -1.468418, 0.032031, -0.092738, -0.816773, -0.417635, -1.005832, 0.977514, -0.696631, 0.310912, 0.976352, -0.714429, -0.512922)
# (0.180175, 0.681005, 0.032031, -0.663867, -0.198357, -0.417635, 0.898962, 1.239897, -0.696631, -0.206705, 0.976352, -0.714429, -0.512922)
# (0.290464, -1.468418, -0.938515, -0.663867, 2.082050, -0.417635, 0.898962, 0.583939, 1.435481, -0.379244, 0.976352, -0.714429, -0.512922)


# Tail Data..
# (0.290464, -1.468418, -0.938515, 0.478391, -0.101730, -0.417635, 0.898962, -1.165281, 1.435481, -0.724323, -0.649113, -0.714429, 1.123029)
# (-1.033002, 0.681005, 1.973123, -1.234996, 0.342756, -0.417635, 0.898962, -0.771706, -0.696631, 0.138373, -0.649113, -0.714429, 1.123029)
# (1.503641, 0.681005, -0.938515, 0.706843, -1.029353, 2.394438, 0.898962, -0.378132, -0.696631, 2.036303, -0.649113, 1.244593, 1.123029)
# (0.290464, 0.681005, -0.938515, -0.092738, -2.227533, -0.417635, 0.898962, -1.515125, 1.435481, 0.138373, -0.649113, 0.265082, 1.123029)
# (0.290464, -1.468418, 0.032031, -0.092738, -0.198357, -0.417635, -1.005832, 1.064975, -0.696631, -0.896862, -0.649113, 0.265082, -0.512922)
