In [1]:
import numpy as np
import pandas as pd 
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [2]:
#Data collection and Processing

In [4]:
#loading the csv data
import zipfile
zip_ref=zipfile.ZipFile('heart_diesease.zip','r')
zip_ref.extractall()
zip_ref.close()


In [5]:
df=pd.read_csv('heart_failure.csv')
df

Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time,DEATH_EVENT
0,55.0,0,748,0,45,0,263358.03,1.3,137,1,1,88,0
1,65.0,0,56,0,25,0,305000.00,5.0,130,1,0,207,0
2,45.0,0,582,1,38,0,319000.00,0.9,140,0,0,244,0
3,60.0,1,754,1,40,1,328000.00,1.2,126,1,0,90,0
4,95.0,1,582,0,30,0,461000.00,2.0,132,1,0,50,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,45.0,0,582,1,55,0,543000.00,1.0,132,0,0,250,0
4996,60.0,1,582,0,30,1,127000.00,0.9,145,0,0,95,0
4997,95.0,1,112,0,40,1,196000.00,1.0,138,0,0,24,1
4998,65.0,1,160,1,20,0,327000.00,2.7,116,0,0,8,1


In [6]:
#print last 5 rows of dataset
df.tail()

Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time,DEATH_EVENT
4995,45.0,0,582,1,55,0,543000.0,1.0,132,0,0,250,0
4996,60.0,1,582,0,30,1,127000.0,0.9,145,0,0,95,0
4997,95.0,1,112,0,40,1,196000.0,1.0,138,0,0,24,1
4998,65.0,1,160,1,20,0,327000.0,2.7,116,0,0,8,1
4999,40.0,0,244,0,45,1,275000.0,0.9,140,0,0,174,0


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 13 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   age                       5000 non-null   float64
 1   anaemia                   5000 non-null   int64  
 2   creatinine_phosphokinase  5000 non-null   int64  
 3   diabetes                  5000 non-null   int64  
 4   ejection_fraction         5000 non-null   int64  
 5   high_blood_pressure       5000 non-null   int64  
 6   platelets                 5000 non-null   float64
 7   serum_creatinine          5000 non-null   float64
 8   serum_sodium              5000 non-null   int64  
 9   sex                       5000 non-null   int64  
 10  smoking                   5000 non-null   int64  
 11  time                      5000 non-null   int64  
 12  DEATH_EVENT               5000 non-null   int64  
dtypes: float64(3), int64(10)
memory usage: 507.9 KB


In [8]:
#checking for missing values
df.isnull().sum()

age                         0
anaemia                     0
creatinine_phosphokinase    0
diabetes                    0
ejection_fraction           0
high_blood_pressure         0
platelets                   0
serum_creatinine            0
serum_sodium                0
sex                         0
smoking                     0
time                        0
DEATH_EVENT                 0
dtype: int64

In [9]:
#statistical measures about the data
df.describe()

Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time,DEATH_EVENT
count,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0
mean,60.288736,0.4744,586.7606,0.4394,37.7346,0.3648,265075.40437,1.369106,136.8082,0.6456,0.3118,130.6788,0.3136
std,11.697243,0.499394,976.733979,0.496364,11.514855,0.481422,97999.758622,1.00975,4.464236,0.478379,0.463275,77.325928,0.464002
min,40.0,0.0,23.0,0.0,14.0,0.0,25100.0,0.5,113.0,0.0,0.0,4.0,0.0
25%,50.0,0.0,121.0,0.0,30.0,0.0,215000.0,0.9,134.0,0.0,0.0,74.0,0.0
50%,60.0,0.0,248.0,0.0,38.0,0.0,263358.03,1.1,137.0,1.0,0.0,113.0,0.0
75%,68.0,1.0,582.0,1.0,45.0,1.0,310000.0,1.4,140.0,1.0,1.0,201.0,1.0
max,95.0,1.0,7861.0,1.0,80.0,1.0,850000.0,9.4,148.0,1.0,1.0,285.0,1.0


In [10]:
#checking how many people have heart disease how many don't have
df['DEATH_EVENT'].value_counts()

DEATH_EVENT
0    3432
1    1568
Name: count, dtype: int64

In [11]:
#3432 people does not have any heart disease
#the distribution is not quite similar means no of patients with heart disease is quite less than no of patients with no heart disease
#two class contain different number ratio so processing is required

In [12]:
# 1-->Defective heart
# 0-->healthy heart

In [13]:
#spliting the features and target:target means output and features means other like age,high blood pressure etc

In [14]:
x=df.drop(columns='DEATH_EVENT',axis=1)
y=df['DEATH_EVENT']

In [16]:
print(x)


       age  anaemia  creatinine_phosphokinase  diabetes  ejection_fraction  \
0     55.0        0                       748         0                 45   
1     65.0        0                        56         0                 25   
2     45.0        0                       582         1                 38   
3     60.0        1                       754         1                 40   
4     95.0        1                       582         0                 30   
...    ...      ...                       ...       ...                ...   
4995  45.0        0                       582         1                 55   
4996  60.0        1                       582         0                 30   
4997  95.0        1                       112         0                 40   
4998  65.0        1                       160         1                 20   
4999  40.0        0                       244         0                 45   

      high_blood_pressure  platelets  serum_creatinine  serum_s

In [17]:
print(y)

0       0
1       0
2       0
3       0
4       1
       ..
4995    0
4996    0
4997    1
4998    1
4999    0
Name: DEATH_EVENT, Length: 5000, dtype: int64


In [18]:
#spliting the data into training data and test data

In [19]:
#x_train contain features of all training data ,x_test contain features of all test data
#y_train contain features of target

In [20]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,stratify=y,random_state=2) #stratified is equal to y thest two  
#classes 0 or 1 which are distributed in an even manner throughtout training and testing data

#random can be any number u want

In [21]:
print(x.shape,x_train.shape,x_test.shape)

(5000, 12) (4000, 12) (1000, 12)


In [22]:
#Model training 

In [23]:
#logistic Regression also used for classification problem as well

In [24]:
model=LogisticRegression()

In [27]:
#training the logisticRegresion model with training data

model.fit(x_train,y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [28]:
#accuracy on training data
x_train_prediction=model.predict(x_train)
training_data_acc=accuracy_score(x_train_prediction,y_train)

In [29]:
print('Accuracy on Training data: ',training_data_acc)

Accuracy on Training data:  0.84175


In [31]:
x_test_prediction=model.predict(x_test)
training_data_accuracy=accuracy_score(x_test_prediction,y_test)
print("Accuracy on testing data: ",training_data_accuracy)

Accuracy on testing data:  0.857


In [32]:
#note:accuracy on the training and testing data should be almost similar 
#if accuracy on training data is large and on test data is small it is overfitting


In [33]:
#building a predictive system

In [34]:
input_data=(65.0,0,56,0,25,0,305000.0,5.0,130,1,0,207)

#change the input data to a numpy array because it is easy to reshape in numpy array instead of tuple

data_numpy=np.asarray(input_data)

#reshape the numpy array as we are predicting for only on instance

input_data_reshaped=data_numpy.reshape(1,-1)

prediction=model.predict(input_data_reshaped)

print(prediction)



[0]




In [35]:
#it is in listed form 
if (prediction[0]==0):
    print("The person does not have heart disease")
else:
    print("The person has heart disease")

The person does not have heart disease


In [38]:
data2=(95.0,1,582,0,30,0,461000.0,2.0,132,1,0,50)
data=np.asarray(data2)
re_shape=data.reshape(1,-1)
pre=model.predict(re_shape)
print(pre)
if (pre==0):
    print("The person does not have heart disease")
else:
    print("The person has heart disease")


[1]
The person has heart disease




In [40]:
data3=(42.0,1,86,1,15,0,213000.0,1.3,136,0,0,65)

data3_num=np.asarray(data3)

re_shape3=data3_num.reshape(1,-1)

pre3=model.predict(re_shape3)

print(pre3)

if (pre3==0):
    print("The person does not have heart disease")
else:
    print("The person has heart disease")

[1]
The person has heart disease




In [43]:
data4=(55.0,0,2017,0,35,1,141000.0,1.0,140,1,0,206)

data4=np.asarray(data4)

data4_reshape=data4.reshape(1,-1)

data4_predict=model.predict(data4_reshape)

print(data4_predict)

if (data4_predict==0):
    print('The person does not have heart disease')
else:
    print("The person has heart disease")

[0]
The person does not have heart disease


