In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing

In [2]:
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
# importing dataset 
df = pd.read_csv('heart_failure_clinical_records_dataset.csv')

In [4]:
df.head()

Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time,DEATH_EVENT
0,75.0,0,582,0,20,1,265000.0,1.9,130,1,0,4,1
1,55.0,0,7861,0,38,0,263358.03,1.1,136,1,0,6,1
2,65.0,0,146,0,20,0,162000.0,1.3,129,1,1,7,1
3,50.0,1,111,0,20,0,210000.0,1.9,137,1,0,7,1
4,65.0,1,160,1,20,0,327000.0,2.7,116,0,0,8,1


The last column named 'DEATH_EVENT' is our target columns, based on input values we have to predict whether a person got heart fail or not.

In [8]:
df['DEATH_EVENT'].unique()

array([1, 0], dtype=int64)

In [5]:
# finding datatype of all columns

df.dtypes

age                         float64
anaemia                       int64
creatinine_phosphokinase      int64
diabetes                      int64
ejection_fraction             int64
high_blood_pressure           int64
platelets                   float64
serum_creatinine            float64
serum_sodium                  int64
sex                           int64
smoking                       int64
time                          int64
DEATH_EVENT                   int64
dtype: object

In [7]:
df.shape

(299, 13)

We have 13 columns and 299 rows in our dataset

In [6]:
# finding missing values
df.isnull().sum()

age                         0
anaemia                     0
creatinine_phosphokinase    0
diabetes                    0
ejection_fraction           0
high_blood_pressure         0
platelets                   0
serum_creatinine            0
serum_sodium                0
sex                         0
smoking                     0
time                        0
DEATH_EVENT                 0
dtype: int64

We have no missing value in any column, its great !

In [9]:
# selecting 'input' and 'target' variables
X=df.drop(['DEATH_EVENT'],axis=1)
y=df['DEATH_EVENT']

In [10]:
# splitting dataset into training and testing 
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=1)

We have some columns whose values can dominate other values in the dataset, so we need to do feature scaling of those columns..

In [11]:
# feature scaling using StandardScaler
from sklearn.preprocessing import StandardScaler
ss=StandardScaler()
X_train=ss.fit_transform(X_train)
X_test=ss.transform(X_test)

In [22]:
X_train[0:5]

array([[-1.33194278,  1.11069566, -0.4576634 , -0.84818893, -0.24318058,
        -0.72269841, -0.89319445, -0.58084862,  0.51465589,  0.72941845,
         1.52297224, -0.10575054],
       [-0.4999137 , -0.90033664, -0.51894319, -0.84818893,  0.17737049,
        -0.72269841, -0.57954879, -0.39670905,  0.29500811,  0.72941845,
        -0.65661079,  1.33189826],
       [ 0.58172409,  1.11069566,  0.39546617,  1.17898261, -0.24318058,
        -0.72269841,  0.15589621, -0.30463927, -0.58358301,  0.72941845,
        -0.65661079,  0.86979686],
       [-0.08389917,  1.11069566, -0.02487488, -0.84818893, -0.66373165,
         1.38370306, -1.40151673, -0.48877884,  1.83254257, -1.37095517,
        -0.65661079, -0.43949044],
       [-0.08389917, -0.90033664, -0.35712624,  1.17898261,  0.00915007,
        -0.72269841,  0.78318753,  1.44468663,  1.17359923, -1.37095517,
        -0.65661079, -1.27384019]])

In [23]:
X_test[0:5]

array([[-0.91592824,  1.11069566, -0.47202585, -0.84818893, -1.5048338 ,
        -0.72269841, -0.73096394, -0.58084862,  0.51465589,  0.72941845,
        -0.65661079,  0.21515321],
       [-1.33194278, -0.90033664, -0.02487488,  1.17898261,  1.43902371,
        -0.72269841,  3.09767621, -0.39670905, -1.02287857, -1.37095517,
        -0.65661079,  1.55011281],
       [-0.66631952,  1.11069566, -0.49500577, -0.84818893, -1.5048338 ,
         1.38370306,  1.74575525, -0.02842992,  0.51465589, -1.37095517,
        -0.65661079, -1.10697024],
       [-0.74952242, -0.90033664, -0.4557484 , -0.84818893, -0.66373165,
        -0.72269841, -0.41731828, -0.67291841, -0.14428745,  0.72941845,
         1.52297224, -0.22127589],
       [ 0.33211537,  1.11069566, -0.45287591, -0.84818893, -0.24318058,
         1.38370306,  0.36138819, -0.58084862, -0.58358301,  0.72941845,
        -0.65661079,  0.83128841]])

### Decision Tree Model

In [12]:
# model fitting 
from sklearn.tree import DecisionTreeClassifier 
model=DecisionTreeClassifier()
model.fit(X_train,y_train)

DecisionTreeClassifier()

In [13]:
y_pred=model.predict(X_test)

In [14]:
from sklearn.metrics import confusion_matrix,accuracy_score
cm=confusion_matrix(y_test,y_pred)
score=accuracy_score(y_test,y_pred)

In [15]:
cm

array([[35, 11],
       [ 3, 11]], dtype=int64)

In [16]:
score

0.7666666666666667

### Random forest model

In [17]:
from sklearn.ensemble import RandomForestClassifier
random_forest=RandomForestClassifier()

In [18]:
random_forest.fit(X_train,y_train)

RandomForestClassifier()

In [19]:
y_pred_rfc=random_forest.predict(X_test)

In [20]:
random_forest_score=accuracy_score(y_test,y_pred_rfc)

In [21]:
random_forest_score

0.9333333333333333