In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import KFold

In [2]:
df = pd.read_csv('F://Uni/991/Machine_Learning/ML_991_Final/Dataset/5/heart_failure_clinical_records_dataset.csv')
df.head()

Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time,DEATH_EVENT
0,75,0,582,0,20,1,265000.0,1.9,130,1,0,4,1
1,55,0,7861,0,38,0,263358.03,1.1,136,1,0,6,1
2,65,0,146,0,20,0,162000.0,1.3,129,1,1,7,1
3,50,1,111,0,20,0,210000.0,1.9,137,1,0,7,1
4,65,1,160,1,20,0,327000.0,2.7,116,0,0,8,1


# Understanding what are missing

In [3]:
#The reason for reindexing can be found in the report
#Creating new index
newindex = np.append(np.where(df.DEATH_EVENT==1),np.where(df.DEATH_EVENT==0))
#reindexing the dataframe
df = df.reindex(newindex).reset_index(drop=True)

In [4]:
#Understanding the missing data
print('Dataset Size:\n', df.shape)
print('\nNumber of missing values:\n', np.where(df=='?')[0].size)
print('Indices of missing values:\nrow number: ', np.where(df=='?')[0], '\ncolumn number: ', np.where(df=='?')[1])
print('\nNumber of rows with missing values:\n', np.unique(np.where(df=='?')[0]).size)
nlist = list()
for n in np.unique(np.where(df=='?')[0]):
    nlist.append(len(np.where(np.where(df=='?')[0]==n)[0]))
print('Number of missing values in each row with missing values:\n', nlist)
print('\nNumber of columns with missing values:\n', np.unique(np.where(df=='?')[1]).size)
nlist = list()
for n in np.unique(np.where(df=='?')[1]):
    nlist.append(len(np.where(np.where(df=='?')[1]==n)[0]))
print('Number of missing values in each column with missing values:\n', nlist)
print('\nNumber of each class data points:\n', df.groupby('DEATH_EVENT').count()['age'])
print('Class of rows with missing values:\n', df.loc[np.unique(np.where(df=='?')[0]),'DEATH_EVENT'])

Dataset Size:
 (300, 13)

Number of missing values:
 32
Indices of missing values:
row number:  [ 17  18  24  31  34  45  89  89  89  89  89  96  96  96  96  96  96  96
 223 223 227 230 242 242 242 242 242 274 274 274 274 286] 
column number:  [ 2  8  3  9  6  0  2  4  6  7  8  1  2  3  6  7  9 10  7  8  4 11  2  4
  6  8 11  4  6  7 11  2]

Number of rows with missing values:
 14
Number of missing values in each row with missing values:
 [1, 1, 1, 1, 1, 1, 5, 7, 2, 1, 1, 5, 4, 1]

Number of columns with missing values:
 11
Number of missing values in each column with missing values:
 [1, 1, 5, 2, 4, 5, 4, 4, 2, 1, 3]

Number of each class data points:
 DEATH_EVENT
0    203
1     97
Name: age, dtype: int64
Class of rows with missing values:
 17     1
18     1
24     1
31     1
34     1
45     1
89     1
96     1
223    0
227    0
230    0
242    0
274    0
286    0
Name: DEATH_EVENT, dtype: int64


# Dealing with the missing values

In [5]:
# Just checking if number 1000 exists in the dataframe
# The issue was that the features that included missing values were not type float
# This was the way I thought of correcting this
print('\nNumber of values equal to 1000:\n', np.where(df==1000)[0].size)


Number of values equal to 1000:
 0


In [6]:
# Replace ? with 1000, change the type, replace 1000 with nan and fill the missing values
df = df.replace('?', 1000).astype(float)
df = df.replace(1000, np.nan)
df = df.fillna(method='ffill')

In [7]:
# Checking if the missing values have been filled
print('Checking if the missing values have been filled')
print('df 18 creatinine_phosphokinase: ', df.loc[18,'creatinine_phosphokinase'])
print('df 35 sex: ', df.loc[35,'sex'])

Checking if the missing values have been filled
df 18 creatinine_phosphokinase:  582.0
df 35 sex:  1.0


# Preparing Data

In [8]:
# Normalizing the Data
df_norm = pd.DataFrame(MinMaxScaler().fit_transform(df),columns=df.columns)
# Preparing the Data
xdf_norm = df_norm.drop(columns=['DEATH_EVENT'])
ydf_norm = df_norm['DEATH_EVENT']

In [9]:
# Using KFold Cross Validation
kf = KFold(n_splits=5, shuffle=True, random_state=0)

# Choosing best k features

In [10]:
# Computing Absolute Correlation between the Features
betweencorr=abs(xdf_norm.corr())
# Computing Absolute Correlation between the Features and the Output
outputcorr = abs(xdf_norm.corrwith(ydf_norm))
#Setting the Threshold
threshold=0.4

In [11]:
betweencorr

Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time
age,1.0,0.088265,0.080617,0.113457,0.042312,0.095094,0.047062,0.17648,0.077595,0.062955,0.022594,0.219105
anaemia,0.088265,1.0,0.191722,0.004375,0.038211,0.040231,0.048355,0.079922,0.073648,0.096696,0.110981,0.145386
creatinine_phosphokinase,0.080617,0.191722,1.0,0.010825,0.048141,0.06704,0.024412,0.013957,0.05652,0.079755,0.005155,0.009669
diabetes,0.113457,0.004375,0.010825,1.0,0.010415,0.019869,0.085329,0.036826,0.05829,0.164627,0.16054,0.038017
ejection_fraction,0.042312,0.038211,0.048141,0.010415,1.0,0.029327,0.077699,0.003983,0.198782,0.146988,0.061032,0.045391
high_blood_pressure,0.095094,0.040231,0.06704,0.019869,0.029327,1.0,0.05055,0.025316,0.037437,0.106227,0.059016,0.199418
platelets,0.047062,0.048355,0.024412,0.085329,0.077699,0.05055,1.0,0.026894,0.031429,0.118981,0.030165,0.008613
serum_creatinine,0.17648,0.079922,0.013957,0.036826,0.003983,0.025316,0.026894,1.0,0.168982,0.008371,0.053816,0.215189
serum_sodium,0.077595,0.073648,0.05652,0.05829,0.198782,0.037437,0.031429,0.168982,1.0,0.051992,0.006964,0.069028
sex,0.062955,0.096696,0.079755,0.164627,0.146988,0.106227,0.118981,0.008371,0.051992,1.0,0.447479,0.011471


In [12]:
outputcorr

age                         0.265040
anaemia                     0.061752
creatinine_phosphokinase    0.070709
diabetes                    0.015822
ejection_fraction           0.261201
high_blood_pressure         0.075451
platelets                   0.048986
serum_creatinine            0.325494
serum_sodium                0.153729
sex                         0.000747
smoking                     0.005536
time                        0.514728
dtype: float64

In [13]:
# Computing score for dataset with all of the features
scorelist = list()
for train_index, test_index in kf.split(xdf_norm):
    xtrain, xtest = xdf_norm.iloc[train_index], xdf_norm.iloc[test_index]
    ytrain, ytest = ydf_norm.iloc[train_index], ydf_norm.iloc[test_index]
    scorelist.append(LogisticRegression().fit(xtrain, ytrain).score(xtest,ytest))
allscore = sum(scorelist)/len(scorelist)
print('Score without Removing any Features: ', allscore)

Score without Removing any Features:  0.8133333333333332


In [14]:
#Setting up a mask to remove 1 from the diagonal of the correlation matrix
mask = np.eye(betweencorr.shape[0])*(-1)+1
maxdf=betweencorr*mask
#Removing features will stop if linear regressor's score doesn't get any better
t = True
# i is for figuring out how many features got removed
i=0
while t==True:
    print(t)
    if maxdf.max().max()>=threshold:
        maxwhere=np.where(maxdf.max()==maxdf.max().max())
        n = np.argmin(outputcorr[maxwhere[0]])
        print('Featurs with correlation more than Threshold:\n', maxdf.columns[maxwhere[0][0]], ', ', maxdf.columns[maxwhere[0][1]])
        d = betweencorr.columns[maxwhere[0][n]]
        print('Removing ', d)
        #keeping a copy for when the feature shouldn't have been removed
        newdf = xdf_norm.copy()
        #removing the chosen feature
        maxdf.drop(columns=[d],inplace=True)
        maxdf.drop(index=[d],inplace=True, axis=1)
        outputcorr.drop(index=[d],inplace=True, axis=1)
        xdf_norm.drop(columns=[d],inplace=True)
        scorelist = list()
        for train_index, test_index in kf.split(xdf_norm):
            xtrain, xtest = xdf_norm.iloc[train_index], xdf_norm.iloc[test_index]
            ytrain, ytest = ydf_norm.iloc[train_index], ydf_norm.iloc[test_index]
            scorelist.append(LogisticRegression().fit(xtrain, ytrain).score(xtest,ytest))
        score = sum(scorelist)/len(scorelist)
        print('Score After Feature Removal: ', score)
        #Checking if the score got better
        if score<allscore:
            #getting the copy
            xdf_norm = newdf
            t=False
        else:
            #the new score is gonna be set as the base score
            allscore=score
            i+=1
    else:
        t = False
print('Number of Features Removed: ', i)

True
Featurs with correlation more than Threshold:
 sex ,  smoking
Removing  sex
Score After Feature Removal:  0.8
Number of Features Removed:  0


# Voting Models

In [15]:
from sklearn.ensemble import VotingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import svm

In [16]:
# I did put KFold Cross Validation in one function here, just for ease of use
# Else than the Classifier every other variable is globaly defined
def kfoldscore(classifier):
    scorelist = list()
    for train_index, test_index in kf.split(xdf_norm):
        xtrain, xtest = xdf_norm.iloc[train_index], xdf_norm.iloc[test_index]
        ytrain, ytest = ydf_norm.iloc[train_index], ydf_norm.iloc[test_index]
        scorelist.append(classifier.fit(xtrain, ytrain).score(xtest,ytest))
    score = sum(scorelist)/len(scorelist)
    return score

In [17]:
#Using three SVMs
svm1 = svm.SVC(probability=True, kernel='poly', degree=1)
svm2 = svm.SVC(probability=True, kernel='poly', degree=2)
svm3 = svm.SVC(probability=True, kernel='poly', degree=3)

eclf = VotingClassifier(estimators=[('svm1', svm1), ('svm2', svm2), ('svm3', svm3)],voting='soft')
score = kfoldscore(eclf)

print(score)

0.79


In [18]:
# Using three KNNs
knn1 = KNeighborsClassifier(n_neighbors=1)
knn2 = KNeighborsClassifier(n_neighbors=3)
knn3 = KNeighborsClassifier(n_neighbors=5)
eclf = VotingClassifier(estimators=[('knn1', knn1), ('knn2', knn2), ('knn3', knn3)],voting='hard')
score = kfoldscore(eclf)

print(score)

0.6933333333333334


In [19]:
# Using three Decision Trees
dt1 = DecisionTreeClassifier(max_depth = 3,min_samples_split=10,max_features=4)
dt2 = DecisionTreeClassifier(max_depth = 3,min_samples_split=30,max_features=4)
dt3 = DecisionTreeClassifier(max_depth = 3,min_samples_split=50,max_features=4)
eclf = VotingClassifier(estimators=[('dt1', dt1), ('dt2', dt2), ('dt3', dt3)],voting='soft')
score = kfoldscore(eclf)

print(score)

0.8233333333333333


In [24]:
# Using Logistic Regression, Decision Tree and GaussianNB
clf1 = LogisticRegression(random_state=1)
clf2 = DecisionTreeClassifier(max_depth = 3,min_samples_split=30,max_features=4)
clf3 = GaussianNB()

eclf = VotingClassifier(estimators=[('lr', clf1), ('dt', clf2), ('gnb', clf3)],voting='soft')
score = kfoldscore(eclf)

print(score)

0.8033333333333333


# Base Models

In [21]:
# Decision Tree 1
score = kfoldscore(dt1)
print('Decision Tree 1 Score: ', score)

Decision Tree 1 Score:  0.7766666666666667


In [22]:
# Decision Tree 2
score = kfoldscore(dt2)
print('Decision Tree 2 Score: ',score)

Decision Tree 2 Score:  0.7166666666666666


In [23]:
# Decision Tree 3
score = kfoldscore(dt3)
print('Decision Tree 3 Score: ',score)

Decision Tree 3 Score:  0.7933333333333332
