In [2]:
import pandas as pd
import numpy as np
import glob, os
import scipy
import warnings
warnings.filterwarnings("ignore")
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split 
from sklearn.metrics import f1_score

 **Data preprocessing for the CSV file**

In [3]:
d_path = os.getcwd()
f = pd.read_csv(d_path + '/chronic_kidney_disease_full.csv',sep =",")
f = f.replace({'\t': ''}, regex=True)
# print(f.dtypes) --- use this to see datatypes of the columns
###### 
# As we see from the dtypes wbcc and rbcc should be float64 but are instead object so 
#  next line converts them to numerical (according to the info file)
# Same is the case with columns sg,Al and Su, which should be nominal but are read as float 64 here so we 
# change them too.
#######
f['wbcc'] = pd.to_numeric(f['wbcc'],errors='coerce')
f['rbcc'] = pd.to_numeric(f['rbcc'],errors='coerce')
f['sg'] = f['sg'].astype('object',errors='ignore')
f['al'] = f['al'].astype('object',errors='ignore')
f['su'] = f['su'].astype('object',errors='ignore')



print('=======================================')
print('Column types according the chronic_kidney_disease.info are')
print(f.dtypes)

Column types according the chronic_kidney_disease.info are
age      float64
bp       float64
sg        object
al        object
su        object
rbc       object
pc        object
pcc       object
ba        object
bgr      float64
bu       float64
sc       float64
sod      float64
pot      float64
hemo     float64
pcv      float64
wbcc     float64
rbcc     float64
htn       object
dm        object
cad       object
appet     object
pe        object
ane       object
class     object
dtype: object


In [4]:
#### please run this cell only after the cell above is run or else this won't print value counts 

obj = f.select_dtypes(['object']).columns

##### In next loop we convert all columns with dtype object to numerical codes
for col in obj:
    f[col] = f[col].astype('category').cat.codes
    print(col ,"\n",f[col].value_counts())
###### Now we replace all Nan in float columns with their mean
f = f.fillna(f.mean())

f.head()

sg 
  3    106
 1     84
 4     81
 2     75
-1     47
 0      7
Name: sg, dtype: int64
al 
  0    199
-1     46
 1     44
 3     43
 2     43
 4     24
 5      1
Name: al, dtype: int64
su 
  0    290
-1     49
 2     18
 3     14
 4     13
 1     13
 5      3
Name: su, dtype: int64
rbc 
  1    201
-1    152
 0     47
Name: rbc, dtype: int64
pc 
  1    259
 0     76
-1     65
Name: pc, dtype: int64
pcc 
  0    354
 1     42
-1      4
Name: pcc, dtype: int64
ba 
  0    374
 1     22
-1      4
Name: ba, dtype: int64
htn 
  0    251
 1    147
-1      2
Name: htn, dtype: int64
dm 
  1    261
 2    136
-1      2
 0      1
Name: dm, dtype: int64
cad 
  0    364
 1     34
-1      2
Name: cad, dtype: int64
appet 
  0    317
 1     82
-1      1
Name: appet, dtype: int64
pe 
  0    323
 1     76
-1      1
Name: pe, dtype: int64
ane 
  0    339
 1     60
-1      1
Name: ane, dtype: int64
class 
 0    250
1    150
Name: class, dtype: int64


Unnamed: 0,age,bp,sg,al,su,rbc,pc,pcc,ba,bgr,...,pcv,wbcc,rbcc,htn,dm,cad,appet,pe,ane,class
0,48.0,80.0,3,1,0,-1,1,0,0,121.0,...,44.0,7800.0,5.2,1,2,0,0,0,0,0
1,7.0,50.0,3,4,0,-1,1,0,0,148.036517,...,38.0,6000.0,4.707435,0,1,0,0,0,0,0
2,62.0,80.0,1,2,3,1,1,0,0,423.0,...,31.0,7500.0,4.707435,0,2,0,1,0,1,0
3,48.0,70.0,0,4,0,1,0,1,0,117.0,...,32.0,6700.0,3.9,1,1,0,1,1,1,0
4,51.0,80.0,1,2,0,1,1,0,0,106.0,...,35.0,7300.0,4.6,0,1,0,0,0,0,0


**SVM Linear**

In [5]:
train_lin, test_lin = train_test_split(f, test_size=0.2,stratify = f['class'])

clf_lin = SVC(kernel = 'linear')
clf_lin.fit(train_lin.loc[:, train_lin.columns != 'class'], train_lin['class'])

lin_train = clf_lin.predict(train_lin.loc[:, train_lin.columns != 'class'])
lin_test = clf_lin.predict(test_lin.loc[:, test_lin.columns != 'class'])


print('F1 score for svm linear kernel on training data is: ',f1_score(train_lin['class'], lin_train))
print('F1 score for svm linear kernel on testing data is: ',f1_score(test_lin['class'], lin_test))    

F1 score for svm linear kernel on training data is:  0.995850622406639
F1 score for svm linear kernel on testing data is:  0.9836065573770492


**SVM RBF**

In [7]:
train_rbf, test_rbf = train_test_split(f, test_size = 0.2, stratify = f['class'])

clf_rbf = SVC(kernel = 'rbf')

clf_rbf.fit(train_rbf.loc[:, train_rbf.columns != 'class'], train_rbf['class'])

rbf_train = clf_rbf.predict(train_rbf.loc[:, train_rbf.columns != 'class'])
rbf_test = clf_rbf.predict(test_rbf.loc[:, test_rbf.columns != 'class'])

print('F1 score for svm rbf kernel on training data is: ',f1_score(train_rbf['class'], rbf_train))
print('F1 score for svm rbf kernel on testing data is: ',f1_score(test_rbf['class'], rbf_test))  

F1 score for svm rbf kernel on training data is:  1.0
F1 score for svm rbf kernel on testing data is:  0.0


It looks like here the rbf kernel overfits the training data with default parameters, as most of the times we get f1 score as 1.0 on training data but 0.0 on test data.

**Randomforest**

In [11]:
train_rand, test_rand = train_test_split(f, test_size=0.2,stratify = f['class'])

clf_rand = RandomForestClassifier()
clf_rand.fit(train_rand.loc[:, train_rand.columns != 'class'], train_rand['class'])

rand_train = clf_rand.predict(train_rand.loc[:, train_rand.columns != 'class'])
rand_test = clf_rand.predict(test_rand.loc[:, test_rand.columns != 'class'])

print('F1 score for random forest on training data is: ',f1_score(train_rand['class'], rand_train))
print('F1 score for random forest on testing data is: ',f1_score(test_rand['class'], rand_test)) 

F1 score for random forest on training data is:  1.0
F1 score for random forest on testing data is:  0.983050847457627
