In [64]:
import pandas as pd
import numpy as np

pd.set_option('max_columns', None, 'max_rows', None)

In [36]:
raw = pd.read_csv('ckd.csv', dtype=str)
raw.head()

Unnamed: 0,age,bp,sg,al,su,rbc,pc,pcc,ba,bgr,bu,sc,sod,pot,hemo,pcv,wbcc,rbcc,htn,dm,cad,appet,pe,ane,class
0,49,100,1.01,3,0,abnormal,abnormal,notpresent,notpresent,129,158,11.8,122,3.2,8.1,24,9600,3.5,yes,yes,no,poor,yes,yes,ckd
1,51,80,1.02,0,0,normal,normal,notpresent,notpresent,94,15,1.2,144,3.7,15.5,46,9500,6.4,no,no,no,good,no,no,notckd
2,20,70,1.02,0,0,normal,normal,notpresent,notpresent,123,44,1.0,135,3.8,14.6,44,5500,4.8,no,no,no,good,no,no,notckd
3,29,70,1.02,0,0,normal,normal,notpresent,notpresent,127,44,1.2,145,5.0,14.8,48,?,?,no,no,no,good,no,no,notckd
4,65,80,1.015,2,1,normal,normal,present,notpresent,215,133,2.5,?,?,13.2,41,?,?,no,yes,no,good,no,no,ckd


### Missing Data Imputation and Encoding

Displaying the rows with most of the values missing.

In [37]:
raw[(raw=='?').sum(axis=1)>10]

Unnamed: 0,age,bp,sg,al,su,rbc,pc,pcc,ba,bgr,bu,sc,sod,pot,hemo,pcv,wbcc,rbcc,htn,dm,cad,appet,pe,ane,class
67,69,60,?,?,?,?,?,notpresent,notpresent,171,26,48.1,?,?,?,?,?,?,yes,no,no,poor,no,no,ckd
127,60,70,?,?,?,?,?,notpresent,notpresent,124,52,2.5,?,?,?,?,?,?,yes,no,no,good,no,no,ckd
130,56,80,?,?,?,?,?,notpresent,notpresent,415,37,1.9,?,?,?,?,?,?,no,yes,no,good,no,no,ckd
149,55,90,?,?,?,?,?,notpresent,notpresent,143,88,2,?,?,?,?,?,?,yes,yes,no,poor,yes,no,ckd
202,74,60,?,?,?,?,?,notpresent,notpresent,108,68,1.8,?,?,?,?,?,?,yes,yes,no,good,no,no,ckd
208,60,80,1.02,0,2,?,?,notpresent,notpresent,?,?,?,?,?,?,?,?,?,no,yes,no,good,no,no,ckd
388,59,100,?,?,?,?,?,notpresent,notpresent,?,96,6.4,?,?,6.6,?,?,?,yes,yes,no,good,no,yes,ckd


Imputing numeric values with the mean over the data with same label.

In [38]:
raw_num = raw[['age', 'bp', 'sg', 'al', 'su', 'bgr', 'bu', 'sc', 'sod', 'pot', 
               'hemo', 'pcv', 'wbcc', 'rbcc']].apply(pd.to_numeric, errors='coerce')
raw_num.head()

Unnamed: 0,age,bp,sg,al,su,bgr,bu,sc,sod,pot,hemo,pcv,wbcc,rbcc
0,49.0,100.0,1.01,3.0,0.0,129.0,158.0,11.8,122.0,3.2,8.1,24.0,9600.0,3.5
1,51.0,80.0,1.02,0.0,0.0,94.0,15.0,1.2,144.0,3.7,15.5,46.0,9500.0,6.4
2,20.0,70.0,1.02,0.0,0.0,123.0,44.0,1.0,135.0,3.8,14.6,44.0,5500.0,4.8
3,29.0,70.0,1.02,0.0,0.0,127.0,44.0,1.2,145.0,5.0,14.8,48.0,,
4,65.0,80.0,1.015,2.0,1.0,215.0,133.0,2.5,,,13.2,41.0,,


In [67]:
raw_label = raw['class']
raw_label.head()

0       ckd
1    notckd
2    notckd
3    notckd
4       ckd
Name: class, dtype: object

In [70]:
raw_numlabel = pd.concat([raw_num, raw_label], axis=1)
raw_numlabel.head()

Unnamed: 0,age,bp,sg,al,su,bgr,bu,sc,sod,pot,hemo,pcv,wbcc,rbcc,class
0,49.0,100.0,1.01,3.0,0.0,129.0,158.0,11.8,122.0,3.2,8.1,24.0,9600.0,3.5,ckd
1,51.0,80.0,1.02,0.0,0.0,94.0,15.0,1.2,144.0,3.7,15.5,46.0,9500.0,6.4,notckd
2,20.0,70.0,1.02,0.0,0.0,123.0,44.0,1.0,135.0,3.8,14.6,44.0,5500.0,4.8,notckd
3,29.0,70.0,1.02,0.0,0.0,127.0,44.0,1.2,145.0,5.0,14.8,48.0,,,notckd
4,65.0,80.0,1.015,2.0,1.0,215.0,133.0,2.5,,,13.2,41.0,,,ckd


In [75]:
num_positive = raw_numlabel[raw_numlabel['class']=='ckd']
num_positive = num_positive.iloc[:,:-1]
num_positive.head()

Unnamed: 0,age,bp,sg,al,su,bgr,bu,sc,sod,pot,hemo,pcv,wbcc,rbcc
0,49.0,100.0,1.01,3.0,0.0,129.0,158.0,11.8,122.0,3.2,8.1,24.0,9600.0,3.5
4,65.0,80.0,1.015,2.0,1.0,215.0,133.0,2.5,,,13.2,41.0,,
5,,70.0,1.01,0.0,2.0,220.0,68.0,2.8,,,8.7,27.0,,
6,56.0,90.0,1.015,2.0,0.0,129.0,107.0,6.7,131.0,4.8,9.1,29.0,6400.0,3.4
7,4.0,,1.02,1.0,0.0,99.0,23.0,0.6,138.0,4.4,12.0,34.0,,


In [76]:
new_num_positive = num_positive.apply(lambda x: x.fillna(x.mean()))
new_num_positive.head()

Unnamed: 0,age,bp,sg,al,su,bgr,bu,sc,sod,pot,hemo,pcv,wbcc,rbcc
0,49.0,100.0,1.01,3.0,0.0,129.0,158.0,11.8,122.0,3.2,8.1,24.0,9600.0,3.5
4,65.0,80.0,1.015,2.0,1.0,215.0,133.0,2.5,133.901786,4.878443,13.2,41.0,9069.536424,3.945238
5,54.541322,70.0,1.01,0.0,2.0,220.0,68.0,2.8,133.901786,4.878443,8.7,27.0,9069.536424,3.945238
6,56.0,90.0,1.015,2.0,0.0,129.0,107.0,6.7,131.0,4.8,9.1,29.0,6400.0,3.4
7,4.0,79.625,1.02,1.0,0.0,99.0,23.0,0.6,138.0,4.4,12.0,34.0,9069.536424,3.945238


In [77]:
num_negative = raw_numlabel[raw_numlabel['class']=='notckd']
num_negative = num_negative.iloc[:,:-1]
num_negative.head()

Unnamed: 0,age,bp,sg,al,su,bgr,bu,sc,sod,pot,hemo,pcv,wbcc,rbcc
1,51.0,80.0,1.02,0.0,0.0,94.0,15.0,1.2,144.0,3.7,15.5,46.0,9500.0,6.4
2,20.0,70.0,1.02,0.0,0.0,123.0,44.0,1.0,135.0,3.8,14.6,44.0,5500.0,4.8
3,29.0,70.0,1.02,0.0,0.0,127.0,44.0,1.2,145.0,5.0,14.8,48.0,,
10,60.0,70.0,1.02,0.0,0.0,,,,,,16.4,43.0,10800.0,5.7
11,17.0,60.0,1.025,0.0,0.0,114.0,50.0,1.0,135.0,4.9,14.2,51.0,7200.0,5.9


In [78]:
new_num_negative = num_negative.apply(lambda x: x.fillna(x.mean()))
new_num_negative.head()

Unnamed: 0,age,bp,sg,al,su,bgr,bu,sc,sod,pot,hemo,pcv,wbcc,rbcc
1,51.0,80.0,1.02,0.0,0.0,94.0,15.0,1.2,144.0,3.7,15.5,46.0,9500.0,6.4
2,20.0,70.0,1.02,0.0,0.0,123.0,44.0,1.0,135.0,3.8,14.6,44.0,5500.0,4.8
3,29.0,70.0,1.02,0.0,0.0,127.0,44.0,1.2,145.0,5.0,14.8,48.0,7705.594406,5.379021
10,60.0,70.0,1.02,0.0,0.0,107.722222,32.798611,0.868966,141.731034,4.337931,16.4,43.0,10800.0,5.7
11,17.0,60.0,1.025,0.0,0.0,114.0,50.0,1.0,135.0,4.9,14.2,51.0,7200.0,5.9


In [85]:
num = pd.concat([new_num_positive, new_num_negative])
num = num.sort_index()
num.head()

Unnamed: 0,age,bp,sg,al,su,bgr,bu,sc,sod,pot,hemo,pcv,wbcc,rbcc
0,49.0,100.0,1.01,3.0,0.0,129.0,158.0,11.8,122.0,3.2,8.1,24.0,9600.0,3.5
1,51.0,80.0,1.02,0.0,0.0,94.0,15.0,1.2,144.0,3.7,15.5,46.0,9500.0,6.4
2,20.0,70.0,1.02,0.0,0.0,123.0,44.0,1.0,135.0,3.8,14.6,44.0,5500.0,4.8
3,29.0,70.0,1.02,0.0,0.0,127.0,44.0,1.2,145.0,5.0,14.8,48.0,7705.594406,5.379021
4,65.0,80.0,1.015,2.0,1.0,215.0,133.0,2.5,133.901786,4.878443,13.2,41.0,9069.536424,3.945238


Imputing categorical values with different strategy.

In [86]:
cat = raw[['rbc', 'pc', 'pcc', 'ba', 'htn', 'dm', 'cad', 'appet', 'pe', 'ane', 'class']]
cat = cat.fillna('?')
cat.head()

Unnamed: 0,rbc,pc,pcc,ba,htn,dm,cad,appet,pe,ane,class
0,abnormal,abnormal,notpresent,notpresent,yes,yes,no,poor,yes,yes,ckd
1,normal,normal,notpresent,notpresent,no,no,no,good,no,no,notckd
2,normal,normal,notpresent,notpresent,no,no,no,good,no,no,notckd
3,normal,normal,notpresent,notpresent,no,no,no,good,no,no,notckd
4,normal,normal,present,notpresent,no,yes,no,good,no,no,ckd


Imputing with a new value and encoding with one-hot encoding.

In [87]:
cat1 = cat[['rbc', 'pc']]
cat1.head()

Unnamed: 0,rbc,pc
0,abnormal,abnormal
1,normal,normal
2,normal,normal
3,normal,normal
4,normal,normal


In [88]:
bin1 = pd.get_dummies(cat1)
bin1 = bin1.rename(columns={'rbc_?':'rbc_unknown', 'pc_?':'pc_unknown'})
bin1.head()

Unnamed: 0,rbc_unknown,rbc_abnormal,rbc_normal,pc_unknown,pc_abnormal,pc_normal
0,0,1,0,0,1,0
1,0,0,1,0,0,1
2,0,0,1,0,0,1
3,0,0,1,0,0,1
4,0,0,1,0,0,1


Imputing with the mode over the data with same label and encoding with binary value.

In [89]:
cat2 = cat[['pcc', 'ba', 'htn', 'dm', 'cad', 'appet', 'pe', 'ane', 'class']]
cat2.head()

Unnamed: 0,pcc,ba,htn,dm,cad,appet,pe,ane,class
0,notpresent,notpresent,yes,yes,no,poor,yes,yes,ckd
1,notpresent,notpresent,no,no,no,good,no,no,notckd
2,notpresent,notpresent,no,no,no,good,no,no,notckd
3,notpresent,notpresent,no,no,no,good,no,no,notckd
4,present,notpresent,no,yes,no,good,no,no,ckd


In [90]:
cat2 = cat2.copy()
cat2.loc[283,'appet'] = '?'
cat2.loc[283,'pe'] = '?'

There are no missing values in positive instances.

In [92]:
cat2_positive = cat2[cat2['class']=='ckd']
cat2_positive.head()

Unnamed: 0,pcc,ba,htn,dm,cad,appet,pe,ane,class
0,notpresent,notpresent,yes,yes,no,poor,yes,yes,ckd
4,present,notpresent,no,yes,no,good,no,no,ckd
5,notpresent,notpresent,yes,yes,no,good,no,yes,ckd
6,notpresent,notpresent,yes,no,no,good,no,no,ckd
7,notpresent,notpresent,no,no,no,good,no,no,ckd


In [93]:
cat2_positive[(cat2_positive=='?').sum(axis=1)>1]

Unnamed: 0,pcc,ba,htn,dm,cad,appet,pe,ane,class


There are eight missing values in negative instances.

In [94]:
cat2_negative = cat2[cat2['class']=='notckd']
cat2_negative.head()

Unnamed: 0,pcc,ba,htn,dm,cad,appet,pe,ane,class
1,notpresent,notpresent,no,no,no,good,no,no,notckd
2,notpresent,notpresent,no,no,no,good,no,no,notckd
3,notpresent,notpresent,no,no,no,good,no,no,notckd
10,notpresent,notpresent,no,no,no,good,no,no,notckd
11,notpresent,notpresent,no,no,no,good,no,no,notckd


In [95]:
cat2_negative[(cat2_negative=='?').sum(axis=1)>1]

Unnamed: 0,pcc,ba,htn,dm,cad,appet,pe,ane,class
59,?,?,no,no,no,good,no,no,notckd
86,notpresent,notpresent,?,?,?,good,no,no,notckd
121,?,?,no,no,no,good,no,no,notckd
182,notpresent,notpresent,?,?,?,good,no,no,notckd
191,?,?,no,no,no,good,no,no,notckd
261,?,?,no,no,no,good,no,no,notckd
274,notpresent,notpresent,no,no,no,?,?,?,notckd
283,notpresent,notpresent,no,?,no,?,?,no,notckd


Imputing with the most frequent value.

In [96]:
new_cat2_negative = cat2_negative.replace('?',np.NaN)
new_cat2_negative = new_cat2_negative.apply(lambda x: x.fillna(x.value_counts().index[0]))
new_cat2_negative.head()

Unnamed: 0,pcc,ba,htn,dm,cad,appet,pe,ane,class
1,notpresent,notpresent,no,no,no,good,no,no,notckd
2,notpresent,notpresent,no,no,no,good,no,no,notckd
3,notpresent,notpresent,no,no,no,good,no,no,notckd
10,notpresent,notpresent,no,no,no,good,no,no,notckd
11,notpresent,notpresent,no,no,no,good,no,no,notckd


In [97]:
new_cat2 = pd.concat([cat2_positive, new_cat2_negative]).sort_index()
new_cat2 = new_cat2.iloc[:,0:-1]
new_cat2.head()

Unnamed: 0,pcc,ba,htn,dm,cad,appet,pe,ane
0,notpresent,notpresent,yes,yes,no,poor,yes,yes
1,notpresent,notpresent,no,no,no,good,no,no
2,notpresent,notpresent,no,no,no,good,no,no
3,notpresent,notpresent,no,no,no,good,no,no
4,present,notpresent,no,yes,no,good,no,no


Encoding with binary values.

In [98]:
bin2 = pd.get_dummies(new_cat2)
bin2 = bin2[['pcc_present', 'ba_present', 'htn_yes', 'dm_yes', 'cad_yes', 'appet_good', 'pe_yes', 'ane_yes']]
bin2.head()

Unnamed: 0,pcc_present,ba_present,htn_yes,dm_yes,cad_yes,appet_good,pe_yes,ane_yes
0,0,0,1,1,0,0,1,1
1,0,0,0,0,0,1,0,0
2,0,0,0,0,0,1,0,0
3,0,0,0,0,0,1,0,0
4,1,0,0,1,0,1,0,0


Encoding label to be binary values.

In [99]:
label = cat.iloc[:,-1]
label.head()

0       ckd
1    notckd
2    notckd
3    notckd
4       ckd
Name: class, dtype: object

In [100]:
label = pd.get_dummies(label)
label = label.iloc[:,0]
label.head()

0    1
1    0
2    0
3    0
4    1
Name: ckd, dtype: uint8

Merging together.

In [101]:
feature = pd.concat([num, bin1, bin2], axis=1)
feature.head()

Unnamed: 0,age,bp,sg,al,su,bgr,bu,sc,sod,pot,hemo,pcv,wbcc,rbcc,rbc_unknown,rbc_abnormal,rbc_normal,pc_unknown,pc_abnormal,pc_normal,pcc_present,ba_present,htn_yes,dm_yes,cad_yes,appet_good,pe_yes,ane_yes
0,49.0,100.0,1.01,3.0,0.0,129.0,158.0,11.8,122.0,3.2,8.1,24.0,9600.0,3.5,0,1,0,0,1,0,0,0,1,1,0,0,1,1
1,51.0,80.0,1.02,0.0,0.0,94.0,15.0,1.2,144.0,3.7,15.5,46.0,9500.0,6.4,0,0,1,0,0,1,0,0,0,0,0,1,0,0
2,20.0,70.0,1.02,0.0,0.0,123.0,44.0,1.0,135.0,3.8,14.6,44.0,5500.0,4.8,0,0,1,0,0,1,0,0,0,0,0,1,0,0
3,29.0,70.0,1.02,0.0,0.0,127.0,44.0,1.2,145.0,5.0,14.8,48.0,7705.594406,5.379021,0,0,1,0,0,1,0,0,0,0,0,1,0,0
4,65.0,80.0,1.015,2.0,1.0,215.0,133.0,2.5,133.901786,4.878443,13.2,41.0,9069.536424,3.945238,0,0,1,0,0,1,1,0,0,1,0,1,0,0


In [102]:
result = pd.concat([feature, label], axis=1)
result.head()

Unnamed: 0,age,bp,sg,al,su,bgr,bu,sc,sod,pot,hemo,pcv,wbcc,rbcc,rbc_unknown,rbc_abnormal,rbc_normal,pc_unknown,pc_abnormal,pc_normal,pcc_present,ba_present,htn_yes,dm_yes,cad_yes,appet_good,pe_yes,ane_yes,ckd
0,49.0,100.0,1.01,3.0,0.0,129.0,158.0,11.8,122.0,3.2,8.1,24.0,9600.0,3.5,0,1,0,0,1,0,0,0,1,1,0,0,1,1,1
1,51.0,80.0,1.02,0.0,0.0,94.0,15.0,1.2,144.0,3.7,15.5,46.0,9500.0,6.4,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0
2,20.0,70.0,1.02,0.0,0.0,123.0,44.0,1.0,135.0,3.8,14.6,44.0,5500.0,4.8,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0
3,29.0,70.0,1.02,0.0,0.0,127.0,44.0,1.2,145.0,5.0,14.8,48.0,7705.594406,5.379021,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0
4,65.0,80.0,1.015,2.0,1.0,215.0,133.0,2.5,133.901786,4.878443,13.2,41.0,9069.536424,3.945238,0,0,1,0,0,1,1,0,0,1,0,1,0,0,1


In [104]:
result.to_csv('ckd_imputed_9.csv', index=None)