In [43]:
import pandas as pd
import numpy as np

pd.set_option('max_columns', None, 'max_rows', None)

In [44]:
raw1 = pd.read_csv('ckd.csv', dtype=str)
raw1.head()

Unnamed: 0,age,bp,sg,al,su,rbc,pc,pcc,ba,bgr,bu,sc,sod,pot,hemo,pcv,wbcc,rbcc,htn,dm,cad,appet,pe,ane,class
0,49,100,1.01,3,0,abnormal,abnormal,notpresent,notpresent,129,158,11.8,122,3.2,8.1,24,9600,3.5,yes,yes,no,poor,yes,yes,ckd
1,51,80,1.02,0,0,normal,normal,notpresent,notpresent,94,15,1.2,144,3.7,15.5,46,9500,6.4,no,no,no,good,no,no,notckd
2,20,70,1.02,0,0,normal,normal,notpresent,notpresent,123,44,1.0,135,3.8,14.6,44,5500,4.8,no,no,no,good,no,no,notckd
3,29,70,1.02,0,0,normal,normal,notpresent,notpresent,127,44,1.2,145,5.0,14.8,48,?,?,no,no,no,good,no,no,notckd
4,65,80,1.015,2,1,normal,normal,present,notpresent,215,133,2.5,?,?,13.2,41,?,?,no,yes,no,good,no,no,ckd


### Missing Data Imputation and Encoding

Displaying the rows with most of the values missing.

In [45]:
feature_stats = (raw1=='?').sum(axis=0)
feature2drop = list(feature_stats[feature_stats>100].index)
feature2drop

['rbc', 'wbcc', 'rbcc']

In [46]:
raw2 = raw1.drop(feature2drop, axis=1)
raw2.head()

Unnamed: 0,age,bp,sg,al,su,pc,pcc,ba,bgr,bu,sc,sod,pot,hemo,pcv,htn,dm,cad,appet,pe,ane,class
0,49,100,1.01,3,0,abnormal,notpresent,notpresent,129,158,11.8,122,3.2,8.1,24,yes,yes,no,poor,yes,yes,ckd
1,51,80,1.02,0,0,normal,notpresent,notpresent,94,15,1.2,144,3.7,15.5,46,no,no,no,good,no,no,notckd
2,20,70,1.02,0,0,normal,notpresent,notpresent,123,44,1.0,135,3.8,14.6,44,no,no,no,good,no,no,notckd
3,29,70,1.02,0,0,normal,notpresent,notpresent,127,44,1.2,145,5.0,14.8,48,no,no,no,good,no,no,notckd
4,65,80,1.015,2,1,normal,present,notpresent,215,133,2.5,?,?,13.2,41,no,yes,no,good,no,no,ckd


In [47]:
raw3 = raw2[(raw2=='?').sum(axis=1)==0]
raw3.head()

Unnamed: 0,age,bp,sg,al,su,pc,pcc,ba,bgr,bu,sc,sod,pot,hemo,pcv,htn,dm,cad,appet,pe,ane,class
0,49,100,1.01,3,0,abnormal,notpresent,notpresent,129,158,11.8,122,3.2,8.1,24,yes,yes,no,poor,yes,yes,ckd
1,51,80,1.02,0,0,normal,notpresent,notpresent,94,15,1.2,144,3.7,15.5,46,no,no,no,good,no,no,notckd
2,20,70,1.02,0,0,normal,notpresent,notpresent,123,44,1.0,135,3.8,14.6,44,no,no,no,good,no,no,notckd
3,29,70,1.02,0,0,normal,notpresent,notpresent,127,44,1.2,145,5.0,14.8,48,no,no,no,good,no,no,notckd
6,56,90,1.015,2,0,abnormal,notpresent,notpresent,129,107,6.7,131,4.8,9.1,29,yes,no,no,good,no,no,ckd


In [52]:
num = raw3[['age', 'bp', 'sg', 'al', 'su', 'bgr', 'bu', 'sc', 'sod', 'pot', 'hemo', 'pcv']]
num.head()

Unnamed: 0,age,bp,sg,al,su,bgr,bu,sc,sod,pot,hemo,pcv
0,49,100,1.01,3,0,129,158,11.8,122,3.2,8.1,24
1,51,80,1.02,0,0,94,15,1.2,144,3.7,15.5,46
2,20,70,1.02,0,0,123,44,1.0,135,3.8,14.6,44
3,29,70,1.02,0,0,127,44,1.2,145,5.0,14.8,48
6,56,90,1.015,2,0,129,107,6.7,131,4.8,9.1,29


In [55]:
cat = raw3[['pc', 'pcc', 'ba', 'htn', 'dm', 'cad', 'appet', 'pe', 'ane']]
cat.head()

Unnamed: 0,pc,pcc,ba,htn,dm,cad,appet,pe,ane
0,abnormal,notpresent,notpresent,yes,yes,no,poor,yes,yes
1,normal,notpresent,notpresent,no,no,no,good,no,no
2,normal,notpresent,notpresent,no,no,no,good,no,no
3,normal,notpresent,notpresent,no,no,no,good,no,no
6,abnormal,notpresent,notpresent,yes,no,no,good,no,no


In [58]:
cat2 = cat.copy()
cat2.loc[283,'appet'] = '?'
cat2.loc[283,'pe'] = '?'

In [61]:
bin1 = pd.get_dummies(cat2)
bin1 = bin1[['pc_normal', 'pcc_present', 'ba_present', 'htn_yes', 'dm_yes', 
            'cad_yes', 'appet_good', 'pe_yes', 'ane_yes']]
bin1.head()

Unnamed: 0,pc_normal,pcc_present,ba_present,htn_yes,dm_yes,cad_yes,appet_good,pe_yes,ane_yes
0,0,0,0,1,1,0,0,1,1
1,1,0,0,0,0,0,1,0,0
2,1,0,0,0,0,0,1,0,0
3,1,0,0,0,0,0,1,0,0
6,0,0,0,1,0,0,1,0,0


Encoding label to be binary values.

In [62]:
label = raw3.iloc[:,-1]
label.head()

0       ckd
1    notckd
2    notckd
3    notckd
6       ckd
Name: class, dtype: object

In [63]:
label = pd.get_dummies(label)
label = label.iloc[:,0]
label.head()

0    1
1    0
2    0
3    0
6    1
Name: ckd, dtype: uint8

Merging together.

In [64]:
feature = pd.concat([num, bin1], axis=1)
feature.head()

Unnamed: 0,age,bp,sg,al,su,bgr,bu,sc,sod,pot,hemo,pcv,pc_normal,pcc_present,ba_present,htn_yes,dm_yes,cad_yes,appet_good,pe_yes,ane_yes
0,49,100,1.01,3,0,129,158,11.8,122,3.2,8.1,24,0,0,0,1,1,0,0,1,1
1,51,80,1.02,0,0,94,15,1.2,144,3.7,15.5,46,1,0,0,0,0,0,1,0,0
2,20,70,1.02,0,0,123,44,1.0,135,3.8,14.6,44,1,0,0,0,0,0,1,0,0
3,29,70,1.02,0,0,127,44,1.2,145,5.0,14.8,48,1,0,0,0,0,0,1,0,0
6,56,90,1.015,2,0,129,107,6.7,131,4.8,9.1,29,0,0,0,1,0,0,1,0,0


In [65]:
result = pd.concat([feature, label], axis=1)
result.head()

Unnamed: 0,age,bp,sg,al,su,bgr,bu,sc,sod,pot,hemo,pcv,pc_normal,pcc_present,ba_present,htn_yes,dm_yes,cad_yes,appet_good,pe_yes,ane_yes,ckd
0,49,100,1.01,3,0,129,158,11.8,122,3.2,8.1,24,0,0,0,1,1,0,0,1,1,1
1,51,80,1.02,0,0,94,15,1.2,144,3.7,15.5,46,1,0,0,0,0,0,1,0,0,0
2,20,70,1.02,0,0,123,44,1.0,135,3.8,14.6,44,1,0,0,0,0,0,1,0,0,0
3,29,70,1.02,0,0,127,44,1.2,145,5.0,14.8,48,1,0,0,0,0,0,1,0,0,0
6,56,90,1.015,2,0,129,107,6.7,131,4.8,9.1,29,0,0,0,1,0,0,1,0,0,1


In [67]:
result.to_csv('ckd_imputed_5.csv', index=None)