In [2]:
import pandas as pd
import numpy as np

pd.set_option('max_columns', None, 'max_rows', None)

### Training data 

In [3]:
train = pd.read_csv('train.csv')
train.head()

Unnamed: 0,age,bp,sg,al,su,rbc,pc,pcc,ba,bgr,bu,sc,sod,pot,hemo,pcv,wbcc,rbcc,htn,dm,cad,appet,pe,ane,class
0,20,60,1.025,0,0,normal,normal,notpresent,notpresent,?,?,?,137,4.7,14,41,4500,5.5,no,no,no,good,no,no,notckd
1,62,?,1.015,3,0,abnormal,?,notpresent,notpresent,?,?,?,?,?,14.3,42,10200,4.8,yes,yes,no,good,no,no,ckd
2,66,70,1.015,2,5,?,normal,notpresent,notpresent,447,41,1.7,131,3.9,12.5,33,9600,4.4,yes,yes,no,good,no,no,ckd
3,34,70,1.025,0,0,normal,normal,notpresent,notpresent,87,38,0.5,144,4.8,17.1,47,7400,6.1,no,no,no,good,no,no,notckd
4,72,90,?,?,?,?,?,notpresent,notpresent,308,36,2.5,131,4.3,?,?,?,?,yes,yes,no,poor,no,no,ckd


In [5]:
train.describe()

Unnamed: 0,age,bp,sg,al,su,rbc,pc,pcc,ba,bgr,bu,sc,sod,pot,hemo,pcv,wbcc,rbcc,htn,dm,cad,appet,pe,ane,class
count,250,250,250.0,250,250,250,250,250,250,250,250,250.0,250,250,250,250,250,250,250,249,250,250,250,250,250
unique,70,9,6.0,7,7,3,3,2,2,114,101,71.0,33,39,101,42,71,44,3,3,3,4,4,3,2
top,60,80,1.02,0,0,normal,normal,notpresent,notpresent,?,?,1.2,?,?,?,?,?,?,no,no,no,good,no,no,ckd
freq,12,78,61.0,118,184,122,154,226,234,32,12,29.0,52,52,35,49,72,87,155,167,233,193,200,205,158


In [3]:
train_num = train[['age', 'bp', 'sg', 'al', 'su', 'bgr', 'bu', 'sc', 'sod', 'pot', 
                   'hemo', 'pcv', 'wbcc', 'rbcc']].apply(pd.to_numeric, errors='coerce')

In [4]:
from sklearn.preprocessing import Imputer
imputer = Imputer(missing_values='NaN', strategy='mean', axis=0)

In [5]:
train_fillna = pd.DataFrame(imputer.fit_transform(train_num), columns=train_num.columns)
train_fillna.head()

Unnamed: 0,age,bp,sg,al,su,bgr,bu,sc,sod,pot,hemo,pcv,wbcc,rbcc
0,20.0,60.0,1.025,0.0,0.0,143.243119,61.746218,3.044561,137.0,4.7,14.0,41.0,4500.0,5.5
1,62.0,76.182573,1.015,3.0,0.0,143.243119,61.746218,3.044561,137.681818,4.605051,14.3,42.0,10200.0,4.8
2,66.0,70.0,1.015,2.0,5.0,447.0,41.0,1.7,131.0,3.9,12.5,33.0,9600.0,4.4
3,34.0,70.0,1.025,0.0,0.0,87.0,38.0,0.5,144.0,4.8,17.1,47.0,7400.0,6.1
4,72.0,90.0,1.017292,1.023041,0.369159,308.0,36.0,2.5,131.0,4.3,12.388372,38.895522,8279.213483,4.713497


In [6]:
train_cat = train[['rbc', 'pc', 'pcc', 'ba', 'htn', 'dm', 'cad', 'appet', 'pe', 'ane']]

In [7]:
train_bin = pd.get_dummies(train_cat)
train_bin = train_bin.drop(['rbc_?', 'pc_?', 'htn_?', 'dm_?', 'cad_?', 
                            'appet_?', 'pe_?', 'ane_?'], axis=1)
train_bin = train_bin.apply(pd.to_numeric, errors='coerce')
train_bin.head()

Unnamed: 0,rbc_abnormal,rbc_normal,pc_abnormal,pc_normal,pcc_notpresent,pcc_present,ba_notpresent,ba_present,htn_no,htn_yes,dm_no,dm_yes,cad_no,cad_yes,appet_good,appet_no,appet_poor,pe_good,pe_no,pe_yes,ane_no,ane_yes
0,0,1,0,1,1,0,1,0,1,0,1,0,1,0,1,0,0,0,1,0,1,0
1,1,0,0,0,1,0,1,0,0,1,0,1,1,0,1,0,0,0,1,0,1,0
2,0,0,0,1,1,0,1,0,0,1,0,1,1,0,1,0,0,0,1,0,1,0
3,0,1,0,1,1,0,1,0,1,0,1,0,1,0,1,0,0,0,1,0,1,0
4,0,0,0,0,1,0,1,0,0,1,0,1,1,0,0,0,1,0,1,0,1,0


In [8]:
from sklearn.preprocessing import OneHotEncoder, LabelBinarizer, LabelEncoder
ohe = OneHotEncoder()
lb = LabelBinarizer()
le = LabelEncoder()

In [9]:
train_label = pd.DataFrame(le.fit_transform(train['class']), columns=['class'])
train_label.head()

Unnamed: 0,class
0,1
1,0
2,0
3,1
4,0


In [10]:
newtrain = pd.merge(
                pd.merge(train_fillna, train_bin, left_index=True, right_index=True), 
                train_label, left_index=True, right_index=True)
newtrain.head()

Unnamed: 0,age,bp,sg,al,su,bgr,bu,sc,sod,pot,hemo,pcv,wbcc,rbcc,rbc_abnormal,rbc_normal,pc_abnormal,pc_normal,pcc_notpresent,pcc_present,ba_notpresent,ba_present,htn_no,htn_yes,dm_no,dm_yes,cad_no,cad_yes,appet_good,appet_no,appet_poor,pe_good,pe_no,pe_yes,ane_no,ane_yes,class
0,20.0,60.0,1.025,0.0,0.0,143.243119,61.746218,3.044561,137.0,4.7,14.0,41.0,4500.0,5.5,0,1,0,1,1,0,1,0,1,0,1,0,1,0,1,0,0,0,1,0,1,0,1
1,62.0,76.182573,1.015,3.0,0.0,143.243119,61.746218,3.044561,137.681818,4.605051,14.3,42.0,10200.0,4.8,1,0,0,0,1,0,1,0,0,1,0,1,1,0,1,0,0,0,1,0,1,0,0
2,66.0,70.0,1.015,2.0,5.0,447.0,41.0,1.7,131.0,3.9,12.5,33.0,9600.0,4.4,0,0,0,1,1,0,1,0,0,1,0,1,1,0,1,0,0,0,1,0,1,0,0
3,34.0,70.0,1.025,0.0,0.0,87.0,38.0,0.5,144.0,4.8,17.1,47.0,7400.0,6.1,0,1,0,1,1,0,1,0,1,0,1,0,1,0,1,0,0,0,1,0,1,0,1
4,72.0,90.0,1.017292,1.023041,0.369159,308.0,36.0,2.5,131.0,4.3,12.388372,38.895522,8279.213483,4.713497,0,0,0,0,1,0,1,0,0,1,0,1,1,0,0,0,1,0,1,0,1,0,0


In [11]:
trainX = newtrain.iloc[:,:-1].as_matrix()
trainy = newtrain.iloc[:,-1].as_matrix()

### Testing data

In [6]:
test = pd.read_csv('test.csv')
test.head()

Unnamed: 0,age,bp,sg,al,su,rbc,pc,pcc,ba,bgr,bu,sc,sod,pot,hemo,pcv,wbcc,rbcc,htn,dm,cad,appet,pe,ane,class
0,55,80,1.02,0,0,normal,normal,notpresent,notpresent,140,49,0.5,150,4.9,15.7,47,6700,4.9,no,no,no,good,no,no,notckd
1,60,100,1.025,0,3,?,normal,notpresent,notpresent,263,27,1.3,135,4.3,12.7,37,11400,4.3,yes,yes,yes,good,no,no,ckd
2,45,70,1.025,2,0,normal,abnormal,present,notpresent,117,52,2.2,136,3.8,10.0,30,19100,3.7,no,no,no,good,no,no,ckd
3,65,70,1.01,2,0,?,normal,present,notpresent,112,73,3.3,?,?,10.9,37,?,?,no,no,no,good,no,no,ckd
4,25,70,1.02,0,0,normal,normal,notpresent,notpresent,88,42,0.5,136,3.5,13.3,48,7000,4.9,no,no,no,good,no,no,notckd


In [7]:
test.describe()

Unnamed: 0,age,bp,sg,al,su,rbc,pc,pcc,ba,bgr,bu,sc,sod,pot,hemo,pcv,wbcc,rbcc,htn,dm,cad,appet,pe,ane,class
count,150,150,150.0,150,150,150,150,150,150,150,150,150.0,150,150,150,150,150,150,150,150,150,150,150,150,150
unique,57,10,6.0,6,7,3,3,3,3,93,66,49.0,26,31,78,37,66,39,3,4,4,2,2,2,2
top,60,70,1.02,0,0,normal,normal,notpresent,notpresent,?,25,0.7,?,?,?,?,?,?,no,no,no,good,no,no,ckd
freq,7,50,45.0,81,106,79,105,128,140,12,7,12.0,35,36,17,21,34,44,96,93,130,123,122,134,92


In [13]:
test_num = test[['age', 'bp', 'sg', 'al', 'su', 'bgr', 'bu', 'sc', 'sod', 'pot', 
                   'hemo', 'pcv', 'wbcc', 'rbcc']].apply(pd.to_numeric, errors='coerce')

In [14]:
test_fillna = pd.DataFrame(imputer.fit_transform(test_num), columns=test_num.columns)
test_fillna.head()

Unnamed: 0,age,bp,sg,al,su,bgr,bu,sc,sod,pot,hemo,pcv,wbcc,rbcc
0,55.0,80.0,1.02,0.0,0.0,140.0,49.0,0.5,150.0,4.9,15.7,47.0,6700.0,4.9
1,60.0,100.0,1.025,0.0,3.0,263.0,27.0,1.3,135.0,4.3,12.7,37.0,11400.0,4.3
2,45.0,70.0,1.025,2.0,0.0,117.0,52.0,2.2,136.0,3.8,10.0,30.0,19100.0,3.7
3,65.0,70.0,1.01,2.0,0.0,112.0,73.0,3.3,137.265217,4.665789,10.9,37.0,8600.862069,4.698113
4,25.0,70.0,1.02,0.0,0.0,88.0,42.0,0.5,136.0,3.5,13.3,48.0,7000.0,4.9


In [15]:
test_cat = test[['rbc', 'pc', 'pcc', 'ba', 'htn', 'dm', 'cad', 'appet', 'pe', 'ane']]

In [16]:
test_bin = pd.get_dummies(test_cat)
test_bin = test_bin.drop(['rbc_?', 'pc_?', 'pcc_?', 'ba_?', 'htn_?', 
                          'dm_?', 'cad_?'], axis=1)
test_bin = test_bin.apply(pd.to_numeric, errors='coerce')
test_bin.head()

Unnamed: 0,rbc_abnormal,rbc_normal,pc_abnormal,pc_normal,pcc_notpresent,pcc_present,ba_notpresent,ba_present,htn_no,htn_yes,dm_	yes,dm_no,dm_yes,cad_	no,cad_no,cad_yes,appet_good,appet_poor,pe_no,pe_yes,ane_no,ane_yes
0,0,1,0,1,1,0,1,0,1,0,0,1,0,0,1,0,1,0,1,0,1,0
1,0,0,0,1,1,0,1,0,0,1,0,0,1,0,0,1,1,0,1,0,1,0
2,0,1,1,0,0,1,1,0,1,0,0,1,0,0,1,0,1,0,1,0,1,0
3,0,0,0,1,0,1,1,0,1,0,0,1,0,0,1,0,1,0,1,0,1,0
4,0,1,0,1,1,0,1,0,1,0,0,1,0,0,1,0,1,0,1,0,1,0


In [17]:
test_label = pd.DataFrame(le.fit_transform(test['class']), columns=['class'])
test_label.head()

Unnamed: 0,class
0,1
1,0
2,0
3,0
4,1


In [18]:
newtest = pd.merge(
                pd.merge(test_fillna, test_bin, left_index=True, right_index=True), 
                test_label, left_index=True, right_index=True)
newtest.head()

Unnamed: 0,age,bp,sg,al,su,bgr,bu,sc,sod,pot,hemo,pcv,wbcc,rbcc,rbc_abnormal,rbc_normal,pc_abnormal,pc_normal,pcc_notpresent,pcc_present,ba_notpresent,ba_present,htn_no,htn_yes,dm_	yes,dm_no,dm_yes,cad_	no,cad_no,cad_yes,appet_good,appet_poor,pe_no,pe_yes,ane_no,ane_yes,class
0,55.0,80.0,1.02,0.0,0.0,140.0,49.0,0.5,150.0,4.9,15.7,47.0,6700.0,4.9,0,1,0,1,1,0,1,0,1,0,0,1,0,0,1,0,1,0,1,0,1,0,1
1,60.0,100.0,1.025,0.0,3.0,263.0,27.0,1.3,135.0,4.3,12.7,37.0,11400.0,4.3,0,0,0,1,1,0,1,0,0,1,0,0,1,0,0,1,1,0,1,0,1,0,0
2,45.0,70.0,1.025,2.0,0.0,117.0,52.0,2.2,136.0,3.8,10.0,30.0,19100.0,3.7,0,1,1,0,0,1,1,0,1,0,0,1,0,0,1,0,1,0,1,0,1,0,0
3,65.0,70.0,1.01,2.0,0.0,112.0,73.0,3.3,137.265217,4.665789,10.9,37.0,8600.862069,4.698113,0,0,0,1,0,1,1,0,1,0,0,1,0,0,1,0,1,0,1,0,1,0,0
4,25.0,70.0,1.02,0.0,0.0,88.0,42.0,0.5,136.0,3.5,13.3,48.0,7000.0,4.9,0,1,0,1,1,0,1,0,1,0,0,1,0,0,1,0,1,0,1,0,1,0,1


In [19]:
testX = newtest.iloc[:,:-1].as_matrix()
testy = newtest.iloc[:,-1].as_matrix()

### Decision Tree

In [20]:
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier()

In [21]:
dtc.fit(trainX, trainy)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [22]:
predy_dtc = dtc.predict(testX)

In [23]:
print('Decision Tree')
print('Accuracy: {0:.5f}'.format((predy_dtc==testy).sum() / testy.size))

Decision Tree
Accuracy: 0.97333


### Linear Classifier with SGD 

In [24]:
from sklearn.linear_model import SGDClassifier
sgdc = SGDClassifier()

In [1]:
sgdc.fit(trainX, trainy)

NameError: name 'sgdc' is not defined

In [2]:
predy_sgdc = sgdc.predict(testX)

NameError: name 'sgdc' is not defined

In [27]:
print('Linear Classifier with SGD')
print('Accuracy: {0:.5f}'.format((predy_sgdc==testy).sum() / testy.size))

Linear Classifier with SGD
Accuracy: 0.38667


### Linear Classifier with Perceptron 

In [28]:
from sklearn.linear_model import Perceptron
perc = Perceptron()

In [29]:
perc.fit(trainX, trainy)



Perceptron(alpha=0.0001, class_weight=None, eta0=1.0, fit_intercept=True,
      max_iter=None, n_iter=None, n_jobs=1, penalty=None, random_state=0,
      shuffle=True, tol=None, verbose=0, warm_start=False)

In [30]:
predy_perc = sgdc.predict(testX)

In [31]:
print('Linear Classifier with Perceptron')
print('Accuracy: {0:.5f}'.format((predy_perc==testy).sum() / testy.size))

Linear Classifier with Perceptron
Accuracy: 0.38667
