In [1]:
import pandas as pd
import numpy as np
from pandas.plotting import scatter_matrix
from matplotlib import pyplot as plt

pd.set_option('max_columns', None, 'max_rows', None)

In [2]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [3]:
raw = pd.read_csv('ckd_imputed_9.csv')
raw.head()

Unnamed: 0,age,bp,sg,al,su,bgr,bu,sc,sod,pot,hemo,pcv,wbcc,rbcc,rbc_unknown,rbc_abnormal,rbc_normal,pc_unknown,pc_abnormal,pc_normal,pcc_present,ba_present,htn_yes,dm_yes,cad_yes,appet_good,pe_yes,ane_yes,ckd
0,49.0,100.0,1.01,3.0,0.0,129.0,158.0,11.8,122.0,3.2,8.1,24.0,9600.0,3.5,0,1,0,0,1,0,0,0,1,1,0,0,1,1,1
1,51.0,80.0,1.02,0.0,0.0,94.0,15.0,1.2,144.0,3.7,15.5,46.0,9500.0,6.4,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0
2,20.0,70.0,1.02,0.0,0.0,123.0,44.0,1.0,135.0,3.8,14.6,44.0,5500.0,4.8,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0
3,29.0,70.0,1.02,0.0,0.0,127.0,44.0,1.2,145.0,5.0,14.8,48.0,7705.594406,5.379021,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0
4,65.0,80.0,1.015,2.0,1.0,215.0,133.0,2.5,133.901786,4.878443,13.2,41.0,9069.536424,3.945238,0,0,1,0,0,1,1,0,0,1,0,1,0,0,1


In [4]:
feature = raw.iloc[:,0:-1]
x = feature.as_matrix()

In [5]:
label = raw.iloc[:,-1]
y = label.as_matrix()

### Expert

In [6]:
feature_expertise = feature[['age', 'bp', 'al', 'su', 'bu', 'sc', 'hemo', 'rbcc', 'rbc_unknown', 
                             'rbc_abnormal', 'rbc_normal', 'htn_yes', 'dm_yes', 'pe_yes', 'ane_yes']]
x_expertise = feature_expertise.as_matrix()

In [7]:
x_expertise.shape

(400, 15)

### Statistician

In [8]:
from sklearn.feature_selection import SelectKBest, SelectFpr
from sklearn.feature_selection import f_classif, chi2

In [9]:
skb = SelectKBest(f_classif, k=15)
x_statstician = skb.fit_transform(x, y)

In [10]:
x_statstician.shape

(400, 15)

### Data Scientist

In [11]:
from sklearn.feature_selection import RFE

In [12]:
log = LogisticRegression(C=7, penalty='l1')

In [13]:
rfe_log = RFE(estimator=log, n_features_to_select=15, step=10)

In [14]:
x_datascientist_log = rfe_log.fit_transform(x,y)

In [15]:
x_datascientist_log.shape

(400, 15)

In [16]:
rfe_dt = RFE(estimator=DecisionTreeClassifier(max_depth=3), n_features_to_select=15, step=10)

In [17]:
x_datascientist_dt = rfe_dt.fit_transform(x,y)

In [18]:
x_datascientist_dt.shape

(400, 15)

In [19]:
rfe_rf = RFE(estimator=RandomForestClassifier(max_depth=None, n_estimators=50), n_features_to_select=15, step=10)

In [20]:
x_datascientist_rf = rfe_rf.fit_transform(x,y)

In [21]:
x_datascientist_rf.shape

(400, 15)

### Cross Validation

In [22]:
from sklearn.model_selection import cross_validate

In [23]:
scoring = {'accuracy': 'accuracy',
           'f1': 'f1',
           'precision': 'precision',
           'recall': 'recall'}

### Logistic Regression

In [24]:
logistic = LogisticRegression(C=7, penalty='l1')

In [25]:
xs_logistic = [x, x_expertise, x_statstician, x_datascientist_log]
scores_logistic = []
for xx in xs_logistic:
    score_logistic = cross_validate(logistic, xx, y, cv=5, scoring=scoring)
    scores_logistic.append(score_logistic)

In [26]:
for score in scores_logistic:
    fit_time = score['fit_time'].mean()
    score_time = score['score_time'].mean()
    test_accuracy = score['test_accuracy'].mean()
    test_f1 = score['test_f1'].mean()
    train_accuracy = score['train_accuracy'].mean()
    train_f1 = score['train_f1'].mean()
    print('{0:.3f},{1:.3f},{2:.3f},{3:.3f},{4:.3f},{5:.3f}'.format(fit_time, score_time, test_accuracy, 
                                                                   test_f1, train_accuracy, train_f1))

0.004,0.021,0.990,0.992,1.000,1.000
0.005,0.001,0.988,0.990,1.000,1.000
0.008,0.001,0.990,0.992,1.000,1.000
0.007,0.001,0.988,0.990,1.000,1.000




### Decision Tree

In [28]:
dt = DecisionTreeClassifier(max_depth=3)

In [29]:
xs_dt = [x, x_expertise, x_statstician, x_datascientist_dt]
scores_dt = []
for xx in xs_dt:
    score_dt = cross_validate(dt, xx, y, cv=5, scoring=scoring)
    scores_dt.append(score_dt)

In [30]:
for score in scores_dt:
    fit_time = score['fit_time'].mean()
    score_time = score['score_time'].mean()
    test_accuracy = score['test_accuracy'].mean()
    test_f1 = score['test_f1'].mean()
    train_accuracy = score['train_accuracy'].mean()
    train_f1 = score['train_f1'].mean()
    print('{0:.3f},{1:.3f},{2:.3f},{3:.3f},{4:.3f},{5:.3f}'.format(fit_time, score_time, test_accuracy, 
                                                                   test_f1, train_accuracy, train_f1))

0.001,0.002,0.997,0.998,0.999,0.999
0.000,0.001,0.977,0.982,0.989,0.991
0.001,0.003,0.995,0.996,0.999,0.999
0.001,0.001,0.995,0.996,0.999,0.999




### Random Forest

In [31]:
rf = RandomForestClassifier(max_depth=None, n_estimators=50)

In [32]:
xs_rf = [x, x_expertise, x_statstician, x_datascientist_rf]
scores_rf = []
for xx in xs_rf:
    score_rf = cross_validate(rf, xx, y, cv=5, scoring=scoring)
    scores_rf.append(score_rf)

In [33]:
for score in scores_rf:
    fit_time = score['fit_time'].mean()
    score_time = score['score_time'].mean()
    test_accuracy = score['test_accuracy'].mean()
    test_f1 = score['test_f1'].mean()
    train_accuracy = score['train_accuracy'].mean()
    train_f1 = score['train_f1'].mean()
    print('{0:.3f},{1:.3f},{2:.3f},{3:.3f},{4:.3f},{5:.3f}'.format(fit_time, score_time, test_accuracy, 
                                                                   test_f1, train_accuracy, train_f1))

0.057,0.013,1.000,1.000,1.000,1.000
0.048,0.013,0.997,0.998,1.000,1.000
0.046,0.011,1.000,1.000,1.000,1.000
0.042,0.010,1.000,1.000,1.000,1.000


