In [99]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import make_scorer,accuracy_score,f1_score
from sklearn.metrics import roc_curve,auc
from sklearn.metrics import confusion_matrix,roc_auc_score,recall_score,precision_score

In [100]:
col_names = ['pregnant','glucose','bp','skin','insulin','bmi','pedigree','age','label']
pima = pd.read_csv('diabetes.csv',names=col_names)
pima = pima.iloc[1:,:]


In [101]:
feature_cols = ['pregnant','insulin','bmi','age','glucose','bp','pedigree']
X = pima[feature_cols]
y = pima['label']

In [102]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X = scaler.fit_transform(X)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


In [103]:
y = y.values.reshape(-1,1)

In [104]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=1)
X_test = X_test.astype('float')
y_test = y_test.astype('float')

In [105]:
dt = DecisionTreeClassifier(criterion = 'entropy')
dt.fit(X_train,y_train)
pred = dt.predict(X_test).astype('float')

In [106]:
print('accuracy score : ',accuracy_score(y_test,pred))
print('area under the curve:',roc_auc_score(y_test,pred))
print('f1 score:',f1_score(y_test,pred))
print('precision score:',precision_score(y_test,pred))
print('confusion matrix:',confusion_matrix(y_test,pred))



accuracy score :  0.7445887445887446
area under the curve: 0.7217566478646253
f1 score: 0.6467065868263472
precision score: 0.6585365853658537
confusion matrix: [[118  28]
 [ 31  54]]


In [107]:
dt = DecisionTreeClassifier(criterion = 'gini',max_depth = 5)
dt.fit(X_train,y_train)
pred = dt.predict(X_test).astype('float')

In [108]:
print('accuracy score : ',accuracy_score(y_test,pred))
print('area under the curve:',roc_auc_score(y_test,pred))
print('f1 score:',f1_score(y_test,pred))
print('precision score:',precision_score(y_test,pred))
print('confusion matrix:',confusion_matrix(y_test,pred))



accuracy score :  0.7662337662337663
area under the curve: 0.741337630942788
f1 score: 0.6707317073170732
precision score: 0.6962025316455697
confusion matrix: [[122  24]
 [ 30  55]]


lets use cross validation

In [109]:
from sklearn.model_selection import cross_validate
dt = DecisionTreeClassifier(criterion = 'entropy',max_depth = 5)
scores = cross_validate(dt,X,y,cv=10,scoring='accuracy',return_train_score=False)
scores_df = pd.DataFrame(scores)
scores_df.mean()

fit_time      0.004505
score_time    0.000575
test_score    0.730519
dtype: float64

In [110]:
criterion = ['gini', 'entropy']
max_depth = [4,6,8,12]

In [114]:
tuned_parameters = {'criterion':['gini', 'entropy'],'max_depth':np.arange(2,26,1)}
dt = DecisionTreeClassifier()

In [115]:
from sklearn.model_selection import GridSearchCV
best_parameters = GridSearchCV(dt,tuned_parameters,cv=10,scoring='accuracy')

In [116]:
best_parameters.fit(X_train,y_train)



GridSearchCV(cv=10, error_score='raise-deprecating',
       estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'criterion': ['gini', 'entropy'], 'max_depth': array([ 2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
       19, 20, 21, 22, 23, 24, 25])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=0)

In [117]:
best_parameters.best_estimator_

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=5,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [118]:
best_estimator = best_parameters.best_estimator_

In [120]:
pred = best_estimator.predict(X_test).astype('float')

In [121]:
print('accuracy score : ',accuracy_score(y_test,pred))
print('area under the curve:',roc_auc_score(y_test,pred))
print('f1 score:',f1_score(y_test,pred))
print('precision score:',precision_score(y_test,pred))
print('confusion matrix:',confusion_matrix(y_test,pred))



accuracy score :  0.7662337662337663
area under the curve: 0.7437953263497179
f1 score: 0.674698795180723
precision score: 0.691358024691358
confusion matrix: [[121  25]
 [ 29  56]]
