In [17]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import pylab
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostRegressor

In [18]:
data = pd.read_csv("diabetes.csv")

feature_cols = ['Pregnancies', 'Insulin', 'BMI', 'Age','Glucose','BloodPressure','DiabetesPedigreeFunction']
x = data[feature_cols] # Features
y = data.Outcome # Target variable

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=1)
print(x.head())

   Pregnancies  Insulin   BMI  Age  Glucose  BloodPressure  \
0            6        0  33.6   50      148             72   
1            1        0  26.6   31       85             66   
2            8        0  23.3   32      183             64   
3            1       94  28.1   21       89             66   
4            0      168  43.1   33      137             40   

   DiabetesPedigreeFunction  
0                     0.627  
1                     0.351  
2                     0.672  
3                     0.167  
4                     2.288  


In [19]:
# Create Decision Tree Classifier
clsfr = DecisionTreeClassifier()
clsfr = clsfr.fit(x_train, y_train)

pred = clsfr.predict(x_test)
pred_prob = clsfr.predict_proba(x_test)[:,1]

print(confusion_matrix(y_test, pred))
print("Accuracy", accuracy_score(y_test, pred))
print("Auc Score Prob", roc_auc_score(y_test, pred_prob))
print("Recall", recall_score(y_test, pred))
print("Precission", precision_score(y_test, pred))
print("F1 Score", f1_score(y_test, pred))

feature_imp = pd.DataFrame(clsfr.feature_importances_, index=x_train.columns, columns=["Importance"]).sort_values(
                                "Importance", ascending=False)
print(feature_imp)

[[114  32]
 [ 46  39]]
Accuracy 0.6623376623376623
Auc Score Prob 0.6198227236099919
Recall 0.4588235294117647
Precission 0.5492957746478874
F1 Score 0.5
                          Importance
Glucose                     0.319714
BMI                         0.213853
DiabetesPedigreeFunction    0.131080
BloodPressure               0.128553
Age                         0.113079
Insulin                     0.048055
Pregnancies                 0.045666


In [20]:
clsfr2 = DecisionTreeClassifier(criterion="entropy", max_depth=3)
clsfr2 = clsfr2.fit(x_train, y_train)

pred2 = clsfr2.predict(x_test)
pred2_prob = clsfr2.predict_proba(x_test)[:,1]

print(confusion_matrix(y_test, pred2))
print("Accuracy", accuracy_score(y_test, pred2))
print("Auc Score Prob", roc_auc_score(y_test, pred2_prob))
print("Recall", recall_score(y_test, pred2))
print("Precission", precision_score(y_test, pred2))
print("F1 Score", f1_score(y_test, pred2))

feature_imp2 = pd.DataFrame(clsfr2.feature_importances_, index=x_train.columns, columns=["Importance"]).sort_values(
                                "Importance", ascending=False)
print(feature_imp2)

[[124  22]
 [ 31  54]]
Accuracy 0.7705627705627706
Auc Score Prob 0.84709911361805
Recall 0.6352941176470588
Precission 0.7105263157894737
F1 Score 0.6708074534161491
                          Importance
Glucose                     0.562837
BMI                         0.326594
Age                         0.110569
Pregnancies                 0.000000
Insulin                     0.000000
BloodPressure               0.000000
DiabetesPedigreeFunction    0.000000


In [22]:
# Bagging
bag = BaggingClassifier(base_estimator=clsfr, n_estimators=100, random_state=42)
bag = bag.fit(x_train, y_train)

pred3 = bag.predict(x_test)
pred3_prob = bag.predict_proba(x_test)[:,1]

print(confusion_matrix(y_test, pred3))
print("Accuracy", accuracy_score(y_test, pred3))
print("Auc Score Prob", roc_auc_score(y_test, pred3_prob))
print("Recall", recall_score(y_test, pred3))
print("Precission", precision_score(y_test, pred3))
print("F1 Score", f1_score(y_test, pred3))


featimp = np.mean([
    tree.feature_importances_ for tree in bag.estimators_
], axis=0)

feature_importances = pd.DataFrame(featimp,
                                   index = x_train.columns,
                                    columns=['importance']).sort_values('importance', ascending=False)
print(featimp)

[[127  19]
 [ 30  55]]
Accuracy 0.7878787878787878
Auc Score Prob 0.8624093473005641
Recall 0.6470588235294118
Precission 0.7432432432432432
F1 Score 0.6918238993710691
[0.07159728 0.05451045 0.20221884 0.13038158 0.30864438 0.09496335
 0.13768413]


In [23]:
# Bagging2
bag = BaggingClassifier(base_estimator=clsfr2, n_estimators=100, random_state=42)
bag = bag.fit(x_train, y_train)

pred4 = bag.predict(x_test)
pred4_prob = bag.predict_proba(x_test)[:,1]

print(confusion_matrix(y_test, pred4))
print("Accuracy", accuracy_score(y_test, pred4))
print("Auc Score Prob", roc_auc_score(y_test, pred4_prob))
print("Recall", recall_score(y_test, pred4))
print("Precission", precision_score(y_test, pred4))
print("F1 Score", f1_score(y_test, pred4))


featimport = np.mean([
    tree.feature_importances_ for tree in bag.estimators_
], axis=0)

feature_importances = pd.DataFrame(featimport,
                                   index = x_train.columns,
                                    columns=['importance']).sort_values('importance', ascending=False)
print(featimport)

[[131  15]
 [ 36  49]]
Accuracy 0.7792207792207793
Auc Score Prob 0.8764705882352941
Recall 0.5764705882352941
Precission 0.765625
F1 Score 0.6577181208053692
[0.02651369 0.00973136 0.26448318 0.10728876 0.52629193 0.01647792
 0.04921316]


In [26]:
rfc = RandomForestClassifier(n_estimators=100, max_features=4)

rfc = rfc.fit(x_train,y_train)

#Predict the response for test dataset
y_pred = rfc.predict(x_test)
y_pred_prob = rfc.predict_proba(x_test)[:,1]

print(confusion_matrix(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))
print ("AUC Score:", roc_auc_score(y_test, y_pred))
print ("AUC Score prob:", roc_auc_score(y_test, y_pred_prob))
print ("Precision:", precision_score(y_test, y_pred))
print ("Recall:", recall_score(y_test, y_pred))
print ("F1 Score:", f1_score(y_test, y_pred))

feature_importances = pd.DataFrame(rfc.feature_importances_,
                                   index = x_train.columns,
                                    columns=['importance']).sort_values('importance', ascending=False)

print(feature_importances)

[[132  14]
 [ 32  53]]
Accuracy: 0.8008658008658008
AUC Score: 0.7638195004029009
AUC Score prob: 0.8596696212731667
Precision: 0.7910447761194029
Recall: 0.6235294117647059
F1 Score: 0.6973684210526316
                          importance
Glucose                     0.287053
BMI                         0.191048
Age                         0.145789
DiabetesPedigreeFunction    0.144158
BloodPressure               0.092014
Pregnancies                 0.079983
Insulin                     0.059956


In [30]:
from sklearn.ensemble import AdaBoostClassifier
abc = AdaBoostClassifier(n_estimators=100, random_state=42)
abc = abc.fit(x_train, y_train)

pred = abc.predict(x_test)
pred_prob = abc.predict_proba(x_test)[:,1]

print(confusion_matrix(y_test, pred))
print("Accuracy:", accuracy_score(y_test, pred))
print ("AUC Score:", roc_auc_score(y_test, pred))
print ("AUC Score prob:", roc_auc_score(y_test, pred_prob))
print ("Precision:", precision_score(y_test, pred))
print ("Recall:", recall_score(y_test, pred))
print ("F1 Score:", f1_score(y_test, pred))

feature_importances = pd.DataFrame(rfc.feature_importances_,
                                   index = x_train.columns,
                                    columns=['importance']).sort_values('importance', ascending=False)

print(feature_importances)

[[128  18]
 [ 30  55]]
Accuracy: 0.7922077922077922
AUC Score: 0.7618855761482675
AUC Score prob: 0.8287671232876712
Precision: 0.7534246575342466
Recall: 0.6470588235294118
F1 Score: 0.6962025316455697
                          importance
Glucose                     0.287053
BMI                         0.191048
Age                         0.145789
DiabetesPedigreeFunction    0.144158
BloodPressure               0.092014
Pregnancies                 0.079983
Insulin                     0.059956


In [None]:
# Boosting will not perform well enough if the Data is small and the number of rows is low