In [26]:
import pandas as pd
import numpy as np
from sklearn import metrics
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import seaborn
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import BaggingClassifier
from sklearn import tree

In [27]:
data = pd.read_csv("diabetes.csv")
data.columns

Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome'],
      dtype='object')

In [28]:
feature_cols = ['Pregnancies', 'Insulin', 'BMI', 'Age','Glucose','BloodPressure','DiabetesPedigreeFunction']
x = data[feature_cols]
y = data.Outcome

In [29]:
# Split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=1)
x_train.head()

Unnamed: 0,Pregnancies,Insulin,BMI,Age,Glucose,BloodPressure,DiabetesPedigreeFunction
88,15,110,37.1,43,136,70,0.153
467,0,100,36.8,25,97,64,0.6
550,1,0,27.4,21,116,70,0.204
147,2,119,30.5,34,106,64,1.4
481,0,0,35.2,29,123,88,0.197


In [30]:
clsfr = DecisionTreeClassifier()
clsfr.fit(x_train, y_train)

# Predict
pred = clsfr.predict(x_test)
pred_prob = clsfr.predict_proba(x_test)[:,1]

# Metrics
print(confusion_matrix(y_test, pred))
print("Accuracy:", accuracy_score(y_test, pred))
print("Auc Score Prob", roc_auc_score(y_test, pred_prob))
print("Recall:", recall_score(y_test, pred))
print("Precission:", precision_score(y_test, pred))
print("F1 Score:", f1_score(y_test, pred))

# Important Features
features_imp = pd.DataFrame(clsfr.feature_importances_, index=x_train.columns, columns=["Importance"], 
                           ).sort_values("Importance", ascending=False)
print(features_imp)

[[112  34]
 [ 40  45]]
Accuracy: 0.6796536796536796
Auc Score Prob 0.6482675261885575
Recall: 0.5294117647058824
Precission: 0.569620253164557
F1 Score: 0.5487804878048781
                          Importance
Glucose                     0.321662
BMI                         0.209405
DiabetesPedigreeFunction    0.135865
BloodPressure               0.133061
Age                         0.101435
Pregnancies                 0.052589
Insulin                     0.045983


In [31]:
# Model
bag = BaggingClassifier(base_estimator=clsfr, n_estimators=100, random_state=42)
bag = bag.fit(x_train, y_train)

# Predict
pred = bag.predict(x_test)
pred_prob = bag.predict_proba(x_test)[:,1]

# Metrics
print(confusion_matrix(y_test, pred))
print("Accuracy:", accuracy_score(y_test, pred))
print("Auc Score Prob", roc_auc_score(y_test, pred_prob))
print("Recall:", recall_score(y_test, pred))
print("Precission:", precision_score(y_test, pred))
print("F1 Score:", f1_score(y_test, pred))

# Important Features
featimp = np.mean([tree.feature_importances_ for tree in bag.estimators_], axis=0)
features_imp = pd.DataFrame(featimp, index=x_train.columns, columns=["Importance"]).sort_values("Importance", 
                                                                                               ascending=False)

print(features_imp)

[[127  19]
 [ 30  55]]
Accuracy: 0.7878787878787878
Auc Score Prob 0.8624093473005641
Recall: 0.6470588235294118
Precission: 0.7432432432432432
F1 Score: 0.6918238993710691
                          Importance
Glucose                     0.308644
BMI                         0.202219
DiabetesPedigreeFunction    0.137684
Age                         0.130382
BloodPressure               0.094963
Pregnancies                 0.071597
Insulin                     0.054510


In [34]:
clsfr2 = DecisionTreeClassifier(criterion="entropy", max_depth=3)
clsfr2.fit(x_train, y_train)

# Predict
pred = clsfr2.predict(x_test)
pred_prob = clsfr2.predict_proba(x_test)[:,1]

# Metrics
print(confusion_matrix(y_test, pred))
print("Accuracy:", accuracy_score(y_test, pred))
print("Auc Score Prob", roc_auc_score(y_test, pred_prob))
print("Recall:", recall_score(y_test, pred))
print("Precission:", precision_score(y_test, pred))
print("F1 Score:", f1_score(y_test, pred))

# Important Features
features_imp = pd.DataFrame(clsfr2.feature_importances_, index=x_train.columns, columns=["Importance"], 
                           ).sort_values("Importance", ascending=False)
print(features_imp)

[[124  22]
 [ 31  54]]
Accuracy: 0.7705627705627706
Auc Score Prob 0.84709911361805
Recall: 0.6352941176470588
Precission: 0.7105263157894737
F1 Score: 0.6708074534161491
                          Importance
Glucose                     0.562837
BMI                         0.326594
Age                         0.110569
Pregnancies                 0.000000
Insulin                     0.000000
BloodPressure               0.000000
DiabetesPedigreeFunction    0.000000


In [35]:
# Model
bag = BaggingClassifier(base_estimator=clsfr2, n_estimators=100, random_state=42)
bag = bag.fit(x_train, y_train)

# Predict
pred = bag.predict(x_test)
pred_prob = bag.predict_proba(x_test)[:,1]

# Metrics
print(confusion_matrix(y_test, pred))
print("Accuracy:", accuracy_score(y_test, pred))
print("Auc Score Prob", roc_auc_score(y_test, pred_prob))
print("Recall:", recall_score(y_test, pred))
print("Precission:", precision_score(y_test, pred))
print("F1 Score:", f1_score(y_test, pred))

# Important Features
featimp = np.mean([tree.feature_importances_ for tree in bag.estimators_], axis=0)
features_imp = pd.DataFrame(featimp, index=x_train.columns, columns=["Importance"]).sort_values("Importance", 
                                                                                               ascending=False)

print(features_imp)

[[131  15]
 [ 36  49]]
Accuracy: 0.7792207792207793
Auc Score Prob 0.8764705882352941
Recall: 0.5764705882352941
Precission: 0.765625
F1 Score: 0.6577181208053692
                          Importance
Glucose                     0.526292
BMI                         0.264483
Age                         0.107289
DiabetesPedigreeFunction    0.049213
Pregnancies                 0.026514
BloodPressure               0.016478
Insulin                     0.009731
