In [12]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
import warnings
warnings.filterwarnings("ignore")
import random
import time
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import train_test_split

np.random.seed(123)

In [13]:
df = pd.read_csv("diabetes.csv")
df.head(5)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [14]:
df.dropna(inplace=True)
target  = np.array(df["Outcome"])
df = df.drop("Outcome", axis=1)
df_list = list(df.columns)
df_list

['Pregnancies',
 'Glucose',
 'BloodPressure',
 'SkinThickness',
 'Insulin',
 'BMI',
 'DiabetesPedigreeFunction',
 'Age']

In [15]:
train_x, test_x, train_y, test_y = train_test_split(df, target, test_size=0.2, random_state=42)
df.shape

(768, 8)

In [16]:
# Old Performances

# Bagging Performance
"""[[131  15]
 [ 36  49]]
Accuracy: 0.7792207792207793
Auc Score Prob 0.8764705882352941
Recall: 0.5764705882352941
Precission: 0.765625
F1 Score: 0.6577181208053692
"""


# RandomForest Performance
"""[[130  16]
 [ 34  51]]
Accuracy: 0.7835497835497836
Auc Score: 0.7452054794520548
Auc Score Prob: 0.8638195004029009
Recall 0.6
Precission: 0.7611940298507462
F1 Score: 0.6710526315789473
"""


# XgBoost Performance
"""
[[70 29]
 [19 36]]
Accuracy: 0.6883116883116883
Auc Score: 0.6808080808080808
Recall: 0.6545454545454545
Precission: 0.5538461538461539
F1 Score: 0.6000000000000001
"""

'\n[[70 29]\n [19 36]]\nAccuracy: 0.6883116883116883\nAuc Score: 0.6808080808080808\nRecall: 0.6545454545454545\nPrecission: 0.5538461538461539\nF1 Score: 0.6000000000000001\n'

In [19]:
ext = ExtraTreesClassifier(n_estimators=100, max_features=5)
ext = ext.fit(train_x, train_y)
pred = ext.predict(test_x)
pred_prob = ext.predict_proba(test_x)[:,1]
print(confusion_matrix(test_y, pred))
print("Accuracy:", accuracy_score(test_y, pred))
print("Auc Score:", roc_auc_score(test_y, pred))
print("Auc Score Prob:", roc_auc_score(test_y, pred_prob))
print("Recall:", recall_score(test_y, pred))
print("Precission:", precision_score(test_y, pred))
print("F1 Score:", f1_score(test_y, pred))
feature_imp = pd.DataFrame(ext.feature_importances_, index=train_x.columns, 
                           columns=["importance"]).sort_values("importance", ascending=False)
print(feature_imp)

[[82 17]
 [19 36]]
Accuracy: 0.7662337662337663
Auc Score: 0.7414141414141414
Auc Score Prob: 0.8103764921946741
Recall: 0.6545454545454545
Precission: 0.6792452830188679
F1 Score: 0.6666666666666666
                          importance
Glucose                     0.271036
Age                         0.153929
BMI                         0.138175
DiabetesPedigreeFunction    0.102246
Pregnancies                 0.096499
BloodPressure               0.093623
Insulin                     0.072704
SkinThickness               0.071787


In [None]:
# Comments
# I thought Extra Tree would give poor performance because of random splitting, but that didn't happen.
# Many mistakes are made within the model. But mistakes fade each other. I got a good result