In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split

In [7]:
data = pd.read_csv("diabetes.csv")
data.columns

Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome'],
      dtype='object')

In [8]:
feature_cols = ['Pregnancies', 'Insulin', 'BMI', 'Age','Glucose','BloodPressure','DiabetesPedigreeFunction']
x = data[feature_cols]
y = data.Outcome

In [9]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3 ,random_state=1)

In [10]:
# Best Score (Bagging)
"""[[131  15]
 [ 36  49]]
Accuracy: 0.7792207792207793
Auc Score Prob 0.8764705882352941
Recall: 0.5764705882352941
Precission: 0.765625
F1 Score: 0.6577181208053692
                          Importance
Glucose                     0.526292
BMI                         0.264483
Age                         0.107289
DiabetesPedigreeFunction    0.049213
Pregnancies                 0.026514
BloodPressure               0.016478
Insulin                     0.009731
"""

'[[131  15]\n [ 36  49]]\nAccuracy: 0.7792207792207793\nAuc Score Prob 0.8764705882352941\nRecall: 0.5764705882352941\nPrecission: 0.765625\nF1 Score: 0.6577181208053692\n                          Importance\nGlucose                     0.526292\nBMI                         0.264483\nAge                         0.107289\nDiabetesPedigreeFunction    0.049213\nPregnancies                 0.026514\nBloodPressure               0.016478\nInsulin                     0.009731\n'

In [13]:
rfc = RandomForestClassifier(n_estimators=100, max_features=4)
rfc = rfc.fit(x_train, y_train)
pred = rfc.predict(x_test)
pred_porb =  rfc.predict_proba(x_test)[:,1]

print(confusion_matrix(y_test, pred))
print("Accuracy:", accuracy_score(y_test, pred))
print("Auc Score:", roc_auc_score(y_test, pred))
print("Auc Score Prob:", roc_auc_score(y_test, pred_porb))
print("Recall", recall_score(y_test, pred))
print("Precission:", precision_score(y_test, pred))
print("F1 Score:", f1_score(y_test, pred))

feature_imp = pd.DataFrame(rfc.feature_importances_, index=x_train.columns, columns=["Importance"]).sort_values(
                        "Importance", ascending=False)

print(feature_imp)

[[131  15]
 [ 33  52]]
Accuracy: 0.7922077922077922
Auc Score: 0.7545124899274779
Auc Score Prob: 0.8564464141821112
Recall 0.611764705882353
Precission: 0.7761194029850746
F1 Score: 0.6842105263157895
                          Importance
Glucose                     0.282551
BMI                         0.202243
DiabetesPedigreeFunction    0.147016
Age                         0.136904
BloodPressure               0.098054
Pregnancies                 0.076166
Insulin                     0.057066


In [14]:
rfc = RandomForestClassifier(n_estimators=100, max_features=3)
rfc = rfc.fit(x_train, y_train)
pred = rfc.predict(x_test)
pred_porb =  rfc.predict_proba(x_test)[:,1]

print(confusion_matrix(y_test, pred))
print("Accuracy:", accuracy_score(y_test, pred))
print("Auc Score:", roc_auc_score(y_test, pred))
print("Auc Score Prob:", roc_auc_score(y_test, pred_porb))
print("Recall", recall_score(y_test, pred))
print("Precission:", precision_score(y_test, pred))
print("F1 Score:", f1_score(y_test, pred))

feature_imp = pd.DataFrame(rfc.feature_importances_, index=x_train.columns, columns=["Importance"]).sort_values(
                        "Importance", ascending=False)

print(feature_imp)

[[128  18]
 [ 32  53]]
Accuracy: 0.7835497835497836
Auc Score: 0.7501208702659146
Auc Score Prob: 0.8575745366639806
Recall 0.6235294117647059
Precission: 0.7464788732394366
F1 Score: 0.6794871794871796
                          Importance
Glucose                     0.264728
BMI                         0.203806
DiabetesPedigreeFunction    0.142245
Age                         0.137626
BloodPressure               0.101477
Pregnancies                 0.086405
Insulin                     0.063713


In [15]:
rfc = RandomForestClassifier(n_estimators=100, max_features=2)
rfc = rfc.fit(x_train, y_train)
pred = rfc.predict(x_test)
pred_porb =  rfc.predict_proba(x_test)[:,1]

print(confusion_matrix(y_test, pred))
print("Accuracy:", accuracy_score(y_test, pred))
print("Auc Score:", roc_auc_score(y_test, pred))
print("Auc Score Prob:", roc_auc_score(y_test, pred_porb))
print("Recall", recall_score(y_test, pred))
print("Precission:", precision_score(y_test, pred))
print("F1 Score:", f1_score(y_test, pred))

feature_imp = pd.DataFrame(rfc.feature_importances_, index=x_train.columns, columns=["Importance"]).sort_values(
                        "Importance", ascending=False)

print(feature_imp)

[[130  16]
 [ 34  51]]
Accuracy: 0.7835497835497836
Auc Score: 0.7452054794520548
Auc Score Prob: 0.8638195004029009
Recall 0.6
Precission: 0.7611940298507462
F1 Score: 0.6710526315789473
                          Importance
Glucose                     0.265864
BMI                         0.191974
DiabetesPedigreeFunction    0.140676
Age                         0.138379
BloodPressure               0.098366
Pregnancies                 0.087953
Insulin                     0.076788
