In [10]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import random
import time
import warnings
warnings.filterwarnings("ignore")
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score

In [11]:
np.random.seed(1001)
df  = pd.read_csv("diabetes.csv")
df.head(5)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [12]:
feat_cols = ["Pregnancies", "Glucose", "BloodPressure", "SkinThickness", "Insulin", "BMI",
            "DiabetesPedigreeFunction", "Age"]
X = df[feat_cols]
Y = df.Outcome
df.shape

(768, 9)

In [13]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.1, random_state=1)

In [22]:
# Decision Tree
dt = DecisionTreeClassifier(criterion="entropy", max_depth=4) # object
dt = dt.fit(x_train, y_train)

pred = dt.predict(x_test)
pred_prob = dt.predict_proba(x_test)[:,1]

print("Auc Score Prob:", roc_auc_score(y_test, pred_prob))

Auc Score Prob: 0.8390804597701149


In [42]:
# Bagging
bag = BaggingClassifier(base_estimator=dt, n_estimators=100, random_state=42)
bag = bag.fit(x_train, y_train)

pred = bag.predict(x_test)
pred_prob = bag.predict_proba(x_test)[:,1]

print("Auc Score Prob:", roc_auc_score(y_test, pred_prob))

Auc Score Prob: 0.8900862068965517


In [43]:
# Random Forest
rfc = RandomForestClassifier(n_estimators=100, max_features=4)
rfc = rfc.fit(x_train, y_train)

pred = rfc.predict(x_test)
pred_porb =  rfc.predict_proba(x_test)[:,1]

print("Auc Score Prob:", roc_auc_score(y_test, pred_prob))

Auc Score Prob: 0.8900862068965517


In [25]:
# Adaptive Boosting
abc = AdaBoostClassifier(n_estimators=100, random_state=42)
abc = abc.fit(x_train, y_train)

pred = abc.predict(x_test)
pred_prob = abc.predict_proba(x_test)[:,1]

print("Auc Score Prob:", roc_auc_score(y_test, pred_prob))

Auc Score Prob: 0.8692528735632183


In [27]:
# Gradient Boosting
grd = GradientBoostingClassifier(n_estimators=100, random_state=42)
grd = grd.fit(x_train, y_train)

pred = grd.predict(x_test)
pred_prob = grd.predict_proba(x_test)[:,1]

print("Auc Score Prob:", roc_auc_score(y_test, pred_prob))

Auc Score Prob: 0.8706896551724137


In [31]:
dt_cv_score = cross_val_score(dt, X, Y, cv=10, scoring="roc_auc")

print(dt_cv_score)
print("Mean Auc Score(DecisionTree):", dt_cv_score.mean())

[0.78       0.79888889 0.73074074 0.65888889 0.70777778 0.8737037
 0.79037037 0.87333333 0.77461538 0.75384615]
Mean Auc Score(DecisionTree): 0.7742165242165242


In [33]:
bag_cv_score = cross_val_score(bag, X, Y, cv=10, scoring="roc_auc")

print(bag_cv_score)
print("Mean Auc Score(Bagging):", bag_cv_score.mean())

[0.80518519 0.82296296 0.84666667 0.68962963 0.81259259 0.89111111
 0.86074074 0.90518519 0.83538462 0.84      ]
Mean Auc Score(Bagging): 0.8309458689458691


In [34]:
frst_cv_score = cross_val_score(rfc, X, Y, cv=10, scoring="roc_auc")

print(frst_cv_score)
print("Mean Auc Score(RandomForest):", frst_cv_score.mean())

[0.75777778 0.82407407 0.82296296 0.73740741 0.8037037  0.85333333
 0.86666667 0.89592593 0.81730769 0.855     ]
Mean Auc Score(RandomForest): 0.8234159544159544


In [35]:
frst_pred = cross_val_predict(rfc, X, Y, cv=10)
frst_pred

array([1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0,
       1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1,
       1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0,
       0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0,
       1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0,
       0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0,
       1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0,
       0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0,
       1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1,

In [36]:
adap_cv_score = cross_val_score(abc, X, Y, cv=10, scoring="roc_auc")

print(adap_cv_score)
print("Mean Auc Score(AdaptiveBoost):", adap_cv_score.mean())

[0.77777778 0.78740741 0.78       0.73481481 0.79925926 0.7562963
 0.81777778 0.82666667 0.82538462 0.86384615]
Mean Auc Score(AdaptiveBoost): 0.796923076923077


In [37]:
grd_cv_score = cross_val_score(grd, X, Y, cv=10, scoring="roc_auc")

print(grd_cv_score)
print("Mean Auc Score(GradientBoosting):", grd_cv_score.mean())

[0.78518519 0.84518519 0.83925926 0.72888889 0.82592593 0.85333333
 0.84814815 0.89037037 0.82384615 0.86      ]
Mean Auc Score(GradientBoosting): 0.830014245014245


In [None]:
# COMMENTS

# As a result of the Test Train split, 
# I would see the RandomForest and Bagging models show the same performance and choose one.

# I see the performances much better with Cross Validation. It makes more sense to choose the model accordingly.