# Ensemble Learning: Bagging Tutorial

We will use pima indian diabetes dataset to predict if a person has a diabetes or not based on certain features such as blood pressure, skin thickness, age etc. We will train a standalone model first and then use bagging ensemble technique to check how it can improve the performance of the model

# dataset credit: https://www.kaggle.com/gargmanas/pima-indians-diabetes

In [12]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

In [13]:
df = pd.read_csv("diabetes.csv")
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [14]:
df.shape

(768, 9)

In [15]:
df.isnull().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [16]:
df.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [17]:
df.Outcome.value_counts()

0    500
1    268
Name: Outcome, dtype: int64

In [18]:
268/500

0.536

In [19]:
x = df.drop("Outcome", axis="columns")
y = df.Outcome

# Scaling the Data

In [116]:
from sklearn.preprocessing import StandardScaler

scalar = StandardScaler()

x_scalar = scalar.fit_transform(x)
x_scalar[:4]

array([[ 0.63994726,  0.84832379,  0.14964075,  0.90726993, -0.69289057,
         0.20401277,  0.46849198,  1.4259954 ],
       [-0.84488505, -1.12339636, -0.16054575,  0.53090156, -0.69289057,
        -0.68442195, -0.36506078, -0.19067191],
       [ 1.23388019,  1.94372388, -0.26394125, -1.28821221, -0.69289057,
        -1.10325546,  0.60439732, -0.10558415],
       [-0.84488505, -0.99820778, -0.16054575,  0.15453319,  0.12330164,
        -0.49404308, -0.92076261, -1.04154944]])

In [24]:
from sklearn.model_selection import  train_test_split
x_train, x_test, y_train, y_test = train_test_split(x_scalar,y, stratify=y, random_state=10)

In [25]:
x_train.shape

(576, 8)

In [26]:
x_test.shape

(192, 8)

In [27]:
y_train.value_counts()

0    375
1    201
Name: Outcome, dtype: int64

In [28]:
201/375

0.536

# Train data using stand alone model with DicisionTree 

In [38]:
from sklearn.tree import  DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
s = cross_val_score(DecisionTreeClassifier(), x, y, cv=5)
s

array([0.68181818, 0.68831169, 0.66233766, 0.79738562, 0.73856209])

In [44]:
s.mean()

0.7136830489771666

# Train using  Bagging

In [122]:
from sklearn.ensemble import BaggingClassifier

bag_model =BaggingClassifier(
        base_estimator=None, 
        n_estimators=100,
        max_samples=0.8, 
        max_features=1.0, 
        bootstrap=True, 
        bootstrap_features=False, 
        oob_score=True, 
        warm_start=False, 
        random_state=0, 
        verbose=100)
bag_model.fit(x_train, y_train)
bag_model.oob_score_

Building estimator 1 of 100 for this parallel run (total 100)...
Building estimator 2 of 100 for this parallel run (total 100)...
Building estimator 3 of 100 for this parallel run (total 100)...
Building estimator 4 of 100 for this parallel run (total 100)...
Building estimator 5 of 100 for this parallel run (total 100)...
Building estimator 6 of 100 for this parallel run (total 100)...
Building estimator 7 of 100 for this parallel run (total 100)...
Building estimator 8 of 100 for this parallel run (total 100)...
Building estimator 9 of 100 for this parallel run (total 100)...
Building estimator 10 of 100 for this parallel run (total 100)...
Building estimator 11 of 100 for this parallel run (total 100)...
Building estimator 12 of 100 for this parallel run (total 100)...
Building estimator 13 of 100 for this parallel run (total 100)...
Building estimator 14 of 100 for this parallel run (total 100)...
Building estimator 15 of 100 for this parallel run (total 100)...
Building estimator 

0.7586805555555556

In [123]:
bag_model.score(x_test, y_test)

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished


0.7864583333333334

In [124]:
bag_model = BaggingClassifier(
    base_estimator = None,
    n_estimators = 100,
    max_samples = .8,
    oob_score = True,
    random_state = 0
)
s = cross_val_score(bag_model, x, y, cv=5)
print(f"score values:{s}")
s.mean()

score values:[0.74025974 0.74025974 0.74675325 0.81699346 0.76470588]


0.7617944147355912

# We can see some improvement in test score with bagging classifier as compared to a standalone classifier

# RandomForestClassifier

In [125]:
from sklearn.ensemble import  RandomForestClassifier
s = cross_val_score(RandomForestClassifier(n_estimators=100), x,y, cv=5)
s.mean()

0.7604957134368899

# Support Vector Machine

In [126]:
from sklearn.svm import SVC
s = cross_val_score(SVC(), x,y, cv=5)
s.mean()

0.6510482981071216