In [21]:
from sklearn.datasets import make_classification
from sklearn.metrics import accuracy_score
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split

In [22]:
X, y = make_classification(n_samples=10000, n_features=10, n_informative=3)

In [23]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [24]:
dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train,y_train)
y_pred = dt.predict(X_test)
print("Decision Tree accuracy",accuracy_score(y_test,y_pred))

Decision Tree accuracy 0.7795


# 1. Without replacement

#### a. Bagging using DT

In [25]:
bag_dt = BaggingClassifier(
    base_estimator=DecisionTreeClassifier(),
    n_estimators=500,
    max_samples=0.5,    # row sampling
    bootstrap=True,
    random_state=42,
    verbose = 1,
    n_jobs=-1
)
bag_dt.fit(X_train,y_train)
y_pred = bag_dt.predict(X_test)
accuracy_score(y_test,y_pred)

[Parallel(n_jobs=12)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done   2 out of  12 | elapsed:    2.2s remaining:   11.6s
[Parallel(n_jobs=12)]: Done  12 out of  12 | elapsed:    2.4s finished
[Parallel(n_jobs=12)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done   2 out of  12 | elapsed:    0.0s remaining:    0.1s
[Parallel(n_jobs=12)]: Done  12 out of  12 | elapsed:    0.0s finished


0.841

In [26]:
bag_dt.estimators_samples_[0].shape

(4000,)

In [27]:
bag_dt.estimators_features_[0].shape

(10,)

#### b. Bagging using SVM

In [28]:
bag_svc = BaggingClassifier(
    base_estimator=SVC(),
    n_estimators=500,
    max_samples=0.25,     # row sampling
    bootstrap=True,
    random_state=42,
    verbose = 1,
    n_jobs=-1
)
bag_svc.fit(X_train,y_train)
y_pred = bag_svc.predict(X_test)
accuracy_score(y_test,y_pred)

[Parallel(n_jobs=12)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done   2 out of  12 | elapsed:    4.8s remaining:   24.5s
[Parallel(n_jobs=12)]: Done  12 out of  12 | elapsed:    5.0s finished
[Parallel(n_jobs=12)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done   2 out of  12 | elapsed:   10.4s remaining:   52.3s
[Parallel(n_jobs=12)]: Done  12 out of  12 | elapsed:   10.7s finished


0.816

In [29]:
bag_svc.estimators_samples_[0].shape

(2000,)

In [30]:
bag_svc.estimators_features_[0].shape

(10,)

# 2. Pasting

#### a. Bagging using DT

In [31]:
bag_dt = BaggingClassifier(
    base_estimator=DecisionTreeClassifier(),
    n_estimators=500,
    max_samples=0.5,   # row sampling
    bootstrap=False,  # no replacement
    random_state=42,
    verbose = 1,
    n_jobs=-1
)
bag_dt.fit(X_train, y_train)
y_pred = bag_dt.predict(X_test)
accuracy_score(y_test, y_pred)

[Parallel(n_jobs=12)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done   2 out of  12 | elapsed:    2.8s remaining:   14.5s
[Parallel(n_jobs=12)]: Done  12 out of  12 | elapsed:    3.0s finished
[Parallel(n_jobs=12)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done   2 out of  12 | elapsed:    0.0s remaining:    0.2s
[Parallel(n_jobs=12)]: Done  12 out of  12 | elapsed:    0.1s finished


0.8405

In [32]:
bag_dt.estimators_samples_[0].shape

(4000,)

In [33]:
bag_dt.estimators_features_[0].shape

(10,)

#### b. Bagging using SVM

In [34]:
bag_svc = BaggingClassifier(
    base_estimator=SVC(),
    n_estimators=500,
    max_samples=0.25,   # row sampling
    bootstrap=False,    # no replacement
    random_state=42,
    verbose = 1,
    n_jobs=-1
)
bag_svc.fit(X_train, y_train)
y_pred = bag_svc.predict(X_test)
accuracy_score(y_test, y_pred)

[Parallel(n_jobs=12)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done   2 out of  12 | elapsed:    6.1s remaining:   30.9s
[Parallel(n_jobs=12)]: Done  12 out of  12 | elapsed:    6.5s finished
[Parallel(n_jobs=12)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done   2 out of  12 | elapsed:   11.7s remaining:   58.8s
[Parallel(n_jobs=12)]: Done  12 out of  12 | elapsed:   12.2s finished


0.8165

In [35]:
bag_svc.estimators_samples_[0].shape

(2000,)

In [36]:
bag_svc.estimators_features_[0].shape

(10,)

# 3. Random Subspace

#### a. Bagging using DT

In [43]:
bag_dt = BaggingClassifier(
    base_estimator=DecisionTreeClassifier(),
    n_estimators=500,
    max_samples=1.0,   # no row sampling
    bootstrap=False,  # (not required)
    max_features=0.5, # cols sampling
    random_state=42,
    verbose=1,
    n_jobs=-1
)
bag_dt.fit(X_train, y_train)
y_pred = bag_dt.predict(X_test)
accuracy_score(y_test, y_pred)

[Parallel(n_jobs=12)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done   2 out of  12 | elapsed:    3.5s remaining:   17.8s
[Parallel(n_jobs=12)]: Done  12 out of  12 | elapsed:    3.9s finished
[Parallel(n_jobs=12)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done   2 out of  12 | elapsed:    0.0s remaining:    0.4s
[Parallel(n_jobs=12)]: Done  12 out of  12 | elapsed:    0.3s finished


0.8325

In [44]:
bag_dt.estimators_samples_[0].shape

(8000,)

In [45]:
bag_dt.estimators_features_[0].shape

(5,)

#### b. Bagging using SVM

In [47]:
bag_svc = BaggingClassifier(
    base_estimator=SVC(),
    n_estimators=500,
    max_samples=1.0,   # no row sampling
    bootstrap=False,  # (not required)
    max_features=0.5, # cols sampling
    random_state=42,
    verbose=1,
    n_jobs=-1
)
bag_svc.fit(X_train, y_train)
y_pred = bag_svc.predict(X_test)
accuracy_score(y_test, y_pred)

[Parallel(n_jobs=12)]: Using backend LokyBackend with 12 concurrent workers.


KeyboardInterrupt: 

In [None]:
bag_svc.estimators_samples_[0].shape

In [None]:
bag_svc.estimators_features_[0].shape

# 4. Random Patches

In [48]:
bag_dt = BaggingClassifier(
    base_estimator=DecisionTreeClassifier(),
    n_estimators=500,
    max_samples=0.25,  # row sampling
    bootstrap=True,  # (not required)
    max_features=0.5,  # cols sampling
    random_state=42,
    verbose=1,
    n_jobs=-1
)
bag_dt.fit(X_train, y_train)
y_pred = bag_dt.predict(X_test)
accuracy_score(y_test, y_pred)

[Parallel(n_jobs=12)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done   2 out of  12 | elapsed:    2.3s remaining:   12.1s
[Parallel(n_jobs=12)]: Done  12 out of  12 | elapsed:    2.6s finished
[Parallel(n_jobs=12)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done   2 out of  12 | elapsed:    0.0s remaining:    0.1s
[Parallel(n_jobs=12)]: Done  12 out of  12 | elapsed:    0.0s finished


0.84

In [49]:
bag_dt.estimators_samples_[0].shape

(2000,)

In [50]:
bag_dt.estimators_features_[0].shape

(5,)

# OOB Score

In [51]:
bag_dt = BaggingClassifier(
    base_estimator=DecisionTreeClassifier(),
    n_estimators=500,
    max_samples=0.25,  
    bootstrap=True,  
    oob_score=True,   # OOB Score
    random_state=42,
    verbose=1,
    n_jobs=-1
)
bag_dt.fit(X_train, y_train)
y_pred = bag_dt.predict(X_test)
accuracy_score(y_test, y_pred)

[Parallel(n_jobs=12)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done   2 out of  12 | elapsed:    1.1s remaining:    5.9s
[Parallel(n_jobs=12)]: Done  12 out of  12 | elapsed:    1.4s finished
[Parallel(n_jobs=12)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done   2 out of  12 | elapsed:    0.0s remaining:    0.1s
[Parallel(n_jobs=12)]: Done  12 out of  12 | elapsed:    0.0s finished


0.841

In [52]:
bag_dt.oob_score_

0.838125

In [53]:
y_pred = bag_dt.predict(X_test)
print("Accuracy",accuracy_score(y_test,y_pred))

Accuracy 0.841


[Parallel(n_jobs=12)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done   2 out of  12 | elapsed:    0.0s remaining:    0.1s
[Parallel(n_jobs=12)]: Done  12 out of  12 | elapsed:    0.0s finished


# Bagging Tips
* Bagging generally gives better results than Pasting
* Good results come around the 25% to 50% row sampling mark
* Random patches and subspaces should be used while dealing with high dimensional data
* To find the correct hyperparameter values we can do GridSearchCV/RandomSearchCV

# All possible case : Grid Search CV

In [60]:
from sklearn.model_selection import GridSearchCV
parameters = {
    'base_estimator':[None,DecisionTreeClassifier(),SVC()],
    'n_estimators': [50,100,500],
    'max_samples': [0.1,0.4,0.7,1.0],
    'bootstrap' : [True,False],
    'max_features' : [0.1,0.4,0.7,1.0],
}

In [61]:
search = GridSearchCV(
    BaggingClassifier(n_jobs=-1),
    param_grid=parameters,
    cv=5,
    n_jobs=-1,
    verbose=1
)

In [62]:
search.fit(X_train,y_train)

Fitting 5 folds for each of 288 candidates, totalling 1440 fits


KeyboardInterrupt: 

In [None]:
search.best_params_

In [59]:
search.best_score_

0.840625