In [3]:
import pandas as pd
import numpy as np
from sklearn.datasets import make_classification
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split

In [4]:
X,y = make_classification(n_samples=10000, n_features = 10, n_informative=3)

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2 , random_state=42)

In [7]:
X_train.shape

(8000, 10)

In [10]:
#model train
dt = DecisionTreeClassifier()

In [11]:
dt.fit(X_train, y_train)

In [12]:
y_pred = dt.predict(X_test)

In [13]:
from sklearn.metrics import accuracy_score


In [14]:
print(f"Decision Tree Accuracy Score : {accuracy_score(y_test,y_pred)}")

Decision Tree Accuracy Score : 0.9165


## Bagging Classifier

In [18]:
bag = BaggingClassifier(
    estimator = DecisionTreeClassifier(),  #all same model , base model selection
    n_estimators= 500, # total number of base model
    max_samples=0.25, #this is in percentage in 100 , 0.25 means 25
    bootstrap=True,  # row sample with replacement
    random_state=42
)

In [19]:
bag.fit(X_train,y_train)

In [20]:
y_pred = bag.predict(X_test)

In [21]:
print(f"Bagging Classification Accuracy Score : {accuracy_score(y_test,y_pred)}")


Bagging Classification Accuracy Score : 0.9355


In [23]:
#for viewing how may rows(sample) does each base model use
bag.estimators_samples_[0].shape   #shape of first base model

#the ans will be 2000 because there was total 8000 on training and on sample 0.25 which is 2000

(2000,)

In [24]:
#for viewing how may columns(features) does each base model use
bag.estimators_features_[0].shape   #shape of first base model

(10,)

## Pasting

In [29]:
pasting = BaggingClassifier(
    estimator = DecisionTreeClassifier(),  #all same model , base model selection
    n_estimators= 500, # total number of base model
    max_samples=0.25, #this is in percentage in 100 , 0.25 means 25
    bootstrap=False,  # row sample without replacement
    random_state=42,
    verbose =1, # to see the realtime print of working
    n_jobs = -1 # to distribute task on all cpu cores
)

In [32]:
pasting.fit(X_train,y_train)
y_pred = pasting.predict(X_test)
print(f"Bagging (Pasting) Classification Accuracy Score : {accuracy_score(y_test,y_pred)}")


[Parallel(n_jobs=16)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done   2 out of  16 | elapsed:    2.1s remaining:   15.5s
[Parallel(n_jobs=16)]: Done  16 out of  16 | elapsed:    2.3s finished
[Parallel(n_jobs=16)]: Using backend LokyBackend with 16 concurrent workers.


Bagging (Pasting) Classification Accuracy Score : 0.941


[Parallel(n_jobs=16)]: Done   2 out of  16 | elapsed:    0.0s remaining:    0.3s
[Parallel(n_jobs=16)]: Done  16 out of  16 | elapsed:    0.1s finished


## Random Subspaces

In [33]:
rs = BaggingClassifier(
    estimator = DecisionTreeClassifier(),  #all same model , base model selection
    n_estimators= 500, # total number of base model
    max_samples=0.25, #this is in percentage in 100 , 0.25 means 25
    bootstrap=False,  # row sample without replacement
    random_state=42,
    max_features=0.5, #number of features(columns) 0.5 means 50% of total columns
    bootstrap_features=True #with column replacement
    
)

In [34]:
rs.fit(X_train,y_train)
y_pred = rs.predict(X_test)
print(f"Bagging (Random Subspace) Classification Accuracy Score : {accuracy_score(y_test,y_pred)}")


Bagging (Random Subspace) Classification Accuracy Score : 0.9215


In [37]:
#for viewing how may columns(features) does each base model use
rs.estimators_features_[0].shape   #shape of first base model

# 0.5 means 5 out of 10

(5,)

## Random Patches

In [38]:
rp = BaggingClassifier(
    estimator = DecisionTreeClassifier(),  #all same model , base model selection
    n_estimators= 500, # total number of base model
    max_samples=0.25, #this is in percentage in 100 , 0.25 means 25
    bootstrap=True,  # row sample with replacement
    random_state=42,
    max_features=0.5, #number of features(columns) 0.5 means 50% of total columns
    bootstrap_features=True #with column replacement
    
)

In [39]:
rp.fit(X_train,y_train)
y_pred = rp.predict(X_test)
print(f"Bagging (Random Patches) Classification Accuracy Score : {accuracy_score(y_test,y_pred)}")


Bagging (Random Patches) Classification Accuracy Score : 0.9205


## OOB Score (Out Of Bag  - samples)

In [41]:
oob = BaggingClassifier(
    estimator = DecisionTreeClassifier(), 
    n_estimators= 500, 
    max_samples=0.25, 
    bootstrap=True, 
    oob_score=True, #only those samples(rows) that doesnot appears on any base model (decision trees) - default false
    random_state=42,
)

In [42]:
oob.fit(X_train,y_train)

In [44]:
oob.oob_score_  #rought estimation of accuracy of model

0.93375

In [46]:
y_pred = rp.predict(X_test)
print(f"Bagging (OOB score enable)Classification Accuracy Score : {accuracy_score(y_test,y_pred)}")


Bagging (OOB score enable)Classification Accuracy Score : 0.9205


## Applying GridSearchCV

In [47]:
from sklearn.model_selection import GridSearchCV

In [66]:
parameters = {
    'n_estimators': [50, 100, 500],  # This is the number of estimators
    'max_samples': [0.3, 0.5, 0.8],  # The proportion of samples to draw
    'bootstrap': [True, False],      # Whether samples are drawn with replacement
    'max_features': [0.3, 0.5, 0.8]  # The proportion of features to draw
}

In [67]:
search = GridSearchCV(BaggingClassifier(), parameters, cv=5)

In [None]:
search.fit(X_train,y_train)

In [None]:
search.best_score_

In [None]:
search.best_params_