In [1]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

In [3]:
from sklearn.datasets import make_classification
from sklearn.metrics import accuracy_score
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split

In [4]:
X,y = make_classification(n_samples=10000, n_features=10, n_informative=3)

In [5]:
X_train, X_test,y_train,y_test = train_test_split(X,y,test_size=0.2, random_state=42)

In [6]:
dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train,y_train)
y_pred = dt.predict(X_test)

print("Decision Tree accuracy", accuracy_score(y_test,y_pred))

Decision Tree accuracy 0.9015


In [7]:
dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train,y_train)
y_pred = dt.predict(X_test)
print("Decision Tree accuracy", accuracy_score(y_test,y_pred))

Decision Tree accuracy 0.9015


# Bagging 

In [11]:
bag = BaggingClassifier(
    estimator=DecisionTreeClassifier(),
    n_estimators=500,
    max_samples=0.25,
    bootstrap=True,
    random_state=42
)

In [12]:
bag.fit(X_train,y_train)

0,1,2
,estimator,DecisionTreeClassifier()
,n_estimators,500
,max_samples,0.25
,max_features,1.0
,bootstrap,True
,bootstrap_features,False
,oob_score,False
,warm_start,False
,n_jobs,
,random_state,42

0,1,2
,criterion,'gini'
,splitter,'best'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [14]:
y_pred = bag.predict(X_test)

In [16]:
accuracy_score(y_test,y_pred) * 100

93.4

# Bagging using SVM

In [18]:
from sklearn.ensemble import BaggingClassifier
from sklearn.svm import SVC

bag = BaggingClassifier(
    estimator=SVC(),
    n_estimators=500,
    max_samples=0.25,
    bootstrap=True,
    random_state=42
)


In [19]:
bag.fit(X_train,y_train)

0,1,2
,estimator,SVC()
,n_estimators,500
,max_samples,0.25
,max_features,1.0
,bootstrap,True
,bootstrap_features,False
,oob_score,False
,warm_start,False
,n_jobs,
,random_state,42

0,1,2
,C,1.0
,kernel,'rbf'
,degree,3
,gamma,'scale'
,coef0,0.0
,shrinking,True
,probability,False
,tol,0.001
,cache_size,200
,class_weight,


In [25]:
y_pred = bag.predict(X_test)
print("Bagging using SVM", accuracy_score(y_test,y_pred))

Bagging using SVM 0.9215


In [24]:
bag.estimators_samples_[0].shape

(2000,)

# Pasting

In [27]:
bag = BaggingClassifier(
    estimator=DecisionTreeClassifier(),
    n_estimators=500,
    max_samples=0.25,
    bootstrap=False,
    random_state=42,
    verbose=1,
    n_jobs=1
)

In [29]:
bag.fit(X_train, y_train)
y_pred = bag.predict(X_test)
print("Bagging classifier accuracy:", accuracy_score(y_test, y_pred))

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   16.2s finished


Bagging classifier accuracy: 0.934


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s finished


# Random Subspaces

In [30]:
rs = BaggingClassifier(
    estimator=DecisionTreeClassifier(random_state=42),
    n_estimators=200,
    max_features=0.5,    
    max_samples=1.0,   
    bootstrap=False,    
    random_state=42,
    n_jobs=-1
)

In [41]:
# train
rs.fit(X_train, y_train)

0,1,2
,estimator,DecisionTreeC...ndom_state=42)
,n_estimators,200
,max_samples,1.0
,max_features,0.5
,bootstrap,False
,bootstrap_features,False
,oob_score,False
,warm_start,False
,n_jobs,-1
,random_state,42

0,1,2
,criterion,'gini'
,splitter,'best'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,42
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [31]:
# predict
y_pred = rs.predict(X_test)

# accuracy
print("Random Subspaces accuracy:", accuracy_score(y_test, y_pred))


Random Subspaces accuracy: 0.9275


# Random Patches

In [32]:
rp = BaggingClassifier(
    estimator=DecisionTreeClassifier(random_state=42),
    n_estimators=200,
    max_samples=0.5,    
    max_features=0.5,   
    bootstrap=True,
    bootstrap_features=True,
    random_state=42,
    n_jobs=-1
)

In [37]:
# train
rp.fit(X_train, y_train)

0,1,2
,estimator,DecisionTreeC...ndom_state=42)
,n_estimators,200
,max_samples,0.5
,max_features,0.5
,bootstrap,True
,bootstrap_features,True
,oob_score,False
,warm_start,False
,n_jobs,-1
,random_state,42

0,1,2
,criterion,'gini'
,splitter,'best'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,42
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [33]:
# predict
y_pred = rp.predict(X_test)

# accuracy
print("Random Patches accuracy:", accuracy_score(y_test, y_pred))

Random Patches accuracy: 0.927


In [35]:
bag.estimators_features_[0].shape

(10,)

# OBB Score

In [38]:
bag = BaggingClassifier(
    estimator=DecisionTreeClassifier(random_state=42),
    n_estimators=300,
    max_samples=0.5,
    bootstrap=True,         
    oob_score=True,          
    random_state=42,
    n_jobs=-1
)

In [39]:
bag.fit(X_train, y_train)

0,1,2
,estimator,DecisionTreeC...ndom_state=42)
,n_estimators,300
,max_samples,0.5
,max_features,1.0
,bootstrap,True
,bootstrap_features,False
,oob_score,True
,warm_start,False
,n_jobs,-1
,random_state,42

0,1,2
,criterion,'gini'
,splitter,'best'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,42
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [40]:
print("OOB score:", bag.oob_score_)

OOB score: 0.92275


In [42]:
y_pred = bag.predict(X_test)
print("Accuracy", accuracy_score(y_test,y_pred))

Accuracy 0.9335


# 🧠 **Bagging Tips**

**Bagging (Bootstrap Aggregating)** is an ensemble technique that helps reduce variance and improve model stability.

---

### ✅ **Why Bagging Works Well**
- **Reduces overfitting** by averaging multiple models  
- **Works best with high-variance models** (like Decision Trees)  
- **Improves generalization** on unseen data  

---

### 📌 **Bagging vs Pasting**
- **Bagging** → sampling **with replacement**  
- **Pasting** → sampling **without replacement**  
- 👉 *Bagging generally gives better results than Pasting* because:
  - it introduces **more randomness**
  - models become **less correlated**

---

### ⚙️ **Best Practices**
- Use **many weak learners** instead of one strong learner  
- Keep base models **simple** (shallow trees work great)  
- Enable **OOB score** for fast validation  
- Increase `n_estimators` for more stable performance  

---

### ✨ **Quick Tip**
> Bagging shines when your model **overfits easily** but performs well on training data.

---

<small>*Bagging is the foundation of Random Forests and many modern ensemble methods.*</small>


### **Bagging Tips**

- **Bagging (Bootstrap Aggregating)** reduces variance and overfitting  
- Works best with **high-variance models** like Decision Trees  
- Uses **sampling with replacement**  
- **Bagging generally performs better than Pasting** due to higher randomness  
- Improves **model stability and accuracy**  

<small>*Bagging is the base idea behind Random Forests.*</small>


# Applying GridSearchCv

In [44]:
from sklearn.model_selection import GridSearchCV

In [45]:
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_samples': [0.5, 0.7, 1.0],
    'bootstrap': [True, False],
    'max_features' : [0.1,0.4,0.7,1.0]
}

In [46]:
search = GridSearchCV(BaggingClassifier(), param_grid, cv=5)

In [None]:
search.fit(X_train,y_train)

In [None]:
search.best_paras_
search.best_score_

In [None]:
search.best_params_