In [55]:
import sklearn

In [56]:
import sklearn.datasets


X,y=sklearn.datasets.make_classification(n_samples=10000,n_features=10,n_informative=3)


In [57]:
#splitting into training and test set 
import sklearn.model_selection


X_train,X_test,y_train,y_test=sklearn.model_selection.train_test_split(X,y,test_size=0.2,random_state=42)


In [58]:
dt=sklearn.tree.DecisionTreeClassifier(random_state=42)
dt.fit(X_train,y_train)
y_pred=dt.predict(X_test)

In [59]:
import sklearn.metrics

print('Decision tree accuracy',sklearn.metrics.accuracy_score(y_test,y_pred))

Decision tree accuracy 0.8335


In [60]:
#bagging
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier


bag=BaggingClassifier(
    estimator=DecisionTreeClassifier(),
    n_estimators=500,
    max_samples=0.25,
    bootstrap=True,#for sampling with replacememt,
    random_state=2
)

In [61]:
bag.fit(X_train,y_train)

In [62]:
y_pred=bag.predict(X_test)


In [63]:
print(sklearn.metrics.accuracy_score(y_test,y_pred))

0.88


In [64]:
bag.estimators_samples_[0].shape

(2000,)

### **BAGGING USING SVM**

In [65]:
from sklearn.svm import SVC
bag=BaggingClassifier(
    estimator=SVC(),
    n_estimators=500,
    max_samples=0.25,
    bootstrap=True,#for sampling with replacement
    random_state=42
)

In [66]:
X_train.shape

(8000, 10)

In [67]:
bag.fit(X_train,y_train)

In [68]:
y_pred=bag.predict(X_test)

In [69]:
print('Bagging using svm',sklearn.metrics.accuracy_score(y_test,y_pred))

Bagging using svm 0.8725


### **pasting**

In [70]:
bag=BaggingClassifier(
    estimator=DecisionTreeClassifier(),
    n_estimators=500,
    max_samples=0.25,
    bootstrap=False,#for sampling without replaement
    random_state=42,
    verbose=1,
    n_jobs=1
)

In [71]:
bag.fit(X_train,y_train)

In [72]:
y_pred=bag.predict(X_test)
print('Bagging using pasting',sklearn.metrics.accuracy_score(y_test,y_pred))

Bagging using pasting 0.8845


### **RANDOM SUBSPACES**

In [73]:
bag=BaggingClassifier(
    estimator=DecisionTreeClassifier(),
    n_estimators=500,
    max_samples=1.0,
    bootstrap=False,#for sampling without replaement
    random_state=42,
    max_features=0.5,#taking half features at once
    bootstrap_features=True,#smapling columns with replacement
)

In [74]:
bag.fit(X_train,y_train)

In [75]:
y_pred=bag.predict(X_test)
print('Bagging using random subspaces',sklearn.metrics.accuracy_score(y_test,y_pred))

Bagging using random subspaces 0.8715


In [76]:
bag.estimators_samples_[0].shape

(8000,)

In [77]:
bag.estimators_features_[0].shape

(5,)

In [78]:
#random patches
bag=BaggingClassifier(
    estimator=DecisionTreeClassifier(),
    n_estimators=500,
    max_samples=0.25,
    bootstrap=False,#for sampling without replaement
    random_state=42,
    max_features=0.5,
    bootstrap_features=True
)

In [79]:
bag.fit(X_train,y_train)

In [80]:
y_pred=bag.predict(X_test)
print('Bagging using Radnom patches',sklearn.metrics.accuracy_score(y_test,y_pred))

Bagging using Radnom patches 0.863


### **OOB SCORE**

When random sampling around 37% of the samples remains in the bag. Here we can see the accurcay score with the unseen features

In [81]:
bag=BaggingClassifier(
    estimator=DecisionTreeClassifier(),
    n_estimators=500,
    max_samples=0.25,
    bootstrap=True,#for sampling without replaement
    random_state=42,
    oob_score=True
)

In [83]:
bag.fit(X_train,y_train)

In [85]:
bag.oob_score_

0.888

In [86]:
y_pred=bag.predict(X_test)
print('Bagging using pasting',sklearn.metrics.accuracy_score(y_test,y_pred))

Bagging using pasting 0.879


### BAGGING TIPS
* Bagging generally gives better results than Pasting.
* Good results come around the 25% to 50% row sampling mark.
* Random patches and subspaces should be used while dealing with high dimesional data.
* To find the correct hyperparamter values we can do Grid search cv

In [87]:
from sklearn.model_selection import GridSearchCV
parameters={
    'n_estimators':[50,100,500],
    'max_samples':[0.1,0.4,0.7,1.0],
    'bootstrap':[True,False],
    'max_features':[0.1,0.4,0.7,1.0]
}
search=GridSearchCV(BaggingClassifier(),parameters,cv=5)

In [88]:
search.fit(X_train,y_train)

In [91]:
search.best_params_

{'bootstrap': True,
 'max_features': 0.7,
 'max_samples': 1.0,
 'n_estimators': 100}

In [92]:
search.best_score_

np.float64(0.890125)

In [93]:
#score is not perfect beacause we have taken only some and avoided some in pareameteres