In [1]:
from sklearn.datasets import make_classification
from sklearn.metrics import accuracy_score
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split

In [2]:
X,y = make_classification(n_samples=10000,n_features=10,n_informative=3)

In [3]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.33,random_state=42)

In [4]:
dt = DecisionTreeClassifier()
dt.fit(X_train,y_train)
y_pred = dt.predict(X_test)
print('accuracy :',accuracy_score(y_pred,y_test))

accuracy : 0.8893939393939394


In [5]:
## Bagging

bag = BaggingClassifier(base_estimator=DecisionTreeClassifier(),
                       n_estimators=500,
                       max_samples=0.35,
                       bootstrap=True,
                       random_state=42,n_jobs=-1)

In [6]:
bag.fit(X_train,y_train)


BaggingClassifier(base_estimator=DecisionTreeClassifier(), max_samples=0.35,
                  n_estimators=500, n_jobs=-1, random_state=42)

In [7]:
y_pred = bag.predict(X_test)

print('accuracy :',accuracy_score(y_test,y_pred))

accuracy : 0.9327272727272727


In [8]:

bag.estimators_samples_[0].shape

(2345,)

In [9]:
## SVM 

bag = BaggingClassifier(base_estimator=SVC(),
                       n_estimators=500,
                       max_samples=0.35,
                       bootstrap=True,
                       random_state=42,n_jobs=-1)

In [10]:
bag.fit(X_train,y_train)
y_pred = bag.predict(X_test)

print('accuracy :',accuracy_score(y_test,y_pred))

accuracy : 0.9121212121212121


In [11]:
### PASTING
bag = BaggingClassifier(base_estimator=DecisionTreeClassifier(),
                       n_estimators=500,
                       max_samples=0.25,
                       bootstrap=False,
                       random_state=42,
                        verbose=1,
                       n_jobs=-1)


In [12]:
bag.fit(X_train,y_train)
y_pred = bag.predict(X_test)

print('accuracy :',accuracy_score(y_test,y_pred))

[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done   2 out of   8 | elapsed:    1.3s remaining:    4.2s
[Parallel(n_jobs=8)]: Done   8 out of   8 | elapsed:    1.4s finished
[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done   2 out of   8 | elapsed:    0.0s remaining:    0.2s


accuracy : 0.9318181818181818


[Parallel(n_jobs=8)]: Done   8 out of   8 | elapsed:    0.1s finished


In [13]:
##Random subspace
bag = BaggingClassifier(base_estimator=DecisionTreeClassifier(),
                       n_estimators=500,
                       max_samples=1.0,
                       bootstrap=False,
                       random_state=42,
                        max_features=0.5,
                        bootstrap_features=True
                        )

In [14]:
bag.fit(X_train,y_train)
y_pred = bag.predict(X_test)

print('accuracy :',accuracy_score(y_test,y_pred))

accuracy : 0.9227272727272727


In [15]:
bag.estimators_features_[0].shape

(5,)

In [16]:
bag.estimators_samples_[0].shape

(6700,)

In [17]:
## random Patch

bag = BaggingClassifier(base_estimator=DecisionTreeClassifier(),
                       n_estimators=500,
                       max_samples=0.25,
                       bootstrap=True,
                       random_state=42,
                        max_features=0.5,
                        bootstrap_features=True
                        )

In [18]:
bag.fit(X_train,y_train)
y_pred = bag.predict(X_test)

print('accuracy :',accuracy_score(y_test,y_pred))

accuracy : 0.9233333333333333


In [19]:
## OOB Score # out of bag sample - 37% rows never got in training

bag = BaggingClassifier(base_estimator=DecisionTreeClassifier(),
                       n_estimators=500,
                       max_samples=0.25,
                       bootstrap=True,
                       random_state=42,                       
                        oob_score=True
                        )


bag.fit(X_train,y_train)
y_pred = bag.predict(X_test)

print('accuracy :',accuracy_score(y_test,y_pred))

accuracy : 0.9306060606060607


In [20]:
bag.oob_score_

0.9335820895522388

In [21]:
## Bagging gives better result than Paste but still always try
## Good results come around 25% to 50% rows sampling mark
## Random Patches are useful with n-dim features
## To find better results use hyperparameters - GridSearchCV/RandomSeachCV