# Ensembling on Digits dataset

In [1]:
#import libraries
from sklearn.datasets import load_digits
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
import time

In [2]:
digits = load_digits()
print(digits.data.shape)

(1797, 64)


In [3]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(digits['data'], digits['target'], test_size = 0.25, random_state = 11)

In [4]:
#data details
print("X_train :", len(X_train))
print("X_test:", len(X_test))
print("y_train :", len(y_train))
print("y_train :", len(y_test))

X_train : 1347
X_test: 450
y_train : 1347
y_train : 450


In [5]:
#create model objects
log_clf = LogisticRegression()
rnd_clf = RandomForestClassifier()
svm_clf = SVC(probability=True)

In [6]:
#create an ensembled classifier
voting_clf = VotingClassifier(
estimators=[('lr', log_clf), ('rf', rnd_clf)],
voting='soft'
)

In [7]:
#Train the model
start = time.time()
voting_clf.fit(X_train, y_train)
print("Training time : ", time.time() - start)

Training time :  0.1565854549407959


In [8]:
#divide training data
data_size = len(X_train)

In [9]:
from sklearn.metrics import accuracy_score
clf_list = [log_clf, rnd_clf, voting_clf]
for clf in clf_list:
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    accuracy = accuracy_score(y_pred, y_test)
    print(clf.__class__.__name__ + " accuracy:", accuracy)

LogisticRegression accuracy: 0.957777777778
RandomForestClassifier accuracy: 0.935555555556
VotingClassifier accuracy: 0.971111111111


# Bagging on Digits dataset

In [23]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

In [26]:
#Set bootstrap = True for Bagging and false for Pasting
bag_clf = BaggingClassifier(DecisionTreeClassifier(), n_estimators=500, max_samples=1000, bootstrap=True, n_jobs=-1,random_state=11, verbose = 1)

In [27]:
bag_clf.fit(X_train, y_train)
y_pred = bag_clf.predict(X_test)
accuracy_bagging = accuracy_score(y_pred, y_test)
print("Accuracy of bagging :", accuracy_bagging)

[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:    2.4s remaining:    2.4s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    2.6s finished
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:    0.9s remaining:    0.9s


Accuracy of bagging : 0.951111111111


[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    1.1s finished


# Random Patches and Random Subspaces for Bagging

In [55]:
bag_clf_patch = BaggingClassifier(DecisionTreeClassifier(), n_estimators=500, max_samples=1000, bootstrap=True, n_jobs=-1,random_state=11, verbose = 1, max_features=0.6, bootstrap_features=True)

In [56]:
bag_clf_patch.fit(X_train, y_train)
y_pred = bag_clf_patch.predict(X_test)
accuracy_bagging = accuracy_score(y_pred, y_test)
print("Accuracy of bagging with random patch:", accuracy_bagging)

[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:    2.0s remaining:    2.0s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    2.2s finished
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:    1.0s remaining:    1.0s


Accuracy of bagging with random patch: 0.973333333333


[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    1.2s finished


In [13]:
#Set bootstrap = True for Bagging and false for Pasting
pas_clf = BaggingClassifier(DecisionTreeClassifier(), n_estimators=500, max_samples=1000, bootstrap=False, n_jobs=-1,random_state=11, verbose = 1)

In [14]:
pas_clf.fit(X_train, y_train)
y_pred = pas_clf.predict(X_test)
accuracy_bagging = accuracy_score(y_pred, y_test)
print("Accuracy of pasting :", accuracy_bagging)

[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:    3.0s remaining:    3.0s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    3.3s finished
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:    1.0s remaining:    1.0s


Accuracy of pasting : 0.931111111111


[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    1.2s finished


# Ensemble of Bagging and Pasting

In [15]:
#create an ensembled classifier
voting_clf_bagging = VotingClassifier(
estimators=[('br', bag_clf), ('pr', pas_clf)],
voting='soft'
)

In [16]:
#Train the model
start = time.time()
voting_clf_bagging.fit(X_train, y_train)
print("Training time : ", time.time() - start)

[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:    2.5s remaining:    2.5s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    2.7s finished
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:    3.0s remaining:    3.0s


Training time :  6.483613967895508


[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    3.3s finished


In [17]:
y_pred = voting_clf_bagging.predict(X_test)
accuracy = accuracy_score(y_pred, y_test)
print(voting_clf_bagging.__class__.__name__ + " accuracy:", accuracy)

[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:    1.0s remaining:    1.0s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    1.2s finished
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:    1.0s remaining:    1.0s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    1.2s finished


VotingClassifier accuracy: 0.942222222222


# Using GridSearch

In [18]:
from sklearn.model_selection import GridSearchCV
import numpy as np

# GridSearch on Bagging 

In [57]:
parameters = {
    'n_estimators' : np.arange(500, 1000, 100).tolist(),
    'max_samples': np.arange(100, 1000, 200).tolist(),
    'bootstrap' : [True, False],
    'max_features': np.arange(0.2, 1.0, 0.2),
    'bootstrap_features': [True, False]
}
grid_voting_clf = GridSearchCV(bag_clf, param_grid=parameters, n_jobs=-1,verbose=10, cv=5)

In [58]:
grid_voting_clf.fit(X_train, y_train)

Fitting 5 folds for each of 400 candidates, totalling 2000 fits


[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    3.1s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:    4.8s
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    8.6s
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:   12.0s
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   15.7s
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   20.0s
[Parallel(n_jobs=-1)]: Done  53 tasks      | elapsed:   26.4s
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:   31.6s
[Parallel(n_jobs=-1)]: Done  77 tasks      | elapsed:   40.6s
[Parallel(n_jobs=-1)]: Done  90 tasks      | elapsed:   48.4s
[Parallel(n_jobs=-1)]: Done 105 tasks      | elapsed:   58.0s
[Parallel(n_jobs=-1)]: Done 120 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 137 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 173 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  1.9min
[Paralle

GridSearchCV(cv=5, error_score='raise',
       estimator=BaggingClassifier(base_estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
        ...n_estimators=500, n_jobs=-1, oob_score=False,
         random_state=11, verbose=1, warm_start=False),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid={'n_estimators': [500, 600, 700, 800, 900], 'max_samples': [100, 300, 500, 700, 900], 'bootstrap': [True, False], 'max_features': array([ 0.2,  0.4,  0.6,  0.8]), 'bootstrap_features': [True, False]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=10)

In [60]:
#best parameters
print (grid_voting_clf.best_params_)

{'bootstrap': False, 'bootstrap_features': True, 'max_features': 0.60000000000000009, 'max_samples': 900, 'n_estimators': 900}


In [61]:
predicted_grid_voting_clf = grid_voting_clf.predict(X_test)
grid_accuracy = accuracy_score(predicted_grid_voting_clf, y_test)
print("Accuracy for grid serach is :", grid_accuracy)

[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:    1.2s remaining:    1.2s


Accuracy for grid serach is : 0.973333333333


[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    1.4s finished


# Conclusion

In [None]:
# Accuracy using ensemble of Logistic Regression and Random Forest =  0.9711
# Bagging Accuracy = 0.9511
# Bagging with random patches = 0.9733
# Pasting Accuracy = 0.9311
# Ensemble of Bagging and Pasting = 0.9422
# Grid Search on Bagging with random patches = 0.9733