In [1]:
from sklearn.datasets import fetch_openml
import numpy as np

import warnings
warnings.filterwarnings(action='ignore', message='^internal gelsd')

In [2]:
mnist = fetch_openml('mnist_784', version=1)
mnist.keys()

dict_keys(['data', 'target', 'feature_names', 'DESCR', 'details', 'categories', 'url'])

In [3]:
X, y = mnist['data'], mnist['target']

In [4]:
X_train, X_validation, X_test = X[:50000], X[50000:60000], X[60000:]
y_train, y_validation, y_test = y[:50000], y[50000:60000], y[60000:]

In [5]:
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, VotingClassifier
from sklearn.svm import LinearSVC

In [9]:
rnd_clf = RandomForestClassifier()
ext_clf = ExtraTreesClassifier()
svm_clf = LinearSVC(loss='hinge', random_state=42)

In [10]:
rnd_clf.fit(X_train, y_train)
ext_clf.fit(X_train, y_train)




ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
           max_depth=None, max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [11]:
svm_clf.fit(X_train, y_train)



LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='hinge', max_iter=1000, multi_class='ovr',
     penalty='l2', random_state=42, tol=0.0001, verbose=0)

In [25]:
for clf in rnd_clf, ext_clf, svm_clf:
    print(f"{clf.score(X_validation, y_validation)}")

0.9481
0.9513
0.8709


In [22]:
voting_clf = VotingClassifier(
   estimators=[('rf', rnd_clf), ('xt', ext_clf), ('svc', svm_clf)], voting='hard'
)

In [23]:
voting_clf.fit(X_train, y_train)



VotingClassifier(estimators=[('rf', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_we...nge', max_iter=1000, multi_class='ovr',
     penalty='l2', random_state=42, tol=0.0001, verbose=0))],
         flatten_transform=None, n_jobs=None, voting='hard', weights=None)

In [24]:
voting_clf.score(X_validation, y_validation)

0.9559

In [26]:
voting_clf.estimators

[('rf',
  RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
              max_depth=None, max_features='auto', max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
              oob_score=False, random_state=None, verbose=0,
              warm_start=False)),
 ('xt',
  ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
             max_depth=None, max_features='auto', max_leaf_nodes=None,
             min_impurity_decrease=0.0, min_impurity_split=None,
             min_samples_leaf=1, min_samples_split=2,
             min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
             oob_score=False, random_state=None, verbose=0, warm_start=False)),
 ('svc',
  LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
       intercept_scaling=1, loss='hinge', max_

In [27]:
# Remove svc because it has the lowest performance
voting_clf.set_params(svc=None)
voting_clf.estimators

[('rf',
  RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
              max_depth=None, max_features='auto', max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
              oob_score=False, random_state=None, verbose=0,
              warm_start=False)),
 ('xt',
  ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
             max_depth=None, max_features='auto', max_leaf_nodes=None,
             min_impurity_decrease=0.0, min_impurity_split=None,
             min_samples_leaf=1, min_samples_split=2,
             min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
             oob_score=False, random_state=None, verbose=0, warm_start=False)),
 ('svc', None)]

In [28]:
# svc is still there in the trained estimators
voting_clf.estimators_

[RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
             max_depth=None, max_features='auto', max_leaf_nodes=None,
             min_impurity_decrease=0.0, min_impurity_split=None,
             min_samples_leaf=1, min_samples_split=2,
             min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
             oob_score=False, random_state=None, verbose=0,
             warm_start=False),
 ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=None, verbose=0, warm_start=False),
 LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
      intercept_scaling=1, loss='hinge', max_iter=1000, multi_class='ovr',
      penalt

In [29]:
# ... so, retrain the ensemble
voting_clf.fit(X_train, y_train)

VotingClassifier(estimators=[('rf', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_we...s=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False)), ('svc', None)],
         flatten_transform=None, n_jobs=None, voting='hard', weights=None)

In [30]:
voting_clf.estimators_

[RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
             max_depth=None, max_features='auto', max_leaf_nodes=None,
             min_impurity_decrease=0.0, min_impurity_split=None,
             min_samples_leaf=1, min_samples_split=2,
             min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
             oob_score=False, random_state=None, verbose=0,
             warm_start=False),
 ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=None, verbose=0, warm_start=False)]

In [31]:
voting_clf.score(X_validation, y_validation)

0.9479

In [32]:
# Try also soft voting. No need to retrain the ensemble
voting_clf.voting = "soft"
voting_clf.score(X_validation, y_validation)

0.9661

In [33]:
voting_clf.score(X_test, y_test)

0.9591

In [36]:
# The ensemble slightly outperforms the individual classifiers
for clf in [rnd_clf, ext_clf]:
    print(clf.score(X_test, y_test))

0.9459
0.9517


## Part 2 - Stacking ensemble

In [46]:
rnd_clf_predictions = rnd_clf.predict(X_validation)
ext_clf_predicitions = ext_clf.predict(X_validation)
X_stack = np.array([[a, b] for a, b in zip(rnd_clf_predictions, ext_clf_predicitions)])
y_stack = y_validation

In [58]:
rnd_clf_new = RandomForestClassifier(n_estimators=100, oob_score=True)
rnd_clf_new.fit(X_stack, y_stack)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
            oob_score=True, random_state=None, verbose=0, warm_start=False)

In [55]:
rnd_clf_new.oob_score_

AttributeError: 'RandomForestClassifier' object has no attribute 'oob_score_'

In [49]:
# Prepare a "test" set for the blender
rnd_clf_test_predictions = rnd_clf.predict(X_test)
ext_clf_test_predicitions = ext_clf.predict(X_test)
X_stack_test = np.array([[a, b] for a, b in zip(rnd_clf_test_predictions, ext_clf_test_predicitions)])
y_stack_test = y_test

In [50]:
# Evaluate the blender on the test set
rnd_clf_new.score(X_stack_test, y_stack_test)

0.9488