# Voting Classifier with MNIST dataset

Load the MNIST data and split it into a training set, a validation set, and a test set (e.g., use 50,000 instances for training, 10,000 for validation, and 10,000 for testing).

In [1]:
from sklearn.datasets import fetch_mldata

mnist = fetch_mldata('MNIST original')



In [2]:
from sklearn.model_selection import train_test_split

X_train_val, X_test, y_train_val, y_test = train_test_split(mnist.data, mnist.target, random_state=42, test_size=10000)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, random_state=42, test_size=10000)

Then train various classifiers, such as a Random Forest classifier, an Extra-Trees classifier, and an SVM.

In [3]:
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.svm import LinearSVC

rnd_clf = RandomForestClassifier(random_state=42, n_estimators=10)
extra_tree_clf = ExtraTreesClassifier(random_state=42, n_estimators=10)
svm_clf = LinearSVC(random_state=42)

In [4]:
for clf in [rnd_clf, extra_tree_clf, svm_clf]:
    print('Training the', clf)
    clf.fit(X_train, y_train)

Training the RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=42, verbose=0, warm_start=False)
Training the ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
           max_depth=None, max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
           oob_score=False, random_state=42, verbose=0, warm_start=False)
Training the LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class=



In [5]:
[clf.score(X_val, y_val) for clf in [rnd_clf, extra_tree_clf, svm_clf]]

[0.9467, 0.9512, 0.8547]

Next, try to combine them into an ensemble that outperforms them all on the validation set, using a soft or hard voting classifier.

In [6]:
from sklearn.ensemble import VotingClassifier

voting_hard_clf = VotingClassifier(
    estimators=[('rf', rnd_clf), ('extra_tree', extra_tree_clf), ('svc', svm_clf)],
    voting='hard'
)
voting_hard_clf.fit(X_train, y_train)



VotingClassifier(estimators=[('rf', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_we... max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=42, tol=0.0001,
     verbose=0))],
         flatten_transform=None, n_jobs=None, voting='hard', weights=None)

In [7]:
voting_hard_clf.score(X_val, y_val)

0.9514

In [8]:
[clf.score(X_val, y_val) for clf in voting_hard_clf.estimators_]

[0.9467, 0.9512, 0.8547]

Following part of hard voting is inspired by https://github.com/ageron/handson-ml/blob/master/07_ensemble_learning_and_random_forests.ipynb

Let's remove the SVM to see if performance improves. It is possible to remove an estimator by setting it to None using `set_params()` like this:

In [9]:
voting_hard_clf.set_params(svc=None)

VotingClassifier(estimators=[('rf', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_we...obs=None,
           oob_score=False, random_state=42, verbose=0, warm_start=False)), ('svc', None)],
         flatten_transform=None, n_jobs=None, voting='hard', weights=None)

This updated the list of estimators:

In [10]:
voting_hard_clf.estimators

[('rf',
  RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
              max_depth=None, max_features='auto', max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
              oob_score=False, random_state=42, verbose=0, warm_start=False)),
 ('extra_tree',
  ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
             max_depth=None, max_features='auto', max_leaf_nodes=None,
             min_impurity_decrease=0.0, min_impurity_split=None,
             min_samples_leaf=1, min_samples_split=2,
             min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
             oob_score=False, random_state=42, verbose=0, warm_start=False)),
 ('svc', None)]

However, it did not update the list of _trained estimators_:

In [11]:
voting_hard_clf.estimators_

[RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
             max_depth=None, max_features='auto', max_leaf_nodes=None,
             min_impurity_decrease=0.0, min_impurity_split=None,
             min_samples_leaf=1, min_samples_split=2,
             min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
             oob_score=False, random_state=42, verbose=0, warm_start=False),
 ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=42, verbose=0, warm_start=False),
 LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
      intercept_scaling=1, loss='squared_hinge', max_iter=1000,
      multi_class='ovr', penalty='l2', r

So we can either fit the `VotingClassifier` again, or just remove the SVM from the list of trained estimators:

In [12]:
del voting_hard_clf.estimators_[2]

In [13]:
voting_hard_clf.estimators_

[RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
             max_depth=None, max_features='auto', max_leaf_nodes=None,
             min_impurity_decrease=0.0, min_impurity_split=None,
             min_samples_leaf=1, min_samples_split=2,
             min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
             oob_score=False, random_state=42, verbose=0, warm_start=False),
 ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=42, verbose=0, warm_start=False)]

In [14]:
voting_hard_clf.score(X_val, y_val)

0.9442

Now let's try using a soft voting classifier. We do not actually need to retrain the classifier, we can just set voting to "soft":

In [15]:
voting_soft_clf = VotingClassifier(
    estimators=[('rf', rnd_clf), ('extra_tree', extra_tree_clf)],
    voting='soft'
)
voting_soft_clf.fit(X_train, y_train)

VotingClassifier(estimators=[('rf', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_we...imators=10, n_jobs=None,
           oob_score=False, random_state=42, verbose=0, warm_start=False))],
         flatten_transform=None, n_jobs=None, voting='soft', weights=None)

In [16]:
voting_soft_clf.score(X_val, y_val)

0.9625

In [17]:
voting_soft_clf.score(X_test, y_test)

0.9582

In [18]:
[clf.score(X_test, y_test) for clf in [rnd_clf, extra_tree_clf]]

[0.9434, 0.9444]