In [1]:
# Ensemble classifier using hard/soft voting composing of a
# multilayer perceptron, forests, and a linear support vectors.
# Create a stacking ensemble blending the ensemble predictions

# Common Imports
import numpy as np

# ML Imports

# Data Imports
from sklearn.datasets import fetch_mldata
from sklearn.model_selection import train_test_split
# Classifier Imports
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.svm import LinearSVC
from sklearn.neural_network import MLPClassifier
# Metric Imports
from sklearn.metrics import accuracy_score


# Declare Functions
def display_estimators(clf_list, score_list):
    for i, j, in zip(clf_list, score_list):
        print("Estimator:", i)
        print("Estimator:", j)

  from numpy.core.umath_tests import inner1d


In [3]:
# Create training, test, and validation sets
mnist = fetch_mldata("MNIST original")

X_train_val, X_test, y_train_val, y_test = train_test_split(
    mnist.data, mnist.target, test_size=10000, random_state=42)

X_train, X_val, y_train, y_val = train_test_split(
    X_train_val, y_train_val, test_size=10000, random_state=42)

In [4]:
# Instantiate models
random_forest_clf = RandomForestClassifier(random_state=42)
extra_tree_clf = ExtraTreesClassifier(random_state=42)
svm_clf = LinearSVC(random_state=42)
mlp_clf = MLPClassifier(random_state=42)

named_estimators = [
    ("random_forest_clf", random_forest_clf),
    ("extra_tree_clf", extra_tree_clf),
    ("svm_clf", svm_clf),
    ("mlp_clf", mlp_clf),
]

voting_clf = VotingClassifier(named_estimators)

In [5]:
# Fit the Voting classifier
voting_clf.fit(X_train, y_train)

# Train models
estimators = [random_forest_clf, extra_tree_clf, svm_clf, mlp_clf]
for estimator in estimators:
    estimator.fit(X_train, y_train)

scores = [estimator.score(X_val, y_val) for estimator in estimators]
display_estimators(estimators, scores)

Estimator: RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=42, verbose=0, warm_start=False)
Estimator: 0.9467
Estimator: ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
           max_depth=None, max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=42, verbose=0, warm_start=False)
Estimator: 0.9512
Estimator: LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter

In [6]:
# Remove SVC
print("Without the LinearSVC")
voting_clf.set_params(svm_clf=None)

scores = [estimator.score(X_val, y_val) for estimator in estimators]
display_estimators(estimators, scores)

del voting_clf.estimators_[2]

Without the LinearSVC
Estimator: RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=42, verbose=0, warm_start=False)
Estimator: 0.9467
Estimator: ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
           max_depth=None, max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=42, verbose=0, warm_start=False)
Estimator: 0.9512
Estimator: LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='sq

In [7]:
# Set show soft v hard voting classification
hard = voting_clf.score(X_val, y_val)
voting_clf.voting = "soft"
soft = voting_clf.score(X_val, y_val)

print("Hard voting score compared to Softvoting score")
print("Hard: {}, Soft: {}".format(hard, soft))

  if diff:


Hard voting score compared to Softvoting score
Hard: 0.9643, Soft: 0.9671


  if diff:


In [8]:
# Generate predictions using the estimators for a stacking ensemble
X_val_predictions = np.empty((len(X_val), len(estimators)), dtype=np.float32)

for index, estimator in enumerate(estimators):
    X_val_predictions[:, index] = estimator.predict(X_val)

rnd_forest_blender = RandomForestClassifier(
    n_estimators=200, oob_score=True, random_state=42)
rnd_forest_blender.fit(X_val_predictions, y_val)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=1,
            oob_score=True, random_state=42, verbose=0, warm_start=False)

In [9]:
# Evaluate on the test set
X_test_predictions = np.empty((len(X_test), len(estimators)), dtype=np.float32)

for index, estimator in enumerate(estimators):
    X_test_predictions[:, index] = estimator.predict(X_test)

y_pred = rnd_forest_blender.predict(X_test_predictions)

# Stacking prediction ensembles
print("Stacking ensemble predictions")
print("OOB CV Training Accuracy score:{}".format(
    rnd_forest_blender.oob_score_))
print("Test accuracy score:{}".format(accuracy_score(y_test, y_pred)))

Stacking ensemble predictions
OOB CV Training Accuracy score:0.9615
Test accuracy score:0.9563
