In [1]:
# train mnist with voting classifier
from sklearn.datasets import fetch_openml
import numpy as np
mnist = fetch_openml('mnist_784', version=1, as_frame=False)
mnist.target = mnist.target.astype(np.uint8)

In [3]:
from sklearn.model_selection import train_test_split
X_train_val, X_test, y_train_val, y_test = train_test_split(
    mnist.data, mnist.target, test_size=10000, random_state=42)
# 10000 for test
X_train, X_val, y_train, y_val = train_test_split(
    X_train_val, y_train_val, test_size=10000, random_state=42)
# further split the rest for train and validation
# 10000 for validation

In [4]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.ensemble import ExtraTreesClassifier


In [5]:
rnd_clf = RandomForestClassifier(n_estimators=100, random_state=42)
ext_clf = ExtraTreesClassifier(n_estimators = 100, random_state=42)
svm_clf = LinearSVC(max_iter=100, tol=20, random_state=42)


In [6]:
# train each estimator
rnd_clf.fit(X_train, y_train)
ext_clf.fit(X_train, y_train)
svm_clf.fit(X_train, y_train)

LinearSVC(max_iter=100, random_state=42, tol=20)

In [7]:
vt_clf = VotingClassifier(
estimators=[('rnd', rnd_clf), ('extra', ext_clf), ('svm', svm_clf)],
    voting='hard'
)
vt_clf.fit(X_train, y_train)

VotingClassifier(estimators=[('rnd', RandomForestClassifier(random_state=42)),
                             ('extra', ExtraTreesClassifier(random_state=42)),
                             ('svm',
                              LinearSVC(max_iter=100, random_state=42,
                                        tol=20))])

In [8]:
from sklearn.metrics import accuracy_score
the_estimators = [rnd_clf, ext_clf, svm_clf, vt_clf]
for i in the_estimators:
    i.fit(X_train, y_train)
    y_pred = i.predict(X_val)
    print(accuracy_score(y_pred, y_val))
# votingclassifier is slightly better    
# extratree perform the best?

0.9692
0.9715
0.859
0.9693


In [9]:
# try all on the test set
from sklearn.metrics import accuracy_score
the_estimators = [rnd_clf, ext_clf, svm_clf, vt_clf]
for i in the_estimators:
    i.fit(X_train, y_train)
    y_pred = i.predict(X_test)
    print(accuracy_score(y_pred, y_test))
    

0.9645
0.9691
0.8566
0.965


In [10]:
# try to remove svm
vt_clf.set_params(svm=None)

VotingClassifier(estimators=[('rnd', RandomForestClassifier(random_state=42)),
                             ('extra', ExtraTreesClassifier(random_state=42)),
                             ('svm', None)])

In [11]:
y_pred = vt_clf.predict(X_val)
print(accuracy_score(y_pred, y_val))
# it was 0.9693 before we delete svm

0.9693


In [12]:
#vt_clf.voting="soft"
#y_pred = vt_clf.predict(X_val)
#print(accuracy_score(y_pred, y_val))
# soft voting is slightly better

In [13]:
#vt_clf.voting="hard"
# the test score was 0.965 before remove svm
#y_pred = vt_clf.predict(X_test)
#print(accuracy_score(y_pred, y_test))

In [14]:
y_pred = vt_clf.predict(X_test)
print(accuracy_score(y_pred, y_test))
#　worse than randomforest?

0.965


In [15]:
# stacking ensemble

In [16]:
estimators = [rnd_clf, ext_clf, svm_clf]


In [17]:
X_val_predictions = np.empty((len(X_val), len(estimators)), dtype=np.float32)
# enumerate help with index

for index, estimator in enumerate(estimators):
    X_val_predictions[:, index] = estimator.predict(X_val)

In [18]:
X_val_predictions


array([[5., 5., 5.],
       [8., 8., 8.],
       [2., 2., 3.],
       ...,
       [7., 7., 7.],
       [6., 6., 6.],
       [7., 7., 7.]], dtype=float32)

In [20]:
list(enumerate(estimators))

[(0, RandomForestClassifier(random_state=42)),
 (1, ExtraTreesClassifier(random_state=42)),
 (2, LinearSVC(max_iter=100, random_state=42, tol=20))]

In [21]:
rnd_forest_blender = RandomForestClassifier(n_estimators=200, oob_score=True, random_state=42)
rnd_forest_blender.fit(X_val_predictions, y_val)
# successfully train a blender

RandomForestClassifier(n_estimators=200, oob_score=True, random_state=42)

In [22]:
rnd_forest_blender.oob_score_

0.9703

In [23]:
# evaluate the test set
X_test_predictions = np.empty((len(X_test), len(estimators)), dtype=np.float32)

for index, estimator in enumerate(estimators):
    X_test_predictions[:, index] = estimator.predict(X_test)

In [25]:
y_pred = rnd_forest_blender.predict(X_test_predictions)

In [26]:
from sklearn.metrics import accuracy_score
accuracy_score(y_pred, y_test)

0.9661