In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [3]:
from sklearn.datasets import fetch_openml

mnist = fetch_openml('mnist_784', version=1)
X, y = mnist["data"], mnist["target"]
y = y.astype(np.uint8)

In [4]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=10000, random_state=0)

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=10000, random_state=0)

In [5]:
print(X_train.shape), print(y_train.shape)
print(X_val.shape), print(y_val.shape)
print(X_test.shape), print(y_test.shape)

(50000, 784)
(50000,)
(10000, 784)
(10000,)
(10000, 784)
(10000,)


(None, None)

### Random forest classifier

In [13]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [18]:
from sklearn.ensemble import RandomForestClassifier

rnd_clf = RandomForestClassifier(n_estimators=10, n_jobs=-1)

rnd_clf.fit(X_train, y_train)

y_pred_rf = rnd_clf.predict(X_test)

### ExtraTree classifier

In [19]:
from sklearn.ensemble import ExtraTreesClassifier

extr_clf = ExtraTreesClassifier(n_estimators=10, n_jobs=-1)

extr_clf.fit(X_train, y_train)

y_pred_extr = extr_clf.predict(X_test)

### Logistic Regression

In [20]:
from sklearn.linear_model import LogisticRegression 
## SoftMax
## consider scaling the features
softmax_clf = LogisticRegression(multi_class="multinomial",solver="lbfgs", C=10)
softmax_clf.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression(C=10, multi_class='multinomial')

In [21]:
estimators = [rnd_clf, extr_clf, softmax_clf]
for estimator in estimators:
  print(estimator.score(X_val, y_val))

0.9463
0.9474
0.9178


### voting classifier

In [22]:
from sklearn.ensemble import VotingClassifier

named_estimators = [
    ("random_forest_clf", rnd_clf),
    ("extra_trees_clf", extr_clf),
    ("softmax_clf", softmax_clf)
]

In [23]:
voting_clf = VotingClassifier(named_estimators)

In [24]:
voting_clf.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


VotingClassifier(estimators=[('random_forest_clf',
                              RandomForestClassifier(n_estimators=10,
                                                     n_jobs=-1)),
                             ('extra_trees_clf',
                              ExtraTreesClassifier(n_estimators=10, n_jobs=-1)),
                             ('softmax_clf',
                              LogisticRegression(C=10,
                                                 multi_class='multinomial'))])

In [25]:
voting_clf.score(X_val, y_val)

0.9563

the hard voting classifier is performing better than the single classifiers before

In [26]:
voting_clf.voting = "soft"
voting_clf.score(X_val, y_val)

0.9565

by doing soft voting classifier does not improve drastically the score

### comparing to the individual classifiers

In [27]:
voting_clf.score(X_test, y_test)

0.9511

In [28]:
for estimator in voting_clf.estimators_:
  print(estimator.score(X_test, y_test))

0.9467
0.9487
0.9152
