<a href="https://colab.research.google.com/github/hoelzl/ML-Course/blob/master/notebooks/nb020_welcome_mnist_example.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
from sklearn.datasets import fetch_openml
from sklearn.metrics import accuracy_score, balanced_accuracy_score
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

In [None]:
mnist = fetch_openml("mnist_784", version=1)

In [None]:
X, y = mnist.data, mnist.target

In [None]:
X.shape, y.shape

((70000, 784), (70000,))

In [None]:
X_train, X_test = X[:60_000], X[60_000:]
y_train, y_test = y[:60_000].astype(np.int32), y[60_000:].astype(np.int32)

In [None]:
X_train.shape, X_test.shape

((60000, 784), (10000, 784))

In [None]:
y_train.shape, y_test.shape

((60000,), (10000,))

In [None]:
sgd_clf = SGDClassifier()

In [None]:
y_train[:10]

array([5, 0, 4, 1, 9, 2, 1, 3, 1, 4], dtype=int32)

In [None]:
sgd_clf.fit(X_train, y_train)

SGDClassifier(alpha=0.0001, average=False, class_weight=None,
              early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
              l1_ratio=0.15, learning_rate='optimal', loss='hinge',
              max_iter=1000, n_iter_no_change=5, n_jobs=None, penalty='l2',
              power_t=0.5, random_state=None, shuffle=True, tol=0.001,
              validation_fraction=0.1, verbose=0, warm_start=False)

In [None]:
pred_sgd = sgd_clf.predict(X_test)

In [None]:
y_test[:10]

array([7, 2, 1, 0, 4, 1, 4, 9, 5, 9], dtype=int32)

In [None]:
pred_sgd[:10]

array([7, 2, 1, 0, 4, 1, 4, 9, 6, 7], dtype=int32)

In [None]:
def print_scores(predictions):
    print(f"Accuracy:          {accuracy_score(y_test, predictions) * 100:.1f}%")
    print(f"Balanced Accuracy: {balanced_accuracy_score(y_test, predictions) * 100:.1f}%")

In [None]:
print_scores(pred_sgd)

Accuracy:          88.6%
Balanced Accuracy: 88.2%


In [None]:
print_scores(np.zeros((len(X_test),)))

Accuracy:          9.8%
Balanced Accuracy: 10.0%


In [None]:
dt_clf = DecisionTreeClassifier()

In [None]:
dt_clf.fit(X_train, y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [None]:
pred_dt = dt_clf.predict(X_test)

In [None]:
print_scores(pred_dt)

Accuracy:          87.7%
Balanced Accuracy: 87.6%


In [None]:
rf_clf = RandomForestClassifier()

In [None]:
rf_clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [None]:
pred_rf = rf_clf.predict(X_test)

In [None]:
print_scores(pred_rf)

Accuracy:          97.0%
Balanced Accuracy: 97.0%


In [None]:
gbt_clf = GradientBoostingClassifier()

In [None]:
gbt_clf.fit(X_train, y_train)

GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=100,
                           n_iter_no_change=None, presort='deprecated',
                           random_state=None, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

In [None]:
pred_gbt = gbt_clf.predict(X_test)

In [None]:
print_scores(pred_gbt)

Accuracy:          94.6%
Balanced Accuracy: 94.5%
