In [None]:
import matplotlib.pyplot as plt
from nbex.interactive import session
import numpy as np
from sklearn.datasets import fetch_openml
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score,
    balanced_accuracy_score,
)
from sklearn.model_selection import train_test_split

In [None]:
mnist = globals().get("mnist") or fetch_openml("mnist_784", version=1)

In [None]:
mnist.data.to_numpy().shape

In [None]:
x = mnist.data.to_numpy().reshape(-1, 28, 28).astype(np.int32)
y = mnist.target.to_numpy().astype(np.int32)

In [None]:
def show_labeled_image(index):
    plt.imshow(x[index], cmap="binary")
    plt.show()
    y[index]

In [None]:
if session.is_interactive:
    show_labeled_image(5)

In [None]:
x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.2, random_state=42
)

In [None]:
x_train.dtype

In [None]:
x_train.shape, x_test.shape

In [None]:
y_train.shape, y_test.shape

In [None]:
x_mean = x_train.mean(axis=0)

In [None]:
x_mean.shape

In [None]:
if session.is_interactive:
    plt.imshow(x_mean, cmap="binary")

In [None]:
ideal_digits = np.zeros((10, 28, 28), dtype=np.int32)

In [None]:
for i in range(10):
    x_i = x_train[y_train == i]
    ideal_digits[i] = x_i.mean(axis=0)

In [None]:
plt.imshow(ideal_digits[0], cmap="binary")

In [None]:
if session.is_interactive:
    fig, ax = plt.subplots(2, 5, figsize=(15, 5))
    for i in range(10):
        ax.reshape(10)[i].imshow(ideal_digits[i], cmap="binary")

In [None]:
if session.is_interactive:
    plt.imshow(x_test[0], cmap="binary")

In [None]:
np.set_printoptions(precision=2)

In [None]:
x_test[0, 3:12, 3:12]

In [None]:
diff = ideal_digits[8] - x_test[0]
diff[3:12, 3:12]

In [None]:
plt.imshow(diff)

In [None]:
(diff * diff)[3:12, 3:12]

In [None]:
(diff * diff).sum()

In [None]:
diffs = ideal_digits - x_test[0]
diffs.shape

In [None]:
errors = (diffs * diffs).sum(axis=(1, 2))
print(errors.shape)
errors.argmin()

In [None]:
def compute_single_numpy_prediction(img):
    diffs = ideal_digits - img
    return (diffs * diffs).sum(axis=(1, 2)).argmin()

In [None]:
compute_single_numpy_prediction(x_test[0])

In [None]:
batched_ideal_digits = np.expand_dims(ideal_digits, axis=0)
batched_ideal_digits.shape

In [None]:
np.expand_dims(x_test, axis=1).shape

In [None]:
def compute_numpy_predictions(imgs):
    diffs = ideal_digits - np.expand_dims(imgs, axis=1)
    return (diffs * diffs).sum(axis=(2, 3)).argmin(axis=1)

In [None]:
pred_numpy = compute_numpy_predictions(x_test)

In [None]:
pred_numpy.shape

In [None]:
def print_scores(predictions):
    accuracy = accuracy_score(y_test, predictions) * 100
    balanced_accuracy = balanced_accuracy_score(y_test, predictions) * 100
    print(f"Accuracy:          {accuracy:.1f}%")
    print(f"Balanced Accuracy: {balanced_accuracy:.1f}%")
    print()
    for i in range(10):
        idx = y_test == i
        accuracy_i = accuracy_score(y_test[idx], predictions[idx]) * 100
        print(f"Accuracy for {i}:    {accuracy_i:.1f}%")
    print()

In [None]:
print_scores(pred_numpy)

In [None]:
if session.is_interactive:
    plt.imshow(x_mean, cmap="binary")

In [None]:
rf_clf = RandomForestClassifier()

In [None]:
x_train.shape

In [None]:
rf_clf.fit(x_train.reshape(-1, 28 * 28), y_train)

In [None]:
pred_rf = rf_clf.predict(x_test.reshape(-1, 28 * 28))

In [None]:
print_scores(pred_rf)