# MNIST from Scratch

Can I train a model to recognize handwritten digits using numpy?

1. Load the MNIST dataset from the web and store as NumPy arrays
2. Train a simple model to solve MNIST using PyTorch
3. Do the same with NumPy by implementing various ML algorithms

In [1]:
from pathlib import Path

import numpy as np
import pandas as pd
from tqdm import trange
from torch import nn, Tensor, optim

In [2]:
np.random.seed(1337)

Load data from CSV files

In [3]:
def load_dataset(data_dir: Path, file_name: str) -> np.ndarray:
    # can be done with numpy.genfromtxt(), but is much slower
    dataset = pd.read_csv(data_dir / file_name)
    x = dataset.drop("label", axis=1).to_numpy() / 255.0
    y = dataset["label"].to_numpy()
    return x, y

In [4]:
mnist_path = Path(".").resolve(strict=True).parent / "data" / "mnist"
x_train, y_train = load_dataset(mnist_path, "mnist_train.csv")
x_test, y_test = load_dataset(mnist_path, "mnist_test.csv")

Solve with PyTorch

In [5]:
class NeuralNet(nn.Module):

    def __init__(self):
        super().__init__()
        self.layer1 = nn.Linear(784, 256, bias=False)
        self.layer2 = nn.Linear(256, 64, bias=False)
        self.layer3 = nn.Linear(64, 10, bias=False)
        self.act = nn.ReLU()
    
    def forward(self, x: Tensor) -> Tensor:
        x = x.view(-1, 28 * 28)
        x = self.act(self.layer1(x))
        x = self.act(self.layer2(x))
        x = self.layer3(x)
        return x

In [6]:
iterations = 500
batch_size = 64
learning_rate = 0.001

model = NeuralNet()
loss_func = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

In [7]:
# training loop
for i in (t := trange(iterations)):
    sample = np.random.randint(0, len(x_train), size=batch_size)
    out = model(Tensor(x_train[sample]))
    loss = loss_func(out, Tensor(y_train[sample]).long())
    loss.backward()
    optimizer.step()
    model.zero_grad()
    t.set_description(f"Iteration {i} loss {loss.item():.2f}")

Iteration 499 loss 0.16: 100%|██████████| 500/500 [00:02<00:00, 189.71it/s]


In [8]:
# test accuracy
pred = model(Tensor(x_test)).argmax(dim=1)
accuracy = (pred == Tensor(y_test)).float().mean().item()
accuracy

0.9373999834060669

Sanity check with NumPy

In [9]:
w1 = model.layer1.weight.detach().numpy().astype(np.float64)
w2 = model.layer2.weight.detach().numpy().astype(np.float64)
w3 = model.layer3.weight.detach().numpy().astype(np.float64)

In [10]:
def forward(x: np.ndarray) -> np.ndarray:
    x = np.maximum(x @ w1.T, 0)
    x = np.maximum(x @ w2.T, 0)
    x = x @ w3.T
    return x

In [11]:
pred = forward(x_test).argmax(axis=1)
accuracy = (pred == y_test).mean()
accuracy

0.9374

Now solve with NumPy

In [12]:
def layer_init(m, h):
    weights = np.random.uniform(low=-1., high=1., size=(m, h)) / np.sqrt(m * h)
    return weights

In [13]:
w1 = layer_init(256, 784)
w2 = layer_init(64, 256)
w3 = layer_init(10, 64)

In [14]:
def saved_forward(x0: np.ndarray) -> tuple[np.ndarray]:
    x1 = x0 @ w1.T  # 64 * 256
    x2 = np.maximum(x1, 0)  # 64 * 256
    x3 = x2 @ w2.T  # 64 * 64
    x4 = np.maximum(x3, 0)  # 64 x 64
    x5 = x4 @ w3.T  # 64 x 10
    return x5, x4, x3, x2, x1, x0

In [15]:
def mse(pred: np.ndarray, labels: np.ndarray) -> tuple[float, np.ndarray]:
    # y is the one-hot enconding of labels
    actual = np.zeros((labels.size, 10))
    actual[np.arange(labels.size), labels] = 1
    # do total squared error for now
    error = pred - actual
    loss = np.sum((pred - actual) ** 2)
    return loss, error

In [16]:
def backward(error: np.ndarray, xs: tuple[np.ndarray]) -> tuple[np.ndarray]:
    x5, x4, x3, x2, x1, x0 = xs  # 64 * 10, 64 * 64, 64 * 64, 64 * 256, 64 * 256, 64 * 784
    dx5 = np.ones_like(x5) * error * 2  # 64 * 10
    dw3 = (x4.T @ dx5).T   # 10 * 64
    dx4 = dx5 @ w3  # 64 * 64
    dx3 = (x4 > 0).astype(np.float64) * dx4  # 64 * 64
    dw2 = (x2.T @ dx3).T  # 64 * 256
    dx2 = dx3 @ w2  # 64 * 256
    dx1 = (x2 > 0).astype(np.float64) * dx2  # 64 x 256
    dw1 = (x0.T @ dx1).T  # 256 x 784
    assert dw3.shape == w3.shape
    assert dw2.shape == w2.shape
    assert dw1.shape == w1.shape
    return dw3, dw2, dw1

In [17]:
def update_weights(ws: tuple[np.ndarray], dws: tuple[np.ndarray], lr: float = 0.001) -> tuple[np.ndarray]:
    for wi, dwi in zip(ws, dws):
        wi -= lr * dwi
    return ws

In [18]:
for i in (t := trange(2000)):
    sample = np.random.randint(0, len(x_train), size=batch_size)
    xs = saved_forward(x_train[sample])
    loss, error = mse(xs[0], y_train[sample].astype(int))
    dws = backward(error, xs)
    update_weights((w3, w2, w1), dws)
    t.set_description(f"Iteration {i} loss {loss.item():.2f}")

Iteration 1999 loss 13.45: 100%|██████████| 2000/2000 [00:10<00:00, 184.46it/s]


In [19]:
pred = forward(x_test).argmax(axis=1)
accuracy = (pred == y_test).mean()
accuracy

0.942