In [None]:
%pip install pyarrow

In [None]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import seaborn as sns
from PIL import Image
from io import BytesIO

In [None]:
class MnistDataset(Dataset):
    def __init__(self, file_path):
        self.data = pd.read_parquet(file_path).to_numpy()

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        entry = self.data[index]

        with Image.open(BytesIO(entry[0]["bytes"])) as img:
            img_gray = img.convert("L")
            pixel_array = np.array(img_gray)
            flattened_array = pixel_array.flatten()
            normalized_array = flattened_array / 255.0
            float32_array = normalized_array.astype("float32")

        label = torch.zeros(10)
        label[entry[1]] = 1

        return float32_array, label


BATCH_SIZE = 32
dataloader = DataLoader(dataset=MnistDataset("train.parquet"), batch_size=32)

In [None]:
class MultiLayerNN(nn.Module):
    def __init__(self, input_size, hidden_layer_count, hidden_size, output_size):
        super(MultiLayerNN, self).__init__()

        if hidden_layer_count == 0:
            self.input_layer = nn.Linear(input_size, output_size)
            self.layers = []
        else:
            self.input_layer = nn.Linear(input_size, hidden_size)
            self.layers = []
            for i in range(hidden_layer_count):
                self.layers.append(
                    nn.Linear(
                        hidden_size,
                        output_size if i == hidden_layer_count - 1 else hidden_size,
                    )
                )

    def forward(self, x):
        interm = self.input_layer(x)

        for i, layer in enumerate(self.layers):
            new_pass = layer(interm)

            if i < len(self.layers) - 1:
                interm = nn.functional.relu(new_pass)
            else:
                interm = new_pass

        return interm

In [None]:
INPUT_DIMENSION = 28 * 28
OUTPUT_DIMENSION = 10

HIDDEN_LAYER_COUNT = 2
HIDDEN_LAYER_SIZE = 30

model = MultiLayerNN(
    INPUT_DIMENSION, HIDDEN_LAYER_COUNT, HIDDEN_LAYER_SIZE, OUTPUT_DIMENSION
)
model.train()

In [None]:
loss_fun = nn.MSELoss()

In [None]:
LEARNING_RATE = 0.2

optimizer = torch.optim.SGD(model.parameters(), lr=LEARNING_RATE)

In [None]:
NUM_EPOCHS = 20

losses = []

for epoch in range(NUM_EPOCHS):
    for i, data in enumerate(dataloader):
        optimizer.zero_grad()

        pred = model(data[0])

        # compute loss
        loss = loss_fun(pred, data[1])
        # losses.append(loss.item())

        # backprop
        loss.backward()

        # update weights
        optimizer.step()

    losses.append(float(loss.data))

In [None]:
# Plot loss over time
sns.lineplot(x=range(len(losses)), y=losses)

In [None]:
eval_set = MnistDataset("test.parquet")

model.eval()

correct = 0
total = 0


for i, data in enumerate(eval_set):
    output = model(torch.tensor(data[0]))
    total += 1
    if torch.argmax(output) == torch.argmax(data[1]):
        correct += 1

error_rate = 1 - correct / total
print(f"Error rate in percent: {error_rate * 100}%")