In [13]:
from torch import torch
from torchvision import datasets
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader, random_split
from torchvision import transforms
import matplotlib.pyplot as plt


In [None]:
lr = 0.2
bs = 64
epochs = 30

Load Data

In [None]:
from pathlib import Path

data_dirs = [Path("./"), Path("../")]

tf = transforms.Compose([
    # 0.1307 is the mean of the MNIST dataset, 0.3081 is the standard deviation
    # use flatten(1) to flatten the shape (1, 28, 28) to (784)
    transforms.ToTensor(),
    transforms.Normalize((0.1307,), (0.3081,)), 
    transforms.Lambda(lambda x: x.flatten(0))
])

for data_dir in data_dirs:
    if (data_dir / "MNIST").exists():
        train_data = datasets.MNIST(data_dir, train=True, transform=tf)
        test_data = datasets.MNIST(data_dir, train=False, transform=tf)
        break
else:
    train_data = datasets.MNIST("./", train=True, download=True, transform=tf)
    test_data = datasets.MNIST("./", train=False, download=True, transform=tf)
    
g = torch.Generator().manual_seed(42)

train_data, val_data = random_split(train_data, [50000, 10000], generator=g)

train_loader = DataLoader(train_data, batch_size=bs, shuffle=True)
val_loader = DataLoader(val_data, batch_size=bs*2, shuffle=True)
test_loader = DataLoader(test_data, batch_size=bs*2)

Model


In [None]:
import numpy as np
from torch import Tensor

loss_func = F.cross_entropy

def accuracy(input:Tensor, target:Tensor):
    preds = torch.argmax(input, dim=1)
    return (preds == target).float().mean()

In [None]:
class MLP(nn.Module):
    def __init__(self):
        super().__init__()
        self.layer1 = nn.Linear(784, 128)
        self.layer2 = nn.Linear(128, 10)
        
    def forward(self, x):
        x = F.relu(self.layer1(x))
        x = self.layer2(x)
        return x
    
    def evaluate(self, loader:DataLoader): 
        """
        Evaluate the model on the given data
        return: (loss, accuracy)
        """
        self.eval()
        total_loss = 0
        total_acc = 0
        with torch.no_grad():
            for x_batch, y_batch in loader:
                preds = self.forward(x_batch)
                total_acc += accuracy(preds, y_batch)
                total_loss += loss_func(preds, y_batch)
        return total_loss / len(loader), total_acc / len(loader)


In [121]:
def get_model():
    model = MLP()
    return model, optim.SGD(model.parameters(), lr=lr)


Train

In [None]:
def fit(model, optimizer, train_loader, val_loader, epochs):
    for epoch in range(epochs):
        model.train()
        for x_batch, y_batch in train_loader:
            preds = model.forward(x_batch)
            loss = loss_func(preds, y_batch)
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
        
        loss, acc = model.evaluate(val_loader)
        print(f"epoch {epoch+1} loss: {loss:.2f}, accuracy: {acc:.2f}")

model, optimizer = get_model()
fit(model, optimizer, train_loader, val_loader, epochs)

Test

In [123]:
loss, acc = model.evaluate(test_loader)
print(f"loss: {loss:.2f}, accuracy: {acc:.4f}")

loss: 0.07, accuracy: 0.9774


|No.|hidden layers|activation function|Batch Size|Optimizer|Learning Rate|Accuracy|
|-|-|-|-|-|-|-|
| 0 (Logistic) | [] | None | 64 | SGD | 0.2 | 89% |
| 1 | [128] | ReLU | 64 | SGD | 0.2 | 98.19% |
| 2 | [128] | ReLU | 64 | Adam | 0.001 | 97.93% |
| 3 | [128] | Sigmoid | 64 | SGD | 0.2 | 97.74% |