### Download data

In [1]:
import torch
from torchvision import datasets, transforms
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.5,), (0.5,))  # Normalize the images
])

train_all = datasets . MNIST ('../ data', train =True , download = True, transform=transform ) # 60K images
train_data , val_data = torch . utils . data . random_split (
train_all , [50000 , 10000 ], torch . Generator () . manual_seed (0 )) # train : 50K ; val : 10K
test_data = datasets . MNIST ('../ data', train = False, download = True, transform=transform ) # test : 10K

### Load data

In [2]:
from torch.utils.data import DataLoader, random_split
train_loader = DataLoader(train_data, batch_size=64, shuffle=True)
val_loader = DataLoader(val_data, batch_size=64, shuffle=False)
test_loader = DataLoader(test_data, batch_size=64, shuffle=False)

### Part 1: 10-class softmax classifer

In [3]:
import torch.nn as nn
class SoftmaxClassifier(nn.Module):
    def __init__(self):
        super(SoftmaxClassifier, self).__init__()
        self.flatten = nn.Flatten()
        self.linear = nn.Linear(28*28, 10)

    def forward(self, x):
        x = self.flatten(x)
        x = self.linear(x)
        return x

model = SoftmaxClassifier()

#### Train classifier via gradient descent

In [4]:
import torch.optim as optim
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.01)

# Training function
def train(dataloader, model, loss_fn, optimizer):
    model.train()
    for batch, (X, y) in enumerate(dataloader):
        pred = model(X)
        loss = loss_fn(pred, y)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()


In [5]:
for epoch in range(10):  # number of epochs
    train(train_loader, model, criterion, optimizer)


#### Report accuracy

In [6]:
import torchmetrics
def test(dataloader, model):
    model.eval()
    accuracy = torchmetrics.Accuracy(num_classes=10, average='macro', task='multiclass')
    with torch.no_grad():
        for X, y in dataloader:
            pred = model(X)
            accuracy.update(pred, y)
    print(f"Test Accuracy: {(100 * accuracy.compute()): }%")

# Evaluate on test data
test(test_loader, model)

Test Accuracy:  91.21615600585938%


### Part 2: Hidden units, using ReLU as the activation function

In [7]:
class NeuralNetwork(nn.Module):
    def __init__(self):
        super(NeuralNetwork, self).__init__()
        self.flatten = nn.Flatten()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(28*28, 1024),
            nn.ReLU(),
            nn.Linear(1024, 10)
        )


    def forward(self, x):
        x = self.flatten(x)
        logits = self.linear_relu_stack(x)
        return logits


new_model = NeuralNetwork()

#### Train new model

In [8]:
train_loader = DataLoader(train_data, batch_size=64, shuffle=True)
val_loader = DataLoader(val_data, batch_size=64, shuffle=False)
test_loader = DataLoader(test_data, batch_size=64, shuffle=False)
new_optimizer = optim.SGD(new_model.parameters(), lr=0.01)
for epoch in range(10):  # number of epochs
    train(train_loader, new_model, criterion, new_optimizer)

#### Report accuracy

In [9]:
test(test_loader, new_model)

Test Accuracy:  94.13797760009766%


Notice the model with hidden layer has a higher accuracy compared to our first model. 

In [10]:
def build_model(L):
    layers = [nn.Flatten(), nn.Linear(28*28, 1024), nn.ReLU()]
    for _ in range(L - 1):
        layers += [nn.Linear(1024, 1024), nn.ReLU()]
    layers += [nn.Linear(1024, 10)]  # Output layer
    return nn.Sequential(*layers)

In [11]:
def train_model(model, train_loader, criterion, optimizer, num_epochs=10):
    for epoch in range(num_epochs):
        model.train()
        total_loss = 0
        for data, target in train_loader:
            optimizer.zero_grad()
            output = model(data)
            loss = criterion(output, target)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"Epoch {epoch+1}/{num_epochs}, Loss: {total_loss/len(train_loader)}")


In [12]:
def evaluate_model(model, test_loader):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for data, target in test_loader:
            output = model(data)
            _, predicted = torch.max(output.data, 1)
            total += target.size(0)
            correct += (predicted == target).sum().item()
    return correct / total

In [15]:
train_loader = DataLoader(train_data, batch_size=64, shuffle=True)
test_loader = DataLoader(test_data, batch_size=64, shuffle=False)

# Train and evaluate the model with different L
num_hidden_layers = [2, 3, 4, 5, 6, 7, 8]
for L in num_hidden_layers:
    print(f"Training model with {L} hidden layers")
    MLP_model = build_model(L)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.SGD(MLP_model.parameters(), lr=0.01)
    train_model(MLP_model, train_loader, criterion, optimizer)
    accuracy = evaluate_model(model, test_loader)
    print(f"Test Accuracy for L={L}: {accuracy * 100:.2f}%\n")

Training model with 2 hidden layers
Epoch 1/10, Loss: 0.894119747154548
Epoch 2/10, Loss: 0.36875004926339133
Epoch 3/10, Loss: 0.31389277820925576
Epoch 4/10, Loss: 0.2830209005195314
Epoch 5/10, Loss: 0.25818145334187065
Epoch 6/10, Loss: 0.23668443232946232
Epoch 7/10, Loss: 0.21641821893946747
Epoch 8/10, Loss: 0.19861201206912926
Epoch 9/10, Loss: 0.18301942675133875
Epoch 10/10, Loss: 0.16896425511526025
Test Accuracy for L=2: 95.06%

Training model with 3 hidden layers
Epoch 1/10, Loss: 1.2372956513748754
Epoch 2/10, Loss: 0.3999201435490947
Epoch 3/10, Loss: 0.3240137318973346
Epoch 4/10, Loss: 0.2839157148204801
Epoch 5/10, Loss: 0.25462189168595445
Epoch 6/10, Loss: 0.22777592285495737
Epoch 7/10, Loss: 0.20523945231686164
Epoch 8/10, Loss: 0.18521637268497815
Epoch 9/10, Loss: 0.16842531019826526
Epoch 10/10, Loss: 0.15353488332122717
Test Accuracy for L=3: 95.47%

Training model with 4 hidden layers
Epoch 1/10, Loss: 1.846705102402231
Epoch 2/10, Loss: 0.540350135120437
Epo

KeyboardInterrupt: 