In [34]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

import numpy as np
from tqdm import tqdm
from pathlib import Path

In [35]:
# Making the runs deterministic
torch.manual_seed(42)
np.random.seed(42)

# Defining the Dataset

In [36]:
class MNISTDataset(Dataset):
    """MNIST Dataset"""
    def __init__(self, input_dir: Path):
        super().__init__()
        self.input_dir = input_dir
        self.num_samples = list(filter(lambda x: "image" in x.name, self.input_dir.iterdir()))
        self.sample_ids = list(map(lambda x:int(x.name.split('.')[0]),self.num_samples))
    
    def __len__(self):
        """Returns the size of the dataset"""
        return len(self.num_samples)

    def __getitem__(self, idx: int):
        """Returns a data point (image and label) given an index"""
        idx = self.sample_ids[idx]
        image_txt = self.input_dir.joinpath("{}.image".format(idx)).read_text()
        image = np.fromstring(image_txt, sep=" ").reshape((28, 28))
        image = torch.from_numpy(image).float() # network inputs need to be float

        label = int(self.input_dir.joinpath("{}.label".format(idx)).read_text())
        label = torch.tensor(label).long() # label is not a continuous value but class indices

        return image, label

# Defining the Model structure

In [None]:
class MNISTModel(nn.Module):
    """Simple Feedforward Neural Network for MNIST"""
    def __init__(self):
        super().__init__()
        
        self.image_size = 28 * 28
        self.hidden_1_size = 2048
        self.hidden_2_size = 256
        self.num_classes = 10

        # nn.Linear is a feedforward layer, i.e. that it captures weights and bias values
        self.fc1 = nn.Linear(self.image_size, self.hidden_1_size)
        self.relu1 = nn.ReLU()
        self.fc2 = nn.Linear(self.hidden_1_size, self.hidden_2_size)
        self.relu2 = nn.ReLU()
        self.fc3 = nn.Linear(self.hidden_2_size, self.num_classes)
        
        # weight initialisation
        torch.nn.init.xavier_uniform_(self.fc1.weight)
        torch.nn.init.xavier_uniform_(self.fc2.weight)
        torch.nn.init.xavier_uniform_(self.fc3.weight)

    def forward(self, x):
        # To use a fully connected network, we need a single vector, not a matrix
        x = x.reshape((-1, self.image_size))
        
        x = self.relu1(self.fc1(x))
        x = self.relu2(self.fc2(x))
        x = self.fc3(x) # => logits
        
        # softmax is not used here as the predefined loss function automatically assigns it

        return x

In [None]:
class MNISTModelDropout(nn.Module):
    """Simple Feedforward Neural Network for MNIST"""
    def __init__(self):
        super().__init__()
        
        self.image_size = 28 * 28
        self.hidden_1_size = 2048
        self.hidden_2_size = 256
        self.num_classes = 10

        # nn.Linear is a feedforward layer, i.e. that it captures weights and bias values
        self.fc1 = nn.Linear(self.image_size, self.hidden_1_size)
        self.relu1 = nn.ReLU()
        self.fc2 = nn.Linear(self.hidden_1_size, self.hidden_2_size)
        self.relu2 = nn.ReLU()
        self.fc3 = nn.Linear(self.hidden_2_size, self.num_classes)
        
        # Dropout with 5% chance of dropping a neuron
        self.dropout = nn.Dropout(p=0.05)
        
        # weight initialisation
        torch.nn.init.xavier_uniform_(self.fc1.weight)
        torch.nn.init.xavier_uniform_(self.fc2.weight)
        torch.nn.init.xavier_uniform_(self.fc3.weight)

    def forward(self, x):
        # To use a fully connected network, we need a single vector, not a matrix
        x = x.reshape((-1, self.image_size))
        
        x = self.dropout(self.fc1(x))
        x = self.relu1(x)
        x = self.dropout(self.fc2(x))
        x = self.relu2(x)
        x = self.fc3(x) # => logits
        
        # softmax is not used here as the predefined loss function automatically assigns it

        return x

In [37]:
class MNISTModelLowLevel(nn.Module):
    """Simple Feedforward Neural Network for MNIST"""
    def __init__(self):
        super().__init__()
        
        self.image_size = 28 * 28
        self.hidden_1_size = 2048
        self.hidden_2_size = 256
        self.num_classes = 10

        self.W1 = nn.Parameter(torch.randn(self.image_size, self.hidden_1_size, requires_grad=True))
        self.b1 = nn.Parameter(torch.randn(1, self.hidden_1_size, requires_grad=True))
        self.relu1 = nn.ReLU()
        self.W2 = nn.Parameter(torch.randn(self.hidden_1_size, self.hidden_2_size, requires_grad=True))
        self.b2 = nn.Parameter(torch.randn(1, self.hidden_2_size, requires_grad=True))
        self.relu2 = nn.ReLU()
        self.W3 = nn.Parameter(torch.randn(self.hidden_2_size, self.num_classes, requires_grad=True))
        self.b3 = nn.Parameter(torch.randn(1, self.num_classes, requires_grad=True))
        
    def forward(self, x):
        # To use a fully connected network, we need a single vector, not a matrix
        x = x.reshape((-1, self.image_size))

        # first hidden layer
        x = x @ self.W1 + self.b1
        x = self.relu1(x)
        # second hidden layer
        x = x @ self.W2 + self.b2
        x = self.relu2(x)
        # output layer
        x = x @ self.W3 + self.b3 # => logits
                
        # softmax is not used here as the predefined loss function automatically assigns it

        return x

# Loading the train and test sets

In [None]:
path = Path("./data")

train_path = path.joinpath("train/train")
train_dataset = MNISTDataset(train_path)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=512, shuffle=True)

test_path = path.joinpath("test/test")
test_dataset = MNISTDataset(test_path)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=512, shuffle=False)

# Initialising the model

In [38]:
#model = MNISTModel()
#model = MNISTModelDropout()
model = MNISTModelLowLevel()

# Defining the loss function and optimisation algorithm

In [39]:
# Loss function
criterion = nn.CrossEntropyLoss()
# Optimiser
optimiser = optim.Adam(model.parameters(), lr=0.001)
# optimiser = optim.Adam(model.parameters(), lr=0.001, weight_decay=0.05)

# Metrics to check the performance

We want to check how the model performs on the train and test datasets while and after training.
Therefore we build a little helper that calculates the accuracy of the network's predictions.

We need to handle the batches that are used while training.

In [40]:
class Accuracy:
    """A class to keep track of the accuracy while training"""
    def __init__(self):
        self.correct = 0
        self.total = 0
        
    def reset(self):
        """Resets the internal state"""
        self.correct = 0
        self.total = 0
        
    def update(self, output, labels):
        """
        Updates the internal state to later compute the overall accuracy
        
        output: the output of the network for a batch
        labels: the target labels
        """
        _, predicted = torch.max(output.data, 1) # predicted now contains the predicted class index/label
        
        self.total += labels.size(0)
        self.correct += (predicted == labels).sum().item() # .item() gets the number, not the tensor

    def compute(self):
        return self.correct/self.total
    

accuracy = Accuracy()

# Training loop

We loop over the training dataset multiple times (every full iteration is called an *epoch*).
For every batch in the dataset, we calculate the loss of the network output, calculate the gradients by using Autograd's automatic gradient calculation, and update the network parameters using the Adam optimiser we initialised before.

In [41]:
model.train()

for epoch in range(10):  # loop over the dataset multiple times
    print("Starting epoch {}".format(epoch+1))
    
    total = 0
    running_loss = 0.0

    # to make a beautiful progress bar
    loader = tqdm(enumerate(train_loader), total=len(train_loader))
    for i, data in loader:
        # get the data points
        inputs, labels = data

        # zero the parameter gradients (else, they are accumulated)
        optimiser.zero_grad()

        # forward the data through the network
        outputs = model(inputs)
        # calculate the loss given the output of the network and the target labels
        loss = criterion(outputs, labels)
        # calculate the gradients of the network w.r.t. its parameters
        loss.backward()
        # Let the optimiser take an optimization step using the calculated gradients
        optimiser.step()
        
        running_loss += loss
        total += outputs.size(0)

        loader.set_description("loss: {:.5f}".format(running_loss/total))

print("Finished Training")

Starting epoch 1


loss: 1.92579: 100%|███████████████████████████████████████████████████| 58/58 [00:49<00:00,  1.16it/s]


Starting epoch 2


loss: 0.41025: 100%|███████████████████████████████████████████████████| 58/58 [00:50<00:00,  1.16it/s]


Starting epoch 3


loss: 0.26162: 100%|███████████████████████████████████████████████████| 58/58 [00:50<00:00,  1.16it/s]


Starting epoch 4


loss: 0.18726: 100%|███████████████████████████████████████████████████| 58/58 [00:49<00:00,  1.17it/s]


Starting epoch 5


loss: 0.13968: 100%|███████████████████████████████████████████████████| 58/58 [00:50<00:00,  1.15it/s]


Starting epoch 6


loss: 0.10731: 100%|███████████████████████████████████████████████████| 58/58 [00:50<00:00,  1.16it/s]


Starting epoch 7


loss: 0.08212: 100%|███████████████████████████████████████████████████| 58/58 [00:49<00:00,  1.16it/s]


Starting epoch 8


loss: 0.06302: 100%|███████████████████████████████████████████████████| 58/58 [00:49<00:00,  1.16it/s]


Starting epoch 9


loss: 0.04873: 100%|███████████████████████████████████████████████████| 58/58 [00:50<00:00,  1.15it/s]


Starting epoch 10


loss: 0.03806: 100%|███████████████████████████████████████████████████| 58/58 [00:50<00:00,  1.16it/s]


Finished Training


# Testing

We can now use the test set to run inference of our model.
We can output resulting predictions or use them for testing how well our model generalizes.

In [42]:
model.eval()

MNISTModelLowLevel(
  (relu1): ReLU()
  (relu2): ReLU()
)

## Training Accuracy

In [43]:
accuracy.reset()

# Gradients are calculated on the forward pass for every iteration.
# As we do not need gradients now, we can disable the calculation.
with torch.no_grad():
    for data in tqdm(train_loader):
        # get the data points
        inputs, labels = data

        # forward the data through the network
        outputs = model(inputs)
        
        accuracy.update(outputs, labels)
        
print("Accuracy: {:.2f}%".format(100 * accuracy.compute()))

100%|██████████████████████████████████████████████████████████████████| 58/58 [00:41<00:00,  1.41it/s]


Accuracy: 96.65%


## Test Accuracy

In [44]:
accuracy.reset()

# Gradients are calculated on the forward pass for every iteration.
# As we do not need gradients now, we can disable the calculation.
with torch.no_grad():
    for data in tqdm(test_loader): # now the test_loader
        # get the data points
        inputs, labels = data

        # forward the data through the network
        outputs = model(inputs)
        
        accuracy.update(outputs, labels)
        
print("Accuracy: {:.2f}%".format(100 * accuracy.compute()))

100%|██████████████████████████████████████████████████████████████████| 20/20 [00:13<00:00,  1.47it/s]


Accuracy: 91.64%
