## Prerequisites
Load pytorch libraries and download MNIST data

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, transforms
from torch.utils.data import DataLoader
from matplotlib import pyplot as plt

In [None]:
batch_size = 100
# MNIST Dataset (Train)
train_dataset = datasets.MNIST(root='./data',
                               train=True,
                               transform=transforms.ToTensor(),
                               download=True)

# MNIST Dataset (Test)
test_dataset = datasets.MNIST(root='./data',
                              train=False,
                              transform=transforms.ToTensor())

# Data Loader (Train)
train_loader = DataLoader(dataset=train_dataset,
                          batch_size=batch_size,
                          shuffle=True)

# Data Loader (Test)
test_loader = DataLoader(dataset=test_dataset,
                         batch_size=batch_size,
                         shuffle=False)

## Cracking open the data
Let's see what's inside

In [None]:
for images, labels in train_loader:
    print(images.shape)
    print(labels.shape)
    break

In [None]:
# prompt: generate a grid of images with the corresponding labels
def grid_show(img_list, labels, w=3, h=3):
    plt.figure(figsize=(10, 10))
    count = w*h

    for i in range(count):
        plt.subplot(h, w, i + 1)
        plt.imshow(img_list[i][0])
        plt.axis('off')
        plt.title(labels[i].item())

    plt.show()

grid_show(images, labels)

## Constructing our model
**TODO #1**: identify what is missing in the forward layer.

Evaluate the model before and after fixing this. How does this affect the performance?

In [None]:
class MLP(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(MLP, self).__init__()
        self.input_dim = input_dim

        self.layer1 = nn.Linear(input_dim, hidden_dim)  # Input layer to first hidden layer
        self.layer2 = nn.Linear(hidden_dim, hidden_dim) # First hidden layer to second hidden layer
        self.layer3 = nn.Linear(hidden_dim, output_dim) # Second hidden layer to output layer (num classes in MNIST)

    def forward(self, x):
        x = x.view(-1, self.input_dim)  # Flatten the image
        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        # Wait, what are we missing here?
        return x

## Training/evaluating the model
**TODO #2:** Fill in the correct hyperparameters for the model.

(One of these hyperparameters does not have a "correct" answer)

In [None]:
# Hyperparameters
input_dim = 28*28
hidden_dim = 256
output_dim = 10

num_epochs = 5
learning_rate = 0.001

# Initialize the model
model = MLP(input_dim, hidden_dim, output_dim)

# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Training loop
for epoch in range(num_epochs):
    for i, (images, labels) in enumerate(train_loader):
        # Forward pass
        outputs = model(images)
        loss = criterion(outputs, labels)

        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if (i + 1) % 100 == 0:
            print(f'Epoch [{epoch+1}/{num_epochs}], Step [{i+1}/{len(train_loader)}], Loss: {loss.item():.4f}')


In [None]:
# Testing the model
test_loss = nn.CrossEntropyLoss(reduction='none')   # 1 loss per sample
model.eval()
test_samples, test_losses = [], []

with torch.no_grad():
    correct = 0
    total = 0
    for images, labels in test_loader:
        outputs = model(images)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

    print(f'Accuracy of the network on the 10000 test images: {100 * correct / total} %')
    for images, labels in test_loader:
        logits   = model(images)
        batch_ls = test_loss(logits, labels)

        test_samples.append(images.cpu())
        test_losses.append(batch_ls.cpu())

test_samples = torch.cat(test_samples)
test_losses  = torch.cat(test_losses)

sorted_test_samples = test_samples[test_losses.argsort(descending=True)]
sorted_test_losses = test_losses[test_losses.argsort(descending=True)]

## Basking in success, meditating on failure
**TODO #3:** Use a previously defined function along with `sorted_test_samples` and `sorted_test_losses` to visualize the test samples on which our model performed *very poorly*, and test samples on which our model performed *very well*.

**HINT:** *look up pytorch tensor indexing*

In [None]:
# Let's see some examples.

## Freestyle
Almost every update you've made to the code is just one of innumerable ways to configure a neural network. Go mess with the model, and see how your changes affect the model performance. A few lines of inquiry for inspiration:


*   How skinny (hidden layer dimension) can you make the network before it starts to fail?
*   What happens when you make it super wide?
*   What about super deep (more layers)?
*   Adjust the hyperparameters: batch size, learning rate, etc. How does this affect the training/testing?
*   Can you change the loss function, perhaps to MSELoss? What other changes need to be made for this to work?

Go nuts!


# Instructor's manual
Solutions to all TODOs may be found below.

### TODO #1

The default model is completely linear, i.e. it has no activation function! The model below implements a proper non-linear operation after each hidden layer.

In [None]:

class MLP(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(MLP, self).__init__()
        self.input_dim = input_dim

        self.layer1 = nn.Linear(input_dim, hidden_dim)  # Input layer to first hidden layer
        self.layer2 = nn.Linear(hidden_dim, hidden_dim) # First hidden layer to second hidden layer
        self.layer3 = nn.Linear(hidden_dim, output_dim) # Second hidden layer to output layer (num classes in MNIST)

        self.act = torch.nn.ReLU()

    def forward(self, x):
        x = x.view(-1, self.input_dim)  # Flatten the image
        x = self.act(self.layer1(x))
        x = self.act(self.layer2(x))
        x = self.layer3(x)
        return x


### TODO #2

In [None]:
# Hyperparameters
input_dim = 28*28 # Number of pixels in a 28*28 MNIST image
output_dim = 10   # Number of classes (labels) in the MNIST dataset
hidden_dim = 256  # Whatever the heck you want

### TODO #3

In [None]:
# `sorted_test_samples` and `sorted_test_losses` are sorted from highest (worst) to lowest (best) loss value.
# Therefore, using these tensors as direct input to `grid_show` will show us the worst scoring examples.
grid_show(sorted_test_samples, sorted_test_losses)

In [None]:
# to show examples from anywhere in the tensor, use the proper indexing
# Where tensor[n] retrieves the nth item, tensor[n:] (with a colon) slices the tensor from the nth item to the last
idx = 500
grid_show(sorted_test_samples[idx:], sorted_test_losses[idx:])

In [None]:
# Using a negative index -n is like saying 'nth from last'
# So `sorted_test_samples[-16:]` gives us the last 16 (16 best scoring) items from our tensor.
grid_show(sorted_test_samples[-16:], sorted_test_losses[-16:])