# Exercise 9 (solution)

If you get an error when importing the libraries, you might have to uncommend and execute the following

In [None]:
# !pip install pillow==6.2.1

In [None]:
import torch
from torch import nn
from torch.utils.data import DataLoader
import torchvision
import matplotlib.pyplot as plt

## Loading Data

- Load the `training_data` of the MNIST dataset from torchvision
- Load the `test_data` of the MNIST dataset from torchvision
- Select the first x tensor from the training data

In botch cases, make sure that the raw image files are transformed to a pytorch tensor.

In [None]:
training_data = torchvision.datasets.MNIST(
    root="data", train=True, download=True, transform=torchvision.transforms.ToTensor()
)

In [None]:
test_data = torchvision.datasets.MNIST(
    root="data", train=False, download=True, transform=torchvision.transforms.ToTensor()
)

In [None]:
training_data

In [None]:
x = training_data[0][0]
x.shape

In [None]:
fig = plt.imshow(x.squeeze(dim=0))

## Task 1: Specify a Model

1. Write a model class that implements the same model as last week. To make it flexibel, the init function should take the dimension `n_in`, `n_hidden`, and `n_out` as arguments. Do not add a softmax activation at the end, i.e. stop after the last linear layer. 
2. Instantiate a model with the same dimensions as last week (`n_in` = 784, `n_hidden` = 16, `n_out` = 10)
3. Evaluate the model on the first x variables of the training data.
4. Loop over the model parameters and print the shape of each parameter tensor. Make sure you understand why the shapes are what they are.

In [None]:
class NeuralNetwork(nn.Module):
    def __init__(self, n_in, n_hidden, n_out):
        super().__init__()
        self.flatten = nn.Flatten()
        self.all_layers = nn.Sequential(
            nn.Linear(n_in, n_hidden),
            nn.ReLU(),
            nn.Linear(n_hidden, n_hidden),
            nn.ReLU(),
            nn.Linear(n_hidden, n_out),
        )

    def forward(self, x):
        x = self.flatten(x)
        logits = self.all_layers(x)
        return logits

In [None]:
n_in = 28 * 28
n_hidden = 16
n_out = 10
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = NeuralNetwork(n_in, n_hidden, n_out).to(device)
model

In [None]:
model(training_data[0][0])

In [None]:
for p in model.parameters():
    print(p.shape)

## Task 2: Define DataLoaders

1. Define a `train_dataloader` and `test_dataloader`
2. Loop over batches and print the shape of every 100th batch
3. Calculate logits by evaluating the model on the X batch from the last iteration of the loop

Both dataloaders should have a batch size of 64; The train_dataloader should be shuffled. 

In [None]:
batch_size = 64
train_dataloader = DataLoader(
    training_data, batch_size=batch_size, shuffle=True, drop_last=True
)
test_dataloader = DataLoader(test_data, batch_size=batch_size, drop_last=True)

In [None]:
for i, (X, y) in enumerate(train_dataloader):
    if i % 100 == 0:
        print(X.shape)

In [None]:
logits = model(X)
logits.shape

## Task 3: Instantiate a loss function

1. Create an instance of pytorch's cross_entropy_loss function
2. Evaluate the loss function on the first observation in the training data

In [None]:
loss_func = nn.CrossEntropyLoss()

In [None]:
loss_func(logits, y)

## Task 4: Instantiate an optimizer

In [None]:
learning_rate = 0.1
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

## Task 5: Write a function for the inner training loop

Write a function called `train_loop` that takes the following arguments:
- dataloader: The dataloader instance used in training
- model: A model instance
- loss_fn: A loss function
- optimizer: An instance of a pytorch optimizer

The function should then perform the following steps:
1. Set the model in training mode
2. Loop over batches in the dataset 
3. Evaluate the model and loss function
4. Backpropagate the gradients
5. Do an optimizer step
6. Zero the gradients
8. Print the loss every 200 batches

Evaluate the function once on the inputs you already defined in earlier tasks to make sure that it works.

In [None]:
def train_loop(dataloader, model, loss_fn, optimizer):
    # Set the model to training mode (best practice)
    model.train()
    for i, (X, y) in enumerate(dataloader):
        pred = model(X)
        loss = loss_fn(pred, y)
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        if i % 200 == 0:
            print(f"Loss after {i} batches: {loss.item()}")

In [None]:
train_loop(train_dataloader, model, loss_func, optimizer)

## Task 6: Write a function for the inner test loop

Write a function called `test_loop` that takes the following arguments:
- dataloader: The dataloader for the testing data
- model: A model instance

The function should do the following steps:
- Set the model in eval mode
- Initialize `correct` to zero
- Start a `torch.no_grad` context
- loop over batches in the dataloader
- Evaluate the model, and add the number of correct examples in the batch to `correct`
- Calculate the accuracy by dividing `correct` by the length of the dataset

In [None]:
def test_loop(dataloader, model):
    # Set the model to evaluation mode (best practice)
    model.eval()

    correct = 0
    with torch.no_grad():
        for X, y in dataloader:
            pred = model(X)
            correct += (pred.argmax(axis=1) == y).to(torch.float).sum().item()

    acc = correct / len(dataloader.dataset)
    print(f"Test Accuracy: {acc}")

In [None]:
test_loop(test_dataloader, model)

## Task 7: Run the outer loop with different optimizers

At the beginning of the next cell we create new instances of the model, optimizer, etc. so we know that you get the same output if you run the training loop multiple times. Otherwise, the second run would continue where the first has left

1. Experiment with tuning parameters to get a better accuracy
2. Switch out SGD by Adam or other optimizers and see if that speeds up the optimization or gives you better results

In [None]:
# training hyperparameters
n_epochs = 3
batch_size = 64
learning_rate = 0.01

# initialization
model = NeuralNetwork(n_in, n_hidden, n_out).to(device)
loss_func = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(
    model.parameters(),
    lr=learning_rate,
)
train_dataloader = DataLoader(
    training_data,
    batch_size=batch_size,
    shuffle=True,
    drop_last=True,
)
test_dataloader = DataLoader(
    test_data,
    batch_size=batch_size,
    drop_last=True,
)

# training loop
for t in range(n_epochs):
    print(f"Epoch {t+1}\n-------------------------------")
    train_loop(train_dataloader, model, loss_func, optimizer)
    test_loop(test_dataloader, model)
print("Done!")