# Lab 4 - Playing around

In [1]:
# copying code from nb 4.0
import torch
from torchvision import transforms, datasets
import torch.nn as nn 
import torch.nn.functional as F 
import torch.optim as optim

train = datasets.MNIST(
    "", 
    train=True, 
    download=True, 
    transform=transforms.Compose([transforms.ToTensor()]))
test = datasets.MNIST("", train=False, download=True, transform=transforms.Compose([transforms.ToTensor()]))

trainset = torch.utils.data.DataLoader(train, batch_size=10, shuffle=True)
testset = torch.utils.data.DataLoader(test, batch_size=10, shuffle=True)

### Modifying the layers

In [2]:
class Net(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv2d(1, 32, 5, padding=2)
        self.conv2 = nn.Conv2d(32, 64, 5, padding=2)

        self.fc1 = nn.Linear(64*7*7, 128)
        self.fc2 = nn.Linear(128, 10)

    def convs(self, x):
        x = F.max_pool2d(F.relu(self.conv1(x)), (2, 2))
        x = F.max_pool2d(F.relu(self.conv2(x)), (2, 2))

        return x

    def forward(self, x):
        x = self.convs(x)
        x = x.view(-1, 64*7*7)

        x = F.relu(self.fc1(x))
        x = self.fc2(x)

        return F.softmax(x, dim=1)

net = Net()

optimizer = optim.Adam(net.parameters(), lr=0.001)

for epoch in range(3):
    for data in trainset:
        X, y = data
        net.zero_grad()
        output = net.forward(X)
        loss = F.nll_loss(output, y)
        loss.backward()
        optimizer.step()

In [3]:
correct = 0
total = 0

with torch.no_grad():
    for data in testset:
        X, y = data
        output = net.forward(X)
        for idx, i in enumerate(output):
            # argmax is finding the highest probability out of y (from SoftMax)
            # y is a label between 0 and 9
            print(i)
            if torch.argmax(i) == y[idx]:
                correct += 1
            total += 1

print("Accuracy: ", round(correct/total, 3))

tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 1.])
tensor([0.0000e+00, 0.0000e+00, 4.0982e-02, 9.5902e-01, 0.0000e+00, 0.0000e+00,
        0.0000e+00, 0.0000e+00, 1.5891e-28, 0.0000e+00])
tensor([0.0000e+00, 1.0000e+00, 9.9919e-40, 0.0000e+00, 0.0000e+00, 0.0000e+00,
        0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00])
tensor([1.0000e+00, 0.0000e+00, 1.1778e-13, 0.0000e+00, 0.0000e+00, 0.0000e+00,
        0.0000e+00, 0.0000e+00, 8.0284e-36, 0.0000e+00])
tensor([1., 0., 0., 0., 0., 0., 0., 0., 0., 0.])
tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 1.])
tensor([0.0000e+00, 0.0000e+00, 4.3621e-33, 0.0000e+00, 0.0000e+00, 0.0000e+00,
        0.0000e+00, 0.0000e+00, 1.0000e+00, 0.0000e+00])
tensor([1.7894e-39, 7.2656e-16, 7.8871e-25, 0.0000e+00, 1.6922e-20, 0.0000e+00,
        0.0000e+00, 7.3135e-13, 0.0000e+00, 1.0000e+00])
tensor([0., 0., 0., 0., 1., 0., 0., 0., 0., 0.])
tensor([0., 0., 0., 0., 1., 0., 0., 0., 0., 0.])
tensor([0., 0., 0., 0., 0., 0., 1., 0., 0., 0.])
tensor([0., 0., 0., 0

### Changing the number of neurons

...

_Does the performance of the network improve? Is it reasonable to expect to reach 100% classification accuracy?_

### Changing the optimizer and activation functions

Changing the optimizer to SGD is expected to reduce the accuracy.

Adam's essentially applies gradient descent for each parameter which is why it is faster.

It is expected that ReLu will perform better than sigmoid activation functions.

_Does the performance of the network improve? Is it reasonable to expect to reach 100% classification accuracy?_

### Modifying the pooling layer

We can use the average function rather than the max function. 

_Does the performance of the network improve? Is it reasonable to expect to reach 100% classification accuracy?_

### Modifying the dimensions of the convolutional kernel (and padding)

...

_Does the performance of the network improve? Is it reasonable to expect to reach 100% classification accuracy?_