# Lab 3 - playing around

Using the same code as in ```1.0-lab3.ipynb``` but with the following changes.

In [1]:
import torch
from torchvision import transforms, datasets
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

train = datasets.MNIST("", train=True, download=True, transform=transforms.Compose([transforms.ToTensor()]))
test = datasets.MNIST("", train=False, download=True, transform=transforms.Compose([transforms.ToTensor()]))

trainset = torch.utils.data.DataLoader(train, batch_size=10, shuffle=True)
testset = torch.utils.data.DataLoader(test, batch_size=10, shuffle=True)

### Change the sigmoid function with a ReLu function (F.relu instead of torch.sigmoid)
_Shown in the code below_</br></br>

This produces an accuracy of 0.141 which is higher than the sigmoid function but still very low. 

**The ReLu function**</br>
The rectified linear activation function or ReLU for short is a piecewise linear function that will output the input directly if it is positive, otherwise, it will output zero.

<img src=".\images/ReLu.png" width="400"> </br>

Source: https://machinelearningmastery.com/rectified-linear-activation-function-for-deep-learning-neural-networks/

In [2]:
class Net(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc1 = nn.Linear(28*28, 64)
        self.fc2 = nn.Linear(64, 64)
        self.fc3 = nn.Linear(64, 64)
        self.fc4 = nn.Linear(64, 10)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        x = self.fc4(x)
        return F.softmax(x, dim=1)

net = Net()

optimizer = optim.SGD(net.parameters(), lr=0.001)

for epoch in range(3):
    for data in trainset:
        X, y = data
        net.zero_grad()
        output = net.forward(X.view(-1, 28*28))
        loss = F.nll_loss(output, y)
        loss.backward()
        optimizer.step()

correct =0
total = 0

with torch.no_grad():
    for data in testset:
        X, y = data
        output = net.forward(X.view(-1, 28*28))
        for idx, i in enumerate(output):
            if torch.argmax(i) == y[idx]:
                correct += 1
            total += 1

print("Accuracy: ", round(correct/total, 3))

Accuracy:  0.096


### Change the optimizer from stochastic gradient descent to Adam (optim.Adam instead of optim.SGD)
_Shown in the code below_</br></br>

- With a sigmoid function: 0.941
- With a ReLu function: 0.905

**Why does the sigmoid function produce a better accuracy here?** Not dataset dependent. Choice of algorithm guided by problem that you have. You get better at this by LOTS of practice.

**What is Adam gradient descent?** This avoids all the problems with gradient descent such as choosing the learning rate, functions being non-differentiable etc. 

In [7]:
class Net(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc1 = nn.Linear(28*28, 64)
        self.fc2 = nn.Linear(64, 64)
        self.fc3 = nn.Linear(64, 64)
        self.fc4 = nn.Linear(64, 10)

    def forward(self, x):
        x = torch.sigmoid(self.fc1(x))
        x = torch.sigmoid(self.fc2(x))
        x = torch.sigmoid(self.fc3(x))
        x = self.fc4(x)
        return F.softmax(x, dim=1)

net = Net()

optimizer = optim.SGD(net.parameters(), lr=0.001)

for epoch in range(3):
    for data in trainset:
        X, y = data
        net.zero_grad()
        output = net.forward(X.view(-1, 28*28))
        loss = F.nll_loss(output, y)
        loss.backward()
        optimizer.step()

correct =0
total = 0

with torch.no_grad():
    for data in testset:
        X, y = data
        output = net.forward(X.view(-1, 28*28))
        for idx, i in enumerate(output):
            if torch.argmax(i) == y[idx]:
                correct += 1
            total += 1

print("Accuracy: ", round(correct/total, 3))

Accuracy:  0.114


### Try and modify the number of layers and the number of neurons in the hidden layer (be careful that the numbers are consistent between contiguous layers)

1. 2 layers

Sigmoid, Adam gradient descent: 0.953 
_Does this high accuracy suggest that the third and fourth layers are unnecessary? Could suggest that these layers are leading to overfitting._

In [5]:
class Net(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc1 = nn.Linear(28*28, 64)
        self.fc2 = nn.Linear(64, 10)

    def forward(self, x):
        x = torch.sigmoid(self.fc1(x))
        x = self.fc2(x)
        return F.softmax(x, dim=1)

net = Net()

optimizer = optim.Adam(net.parameters(), lr=0.001)

for epoch in range(3):
    for data in trainset:
        X, y = data
        net.zero_grad()
        output = net.forward(X.view(-1, 28*28))
        loss = F.nll_loss(output, y)
        loss.backward()
        optimizer.step()

correct =0
total = 0

with torch.no_grad():
    for data in testset:
        X, y = data
        output = net.forward(X.view(-1, 28*28))
        for idx, i in enumerate(output):
            if torch.argmax(i) == y[idx]:
                correct += 1
            total += 1

print("Accuracy: ", round(correct/total, 3))

Accuracy:  0.953


2. 10 layers

Sigmoid, Adam gradient descent: 0.114 </br>
ReLu, Adam gradient descent: </br></br>

Sigmoid, SGD gradient descent: 0.103</br>
ReLu, SGD gradient descent: 0.096</br>

In [13]:
class Net(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc1 = nn.Linear(28*28, 64)
        self.fc2 = nn.Linear(64, 64)
        self.fc3 = nn.Linear(64, 64)
        self.fc4 = nn.Linear(64, 64)
        self.fc5 = nn.Linear(64, 64)
        self.fc6 = nn.Linear(64, 64)
        self.fc7 = nn.Linear(64, 64)
        self.fc8 = nn.Linear(64, 64)
        self.fc9 = nn.Linear(64, 64)
        self.fc10 = nn.Linear(64, 10)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        x = F.relu(self.fc4(x))
        x = F.relu(self.fc5(x))
        x = F.relu(self.fc6(x))
        x = F.relu(self.fc7(x))
        x = F.relu(self.fc8(x))
        x = F.relu(self.fc9(x))
        x = self.fc10(x)
        return F.softmax(x, dim=1)

net = Net()

optimizer = optim.Adam(net.parameters(), lr=0.001)

for epoch in range(3):
    for data in trainset:
        X, y = data
        net.zero_grad()
        output = net.forward(X.view(-1, 28*28))
        loss = F.nll_loss(output, y)
        loss.backward()
        optimizer.step()

correct =0
total = 0

with torch.no_grad():
    for data in testset:
        X, y = data
        output = net.forward(X.view(-1, 28*28))
        for idx, i in enumerate(output):
            if torch.argmax(i) == y[idx]:
                correct += 1
            total += 1

print("Accuracy: ", round(correct/total, 3))

Accuracy:  0.29


### Accessing information about the network

In [8]:
net.fc1.weight

Parameter containing:
tensor([[ 0.0050, -0.0250, -0.0321,  ..., -0.0256,  0.0284,  0.0227],
        [-0.0047, -0.0292, -0.0025,  ..., -0.0169,  0.0189,  0.0198],
        [ 0.0049,  0.0290,  0.0123,  ..., -0.0263, -0.0150,  0.0029],
        ...,
        [-0.0315, -0.0136, -0.0255,  ...,  0.0055, -0.0257,  0.0316],
        [ 0.0269, -0.0329,  0.0006,  ..., -0.0051, -0.0151, -0.0046],
        [ 0.0092,  0.0347, -0.0118,  ..., -0.0276, -0.0095, -0.0245]],
       requires_grad=True)

In [10]:
net.fc1.bias

Parameter containing:
tensor([ 0.0117,  0.0101, -0.0080,  0.0214, -0.0239, -0.0063, -0.0026, -0.0259,
         0.0222,  0.0244, -0.0140, -0.0179,  0.0065,  0.0297,  0.0209, -0.0109,
        -0.0024,  0.0180, -0.0183,  0.0140,  0.0172,  0.0290,  0.0352,  0.0226,
         0.0054, -0.0292, -0.0288, -0.0087, -0.0055, -0.0276,  0.0151,  0.0268,
         0.0008,  0.0322,  0.0252, -0.0049,  0.0039,  0.0320,  0.0124, -0.0052,
        -0.0004, -0.0345, -0.0249,  0.0339, -0.0193, -0.0200, -0.0204,  0.0137,
        -0.0191, -0.0247,  0.0320,  0.0250,  0.0280, -0.0126,  0.0002,  0.0195,
         0.0039,  0.0265, -0.0247,  0.0180,  0.0019, -0.0192,  0.0134, -0.0061],
       requires_grad=True)