# Imports

This imports all the dependencies


In [24]:
import torch
import torch.nn as nn
import torch.nn.functional as F

And then you define the network by creating a class and extending `nn.Module`.

In [25]:
class Net(nn.Module):

    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(1, 6, 3)
        self.conv2 = nn.Conv2d(6, 16, 3)
        
        # affine operation: y = Wx + b
        self.fc1 = nn.Linear(16*6*6, 120)
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 10)
    
    def forward(self, x):
        # max pooling over a 2x2 window
        x = F.max_pool2d(F.relu(self.conv1(x)), (2, 2))

        # if the size is a sqaure you can only specify a single number; what size is this referring to?
        x = F.max_pool2d(F.relu(self.conv2(x)), 2)
        x = x.view(-1, self.num_flat_features(x))
        x = F.relu(self.fc1(x)) # what does relu do?
        x = F.relu(self.fc2(x))
        x = self.fc3(x) # what is this doing?

        return x
    
    def num_flat_features(self, x):
        """
        TODO what is x?
        """
        size = x.size()[1:] # all dimensions except the batch dimension
        num_features = 1
        for s in size:
            num_features *= s

        return num_features        

Our network class is defined now.

 > Note: We have defined the `forward` function and the `backward` function is defined automatically with out `autograd` module. 

In [26]:
net = Net()
print(net)

Net(
  (conv1): Conv2d(1, 6, kernel_size=(3, 3), stride=(1, 1))
  (conv2): Conv2d(6, 16, kernel_size=(3, 3), stride=(1, 1))
  (fc1): Linear(in_features=576, out_features=120, bias=True)
  (fc2): Linear(in_features=120, out_features=84, bias=True)
  (fc3): Linear(in_features=84, out_features=10, bias=True)
)


Now that the network is instantiated, we can see the learnable parameters of a model as follows:

In [27]:
params = list(net.parameters())
print(len(params))
print(params[0].size()) # conv1's weights

10
torch.Size([6, 1, 3, 3])


Let’s try a random 32x32 input.
> Note: expected input size of this net (LeNet) is 32x32. To use this net on the MNIST dataset, please resize the images from the dataset to 32x32.

In [None]:
input = torch.randn(1, 1, 32, 32)
out = net(input)
print(out)

In [None]:
# net.zero_grad() # zeros the grad buffers of all params
# out.backward(torch.randn(1, 10))    # backprop with random grads



Now we can move onto computing the loss and updating the weights of the network.

In [28]:
target = torch.randn(10) # dummy target for example
target = target.view(1, -1) # make it the same shape as output
criterion = nn.MSELoss()

loss = criterion(out, target)
print(loss)

tensor(0.6559, grad_fn=<MseLossBackward>)


Now all we have to do is backpropagate the error with `loss.backward()`.
> NOTE: You need to clear the existing grads, else the grads will accumulate to existing grads.

In [None]:
net.zero_grad()

print("conv1.bias.grad before backward")
print(net.conv1.bias.grad)

loss.backward()

print("conv1.bias.grad after backward")
print(net.conv1.bias.grad)



For a very simple way to calcuate the learning rate with Python code:

In [None]:
learning_rate = 0.01
for f i net.parameters():
    f.data.sub_(f.grad.data * learning_rate)

However, to use other update rules, torch has a module called `optim`

In [32]:
import torch.optim as optim 

optimizer = optim.SGD(net.parameters(), lr=0.01)
output = net(input)
loss = criterion(output, target)
loss.backward()
optimizer.step() # this does the update 