In this notebook, I explore the Autograd process and try to differentiate between the `torch.nn.functional` and `torch.nn`

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F

import torchvision
import torchvision.transforms as transforms

When using `nn.functional`, `net.parameters()` won’t find the parameters/weight. You need to specify them explicity.

`torch.nn` only supports mini-batches: The entire torch.nn package only supports inputs that are a mini-batch of samples, and not a single sample.

In [4]:
class MNISTConvNet(nn.Module):
    
    def __init__(self, kernel=5):
        super().__init__()
        self.conv1 = nn.Conv2d(1, 10, kernel)
        self.pool1 = nn.MaxPool2d(2, 2)
        self.conv2 = nn.Conv2d(10, 20, kernel)
        self.pool2 = nn.MaxPool2d(2, 2)
        self.fc1 = nn.Linear(320, 50)
        self.fc2 = nn.Linear(50, 10)
        
    def forward(self, x_input):
        output = self.pool1(F.relu(self.conv1(x_input)))
        output = self.pool2(F.relu(self.conv2(output)))
        
        output = output.view(output.size(0), -1)
        output = F.relu(self.fc1(output))
        output = F.relu(self.fc2(output))
        
        return output

In [5]:
model = MNISTConvNet()
print(model)

MNISTConvNet(
  (conv1): Conv2d(1, 10, kernel_size=(5, 5), stride=(1, 1))
  (pool1): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (conv2): Conv2d(10, 20, kernel_size=(5, 5), stride=(1, 1))
  (pool2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (fc1): Linear(in_features=320, out_features=50, bias=True)
  (fc2): Linear(in_features=50, out_features=10, bias=True)
)


In [11]:
# One 'image', one channel, 28 x 28 pixels
data = torch.randn(1, 1, 28, 28)
print(data)

tensor([[[[ 1.1647e-01, -2.4016e-02,  8.2265e-01,  5.8840e-01,  9.3602e-01,
           -1.0705e+00, -8.1622e-01, -8.6447e-01, -1.7625e+00,  1.2132e+00,
           -1.2187e+00, -2.1086e+00, -3.2551e-01, -4.8256e-01,  1.4002e+00,
           -1.7066e+00, -1.4391e+00,  9.5715e-01, -2.1572e-01, -4.3356e-01,
            9.3303e-01,  8.2530e-01, -2.7688e-01,  1.6313e+00, -3.1962e-01,
            1.6004e-01, -2.5263e-01, -5.4804e-01],
          [-6.7414e-01, -6.4626e-01,  1.3171e+00,  1.7061e+00, -2.0916e-01,
           -1.2428e+00, -4.9815e-01,  7.2055e-01,  3.1403e+00,  3.8767e-01,
           -1.1841e+00, -1.2291e+00, -1.9625e-01,  1.9245e+00, -4.4848e-01,
            5.9923e-01,  8.9646e-02, -2.5067e+00,  4.2229e-01,  5.9621e-01,
           -4.7169e-01, -3.7023e-01,  4.1579e-01, -1.9380e+00,  1.9104e+00,
            1.0815e+00, -9.2229e-02,  2.6908e-01],
          [-2.8345e-01, -1.2002e+00,  5.2296e-01, -1.9429e-01, -8.3012e-01,
            4.4031e-01, -7.9882e-01,  2.4363e+00,  5.0394e-01,

In [15]:
output = model(data)
print(output)
print("Size:", output.size())

tensor([[0.0879, 0.0665, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0804, 0.0981,
         0.0000]], grad_fn=<ReluBackward0>)
Size: torch.Size([1, 10])


The output of the `ConvNet` out is a `Tensor`. We compute the loss using that, and that results in `error` which is also a `Tensor`. Calling `.backward` on `err` hence will propagate gradients all the way through the `ConvNet` to it’s weights

In [26]:
target = torch.tensor([3], dtype=torch.long)
loss = nn.CrossEntropyLoss()
error = loss(output, target)
print(error)

tensor(2.3367, grad_fn=<NllLossBackward>)


In [32]:
error.backward()

You can view the gradients of the weights of a particular section of the neural network with respect to the loss function.

In [46]:
print(model)

MNISTConvNet(
  (conv1): Conv2d(1, 10, kernel_size=(5, 5), stride=(1, 1))
  (pool1): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (conv2): Conv2d(10, 20, kernel_size=(5, 5), stride=(1, 1))
  (pool2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (fc1): Linear(in_features=320, out_features=50, bias=True)
  (fc2): Linear(in_features=50, out_features=10, bias=True)
)


In [39]:
print("Weights of Convolutional Layer 1:")
print(model.conv1.weight)

Weights of Convolutional Layer 1:
Parameter containing:
tensor([[[[ 0.0562, -0.1601,  0.0164,  0.0033,  0.1643],
          [-0.1841, -0.1978,  0.1821,  0.1325,  0.0489],
          [ 0.1297, -0.1906, -0.1269, -0.0091, -0.0100],
          [-0.1798,  0.0059, -0.0802, -0.1052, -0.0168],
          [ 0.0935, -0.0918,  0.0033,  0.1594,  0.0652]]],


        [[[-0.0239, -0.0972,  0.0784, -0.1828,  0.1466],
          [ 0.0736, -0.1965, -0.0071, -0.1852,  0.1947],
          [ 0.0207, -0.1911, -0.0825, -0.1290, -0.1501],
          [-0.0681,  0.1771,  0.0485, -0.0522,  0.0555],
          [ 0.1747,  0.1146,  0.0424, -0.1735,  0.1356]]],


        [[[-0.0876, -0.0053,  0.0763, -0.0148, -0.0269],
          [-0.1985, -0.1403, -0.0419, -0.0963,  0.0780],
          [-0.0180,  0.1770,  0.1339,  0.0108, -0.1867],
          [ 0.1621,  0.1126, -0.1600,  0.0907, -0.0726],
          [ 0.1401,  0.1833,  0.1344, -0.1302, -0.1990]]],


        [[[-0.0182, -0.1262, -0.1849, -0.1129,  0.0548],
          [-0.1422, 

In [38]:
print("Gradients of Convolutional Layer 1 w.r.t. the cost function:")
print(model.conv1.weight.grad)

Gradients of Convolutional Layer 1 w.r.t. the cost function:
tensor([[[[-1.1439e-03, -1.2803e-02,  7.2938e-03,  6.1427e-03,  1.2401e-02],
          [-4.5631e-03,  7.4339e-03, -7.0621e-03, -8.3033e-03,  1.2871e-02],
          [ 1.2920e-02, -1.2860e-03, -2.2707e-02, -5.4728e-03, -4.2530e-03],
          [ 2.1059e-02,  1.2690e-03, -1.0923e-02,  6.9698e-03, -3.8244e-03],
          [-3.9072e-03,  7.9409e-03, -6.7774e-03, -2.1175e-03,  1.1871e-02]]],


        [[[-1.2667e-02, -4.0576e-03, -4.2889e-04, -5.9545e-04,  1.5678e-02],
          [ 7.8073e-04,  5.6169e-03, -1.0715e-02, -1.0355e-02,  4.9708e-03],
          [-4.4124e-03,  7.2172e-03, -1.8157e-03,  1.3564e-02,  9.1230e-03],
          [-6.7082e-03,  9.9631e-03,  2.7031e-03, -3.9082e-03,  3.2153e-04],
          [ 1.8470e-03,  6.3617e-03,  9.8859e-04, -8.4417e-03,  5.8462e-03]]],


        [[[-6.8187e-03, -3.3707e-04,  7.0637e-03, -6.9428e-03, -3.3960e-03],
          [-6.7471e-03, -5.7037e-03,  7.0404e-03, -5.4531e-03,  8.2024e-03],
       

In [43]:
print("Norm of the weights:", model.conv1.weight.data.norm())
print("Norm of the gradients:", model.conv1.weight.grad.norm())

Norm of the weights: tensor(1.9315)
Norm of the gradients: tensor(0.1165)


In [45]:
def printnorm(self, input, output):
    # input is a tuple of packed inputs
    # output is a Tensor. output.data is the Tensor we are interested
    print('Inside ' + self.__class__.__name__ + ' forward')
    print('')
    print('input: ', type(input))
    print('input[0]: ', type(input[0]))
    print('output: ', type(output))
    print('')
    print('input size:', input[0].size())
    print('output size:', output.data.size())
    print('output norm:', output.data.norm())

model.conv2.register_forward_hook(printnorm)

out = model(data)

Inside Conv2d forward

input:  <class 'tuple'>
input[0]:  <class 'torch.Tensor'>
output:  <class 'torch.Tensor'>

input size: torch.Size([1, 10, 12, 12])
output size: torch.Size([1, 20, 8, 8])
output norm: tensor(15.9687)
