In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F

In [2]:
class Net(nn.Module):

    def __init__(self):
        super(Net, self).__init__()
        # 1 input image channel, 6 output channels, 5x5 square convolution
        # kernel
        self.conv1 = nn.Conv2d(1, 6, 5)
        self.conv2 = nn.Conv2d(6, 16, 5)
        # an affine operation: y = Wx + b
        self.fc1 = nn.Linear(16 * 5 * 5, 120)
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 10)

    def forward(self, x):
        # Max pooling over a (2, 2) window
        x = F.max_pool2d(F.relu(self.conv1(x)), (2, 2))
        # If the size is a square you can only specify a single number
        x = F.max_pool2d(F.relu(self.conv2(x)), 2)
        x = x.view(-1, self.num_flat_features(x))
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

    def num_flat_features(self, x):
        size = x.size()[1:]  # all dimensions except the batch dimension
        num_features = 1
        for s in size:
            num_features *= s
        return num_features

In [60]:
net = Net()
print(net)
for p in net.parameters():
    print(p)

Net(
  (conv1): Conv2d(1, 6, kernel_size=(5, 5), stride=(1, 1))
  (conv2): Conv2d(6, 16, kernel_size=(5, 5), stride=(1, 1))
  (fc1): Linear(in_features=400, out_features=120, bias=True)
  (fc2): Linear(in_features=120, out_features=84, bias=True)
  (fc3): Linear(in_features=84, out_features=10, bias=True)
)
Parameter containing:
tensor([[[[-0.0132, -0.1039,  0.0939, -0.0463, -0.1229],
          [-0.0794, -0.0342, -0.0200, -0.1478,  0.1956],
          [-0.1941, -0.0917, -0.0272,  0.1258, -0.1312],
          [ 0.0430,  0.1758, -0.1660, -0.0348,  0.0997],
          [ 0.1053,  0.1829,  0.1441, -0.1472, -0.1699]]],


        [[[-0.1685,  0.1572,  0.0442, -0.0696,  0.0771],
          [-0.0060, -0.1901, -0.1677, -0.1493, -0.1893],
          [-0.1687,  0.0420,  0.1810,  0.1781, -0.1861],
          [ 0.0869,  0.0996, -0.0208, -0.1343, -0.0938],
          [ 0.0314, -0.0003, -0.0835, -0.1746, -0.1309]]],


        [[[-0.0529,  0.1301, -0.1217,  0.1884, -0.1691],
          [ 0.1471,  0.1125, -0.01

In [52]:
input = torch.randn(1, 1, 32, 32)
out = net(input)
print(out)
target = torch.randn(10)  # a dummy target, for example
target = target.view(1, -1)  # make it the same shape as output
criterion = nn.MSELoss()

loss = criterion(out, target)
print(loss)

tensor([[ 0.0196,  0.0484,  0.0252, -0.0380, -0.0672,  0.0108, -0.0160, -0.0453,
          0.0982,  0.0375]], grad_fn=<ThAddmmBackward>)
tensor(1.1491, grad_fn=<MseLossBackward>)


In [53]:
root = loss.grad_fn

def print_tree(node, prefix):
    if node is None:
        print(prefix+"INPUT")
        return
    print(prefix+str(node))
    for node in node.next_functions:
        print_tree(node[0], prefix+"    ")
        
print_tree(root, "")

<MseLossBackward object at 0x11b4c4470>
    <ThAddmmBackward object at 0x11b4c45f8>
        <ExpandBackward object at 0x11b4c4668>
            <AccumulateGrad object at 0x11b4869b0>
        <ReluBackward object at 0x11b4c46a0>
            <ThAddmmBackward object at 0x11b4c44e0>
                <ExpandBackward object at 0x11b4c4748>
                    <AccumulateGrad object at 0x11b4c4828>
                <ReluBackward object at 0x11b4c4780>
                    <ThAddmmBackward object at 0x11b4c4828>
                        <ExpandBackward object at 0x11b4c4898>
                            <AccumulateGrad object at 0x11b4c4978>
                        <ViewBackward object at 0x11b4c48d0>
                            <MaxPool2DWithIndicesBackward object at 0x11b4c4978>
                                <ReluBackward object at 0x11b4c49e8>
                                    <ThnnConv2DBackward object at 0x11b4c4a90>
                                        <MaxPool2DWithIndicesBackward obje

In [54]:
net.zero_grad()     # zeroes the gradient buffers of all parameters

print('conv1.bias.grad before backward')
print(net.conv1.bias.grad)

loss.backward()

print('conv1.bias.grad after backward')
print(net.conv1.bias.grad)
print(net.conv2.bias.grad)

conv1.bias.grad before backward
tensor([0., 0., 0., 0., 0., 0.])
conv1.bias.grad after backward
tensor([ 0.0017,  0.0095, -0.0024, -0.0027,  0.0100, -0.0053])
tensor([-0.0055,  0.0036,  0.0120, -0.0104,  0.0111, -0.0060, -0.0036,  0.0059,
         0.0000, -0.0099,  0.0122,  0.0075,  0.0010, -0.0089,  0.0092,  0.0022])


In [61]:
import torch.optim as optim

# create your optimizer
optimizer = optim.SGD(net.parameters(), lr=0.01)

# in your training loop:
optimizer.zero_grad()   # zero the gradient buffers
output = net(input)
loss = criterion(output, target)
loss.backward()
optimizer.step()    # Does the update

In [62]:
print(net.conv1.bias.grad)
print(net.conv2.bias.grad)

tensor([-0.0056, -0.0009, -0.0075, -0.0055,  0.0069, -0.0068])
tensor([ 0.0008,  0.0423,  0.0079,  0.0000,  0.0223, -0.0292,  0.0065, -0.0107,
        -0.0028,  0.0214,  0.0003, -0.0011, -0.0075,  0.0091,  0.0107, -0.0079])
