In [None]:
"""
https://pytorch.org/tutorials/beginner/blitz/neural_networks_tutorial.html
"""

In [None]:
"""A typical training procedure for a neural network is as follows:

Define the neural network that has some learnable parameters (or weights)
Iterate over a dataset of inputs
Process input through the network
Compute the loss (how far is the output from being correct)
Propagate gradients back into the network’s parameters
Update the weights of the network, typically using a simple update rule: weight = weight - learning_rate * gradient
"""

In [62]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [2]:
class Net(nn.Module):

    def __init__(self):
        super(Net, self).__init__()
        # 1 input image channel, 6 output channels, 3x3 square convolution kernel
        self.conv1 = nn.Conv2d(1, 6, 3)     #[1, 6, 30, 30]
        self.conv2 = nn.Conv2d(6, 16, 3)    #[1, 16, 13, 13]
        # and affine operation: y = Wx + b
        self.fc1 = nn.Linear(16 * 6 * 6, 120)   # 6*6 from image dimension after maxpool [1,16,6,6]
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 10)

    def forward(self, x):
        # Max Pooling over a (2,2) Window
        x = F.max_pool2d( F.relu( self.conv1(x) ), (2,2) )
        # if the size is a square you can only specify a single number
        x = F.max_pool2d( F.relu( self.conv2(x) ), 2 )
        x = x.view(-1, self.num_flat_features(x) )
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x


    def num_flat_features(self, x):
        size = x.size()[1:] # all dimensions except the batch dimension
        num_features = 1
        for s in size:
            num_features*= s
        return num_features

In [3]:
net = Net()
print(net)

Net(
  (conv1): Conv2d(1, 6, kernel_size=(3, 3), stride=(1, 1))
  (conv2): Conv2d(6, 16, kernel_size=(3, 3), stride=(1, 1))
  (fc1): Linear(in_features=576, out_features=120, bias=True)
  (fc2): Linear(in_features=120, out_features=84, bias=True)
  (fc3): Linear(in_features=84, out_features=10, bias=True)
)


In [7]:
params = list(net.parameters())
print(len(params))
print(params[0].size())

10
torch.Size([6, 1, 3, 3])


In [27]:
input = torch.randn(1, 1, 32, 32)
print(input.size())
out = net(input)
print(out)

torch.Size([1, 1, 32, 32])
tensor([[-0.0144,  0.0146, -0.0414, -0.1396, -0.0345, -0.0638,  0.0163, -0.0200,
         -0.0287,  0.0328]], grad_fn=<AddmmBackward>)


## Independent size testing

In [28]:
conv1 = nn.Conv2d(1, 6, 3)
out_conv1 = conv1(input)
print("out conv1 size:",out_conv1.size())

out_relu1 = F.relu(out_conv1)
print("out relu1 size:", out_relu1.size())

out_maxpool1 = F.max_pool2d(out_relu1, 2)
print("out maxpool1 size:", out_maxpool1.size())

out conv1 size: torch.Size([1, 6, 30, 30])
out relu1 size: torch.Size([1, 6, 30, 30])
out maxpool1 size: torch.Size([1, 6, 15, 15])


In [29]:
conv2 = nn.Conv2d(6, 16, 3)
out_conv2 = conv2(out_maxpool1)
print("out conv2 size:",out_conv2.size())

out_relu2 = F.relu(out_conv2)
print("out relu2 size:", out_relu2.size())

out_maxpool2 = F.max_pool2d(out_relu2, 2)
print("out maxpool2 size:", out_maxpool2.size())

out conv2 size: torch.Size([1, 16, 13, 13])
out relu2 size: torch.Size([1, 16, 13, 13])
out maxpool2 size: torch.Size([1, 16, 6, 6])


In [30]:
def num_flat_features(x):
        size = x.size()[1:] # all dimensions except the batch dimension
        num_features = 1
        for s in size:
            num_features*= s
        return num_features

x = out_maxpool2.view(-1, num_flat_features(out_maxpool2) )
print(x.size())
print(16*6*6)
print(out_maxpool2.view(1, -1).size())

torch.Size([1, 576])
576
torch.Size([1, 576])


In [32]:
fc1 = nn.Linear(16 * 6 * 6, 120)
out_fc1 = fc1(x)
print("out fc1 size:", out_fc1.size())
out_relufc1 = F.relu(out_fc1)
print("out relufc1 size:",out_relufc1.size())

out fc1 size: torch.Size([1, 120])
out relufc1 size: torch.Size([1, 120])


In [33]:
fc2 = nn.Linear(120, 84)
out_fc2 = fc2(out_relufc1)
print("out fc size:", out_fc2.size())
out_relufc2 = F.relu(out_fc2)
print("out relufc1 size:",out_relufc2.size())

out fc size: torch.Size([1, 84])
out relufc1 size: torch.Size([1, 84])


In [35]:
fc3 = nn.Linear(84, 10)
out_fc3 = fc3(out_relufc2)
print("out fc3 size:", out_fc3.size())
print(out_fc3)

out fc3 size: torch.Size([1, 10])
tensor([[ 0.0868,  0.0574, -0.0073, -0.0273, -0.0678, -0.0369, -0.0270, -0.1091,
         -0.0716, -0.0719]], grad_fn=<AddmmBackward>)


In [44]:
softmax = nn.Softmax()
out_softmax = softmax(out_fc3)
print("out softmax shape", out_softmax.size())
print(out_softmax)
print(out_softmax.argmax())

out softmax shape torch.Size([1, 10])
tensor([[0.1119, 0.1087, 0.1019, 0.0999, 0.0959, 0.0989, 0.0999, 0.0920, 0.0955,
         0.0955]], grad_fn=<SoftmaxBackward>)
tensor(0)


  


### Continue with the original tutorial

In [45]:
net.zero_grad() #zero the gradient buffers of all parameters
out.backward(torch.randn(1,10)) #backprops with random gradients

torch.nn only supports mini-batches. The entire torch.nn package only supports inputs that are a mini-batch of samples, and not a single sample.

For example, nn.Conv2d will take in a 4D Tensor of nSamples x nChannels x Height x Width.

If you have a single sample, just use input.unsqueeze(0) to add a fake batch dimension.

In [48]:
output = net(input)
print(output)
target = torch.randn(10)    #dummy target
print(target)
target = target.view(1,-1)  #make it the same shape as the output
criterion = nn.MSELoss()

loss = criterion(output, target)
print(loss)

tensor([[-0.0144,  0.0146, -0.0414, -0.1396, -0.0345, -0.0638,  0.0163, -0.0200,
         -0.0287,  0.0328]], grad_fn=<AddmmBackward>)
tensor([-0.9150, -0.7652, -0.1000, -0.3159,  0.0601, -2.3648, -1.1659, -0.7460,
         1.1578, -1.1692])
tensor(1.1534, grad_fn=<MseLossBackward>)


In [53]:
print(loss.grad_fn) #MSELoss
print(loss.grad_fn.next_functions[0][0])    #linear
print(loss.grad_fn.next_functions[0][0].next_functions[0][0]) # ReLU

<MseLossBackward object at 0x7fe24b96b2e8>
<AddmmBackward object at 0x7fe249e96668>
<AccumulateGrad object at 0x7fe249e96358>


## Backprop

In [56]:
net.zero_grad()
print("conv1.bias.grad before backward")
print(net.conv1.bias.grad)

loss.backward()

print("conv1.bias.grad after backward")
print(net.conv1.bias.grad)

Parameter containing:
tensor([ 0.3234, -0.0111, -0.2086,  0.1248,  0.1516, -0.1964],
       requires_grad=True)
conv1.bias.grad before backward
tensor([0., 0., 0., 0., 0., 0.])


RuntimeError: Trying to backward through the graph a second time, but the buffers have already been freed. Specify retain_graph=True when calling backward the first time.

In [61]:
print(net.fc3.bias.size())
print(net.fc3.bias)

torch.Size([10])
Parameter containing:
tensor([ 0.0035,  0.0540, -0.0504, -0.0721, -0.0003, -0.0448,  0.0374, -0.0060,
        -0.0680, -0.0323], requires_grad=True)


### Update the weights

weight = weight - learning_rate * gradient

In [None]:
# using only Python
learning_rate = 0.01
for f in net.parameters():
    f.data.sub_(f.grad.data * learning_rate)

In [63]:
# create the optimizer
optimizer = optim.SGD(net.parameters(), lr=0.01)

# in your training loop
optimizer.zero_grad()   #zero the gradient buffers
output = net(input)
loss = criterion(output, target)
loss.backward()
optimizer.step()