### A typical training procedure:
    
1. Define the neural network with learnable parameters(weights)
2. iterate over the input dataset
3. process the input through the network
4. calculate the loss
5. back propagate the gradients into parametrs of network
6. Update the weights of the network, using the formula
    `weight = weight - (leraning_rate * gradient)`

#### Network that classifies digit
![MNIST network](https://pytorch.org/tutorials/_images/mnist.png)

In [35]:
#1.  Define network

import torch
import torch.nn as nn
import torch.nn.functional as F

class Net(nn.Module):
    
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(1,6,5) #1 input channel, 6 output channels, 5x5 square convolution
        self.conv2 = nn.Conv2d(6,16,5)
        self.fc1 = nn.Linear(16 * 5 * 5, 120)
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 10)
    
    def forward(self, x):
        x = F.max_pool2d(F.relu(self.conv1(x)), (2,2)) # Max pool over a (2, 2) window
        x = F.max_pool2d(F.relu(self.conv2(x)), 2) #if size is squar we can specify only one number
        x = x.view(-1, self.num_flat_features(x))
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x
    
    def num_flat_features(self, x):
        size = x.size()[1:]
        num_features = 1
        for s in size:
            num_features *= s
        return num_features

net = Net()
print(net)

Net(
  (conv1): Conv2d(1, 6, kernel_size=(5, 5), stride=(1, 1))
  (conv2): Conv2d(6, 16, kernel_size=(5, 5), stride=(1, 1))
  (fc1): Linear(in_features=400, out_features=120, bias=True)
  (fc2): Linear(in_features=120, out_features=84, bias=True)
  (fc3): Linear(in_features=84, out_features=10, bias=True)
)


In [36]:
params = list(net.parameters())
print(len(params))
print(params[0].size())

10
torch.Size([6, 1, 5, 5])


In [37]:
input = torch.randn(1,1,32,32)
out = net(input)
print(out)

tensor([[ 0.0062,  0.0226,  0.0658,  0.0094,  0.0797,  0.0285, -0.1165, -0.0573,
          0.0428,  0.1720]], grad_fn=<ThAddmmBackward>)


In [38]:
net.zero_grad()
out.backward(torch.randn(1,10))

In [39]:
#Loss function
output = net(input)
target = torch.randn(10)
target = target.view(1, -1)
criterion = nn.MSELoss()

loss = criterion(output, target)
print(loss)

tensor(0.6551, grad_fn=<MseLossBackward>)


In [40]:
print(loss.grad_fn)
print(loss.grad_fn.next_functions[0][0])
print(loss.grad_fn.next_functions[0][0].next_functions[0][0])

<MseLossBackward object at 0x119268b70>
<ThAddmmBackward object at 0x119268c50>
<ExpandBackward object at 0x119268b70>


In [41]:
# Backprop
# Clear the existing gradients . else gradients will be accumulated to existing gradients
net.zero_grad()

print("conv1.bias.grad before backward")
print(net.conv1.bias.grad)

loss.backward()

print("conv1.bias.grad after backward")
print(net.conv1.bias.grad)

conv1.bias.grad before backward
tensor([0., 0., 0., 0., 0., 0.])
conv1.bias.grad after backward
tensor([ 0.0195,  0.0080, -0.0245,  0.0114,  0.0069,  0.0024])


In [43]:
# Updating the weights of the network
# weight = weight - learning_rate * gradient

learning_rate = 0.01
for f in net.parameters():
    f.data.sub_(f.grad.data * learning_rate)

In [45]:
# To use other update rules sucha SGD, Adam , we use optim module
import torch.optim as optim

optimizer = optim.SGD(net.parameters(), lr = 0.01)

optimizer.zero_grad()
output = net(input)
loss = criterion(output, target)
loss.backward()
optimizer.step()