In [3]:
import torch
import torchvision
from torchvision import transforms, datasets, models
import matplotlib.pyplot as plt
import torch.optim as optim


import torch.nn as nn
import torch.nn.functional as F



import os 
import time
import cv2
import numpy as np
from tqdm.notebook import tqdm
import pandas as pd

rebuild_data = False

In [4]:
if torch.cuda.is_available():
    device = torch.device("cuda: 0")
else:
    device = torch.device("cpu")
    
#i could assign specific layers to one gpu 

torch.cuda.device_count() #but i have only one :P
    

1

In [6]:
class Net(nn.Module):
    
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(1, 6, 3)
        self.conv2 = nn.Conv2d(6, 16, 3)
        
        #16 times 6 times 6 is tricky, 
        #see: https://pytorch.org/docs/master/generated/torch.nn.Conv2d.html
        #apparently keras just does this for you, pytorch should . . . but can't? 'cause of the whole dynamic graph generation thing? which i don't really understand . . . :/
        
        #EASIER: https://stackoverflow.com/questions/53784998/how-are-the-pytorch-dimensions-for-linear-layers-calculated
        
        self.fc1 = nn.Linear(16*6*6, 120) #6*6 from image
        
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 10)
        
    def forward(self, x):
        x = F.max_pool2d(F.relu(self.conv1(x)), (2,2))
        x = F.max_pool2d(F.relu(self.conv2(x)), 2)
        x = x.view(-1, self.num_flat_features(x)) 
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        
        return x
    
    def num_flat_features(self, x):
        size = x.size()[1:]
        num_features = 1
        for s in size:
            num_features *= s
        return num_features
    

In [7]:
net = Net()
print(net)
        
        

Net(
  (conv1): Conv2d(1, 6, kernel_size=(3, 3), stride=(1, 1))
  (conv2): Conv2d(6, 16, kernel_size=(3, 3), stride=(1, 1))
  (fc1): Linear(in_features=576, out_features=120, bias=True)
  (fc2): Linear(in_features=120, out_features=84, bias=True)
  (fc3): Linear(in_features=84, out_features=10, bias=True)
)


In [8]:
list(net.parameters())

[Parameter containing:
 tensor([[[[ 0.2201,  0.0985, -0.1770],
           [ 0.3067, -0.0331, -0.2608],
           [-0.1463,  0.0611,  0.0389]]],
 
 
         [[[-0.2057, -0.2625,  0.2223],
           [-0.1649, -0.1147,  0.2816],
           [ 0.2135, -0.2841,  0.2121]]],
 
 
         [[[-0.2673, -0.0658, -0.1994],
           [ 0.0227,  0.3152,  0.0683],
           [ 0.0493, -0.2714, -0.1086]]],
 
 
         [[[-0.3091,  0.1030,  0.1806],
           [ 0.2342, -0.3220, -0.0632],
           [ 0.2490,  0.3095,  0.0663]]],
 
 
         [[[-0.1216, -0.3290,  0.1539],
           [ 0.2009, -0.3059, -0.0697],
           [-0.2566,  0.0379, -0.0338]]],
 
 
         [[[-0.1338,  0.2232, -0.0758],
           [ 0.0030, -0.1669, -0.1668],
           [ 0.1776,  0.1693,  0.0383]]]], requires_grad=True),
 Parameter containing:
 tensor([ 0.1773, -0.2163, -0.0495,  0.2434,  0.0128,  0.0785],
        requires_grad=True),
 Parameter containing:
 tensor([[[[-0.0178,  0.0042,  0.0968],
           [-0.0671, -0.

In [10]:
params = list(net.parameters())
params[0].size()

torch.Size([6, 1, 3, 3])

In [11]:
input = torch.randn(1, 1, 32, 32)
out = net(input)
print(out)

tensor([[ 0.1113, -0.0649, -0.0035, -0.0635,  0.1246, -0.0999, -0.0174,  0.1339,
          0.1270,  0.0493]], grad_fn=<AddmmBackward>)


In [12]:
net.zero_grad()

In [13]:
out.backward(torch.randn(1, 10))

In [14]:
print(out)

tensor([[ 0.1113, -0.0649, -0.0035, -0.0635,  0.1246, -0.0999, -0.0174,  0.1339,
          0.1270,  0.0493]], grad_fn=<AddmmBackward>)


In [15]:
out.shape

torch.Size([1, 10])

In [None]:
'''
Note

torch.nn only supports mini-batches. The entire torch.nn package only supports inputs that are a mini-batch of samples, and not a single sample.

For example, nn.Conv2d will take in a 4D Tensor of nSamples x nChannels x Height x Width.

If you have a single sample, just use input.unsqueeze(0) to add a fake batch dimension.
'''

In [21]:
output = net(input)
target = torch.randn(10)  # a dummy target, for example
target = target.view(1, -1)  # make it the same shape as output


#loss = nn.MSELoss(output, target) doesn't work?
lossfn = nn.MSELoss()
loss = lossfn(output, target)
print(loss)

tensor(0.4248, grad_fn=<MseLossBackward>)


In [None]:
'''Now, if you follow loss in the backward direction, using its .grad_fn attribute, you will see a graph of computations that looks like this:

input -> conv2d -> relu -> maxpool2d -> conv2d -> relu -> maxpool2d
      -> view -> linear -> relu -> linear -> relu -> linear
      -> MSELoss
      -> loss

'''

In [22]:
print(loss.grad_fn)  # MSELoss
print(loss.grad_fn.next_functions[0][0])  # Linear
print(loss.grad_fn.next_functions[0][0].next_functions[0][0])  # ReLU

<MseLossBackward object at 0x0000013DD67D5A88>
<AddmmBackward object at 0x0000013DD67D5988>
<AccumulateGrad object at 0x0000013DD67D5A88>


In [23]:
net.zero_grad()     # zeroes the gradient buffers of all parameters

print('conv1.bias.grad before backward')
print(net.conv1.bias.grad)

loss.backward()

print('conv1.bias.grad after backward')
print(net.conv1.bias.grad)

conv1.bias.grad before backward
tensor([0., 0., 0., 0., 0., 0.])
conv1.bias.grad after backward
tensor([ 0.0151,  0.0065, -0.0062, -0.0079,  0.0113,  0.0068])


In [None]:
#We can implement BACKPROP aka updating the weights according to grads 
#calculated (during loss.backward, they're stored in the 
#Tensor objects) using simple Python code:

learning_rate = 0.01
for f in net.parameters():
    f.data.sub_(f.grad.data * learning_rate)

#but there's optimisers


In [24]:
import torch.optim as optim

In [25]:
optimizer = optim.SGD(net.parameters(), lr=0.01)

# in your training loop:
optimizer.zero_grad()   # zero the gradient buffers
#net.zero_grad() #ALSO WORKS the prev is better in context of different optimisers assigned to different layers
output = net(input)
loss = lossfn(output, target)
loss.backward()
optimizer.step()    # Does the update

#GRADIENTS ARE ACCUMULATED (aka summed?), when backward is called
#Observe how gradient buffers had to be manually set to zero using optimizer.zero_grad(). This is because gradients are accumulated 