In [None]:
import numpy as np
import torch

In [None]:
torch.__version__

'1.8.0+cu101'

In [None]:
torch.cuda.is_available()

False

1. Tensors (torch.Tensor, Tensor Attributes)

In [None]:
#from python list or sequence
a = [[1_000, 0], [3, 4]]
#dtype int8, unsigned int8, int16, int32, int64, float16, float32, float64, boolean
x = torch.tensor(a, dtype=torch.bool, device='cpu')
x.dtype, x.device, x.stride() #reference to tensor.Storage

(torch.bool, device(type='cpu'), (2, 1))

In [None]:
# 9 cpu constructors with specific dtype [torch.FloatTensor ...]
# 9 gpu constructors with specific dtype [torch.cuda.FloatTensor ...]

torch.float32

In [None]:
# from array_like data
a = np.array([[1_000, 0], [3, 4]])
q = torch.as_tensor(a) #avoid copy
q[0, 0] = -1
a[0, 0], q.device
#on gpu this trick isn't work

(-1, device(type='cpu'))

In [None]:
#from numpy (see bridge section)

In [None]:
#from another tensor
data = torch.tensor([[1, 2], [3, 4]])
x_ones = torch.ones_like(data)

x_rand = torch.rand_like(data, dtype=torch.float16)

x_ones, x_ones.dtype, x_rand, x_rand.dtype

(tensor([[1, 1],
         [1, 1]]), torch.int64, tensor([[0.7793, 0.8838],
         [0.1787, 0.9209]], dtype=torch.float16), torch.float16)

In [None]:
#numpy axis is equal to torch dim
a = np.array([[[1, 2, 3], 
               [3, 4, 0]],
                        [[1, 2, 3], 
                        [3, 4, 0]]])
a.shape, a.sum(axis=0), a.sum(axis=1), a.sum(axis=2)

((2, 2, 3), array([[2, 4, 6],
        [6, 8, 0]]), array([[4, 6, 3],
        [4, 6, 3]]), array([[6, 7],
        [6, 7]]))

In [None]:
b = torch.tensor(a)
b.sum(dim=0), b.sum(dim=1), b.sum(dim=2)

(tensor([[2, 4, 6],
         [6, 8, 0]]), tensor([[4, 6, 3],
         [4, 6, 3]]), tensor([[6, 7],
         [6, 7]]))

In [None]:
# tensor operations
# that have a _ suffix are in-place
#x.copy_(y)
#x.t_()
tensor = torch.ones((3, 3))
tensor.add_(5)
tensor

tensor([[6., 6., 6.],
        [6., 6., 6.],
        [6., 6., 6.]])

In [None]:
#numpy pytorch bridge
#Tensors on the CPU and NumPy arrays can share their underlying memory locations, and changing one will change the other.
  # torch -> numpy
t = torch.ones(5)
n = t.numpy()
t, n

(tensor([1., 1., 1., 1., 1.]), array([1., 1., 1., 1., 1.], dtype=float32))

In [None]:
t.add_(5)
t, n

(tensor([6., 6., 6., 6., 6.]), array([6., 6., 6., 6., 6.], dtype=float32))

In [None]:
  #numpy -> torch
n = np.ones(3)
t = torch.from_numpy(n)
n, t

(array([1., 1., 1.]), tensor([1., 1., 1.], dtype=torch.float64))

In [None]:
np.add(n, 1, out=n)
n, t

(array([2., 2., 2.]), tensor([2., 2., 2.], dtype=torch.float64))

2. torch.Autograd

In [None]:
#torch autograd
  #out = t^2
t = torch.tensor([1, 2, 3], dtype=torch.float32, requires_grad=True)
out = t.pow(2)
out, t.shape

(tensor([1., 4., 9.], grad_fn=<PowBackward0>), torch.Size([3]))

In [None]:
  #When we call .backward() on OUT, autograd calculates these gradients and stores them in the respective tensor's .grad attribute.
  #We need to explicitly pass a gradient argument in OUT.backward() because it is a vector. gradient is a tensor of the same shape as OUT, and it represents the gradient of the OUT itself
out.backward(gradient=torch.tensor([1, 1, 1]))

In [None]:
t.grad

tensor([2., 4., 6.])

In [None]:
  #Equivalently, we can aggregate Q into a scalar and call backward implicitly [grad can be implicitly created only for scalar outputs]
t = torch.tensor([1, 2, 3], dtype=torch.float32, requires_grad=True)
out2 = t.pow(2).sum()
out2

tensor(14., grad_fn=<SumBackward0>)

In [None]:
out2.backward()
t.grad

tensor([2., 4., 6.])

In [None]:
out2.grad_fn

<SumBackward0 at 0x7fa5f1699110>

In [None]:
  # Pytorch support differentiation of a scalar functions (self derivative is 1), os if we want too start backward from a non-scalar value, we provide torch.tensor in backward(gradient=...)

In [None]:
  #Example 2
a = torch.tensor([2., 3.], requires_grad=True)
b = torch.tensor([6., 4.], requires_grad=True)
Q = 3*a**3 - b**2

In [None]:
Q.backward(gradient=torch.tensor([1., 1.]))

In [None]:
print(9*a**2 == a.grad)
print(-2*b == b.grad)

tensor([True, True])
tensor([True, True])


In [None]:
  #Generally speaking, torch.autograd is an engine for computing vector-Jacobian product. That is, given any vector v, compute the product J.T * v (see perfect example in doc)

3. Computational Graph

Conceptually, autograd keeps a record of data (tensors) and all executed operations (along with the resulting tensors) in a **directed acyclic graph DAG** consisting of a autograd.Function objects. In this DAG, leaves are the input tensors, roots are the output tensors. By tracking this graph from roots to leaves, you can automatically compute the gradients using the chain rule.


In a forward pass, autograd does twoo things simultaneously:
  - run the requested operation to compute a resulting tensor
  - maintain the operation's gradient function in the DAG

The backward pass kicks off when .backward is called on the DAG root. autograd then:
  - computes the gradients from each .grad_fn
  - accumulates them in the respective tensor's .grad attribute
  - using the chain rule, propagates all the way to the leaf tensors

**DAGs are dynamic in PyTorch and, after .backward() call, autograd starts populate a new graph.**

In [None]:
  #Exclusion from the DAG - requires_grad = False (frozen parameters - we dont need their parameters)
  #is important for funetuning a pretrained network

In [None]:
from torch import nn, optim
import torchvision

model = torchvision.models.resnet18(pretrained=True)

#freeze all the parameters in the network
for param in model.parameters():
  param.requires_grad = False

Downloading: "https://download.pytorch.org/models/resnet18-5c106cde.pth" to /root/.cache/torch/hub/checkpoints/resnet18-5c106cde.pth


HBox(children=(FloatProgress(value=0.0, max=46827520.0), HTML(value='')))




let's say we want to finetune the model on a new dataset with 10 label. In resnet, the classifier is the last linear layer model.fc. We can simply replace it with a new linear layer (unfrose by default)

In [None]:
model.fc = nn.Linear(512, 10)

In [None]:
#optimize only the classifier
optimizer = optim.SGD(model.fc.parameters(), lr=1e-2, momentum=0.9)
#same functionality is available as a context manager in torch.no_grad()

4. Neural network

A typical trainig procedure for a nn is as follows:
  - define the neural network that has some learnable parameters (as weights)
  - iterate over a dataset or inputs
  - process input through the network
  - compute the loss
  - propagate gradients back into hte network's parameters 
  - update the weights  of the network, typically using a smiple update rule: weight-=lr*gradients

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F

In [67]:
class Net(nn.Module):
  def __init__(self):
    super().__init__()

    self.conv1 = nn.Conv2d(1, 6, 3)
    self.conv2 = nn.Conv2d(6, 16, 3)
    self.fc1 = nn.Linear(16 * 6 * 6, 120)
    self.fc2 = nn.Linear(120, 84)
    self.fc3 = nn.Linear(84, 10)

  def forward(self, x):
    x = F.max_pool2d(F.relu(self.conv1(x)), (2, 2))
    x = F.max_pool2d(F.relu(self.conv2(x)), 2)
    x = x.view(-1, self.num_flat_features(x))
    x = F.relu(self.fc1(x))
    x = F.relu(self.fc2(x))
    x = self.fc3(x)
    return x

  def num_flat_features(self, x):
    size = x.size()[1:]
    num_features = 1
    for s in size:
      num_features *= s
    #print(num_features) == 576 in our case
    return num_features

net = Net()
print(net)    

Net(
  (conv1): Conv2d(1, 6, kernel_size=(3, 3), stride=(1, 1))
  (conv2): Conv2d(6, 16, kernel_size=(3, 3), stride=(1, 1))
  (fc1): Linear(in_features=576, out_features=120, bias=True)
  (fc2): Linear(in_features=120, out_features=84, bias=True)
  (fc3): Linear(in_features=84, out_features=10, bias=True)
)


In [32]:
##### view #####
tensor = torch.tensor([[1, 2], [3, 4]])
b = tensor.view((-1, 4)) # -1 mean inferred (предполагаемый == досчитай сам это очевидно)

In [18]:
b.storage().data_ptr() == tensor.storage().data_ptr()

True

In [33]:
b, tensor

(tensor([[1, 2, 3, 4]]), tensor([[1, 2],
         [3, 4]]))

In [None]:
################

In [36]:
#learnable parameters
params = list(net.parameters())
len(params), params[0].size()

(10, torch.Size([6, 1, 3, 3]))

In [50]:
params[0].__class__

torch.nn.parameter.Parameter

In [61]:
input = torch.randn(1, 1, 32, 32)
out = net(input)
print(out)

576
tensor([[-0.1896, -0.0440, -0.0467,  0.0132,  0.0551, -0.1311,  0.0959, -0.0061,
         -0.1312,  0.0648]], grad_fn=<AddmmBackward>)


In [63]:
net.zero_grad()
out.backward(torch.randn(1, 10))

In [62]:
# Loss function

In [68]:
output = net(input)
target = torch.randn(10)  # a dummy target, for example
target = target.view(1, -1)  # make it the same shape as output
criterion = nn.MSELoss()

print(output.shape, target.shape)
loss = criterion(output, target)
print(loss)

torch.Size([1, 10]) torch.Size([1, 10])
tensor(1.1860, grad_fn=<MseLossBackward>)


input -> conv2d -> relu -> maxpool2d -> conv2d -> relu -> maxpool2d
      -> view -> linear -> relu -> linear -> relu -> linear
      -> MSELoss
      -> loss

In [70]:
print(loss.grad_fn)  # MSELoss
print(loss.grad_fn.next_functions[0][0])  # Linear
print(loss.grad_fn.next_functions[0][0].next_functions[0][0])  # ReLU

<MseLossBackward object at 0x7ff20e687390>
<AddmmBackward object at 0x7ff20e6b1850>
<AccumulateGrad object at 0x7ff20e687390>


In [None]:
def print_graph(grad_fn):
  