## Pytorch Introduction
Will follow this guide https://docs.pytorch.org/tutorials/beginner/blitz

### Tensors Basics

In [1]:
# Tensors are similar to NumPy’s ndarrays, except that tensors can run on GPUs or other specialized hardware
import torch
import numpy as np

In [19]:
# initializing tensors
# 1. from array
data = [[0, 2], [1, 3]]
tensor = torch.Tensor(data)

# 2. from numpy array
data = np.array(data)
tensor = torch.Tensor(data)

# 3. from another tensor
# - retains shape, datatype, unless overwritten
tensor = torch.ones_like(tensor)
print(f"Ones Tensor {tensor.shape} | {tensor.dtype}")

tensor2 = torch.rand_like(tensor, dtype=torch.bfloat16)
print(f"Tensor 2 {tensor2.shape} | {tensor2.dtype}")
print(tensor2)

# 4. based on a shape, with random/const values
print('------')

shape = (2, 3)
ones_tensor = torch.ones(shape)
rand_tensor = torch.rand(shape)

print(ones_tensor)
print(rand_tensor)

Ones Tensor torch.Size([2, 2]) | torch.float32
Tensor 2 torch.Size([2, 2]) | torch.bfloat16
tensor([[0.6172, 0.7617],
        [0.3438, 0.2539]], dtype=torch.bfloat16)
------
tensor([[1., 1., 1.],
        [1., 1., 1.]])
tensor([[7.4106e-04, 1.3401e-01, 2.6259e-01],
        [9.8301e-01, 6.6625e-01, 5.4750e-01]])


In [21]:
# Attribuets
print(tensor.shape, tensor.dtype, tensor.device)

torch.Size([2, 2]) torch.float32 cpu


In [23]:
# Operations
# Over 100, fuck
# https://docs.pytorch.org/docs/stable/torch.html
# transposing, indexing, slicing, math ops, lin alg, rand sampling, ....

if torch.cuda.is_available():
    tensor = tensor.to("cuda") # moving a tensor to gpu

In [30]:
tensor

tensor([[1., 1.],
        [1., 1.]])

In [39]:
a = torch.floor(torch.rand((4,3)) * 10)
a

tensor([[7., 7., 9.],
        [8., 4., 8.],
        [8., 7., 2.],
        [3., 5., 2.]])

In [43]:
a[1], a[2:], a[1,2], a[:,2]

(tensor([8., 4., 8.]),
 tensor([[8., 7., 2.],
         [3., 5., 2.]]),
 tensor(8.),
 tensor([9., 8., 2., 2.]))

In [44]:
b = torch.tensor([0,2])
a[b]

tensor([[7., 7., 9.],
        [8., 7., 2.]])

In [45]:
c = torch.tensor(np.arange(1))
a[b,c]

tensor([7., 8.])

In [46]:
a

tensor([[7., 7., 9.],
        [8., 4., 8.],
        [8., 7., 2.],
        [3., 5., 2.]])

In [84]:
b = torch.rand((4,1))
b

tensor([[0.9754],
        [0.6188],
        [0.2690],
        [0.9048]])

In [85]:
a.shape, b.shape

(torch.Size([4, 3]), torch.Size([4, 1]))

In [87]:
torch.concat((a,b), axis=0) # requires a,b same dimension (except the one joining)

RuntimeError: Sizes of tensors must match except in dimension 0. Expected size 3 but got size 1 for tensor number 1 in the list.

In [101]:
torch.concat((a,b), axis=1) # requires a,b not on concatenating one

tensor([[7.0000, 7.0000, 9.0000, 0.9754],
        [8.0000, 4.0000, 8.0000, 0.6188],
        [8.0000, 7.0000, 2.0000, 0.2690],
        [3.0000, 5.0000, 2.0000, 0.9048]])

In [102]:
torch.stack((a,b)) # stack, requires equal sizes, so b should be fully 4,3

RuntimeError: stack expects each tensor to be equal size, but got [4, 3] at entry 0 and [4, 1] at entry 1

In [103]:
torch.hstack((a,b))

tensor([[7.0000, 7.0000, 9.0000, 0.9754],
        [8.0000, 4.0000, 8.0000, 0.6188],
        [8.0000, 7.0000, 2.0000, 0.2690],
        [3.0000, 5.0000, 2.0000, 0.9048]])

In [105]:
a.shape, b.shape

(torch.Size([4, 3]), torch.Size([4, 1]))

In [111]:
b= torch.rand(1,3)
b, b.shape

(tensor([[0.0616, 0.5283, 0.6863]]), torch.Size([1, 3]))

In [112]:
torch.vstack((a,b))

tensor([[7.0000, 7.0000, 9.0000],
        [8.0000, 4.0000, 8.0000],
        [8.0000, 7.0000, 2.0000],
        [3.0000, 5.0000, 2.0000],
        [0.0616, 0.5283, 0.6863]])

In [113]:
a, b

(tensor([[7., 7., 9.],
         [8., 4., 8.],
         [8., 7., 2.],
         [3., 5., 2.]]),
 tensor([[0.0616, 0.5283, 0.6863]]))

In [114]:
a * b

tensor([[0.4310, 3.6983, 6.1768],
        [0.4925, 2.1133, 5.4905],
        [0.4925, 3.6983, 1.3726],
        [0.1847, 2.6416, 1.3726]])

In [116]:
a @ b.T

tensor([[10.3060],
        [ 8.0963],
        [ 5.5634],
        [ 4.1989]])

In [118]:
# in place operations
a.add_(5) # _ underscore suffix
a

tensor([[17., 17., 19.],
        [18., 14., 18.],
        [18., 17., 12.],
        [13., 15., 12.]])

### Intro to `torch.autograd`
https://docs.pytorch.org/tutorials/beginner/blitz/autograd_tutorial.html

What notices every operation during forward passes, and then computes gradients on backward prop

In [120]:
import torch
from torchvision.models import resnet18, ResNet18_Weights
model = resnet18(weights=ResNet18_Weights.DEFAULT)
data = torch.rand(1, 3, 64, 64)

labels = torch.rand(1, 1000)

Downloading: "https://download.pytorch.org/models/resnet18-f37072fd.pth" to /Users/joancabezas/.cache/torch/hub/checkpoints/resnet18-f37072fd.pth


100%|██████████| 44.7M/44.7M [00:01<00:00, 45.0MB/s]


In [121]:
prediction = model(data) # forward pass

In [122]:
loss = (prediction - labels).sum() 
loss.backward() # backward pass

In [123]:
optim = torch.optim.SGD(model.parameters(), lr=1e-2, momentum=0.9)

In [124]:
optim.step() # gradient descent

In [142]:
# let autograd now that every op with tensor should be tracked
a = torch.tensor([2.,3.], requires_grad=True)
b = torch.tensor([3.,4.], requires_grad=True)

In [143]:
Q = 3*a**3 - b**2

In [144]:
grad = torch.tensor([1.,1.])
Q.backward(grad) # will execute chain rule, and compute gradients
Q.grad, a.grad, b.grad

  Q.grad, a.grad, b.grad


(None, tensor([36., 81.]), tensor([-6., -8.]))

In [145]:
# when to .requires_grad = False?
# frozer parameters, e.g. on finetunning we freee most params

In [146]:
from torch import nn, optim

model = resnet18(weights=ResNet18_Weights.DEFAULT)

# Freeze all the parameters in the network
for param in model.parameters():
    param.requires_grad = False

In [147]:
model.fc = nn.Linear(512, 10) # replacing the classifier layer

# this would be now the only parameters with gradients

### Neural Networks

`torch.nn` package

https://docs.pytorch.org/tutorials/beginner/blitz/neural_networks_tutorial.html


- [ ] Requires a deeper understanding of CNN's

In [149]:
import torch
import torch.nn as nn
import torch.nn.functional as F


class Net(nn.Module):

    def __init__(self):
        super(Net, self).__init__()
        # 1 input image channel, 6 output channels, 5x5 square convolution
        # kernel
        self.conv1 = nn.Conv2d(1, 6, 5)
        self.conv2 = nn.Conv2d(6, 16, 5)
        # an affine operation: y = Wx + b
        self.fc1 = nn.Linear(16 * 5 * 5, 120)  # 5*5 from image dimension
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 10)

    def forward(self, input):
        # Convolution layer C1: 1 input image channel, 6 output channels,
        # 5x5 square convolution, it uses RELU activation function, and
        # outputs a Tensor with size (N, 6, 28, 28), where N is the size of the batch
        c1 = F.relu(self.conv1(input))
        # Subsampling layer S2: 2x2 grid, purely functional,
        # this layer does not have any parameter, and outputs a (N, 6, 14, 14) Tensor
        s2 = F.max_pool2d(c1, (2, 2))
        # Convolution layer C3: 6 input channels, 16 output channels,
        # 5x5 square convolution, it uses RELU activation function, and
        # outputs a (N, 16, 10, 10) Tensor
        c3 = F.relu(self.conv2(s2))
        # Subsampling layer S4: 2x2 grid, purely functional,
        # this layer does not have any parameter, and outputs a (N, 16, 5, 5) Tensor
        s4 = F.max_pool2d(c3, 2)
        # Flatten operation: purely functional, outputs a (N, 400) Tensor
        s4 = torch.flatten(s4, 1)
        # Fully connected layer F5: (N, 400) Tensor input,
        # and outputs a (N, 120) Tensor, it uses RELU activation function
        f5 = F.relu(self.fc1(s4))
        # Fully connected layer F6: (N, 120) Tensor input,
        # and outputs a (N, 84) Tensor, it uses RELU activation function
        f6 = F.relu(self.fc2(f5))
        # Gaussian layer OUTPUT: (N, 84) Tensor input, and
        # outputs a (N, 10) Tensor
        output = self.fc3(f6)
        return output


net = Net()
print(net)

Net(
  (conv1): Conv2d(1, 6, kernel_size=(5, 5), stride=(1, 1))
  (conv2): Conv2d(6, 16, kernel_size=(5, 5), stride=(1, 1))
  (fc1): Linear(in_features=400, out_features=120, bias=True)
  (fc2): Linear(in_features=120, out_features=84, bias=True)
  (fc3): Linear(in_features=84, out_features=10, bias=True)
)


In [155]:
# TODO: replace above with a simpler nn
# defining forward function

In [152]:
# what is `torch.nn.functional` `nn.Module` `nn.Parameter` `autograd.Function`

In [154]:
# explore .grad_fn
# .grad_fn.next_functions ... and so on
# zero_grad
# access bias, weights from layers
# update weights computing loss manually