# PyTorch Basics
- automatic derivatives with tensors
- tensors as neural network abstractions: `torch.nn`
- optimizers: `nn.optim`

## Package imports

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
# import torchvision

In [2]:
# from pprint import pprint

import matplotlib.pyplot as plt
import numpy as np

# from IPython.core.debugger import set_trace

In [3]:
from numpy.linalg import inv
from numpy.linalg import multi_dot as mdot

# Automatic differentiation with `autograd`
Since `v0.4`, `Tensor` can record gradients directly if you tell it do do so, e.g. `torch.ones(3, requires_grad=True)`.

Ref:
- https://pytorch.org/docs/stable/autograd.html
- https://pytorch.org/tutorials/beginner/blitz/autograd_tutorial.html

We rarely use `torch.autograd` directly.
Pretty much everything is part or `torch.Tensor`.
Simply add `requires_grad=True` to the tensors you want to calculate the gradients for.
`nn.Module` track gradients automatically.

In [4]:
from torch import autograd

In [5]:
x = torch.tensor(2.)
x

tensor(2.)

In [6]:
# requires_grad = True -> tracks all operations on the tensor
x = torch.tensor(2., requires_grad=True) # first show with False, then True
x

tensor(2., requires_grad=True)

In [7]:
print(x.requires_grad)

True


In [8]:
print(x.grad)

None


In [9]:
y = x ** 2

print("Grad of x:", x.grad)

Grad of x: None


In [10]:
# y was created as a result of an operation, so it has a grad_fn attribute.
# grad_fn: references a Function that has created the Tensor
y

tensor(4., grad_fn=<PowBackward0>)

In [11]:
# Let's compute the gradients with backpropagation
# When we finish our computation we can call .backward() and have all the gradients computed automatically.
# The gradient for this tensor will be accumulated into .grad attribute.
# It is the partial derivate of the function w.r.t. the tensor

y = x ** 2
y.backward()

print("Grad of x:", x.grad)

Grad of x: tensor(4.)


In [12]:
# What is going to happen here?
# x = torch.tensor(2.)
# x.backward()

In [13]:
# Don't record the gradient - useful for inference

# Stop a tensor from tracking history:
# For example during our training loop when we want to update our weights
# then this update operation should not be part of the gradient computation
# - x.requires_grad_(False)
# - x.detach()
# - wrap in 'with torch.no_grad():'

# .requires_grad_(...) changes an existing flag in-place.

params = torch.tensor(2., requires_grad=True)

with torch.no_grad():
    y = x * x
    print(x.grad_fn)

None


## Gradient with intermediate tensor

In [14]:
a = torch.tensor(2., requires_grad=True)

In [15]:
b, c = a**2, a**3

In [16]:
d = b + c

In [17]:
d.backward() # b.backward() and c.backward() also work

In [18]:
a.grad # b.grad() and c.grad() not possible

tensor(16.)

In [19]:
a = torch.tensor(2., requires_grad=True)

b, c = a**2, a**3
d = b + c

In [20]:
handle = b.register_hook(print) # prints the gradient with respect to the tensor

In [21]:
d.backward() # gradient of d w.r.t. b is printed as output

tensor(1.)


In [22]:
handle

<torch.utils.hooks.RemovableHandle at 0x7f41ae12a610>

In [23]:
a.grad

tensor(16.)

## Non-scalar output

In [24]:
# Model with non-scalar output:
# If a Tensor is non-scalar (more than 1 elements), we need to specify arguments for backward() 
# specify a gradient argument that is a tensor of matching shape.
# needed for vector-Jacobian product

torch.manual_seed(0)

x = torch.randn(3, requires_grad=True)

In [25]:
x

tensor([ 1.5410, -0.2934, -2.1788], requires_grad=True)

In [26]:
y = x*2

In [27]:
y

tensor([ 3.0820, -0.5869, -4.3576], grad_fn=<MulBackward0>)

In [28]:
for _ in range(5):
    y = y * 2

In [29]:
y

tensor([  98.6237,  -18.7794, -139.4425], grad_fn=<MulBackward0>)

In [30]:
v = torch.tensor([0.1, 1.0, 0.0001], dtype=torch.float32)

y.backward(v)

print(x.grad)

tensor([6.4000e+00, 6.4000e+01, 6.4000e-03])


### scalar leaf tensor

In [31]:
a = torch.tensor(2., requires_grad=True)

In [32]:
b = torch.tensor([1., 2., 3.], requires_grad=True)

In [33]:
c = a*b

In [34]:
c.backward(torch.tensor([2., 2., 2.]))

In [35]:
a.grad # jacobian only has a row --> row vector * column vector

tensor(12.)

In [36]:
b.grad # jacobian is a matrix

tensor([4., 4., 4.])

`nn.Module` and `nn.Parameter` keep track of gradients for you.

## `torch.nn`
The neural network modules contains many different layers.

In [37]:
lin = nn.Linear(2, 1, bias=True)
lin.weight

Parameter containing:
tensor([[-0.0628,  0.1871]], requires_grad=True)

In [38]:
type(lin.weight)

torch.nn.parameter.Parameter

In [39]:
isinstance(lin.weight, torch.FloatTensor)

True

In [40]:
lin_reg = nn.Linear(1, 1, bias=True)
lin_reg

Linear(in_features=1, out_features=1, bias=True)

In [41]:
nn.Conv2d

torch.nn.modules.conv.Conv2d

In [42]:
nn.Conv3d

torch.nn.modules.conv.Conv3d

In [43]:
nn.BatchNorm2d

torch.nn.modules.batchnorm.BatchNorm2d

### Activations

In [44]:
nn.ReLU

torch.nn.modules.activation.ReLU

In [45]:
nn.Sigmoid

torch.nn.modules.activation.Sigmoid

### Losses

In [46]:
nn.Softmax

torch.nn.modules.activation.Softmax

In [47]:
nn.CrossEntropyLoss

torch.nn.modules.loss.CrossEntropyLoss

In [48]:
nn.BCELoss

torch.nn.modules.loss.BCELoss

In [49]:
nn.MSELoss

torch.nn.modules.loss.MSELoss

### Functional (stateless) alternatives

In [50]:
F.mse_loss

<function torch.nn.functional.mse_loss>

In [51]:
F.relu

<function torch.nn.functional.relu>

In [52]:
F.relu6

<function torch.nn.functional.relu6>

## `torch.optim`

In [53]:
optim.SGD

torch.optim.sgd.SGD

In [54]:
optim.Adam

torch.optim.adam.Adam

In [55]:
optim.AdamW

torch.optim.adamw.AdamW