# AUTOMATIC DIFFERENTIATION WITH `TORCH.AUTOGRAD`

https://pytorch.org/tutorials/beginner/basics/autogradqs_tutorial.html

In [1]:
import torch

In [2]:
x = torch.ones(5)
x

tensor([1., 1., 1., 1., 1.])

In [4]:
y = torch.zeros(3)
y

tensor([0., 0., 0.])

In [5]:
w = torch.randn(5, 3, requires_grad=True)
w

tensor([[ 0.4103,  0.1622, -0.9526],
        [ 0.2691,  2.0971,  0.5406],
        [-0.7844,  0.7690, -2.3479],
        [ 0.1660, -0.7140,  0.5251],
        [ 1.7142, -1.4625,  1.7677]], requires_grad=True)

In [13]:
w.shape

torch.Size([5, 3])

In [6]:
b = torch.randn(3, requires_grad=True)
b

tensor([ 0.6557, -1.4608, -0.4922], requires_grad=True)

In [8]:
z = torch.matmul(x, w) + b
z 

tensor([ 2.4308, -0.6091, -0.9593], grad_fn=<AddBackward0>)

In [9]:
loss = torch.nn.functional.binary_cross_entropy_with_logits(z, y)
loss 

tensor(1.0913, grad_fn=<BinaryCrossEntropyWithLogitsBackward>)

## Tensors, Functions and Computational graph

A function that we apply to tensors to construct computational graph is in fact an object of class Function. 

This object knows how to compute the function in the forward direction, and also how to compute its derivative during the backward propagation step. 

A reference to the backward propagation function is stored in grad_fn property of a tensor. 

In [14]:
print('gradient func for z =', z.grad_fn)
print('gradient func for loss =', loss.grad_fn)

gradient func for z = <AddBackward0 object at 0x7f9081512a90>
gradient func for loss = <BinaryCrossEntropyWithLogitsBackward object at 0x7f9060beca60>


## Computing Gradients

In [17]:
loss.backward(torch.ones_like(loss))

In [18]:
print(w.grad)
print(b.grad)

tensor([[0.3064, 0.1174, 0.0923],
        [0.3064, 0.1174, 0.0923],
        [0.3064, 0.1174, 0.0923],
        [0.3064, 0.1174, 0.0923],
        [0.3064, 0.1174, 0.0923]])
tensor([0.3064, 0.1174, 0.0923])


Note:

- We can only obtain the grad properties for the leaf nodes of the computational graph, which have requires_grad property set to True. For all other nodes in our graph, gradients will not be available.

- We can only perform gradient calculations using `backward` once on a given graph, for performance reasons. If we need to do several `backward` calls on the same graph, we need to pass `retain_graph=True` to the `backward` call.

## Disabling gradient tracking

In [22]:
z = torch.matmul(x, w) + b
print(z.requires_grad)

with torch.no_grad():
    z = torch.matmul(x, w) + b
    
print(z.requires_grad)

True
False


In [23]:
z = torch.matmul(x, w) + b
z_det = z.detach()
print(z_det.requires_grad)

False


## Tensor gradients and Jacobian products

In [55]:
x = torch.eye(5, requires_grad=True)
x 

tensor([[1., 0., 0., 0., 0.],
        [0., 1., 0., 0., 0.],
        [0., 0., 1., 0., 0.],
        [0., 0., 0., 1., 0.],
        [0., 0., 0., 0., 1.]], requires_grad=True)

In [38]:
y = (x + 1).pow(2) 
y 

tensor([[4., 1., 1., 1., 1.],
        [1., 4., 1., 1., 1.],
        [1., 1., 4., 1., 1.],
        [1., 1., 1., 4., 1.],
        [1., 1., 1., 1., 4.]], grad_fn=<PowBackward0>)

In [39]:
y.backward(torch.ones_like(x), retain_graph=True)

In [40]:
print('first call\n', x.grad)

first call
 tensor([[4., 2., 2., 2., 2.],
        [2., 4., 2., 2., 2.],
        [2., 2., 4., 2., 2.],
        [2., 2., 2., 4., 2.],
        [2., 2., 2., 2., 4.]])


In [41]:
y.backward(torch.ones_like(x), retain_graph=True)

In [42]:
print('second call\n', x.grad)

second call
 tensor([[8., 4., 4., 4., 4.],
        [4., 8., 4., 4., 4.],
        [4., 4., 8., 4., 4.],
        [4., 4., 4., 8., 4.],
        [4., 4., 4., 4., 8.]])


In [43]:
x.grad.zero_()
y.backward(torch.ones_like(x), retain_graph=True)
print('call after zeroing gradients\n', x.grad)

call after zeroing gradients
 tensor([[4., 2., 2., 2., 2.],
        [2., 4., 2., 2., 2.],
        [2., 2., 4., 2., 2.],
        [2., 2., 2., 4., 2.],
        [2., 2., 2., 2., 4.]])


In [15]:
x = torch.eye(5, requires_grad=True)

def func_y(x):
    y = (x + 1).pow(2)
    return y

y = func_y(x)

J = torch.autograd.functional.jacobian(func_y, x)
J.shape

torch.Size([5, 5, 5, 5])

In [17]:
x_grad = torch.matmul( 
    torch.transpose(J.view(5, -1), 0, 1), 
    torch.ones_like(y)
)

print(x_grad.shape)

torch.Size([125, 5])


In [7]:
J.view(5, -1).shape

torch.Size([5, 125])