In [13]:
# Automatic differentiation

# A simple Example
# Let y = 2*x^T*T

import torch

x = torch.arange(4.0)
x

tensor([0., 1., 2., 3.])

In [14]:
'''
It is important that we do not allocate new memory every time we
take a derivate with respect to a parameter becuase we will often
update the same parameters thousands or millions of times and could
quickly run out of memory.
'''

x.requires_grad_(True) 
# or you can just initialize x like 'x = torch.arange(4.0, requires_grad=True)'
x.grad   # Default value is None

In [15]:
# We assumed y = 2 * x^T * T above.
y = 2 * torch.dot(x, x)
y

tensor(28., grad_fn=<MulBackward0>)

In [16]:
# Pytorch automatically calculate the gradient of y with
# respect to each component of x by some methods.

y.backward()
x.grad

tensor([ 0.,  4.,  8., 12.])

<h5>After differentiation, y' becomes 4 * x</h5>

In [17]:
x.grad == 4 * x

tensor([True, True, True, True])

In [20]:
# Pytorch accumulates the gradient in default,
# so we need to clear the previous values
x.grad.zero_()
y = x.sum()  # same as y = x1 + x2 + x3 + x4
y.backward()
x.grad # so we get 1, 1, 1, 1 after differentiation

tensor([1., 1., 1., 1.])

In [22]:
# Backward for Non-Scalar Variables

x.grad.zero_()
y = x * x
y.sum().backward()
x.grad

tensor([0., 2., 4., 6.])

In [23]:
# Detaching Computation

x.grad.zero_()
y = x * x
u = y.detach()   # treating u as a constant
z = u * x

z.sum().backward()
x.grad == u

tensor([True, True, True, True])

In [24]:
x.grad.zero_()
y.sum().backward()
x.grad == 2 * x

tensor([True, True, True, True])