In [24]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [25]:
import torch

In [39]:
# lets build a graph using pytorch tensors
# weight-w and bias-b are parameters that need to be optimized. so we need the gradient of the loss function with respect to them with requires_grad=True
x = torch.ones(5)
y = torch.zeros(3)
w = torch.randn(5,3,requires_grad=True)
b = torch.randn(3, requires_grad=True)
z = torch.matmul(x,w) + b
loss = torch.nn.functional.binary_cross_entropy_with_logits(z,y)

In [33]:
z.shape

torch.Size([3])

In [11]:
# variables with respect to which gradients can be calculated will have the method .grad_fn
z.grad_fn, loss.grad_fn, x.grad_fn

(<AddBackward0 at 0x7fd5e59ca0b0>,
 <BinaryCrossEntropyWithLogitsBackward0 at 0x7fd5e59ca0e0>,
 None)

In [13]:
# lets calculate gradients for loss function with loss.backward(), Note that loss can only be created for scalar outputs. to get the values we use the method w.grad and b.grad
loss.backward(), w.grad, b.grad

(None,
 tensor([[0.3108, 0.0524, 0.1636],
         [0.3108, 0.0524, 0.1636],
         [0.3108, 0.0524, 0.1636],
         [0.3108, 0.0524, 0.1636],
         [0.3108, 0.0524, 0.1636]]),
 tensor([0.3108, 0.0524, 0.1636]))

In [14]:
#once the model has been trained, we can ensure that only forward computations happen and requires_grad is disabled through the following
#we can also use this to freeze parameters
with torch.no_grad():
    z=torch.matmul(x,w)+b
print(z.requires_grad)

False


In [15]:
# we can also achieve the same thing with the detach method
z = torch.matmul(x,w) + b
z_det = z.detach()
print(z_det.requires_grad)

False


In [20]:
# in case the loss function is also a vector, torch calculates the jacobian product given by vT.J, where v is the same size as the original tensor
input = torch.eye(4,5, requires_grad=True)
output = (input+1).pow(2).t()
output.backward(torch.ones_like(output), retain_graph=True)
output.backward(torch.ones_like(output), retain_graph=True)
input.grad.zero_()
output.backward(torch.ones_like(output), retain_graph=True)
input, output, input.grad

(tensor([[1., 0., 0., 0., 0.],
         [0., 1., 0., 0., 0.],
         [0., 0., 1., 0., 0.],
         [0., 0., 0., 1., 0.]], requires_grad=True),
 tensor([[4., 1., 1., 1.],
         [1., 4., 1., 1.],
         [1., 1., 4., 1.],
         [1., 1., 1., 4.],
         [1., 1., 1., 1.]], grad_fn=<TBackward0>),
 tensor([[4., 2., 2., 2., 2.],
         [2., 4., 2., 2., 2.],
         [2., 2., 4., 2., 2.],
         [2., 2., 2., 4., 2.]]))

In [40]:
z.backward(torch.ones_like(z).t())
w.grad, b.grad

(tensor([[1., 1., 1.],
         [1., 1., 1.],
         [1., 1., 1.],
         [1., 1., 1.],
         [1., 1., 1.]]),
 tensor([1., 1., 1.]))

To understand this properly, read https://pytorch.org/docs/stable/notes/autograd.html 