In [1]:
import torch

### Requires grad: True

The requires_grad attribute is set to True when we need to compute gradients with respect to the tensor, for example, for backpropagation. The requires_grad attribute is set to False by default.

In [15]:
x = torch.rand(2, 2, requires_grad=True)
print(x)

tensor([[0.6868, 0.5720],
        [0.0277, 0.7773]], requires_grad=True)


In [16]:
y = x + 2
print(y)

tensor([[2.6868, 2.5720],
        [2.0277, 2.7773]], grad_fn=<AddBackward0>)


In [19]:
z = y * y * 2
#z = z.mean()
print(z)

tensor([[14.4383, 13.2300],
        [ 8.2234, 15.4272]], grad_fn=<MulBackward0>)


In [21]:
v = torch.tensor([[1, 0.1], [0.01, 0.001]], dtype=torch.float32)

z.backward(v)
print(x.grad)

tensor([[13.4342,  3.6007],
        [ 2.1088,  2.7884]])


### Simple example

Values are accumulated (summed up) 

In [24]:
weights = torch.ones(4, requires_grad=True)

for epoch in range(5):

    # dummy operation
    model_ouput = (weights * 3).sum()

    model_ouput.backward()

    print(weights.grad)

tensor([3., 3., 3., 3.])
tensor([6., 6., 6., 6.])
tensor([9., 9., 9., 9.])
tensor([12., 12., 12., 12.])
tensor([15., 15., 15., 15.])


We must empty the gradients before each iteration, otherwise the gradients will be accumulated to existing gradients.

In [25]:
weights = torch.ones(4, requires_grad=True)

for epoch in range(5):

    # dummy operation
    model_ouput = (weights * 3).sum()

    model_ouput.backward()

    print(weights.grad)

    weights.grad.zero_()

tensor([3., 3., 3., 3.])
tensor([3., 3., 3., 3.])
tensor([3., 3., 3., 3.])
tensor([3., 3., 3., 3.])
tensor([3., 3., 3., 3.])


### Optimizer

In [30]:
weigths = torch.ones(4, requires_grad=True)

# stochastic gradient descent (SGD)
optimizer = torch.optim.SGD([weights], lr=0.01)

optimizer.step()  # update weights

# zero the gradient buffers
optimizer.zero_grad()
