In [1]:
import torch

In [2]:
x = torch.randn(3, requires_grad = True) # Very important to mention requires_grad = True
print(x)

tensor([ 0.5887, -1.4111,  1.0695], requires_grad=True)


In [3]:
print()
y = x + 2
print(y)

print()
z = y*y*2
print(z)

print()
z = z.mean()
print(z)


tensor([2.5887, 0.5889, 3.0695], grad_fn=<AddBackward0>)

tensor([13.4025,  0.6935, 18.8436], grad_fn=<MulBackward0>)

tensor(10.9799, grad_fn=<MeanBackward0>)


In [4]:
z.backward()  # dz/dx
print(x.grad) 

tensor([3.4516, 0.7851, 4.0927])


In [5]:
x.grad

tensor([3.4516, 0.7851, 4.0927])

#### gradient is calculated on scalar values. if there is more than 1 value in the tensor, then we have to create a vector of same size.

In [8]:
x = torch.randn(3, requires_grad = True) # Very important to mention requires_grad = True
print(x)

print()
y = x + 2
print(y)

print()
z = y*y*2
print(z)

v = torch.tensor([0.1, 1.0,0.001], dtype = torch.float32)

z.backward(v)  # dz/dx
print(x.grad) 

tensor([0.9884, 0.9265, 0.3388], requires_grad=True)

tensor([2.9884, 2.9265, 2.3388], grad_fn=<AddBackward0>)

tensor([17.8607, 17.1286, 10.9400], grad_fn=<MulBackward0>)
tensor([1.1953e+00, 1.1706e+01, 9.3552e-03])


### Preventing Pytorch from storing gradient history

1. x.requies.grad_(False)
2. x.detach()
3. with torch.no_grad()

In [9]:
x = torch.randn(3, requires_grad = True)
print(x)
x.requires_grad_(False)
print(x)

tensor([ 0.2794, -0.1959, -0.1313], requires_grad=True)
tensor([ 0.2794, -0.1959, -0.1313])


In [11]:
x = torch.randn(3, requires_grad = True)
print(x)
y = x.detach()
print(y)

tensor([0.1599, 1.2845, 1.1861], requires_grad=True)
tensor([0.1599, 1.2845, 1.1861])


In [12]:
x = torch.randn(3, requires_grad = True)
print(x)
with torch.no_grad():
    y = x + 2
    print(y)

tensor([ 1.3507,  0.8644, -1.4195], requires_grad=True)
tensor([3.3507, 2.8644, 0.5805])


Before we do next operation in optimization step, we have use weights.grad_zero_()

reason from Groq:
When you're training a model, the gradients of the weights are updated at each iteration. If you don't reset the gradients, they can accumulate and cause the model to diverge or converge to a suboptimal solution. By setting the gradients to zero, you ensure that the model starts from a clean slate at each iteration.

In [17]:
weights = torch.ones(4,requires_grad = True)

for epoch in range(3):
    model_output = (weights*3).sum()
    
    model_output.backward()

    print(weights.grad)

print()
weights = torch.ones(4,requires_grad = True)
for epoch in range(3):
    model_output = (weights*3).sum()
    
    model_output.backward()

    print(weights.grad)
    
    weights.grad.zero_()

tensor([3., 3., 3., 3.])
tensor([6., 6., 6., 6.])
tensor([9., 9., 9., 9.])

tensor([3., 3., 3., 3.])
tensor([3., 3., 3., 3.])
tensor([3., 3., 3., 3.])


#### Summary

1. If you want to compute the gradient, then use requires_grad = True while creating a tensor and use tensor.backward() to compute the gradient.
2. if you want to prevent storing gradient, then use x.requies.grad_(False), x.detach() ,with torch.no_grad()
3. If you don't reset the gradients, they can accumulate and cause the model to diverge or converge to a suboptimal solution. By setting the gradients to zero, you ensure that the model starts from a clean slate at each iteration. Use weights.grad.zero_()
