# AutoGrad
we use it to compute gradients of our weight vector during optimizations. we use autodiff

In [1]:
import torch

In [2]:
x = torch.rand(3,requires_grad=True)

In [3]:
x

tensor([0.0220, 0.2922, 0.1679], requires_grad=True)

In [4]:
y = x+2
y

tensor([2.0220, 2.2922, 2.1679], grad_fn=<AddBackward0>)

In [5]:
z = y*y*2
z

tensor([ 8.1770, 10.5088,  9.3998], grad_fn=<MulBackward0>)

In [6]:
z = z.mean()
z

tensor(9.3619, grad_fn=<MeanBackward0>)

In [7]:
z.backward() #dz/dx

In [8]:
x.grad #this is the grandient for the tensor x that is calculated using reverse mode autodiff

tensor([2.6960, 3.0563, 2.8906])

In the background it does a vector jacobian product. Thus for non scalar vector we cannot doe this because there we need to perform a chain rule as we know. Thus from z on removing mean method we need to pass a gradient vector `v` that we show here.

In [10]:
x = torch.rand(3,requires_grad=True)
y = x+2
z = 2*(y**2)

In [11]:
z

tensor([ 8.1038, 14.0323,  9.8810], grad_fn=<MulBackward0>)

In [43]:
z.backward()# this gives error

RuntimeError: grad can be implicitly created only for scalar outputs

In [16]:
z.backward(torch.tensor([0.1,1.0,0.001],dtype = torch.float32))

In [18]:
print(x.grad)

tensor([8.0517e-01, 1.0595e+01, 8.8909e-03])


preventing torch from tracking the gradient history

In [24]:
x = torch.randn(3,requires_grad=True)
x

tensor([-0.3003, -0.1851,  1.6188], requires_grad=True)

In [25]:
# using x.requires_grad_(False)
x.requires_grad_(False)
print(x)

tensor([-0.3003, -0.1851,  1.6188])


In [26]:
#using detach function
print(x.detach())

tensor([-0.3003, -0.1851,  1.6188])


In [27]:
# using torch.no_grad()
with torch.no_grad():
    y = x+2
    print(y)

tensor([1.6997, 1.8149, 3.6188])


whenever we call the backward function the gradient for the tensor is accumulated in the grad attribute of the tensor or the values are summed up.

In [31]:
import torch
def showSumGrad(epochs):
    weights = torch.ones(4,requires_grad=True)
    print(weights)
    for epoch in range(epochs):
        model_output = (weights*3).sum()
        model_output.backward()
        print(weights.grad)
    print('\n')

In [32]:
showSumGrad(1)
showSumGrad(2)
showSumGrad(3)

tensor([1., 1., 1., 1.], requires_grad=True)
tensor([3., 3., 3., 3.])


tensor([1., 1., 1., 1.], requires_grad=True)
tensor([3., 3., 3., 3.])
tensor([6., 6., 6., 6.])


tensor([1., 1., 1., 1.], requires_grad=True)
tensor([3., 3., 3., 3.])
tensor([6., 6., 6., 6.])
tensor([9., 9., 9., 9.])




Now to prevent this summing up of the gradient we do,

In [41]:
def showSumGrad(epochs):
    weights = torch.ones(4,requires_grad=True)
    print(weights)
    for epoch in range(epochs):
        model_output = (weights*3).sum()
        print(model_output)
        model_output.backward()
        print(weights.grad)
        ####### making grad zero after each step ###########
        weights.grad.zero_()
        ####################################################
    print('\n')

In [40]:
showSumGrad(1)
showSumGrad(2)
showSumGrad(3)

tensor([1., 1., 1., 1.], requires_grad=True)
tensor(12., grad_fn=<SumBackward0>)
tensor([3., 3., 3., 3.])


tensor([1., 1., 1., 1.], requires_grad=True)
tensor(12., grad_fn=<SumBackward0>)
tensor([3., 3., 3., 3.])
tensor(12., grad_fn=<SumBackward0>)
tensor([3., 3., 3., 3.])


tensor([1., 1., 1., 1.], requires_grad=True)
tensor(12., grad_fn=<SumBackward0>)
tensor([3., 3., 3., 3.])
tensor(12., grad_fn=<SumBackward0>)
tensor([3., 3., 3., 3.])
tensor(12., grad_fn=<SumBackward0>)
tensor([3., 3., 3., 3.])




we have to use this method while optimizing nn using optimizers for backpropagation.
```python
weights = torch.ones(4,requires_grad=True)
optimizer = torch.optim.SGD(weights,lr = 0.01)
optimizer.step()
optimizer.zero_grad() # to make gradient zero after each step so that they do not cumulate
```