In [9]:
import torch

In [22]:
# requires to calculate the gradient(derivative) of that tensor
x = torch.tensor(3.0,requires_grad=True)
y = x**2

In [23]:
x,y

(tensor(3., requires_grad=True), tensor(9., grad_fn=<PowBackward0>))

In [24]:
# Backward navigation allowed only once!
y.backward()
x.grad

tensor(6.)

In [34]:
x = torch.tensor(3.0,requires_grad=True)

y = x**2

z = torch.sin(y)

In [35]:
x,y,z

(tensor(3., requires_grad=True),
 tensor(9., grad_fn=<PowBackward0>),
 tensor(0.4121, grad_fn=<SinBackward0>))

In [36]:
z.backward()
dz_dx = x.grad
dz_dx

tensor(-5.4668)

##### You can call .backward() only once by default in PyTorch because the computation graph used for the backward pass is freed after the gradients are computed. This is done to save memory during training. The computation graph is dynamic and stored in memory until the .backward() call. Once the backward pass is completed, the graph is discarded unless you explicitly request to retain it.

In [37]:
try:
    y.backward()
    dy_dx = x.grad
    dy_dx
except RuntimeError as e:
    print(f"Runtime error:{e}")

Runtime error:Trying to backward through the graph a second time (or directly access saved tensors after they have already been freed). Saved intermediate values of the graph are freed when you call .backward() or autograd.grad(). Specify retain_graph=True if you need to backward through the graph a second time or if you need to access saved tensors after calling backward.


In [38]:
def binary_cross_entropy_loss(prediction, target):
    epsilon = 1e-8  # To prevent log(0)
    prediction = torch.clamp(prediction, epsilon, 1 - epsilon)
    return -(target * torch.log(prediction) + (1 - target) * torch.log(1 - prediction))

In [None]:


x = torch.tensor(6.7)
y = torch.tensor(0.0)

# the independent variable wrt. which the gradients should be calculated.
w = torch.tensor(1.0,requires_grad=True)
b = torch.tensor(0.0,requires_grad=True)


In [41]:
# Forward pass

z = w*x + b
y_pred = torch.sigmoid(z)

loss = binary_cross_entropy_loss(y_pred,y)

z,y_pred,loss

(tensor(6.7000, grad_fn=<AddBackward0>),
 tensor(0.9988, grad_fn=<SigmoidBackward0>),
 tensor(6.7012, grad_fn=<NegBackward0>))

In [42]:
# Backward pass

loss.backward()

dl_dw = w.grad
dl_db = b.grad

dl_dw,dl_db

(tensor(6.6918), tensor(0.9988))

### Vector input tensors for autograd

In [43]:
x = torch.tensor([1.0,2.0,3.0],requires_grad=True)

y = (x**2).mean()

x,y

(tensor([1., 2., 3.], requires_grad=True),
 tensor(4.6667, grad_fn=<MeanBackward0>))

In [44]:
y.backward()
dy_dx = x.grad
dy_dx

tensor([0.6667, 1.3333, 2.0000])

### Clearing grad

##### If you want to call .backward() more than once (e.g., for gradient accumulation or other tasks), you can use the retain_graph=True argument. This tells PyTorch to keep the computation graph in memory after the backward pass.

In [None]:

# Example tensor
x = torch.tensor(2.0, requires_grad=True)

# Example computation
y = x ** 2

# First backward pass
y.backward(retain_graph=True)
print(x.grad)  # Output: 4.0

# Second backward pass (without retaining the computational graph)
y.backward()
print(x.grad)  # Output: 8.0 (gradients are accumulated)


tensor(4.)
tensor(8.)


##### Gradients Are Accumulated:

By default, gradients are added to the .grad attribute. This is why the gradient values in the second call above are cumulative.

Use x.grad.zero_() to reset gradients if needed.


##### retain_graph=True:

Keeping the computation graph can lead to increased memory usage. Use it cautiously to avoid memory leaks or inefficiency.


##### Typical Use Case:

In most training scenarios, you only call .backward() once per forward pass. Retaining the graph is generally not needed unless for specific tasks like second-order gradients (e.g., using .backward() within a loop).

In [None]:


# Create a tensor with requires_grad=True
x = torch.tensor(2.0, requires_grad=True)

# Perform a computation
y = x ** 2

# First backward pass
y.backward(retain_graph=True)
print("Gradient after first backward pass:", x.grad)  # Output: 4.0

# Reset the gradients
x.grad.zero_()

# Second backward pass (without retaining the computational graph)
y.backward()
print("Gradient after second backward pass:", x.grad)  # Output: 4.0


Gradient after first backward pass: tensor(4.)
Gradient after second backward pass: tensor(4.)


### Disabling gradient tracking

##### Disabling gradient tracking in PyTorch is useful when you want to perform operations on tensors without tracking their computation history or consuming memory for gradient calculations. This is commonly used in scenarios like inference, evaluation, or when you need to manipulate tensors without involving gradients.

In [56]:
x = torch.tensor(2.0,requires_grad=True)
y = x**2
print(x,y)

y.backward()
dy_dx = x.grad
dy_dx

tensor(2., requires_grad=True) tensor(4., grad_fn=<PowBackward0>)


tensor(4.)

##### Method 1: using requires_grad_(False) on the original tensor to not keep track of the gradients

In [59]:
x.requires_grad_(False)
y = x**2
x,y


(tensor(2.), tensor(4.))

In [None]:
# Runtime error because now, there is no gradient tracking for x.So, we can't call backward()
try:
    y.backward()
except RuntimeError as e:
    print(f"{e}")

element 0 of tensors does not require grad and does not have a grad_fn


##### Method 2: Using detach() ,  to create a new tensor(a copy of the original) where gradients aren't tracked

In [67]:
x = torch.tensor(2.0,requires_grad=True)

z = x.detach()

print(x,z)

y1 = x**3
y2 = z**3

print(y1,y2)


y1.backward()
print(x.grad)

try:
    y2.backward()
    print(z.grad)
except RuntimeError as e:
    print(f"Runtime error because there is no gradient tracking for z")




tensor(2., requires_grad=True) tensor(2.)
tensor(8., grad_fn=<PowBackward0>) tensor(8.)
tensor(12.)
Runtime error because there is no gradient tracking for z


##### Method 3: Using torch.no_grad()

In [77]:
x = torch.tensor(2.0,requires_grad=True)

with torch.no_grad():
    y = x**2

z = torch.log(x)


print(x,y)

try:
    y.backward()
    dy_dx = x.grad
    print(dy_dx)
except RuntimeError as e:
    print(f"Calculated y by applying torch.no_grad(), so there is no tracking of gradient of y wrt. x")


# tracked the gradient of z wrt. x because z was computed without applying torch.no_grad()
z.backward()
dz_dx = x.grad
dz_dx
    

tensor(2., requires_grad=True) tensor(4.)
Calculated y by applying torch.no_grad(), so there is no tracking of gradient of y wrt. x


tensor(0.5000)