In [1]:
import torch

In [2]:
a = torch.tensor([2., 3.], requires_grad=True)
b = torch.tensor([6., 4.], requires_grad=True)
Q = 3*a**3 - b**2

external_grad = torch.tensor([1., 1.])
Q.backward(gradient=external_grad)

print(9*a**2 == a.grad)
print(-2*b == b.grad)

tensor([True, True])
tensor([True, True])


### Linear Jacobian

In [5]:
torch.manual_seed(0)
x = torch.randint(low=0, high=255, size=(1, 4), dtype=torch.float32, requires_grad=True)
w = torch.rand((4, 2), requires_grad=True)

print(x)
print(w)

y = x @ w
y.backward(gradient=torch.ones(1, 2))
x.grad

tensor([[194.,  54., 233., 135.]], requires_grad=True)
tensor([[0.3074, 0.6341],
        [0.4901, 0.8964],
        [0.4556, 0.6323],
        [0.3489, 0.4017]], requires_grad=True)


In [6]:
jacobian = torch.zeros(y.size(1), x.size(1))

for i in range(y.size(1)):
    grad_output = torch.zeros_like(y)
    grad_output[:, i] = 1.0
    
    # delta y_i with respect delta vector x
    gradients = torch.autograd.grad(outputs=y, inputs=x, grad_outputs=grad_output, retain_graph=True)[0]
    
    # the row y_i / x
    jacobian[i, :] = gradients

print("Jacobian Matrix:")
print(jacobian)

Jacobian Matrix:
tensor([[0.3074, 0.4901, 0.4556, 0.3489],
        [0.6341, 0.8964, 0.6323, 0.4017]])


### Sigmoid gradient

In [None]:
x = torch.randint(low=0, high=255, size=(3, 3), dtype=torch.float32, requires_grad=True)

def sigmoid(x):
    return 1 / (1 + torch.exp(-x))

y = sigmoid(x)
y.backward(gradient=torch.ones(3, 3))

x.grad

tensor([[2.3195e-16, 5.1091e-12, 8.3153e-07],
        [2.2603e-06, 0.0000e+00, 0.0000e+00],
        [4.4738e-38, 4.3596e-28, 2.0612e-09]])

In [None]:
x = torch.randn(1, 12, requires_grad=True)
x.mean(dim=1), x.std(dim=1)

(tensor([-0.1529], grad_fn=<MeanBackward1>),
 tensor([0.6644], grad_fn=<StdBackward0>))

In [None]:
x = torch.randn(size=(3, 3), dtype=torch.float32, requires_grad=True)

def sigmoid(x):
    return 1 / (1 + torch.exp(-x))

y = sigmoid(x)
y.backward(gradient=torch.ones(3, 3))

x.grad

tensor([[0.1981, 0.2500, 0.1559],
        [0.2416, 0.2105, 0.2472],
        [0.2082, 0.2500, 0.1983]])

Activation function will have vanishing or exploding gradient if the input is not in normal distribution