In [51]:
import torch

s = torch.randn(3, 4)
s.requires_grad = True
X = torch.randn(4, 3)
S = s@X
t = torch.trace(S)
b = torch.randn(1)
b.requires_grad = True
f = torch.trace(s@X)+b


In [52]:
print(s.grad)

None


In [53]:
f.backward()
s.grad

tensor([[-0.4361, -0.5809, -0.6347, -0.3099],
        [ 0.1746, -0.6219,  1.5531,  1.5621],
        [-1.1245,  0.2617,  0.8457, -0.8959]])

In [56]:
print(s,s.grad, sep='\n')
optimizer = torch.optim.SGD([s, b], lr=0.01)
optimizer.step()
print(s,s.grad, sep='\n')

tensor([[ 1.2087,  1.1355,  0.1602, -1.9971],
        [-1.8136,  0.3177,  0.2591, -0.4667],
        [-0.3390, -2.4810, -0.5078, -0.4501]], requires_grad=True)
tensor([[-0.4361, -0.5809, -0.6347, -0.3099],
        [ 0.1746, -0.6219,  1.5531,  1.5621],
        [-1.1245,  0.2617,  0.8457, -0.8959]])
tensor([[ 1.2130,  1.1413,  0.1666, -1.9940],
        [-1.8154,  0.3240,  0.2435, -0.4824],
        [-0.3277, -2.4836, -0.5163, -0.4411]], requires_grad=True)
tensor([[-0.4361, -0.5809, -0.6347, -0.3099],
        [ 0.1746, -0.6219,  1.5531,  1.5621],
        [-1.1245,  0.2617,  0.8457, -0.8959]])


In [57]:
optimizer.zero_grad()
print(s.grad)

None


In [58]:
class MyModule(torch.nn.Module):
    def __init__(self):
        super(MyModule, self).__init__()
        self.s = torch.nn.Parameter(torch.randn(3, 4))
        self.b = torch.nn.Parameter(torch.randn(1))
    def forward(self,X):
        return torch.trace(self.s@X)+self.b

In [64]:
model = MyModule()
X = torch.randn(4, 3)
y = model(X)
for v in model.parameters():
    print(v)
    print(v.grad)

Parameter containing:
tensor([[ 0.4932, -1.0155,  0.9066, -0.9174],
        [ 1.4060, -0.7470, -0.9215, -1.1042],
        [-0.6361,  0.2985, -0.3791, -0.4518]], requires_grad=True)
None
Parameter containing:
tensor([1.0643], requires_grad=True)
None


In [65]:
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)
y.backward()
for v in model.parameters():
    print(v)
    print(v.grad)
    


Parameter containing:
tensor([[ 0.4932, -1.0155,  0.9066, -0.9174],
        [ 1.4060, -0.7470, -0.9215, -1.1042],
        [-0.6361,  0.2985, -0.3791, -0.4518]], requires_grad=True)
tensor([[-2.0575e+00,  7.8419e-01, -1.3891e+00,  2.9726e-01],
        [ 2.6775e-01,  9.7127e-01, -7.6050e-01, -6.7354e-01],
        [-3.8233e-01, -9.6500e-04,  7.6826e-01,  3.4220e-01]])
Parameter containing:
tensor([1.0643], requires_grad=True)
tensor([1.])


In [None]:
optimizer.step()
for v in model.parameters():
    print(v)
    print(v.grad)

Parameter containing:
tensor([[ 0.5138, -1.0233,  0.9205, -0.9203],
        [ 1.4034, -0.7567, -0.9139, -1.0974],
        [-0.6323,  0.2985, -0.3868, -0.4552]], requires_grad=True)
tensor([[-2.0575e+00,  7.8419e-01, -1.3891e+00,  2.9726e-01],
        [ 2.6775e-01,  9.7127e-01, -7.6050e-01, -6.7354e-01],
        [-3.8233e-01, -9.6500e-04,  7.6826e-01,  3.4220e-01]])
Parameter containing:
tensor([1.0543], requires_grad=True)
tensor([1.])


In [67]:
optimizer.zero_grad()
for v in model.parameters():
    print(v)
    print(v.grad)

Parameter containing:
tensor([[ 0.5138, -1.0233,  0.9205, -0.9203],
        [ 1.4034, -0.7567, -0.9139, -1.0974],
        [-0.6323,  0.2985, -0.3868, -0.4552]], requires_grad=True)
None
Parameter containing:
tensor([1.0543], requires_grad=True)
None


Why calling zero_grad() is necesary? 
Torch design in the way that gradient accumulates, but how and why?
 - how to accumulate? they don't allow second time
 - how to accumulate different batches? they seems in the different computational graph
 - why design this way

In [74]:
import torch

s = torch.randn(3, 4)
s.requires_grad = True
X = torch.randn(4, 3)
S = s@X
t = torch.trace(S)
b = torch.randn(1)
b.requires_grad = True
f = torch.trace(s@X)+b

f.backward()
f.backward()

RuntimeError: Trying to backward through the graph a second time (or directly access saved tensors after they have already been freed). Saved intermediate values of the graph are freed when you call .backward() or autograd.grad(). Specify retain_graph=True if you need to backward through the graph a second time or if you need to access saved tensors after calling backward.

In [72]:
import torch
w = torch.rand(5)
w.requires_grad_()
print(w) 
s = w.sum() 
s.backward()
print(w.grad) # tensor([1., 1., 1., 1., 1.])
s.backward()
print(w.grad) # tensor([2., 2., 2., 2., 2.])
s.backward()
print(w.grad) # tensor([3., 3., 3., 3., 3.])
s.backward()
print(w.grad) # tensor([4., 4., 4., 4., 4.])

tensor([0.8990, 0.3552, 0.0863, 0.0066, 0.0147], requires_grad=True)
tensor([1., 1., 1., 1., 1.])
tensor([2., 2., 2., 2., 2.])
tensor([3., 3., 3., 3., 3.])
tensor([4., 4., 4., 4., 4.])


In [76]:
t = torch.tensor([1.0, 2.0, 3.0, 4.0, 5.0], requires_grad=True)
p = torch.nn.Parameter(t)

In gradient_descent, it is observed that PyTorch much slower than numpy. Investigate it.