In [46]:
import torch

torch.manual_seed(0)

<torch._C.Generator at 0x5a02c10>

# tensor，叶子节点

In [47]:
# 为了方便，随机创建一些int型的tensor，然后将它们转换为float型
X   = torch.randint(5, (4,)  ).float().requires_grad_(False)
W_1 = torch.randint(5, (2, 4)).float().requires_grad_(True)
Z   = X.matmul(W_1.T)
W_2 = torch.randint(5, (2,)  ).float().requires_grad_(True)
Y   = Z.matmul(W_2)

print(" X: \t%s \n W_1: \t%s \n Z: \t%s \n W_2: \t%s \n Y: \t%s \n" % (X, W_1, Z, W_2, Y))

 X: 	tensor([4., 4., 3., 0.]) 
 W_1: 	tensor([[3., 4., 2., 3.],
        [2., 3., 1., 1.]], requires_grad=True) 
 Z: 	tensor([34., 23.], grad_fn=<SqueezeBackward3>) 
 W_2: 	tensor([1., 4.], requires_grad=True) 
 Y: 	tensor(126., grad_fn=<DotBackward>) 



In [48]:
print(" is leaf?")
print(" X: \t%s \n W_1: \t%s \n Z: \t%s \n W_2: \t%s \n Y: \t%s \n" % (X.is_leaf, W_1.is_leaf, Z.is_leaf, W_2.is_leaf, Y.is_leaf))

 is leaf?
 X: 	True 
 W_1: 	True 
 Z: 	False 
 W_2: 	True 
 Y: 	False 



# 梯度，反向传播

In [49]:
Y.backward()
print(" grad")
print(" X: \t%s \n W_1: \t%s \n Z: \t%s \n W_2: \t%s \n Y: \t%s \n" % (X.grad, W_1.grad, Z.grad, W_2.grad, Y.grad))

 grad
 X: 	None 
 W_1: 	tensor([[ 4.,  4.,  3.,  0.],
        [16., 16., 12.,  0.]]) 
 Z: 	None 
 W_2: 	tensor([34., 23.]) 
 Y: 	None 



# 梯度累积

In [50]:
Z.detach_()
W_2.detach_().requires_grad_(True)

Z_2 = Z.matmul(W_2)
Z_2.backward()

print(" grad2")
print(" Z: \t%s \n W_2: \t%s \n" % (Z.grad, W_2.grad))

# 可以发现detach_()以后require_grad信息被清除
# 但是grad还保留，第二次反向传播后，两次的grad累积了起来

 grad2
 Z: 	None 
 W_2: 	tensor([68., 46.]) 



# 参数更新

In [52]:
# 前向传播，参数是所有的叶子节点，后面的是参数
def forward(X, W_1, W_2):
    Z = X.matmul(W_1.T)
    Y = Z.matmul(W_2)
    return Y

# 初始化
X   = torch.randint(5, (4,)  ).float().requires_grad_(False)
W_1 = torch.randint(5, (2, 4)).float().requires_grad_(True)
W_2 = torch.randint(5, (2,)  ).float().requires_grad_(True)

learning_rate = 0.01

print(" X: \t%s \n W_1: \t%s \n W_2: \t%s \n" % (X, W_1, W_2))
print(" ====================================")

# 更新模块
for epoch in range(10):
    # 梯度清零
    if W_1.grad is not None:
        W_1.grad.zero_()
    if W_2.grad is not None:
        W_2.grad.zero_()
    
    # 前向传播
    Y = forward(X, W_1, W_2)
    
    # 反向传播
    Y.backward()
    
    # 参数更新（数据域）
    W_1 = W_1 - learning_rate * W_1.grad
    W_2 = W_2 - learning_rate * W_2.grad
    
    # 脱离计算图（参数更新过程中产生的）
    W_1.detach_().requires_grad_(True)
    W_2.detach_().requires_grad_(True)
    
    print(" for epoch %d, Y is %s" % (epoch, Y.data))

 X: 	tensor([4., 0., 1., 2.]) 
 W_1: 	tensor([[3., 0., 0., 0.],
        [2., 4., 1., 3.]], requires_grad=True) 
 W_2: 	tensor([3., 3.], requires_grad=True) 

 for epoch 0, Y is tensor(81.)
 for epoch 1, Y is tensor(73.7001)
 for epoch 2, Y is tensor(67.0496)
 for epoch 3, Y is tensor(60.9899)
 for epoch 4, Y is tensor(55.4677)
 for epoch 5, Y is tensor(50.4343)
 for epoch 6, Y is tensor(45.8455)
 for epoch 7, Y is tensor(41.6608)
 for epoch 8, Y is tensor(37.8434)
 for epoch 9, Y is tensor(34.3598)
