In [None]:
import torch
import torch.nn as nn
import torch.optim as optim


class SimpleModel(nn.Module):
    def __init__(self):
        super(SimpleModel, self).__init__()
        self.fc = nn.Linear(1, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        return self.sigmoid(self.fc(x))


# 使用相同的数据点多次输入
input_data = torch.tensor([[1.0]])
label_data = torch.tensor([[1.0]])

print("================ 梯度累积测试 ================")
print("不使用 optimizer.zero_grad() - 梯度应持续累积")
model = SimpleModel()
optimizer = optim.SGD(model.parameters(), lr=0.01)
criterion = nn.BCELoss()

# 记录初始权重和偏置
initial_weight = model.fc.weight.item()
initial_bias = model.fc.bias.item()

# 进行10次迭代，不清零梯度
for i in range(10):
    output = model(input_data)
    loss = criterion(output, label_data)

    # 反向传播（不重置梯度）
    loss.backward()

    # 打印当前梯度
    print(
        f"迭代 {i+1}: 梯度值 - w={model.fc.weight.grad.item():.6f}, b={model.fc.bias.grad.item():.6f} | "
        f"参数值 - w={model.fc.weight.item():.6f}, b={model.fc.bias.item():.6f}"
    )

    # 更新参数
    optimizer.step()

print("\n================ 正常训练测试 ================")
print("使用 optimizer.zero_grad() - 每次迭代梯度应相同")
model = SimpleModel()
optimizer = optim.SGD(model.parameters(), lr=0.01)

# 重置为初始权重和偏置
with torch.no_grad():
    model.fc.weight[0, 0] = initial_weight
    model.fc.bias[0] = initial_bias

# 进行10次迭代，每次清零梯度
for i in range(10):
    # 重置梯度
    optimizer.zero_grad()

    output = model(input_data)
    loss = criterion(output, label_data)

    # 反向传播
    loss.backward()

    # 打印当前梯度
    print(
        f"迭代 {i+1}: 梯度值 - w={model.fc.weight.grad.item():.6f}, b={model.fc.bias.grad.item():.6f} | "
        f"参数值 - w={model.fc.weight.item():.6f}, b={model.fc.bias.item():.6f}"
    )

    # 更新参数
    optimizer.step()

不使用 optimizer.zero_grad() - 梯度应持续累积
迭代 1: 梯度值 - w=-0.587940, b=-0.587940 | 参数值 - w=-0.247159, b=-0.108295
迭代 2: 梯度值 - w=-1.173027, b=-1.173027 | 参数值 - w=-0.241279, b=-0.102416
迭代 3: 梯度值 - w=-1.752409, b=-1.752409 | 参数值 - w=-0.229549, b=-0.090685
迭代 4: 梯度值 - w=-2.323226, b=-2.323226 | 参数值 - w=-0.212025, b=-0.073161
迭代 5: 梯度值 - w=-2.882625, b=-2.882625 | 参数值 - w=-0.188793, b=-0.049929
迭代 6: 梯度值 - w=-3.427769, b=-3.427769 | 参数值 - w=-0.159966, b=-0.021103
迭代 7: 梯度值 - w=-3.955868, b=-3.955868 | 参数值 - w=-0.125689, b=0.013175
迭代 8: 梯度值 - w=-4.464216, b=-4.464216 | 参数值 - w=-0.086130, b=0.052734
迭代 9: 梯度值 - w=-4.950247, b=-4.950247 | 参数值 - w=-0.041488, b=0.097376
迭代 10: 梯度值 - w=-5.411602, b=-5.411602 | 参数值 - w=0.008015, b=0.146878

使用 optimizer.zero_grad() - 每次迭代梯度应相同
迭代 1: 梯度值 - w=-0.587940, b=-0.587940 | 参数值 - w=-0.247159, b=-0.108295
迭代 2: 梯度值 - w=-0.585088, b=-0.585088 | 参数值 - w=-0.241279, b=-0.102416
迭代 3: 梯度值 - w=-0.582244, b=-0.582244 | 参数值 - w=-0.235428, b=-0.096565
迭代 4: 梯度值 - w=-0.579