我们已经简要了解了自动求导的工作原理，但它在实际使用时会是什么样子呢？让我们定义一个小模型，并检查它在单个训练批次之后的变化。首先，定义一些常量、我们的模型以及一些输入和输出的替代项：

In [15]:
import torch

BATCH_SIZE = 16
DIM_IN = 1000
HIDDEN_SIZE = 100
DIM_OUT = 10

class TinyModel(torch.nn.Module):

    def __init__(self):
        super(TinyModel, self).__init__()

        self.layer1 = torch.nn.Linear(DIM_IN, HIDDEN_SIZE)
        self.relu = torch.nn.ReLU()
        self.layer2 = torch.nn.Linear(HIDDEN_SIZE, DIM_OUT)

    def forward(self, x):
        x = self.layer1(x)
        x = self.relu(x)
        x = self.layer2(x)
        return x

some_input = torch.randn(BATCH_SIZE, DIM_IN, requires_grad=False)
ideal_output = torch.randn(BATCH_SIZE, DIM_OUT, requires_grad=False)
print(some_input, ideal_output)

model = TinyModel()
print(model)

tensor([[-0.4517,  0.8055,  1.2638,  ...,  0.2466,  1.3601,  0.6728],
        [ 0.9455,  0.0191, -0.7938,  ...,  0.5257, -0.0212, -1.7546],
        [-2.1998,  0.4150, -1.2633,  ..., -0.3488, -0.1271,  0.7015],
        ...,
        [-2.1264,  1.0510, -0.7034,  ...,  0.2061,  0.1904,  1.4052],
        [-2.2582, -1.0296,  0.3881,  ..., -0.1438, -1.1002,  1.5507],
        [ 0.2572, -0.8941, -1.7708,  ...,  0.9145,  0.4288, -0.8288]]) tensor([[-0.1669, -1.5310, -0.4940, -0.2785,  2.1457,  0.1470,  1.4160,  0.2929,
         -0.1113,  2.0991],
        [ 0.0299, -0.7963,  0.4011, -0.8442,  1.0319,  0.3571,  0.9438, -1.5946,
         -1.2612,  2.8205],
        [-1.0454, -0.4214,  0.6213, -0.5689,  0.3092, -0.2154, -1.7659, -1.1801,
         -1.7296, -1.0570],
        [-0.2911, -0.2951,  0.8784, -0.0929,  0.5217, -0.8062,  1.2285,  0.3308,
         -1.3082, -1.2791],
        [ 1.7916,  1.0126, -1.1930, -0.9950,  0.4908, -0.2688, -1.0246, -0.2773,
          1.4136,  0.4667],
        [-1.1482, -0.

In [None]:
model.layer2.weight.requires_grad = True
print(model.layer2.weight[0][0:10]) # just a small slice
print(model.layer2.weight.grad)

tensor([ 0.0999, -0.0094,  0.0643, -0.0948,  0.0137,  0.0808,  0.0927, -0.0529,
        -0.0587,  0.0812], grad_fn=<SliceBackward0>)
None


: 

In [13]:
optimizer = torch.optim.SGD(model.parameters(), lr=0.001)

prediction = model(some_input)
print(prediction)  # just a small slice

loss = (ideal_output - prediction).pow(2).sum()
print(loss)

tensor([[-1.6028e-01, -1.4254e-01,  3.4088e-01,  2.1120e-01,  2.6653e-01,
          3.1831e-01,  1.0657e-01, -4.2870e-01,  3.1283e-01, -1.0202e-01],
        [ 5.4637e-02, -1.0615e-01,  2.1667e-01,  1.3969e-02,  4.9250e-01,
          1.5333e-01, -4.2216e-01, -8.5377e-02,  2.7756e-01,  2.2978e-01],
        [-5.5558e-02,  2.5129e-02, -2.3543e-02,  1.8904e-01,  2.9002e-01,
          5.0901e-02,  1.1447e-01, -2.4179e-01,  1.5651e-01,  1.0266e-01],
        [-3.3176e-03,  6.4015e-05,  2.2084e-01,  4.1213e-02, -5.8534e-02,
          8.5144e-02, -1.9813e-01, -1.9020e-01, -1.7577e-01,  3.4645e-01],
        [ 1.0792e-02, -5.5817e-02,  6.3418e-01,  3.3974e-01,  1.7913e-01,
         -6.1391e-02, -1.4051e-01, -2.7041e-01,  1.3786e-01,  1.4114e-01],
        [-9.7367e-02, -1.2809e-01,  7.0580e-01,  2.8596e-01,  1.1787e-01,
          3.7970e-01, -1.6863e-01, -3.9833e-01,  1.7485e-01,  3.0770e-01],
        [ 2.9676e-01, -4.5516e-02,  3.3812e-01,  2.2322e-01,  3.4246e-02,
         -1.0658e-01, -4.3172e-0

In [5]:
loss.backward()
print(model.layer2.weight[0][0:10])
print(model.layer2.weight.grad[0][0:10])

tensor([ 0.0144, -0.0668,  0.0679,  0.0963,  0.0944,  0.0809, -0.0991,  0.0250,
         0.0481, -0.0958], grad_fn=<SliceBackward0>)
tensor([ 1.0759, -1.2435, -6.2087, -6.4404, -1.9673, -1.4760, -2.8918, -1.9802,
        -1.7162, -2.4007])


In [10]:
print(model.layer2.weight.grad[0][0:10])

for i in range(0, 5):
    prediction = model(some_input)
    loss = (ideal_output - prediction).pow(2).sum()
    loss.backward()

print(model.layer2.weight.grad[0][0:10])

optimizer.zero_grad(set_to_none=False)

print(model.layer2.weight.grad[0][0:10])

tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])
tensor([  5.3793,  -6.2173, -31.0437, -32.2019,  -9.8364,  -7.3798, -14.4591,
         -9.9010,  -8.5810, -12.0037])
tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])


In [12]:
a = torch.ones(2, 3, requires_grad=False)
print(a)

b1 = 2 * a
print(b1)

a.requires_grad = False
b2 = 2 * a
print(b2)

tensor([[1., 1., 1.],
        [1., 1., 1.]])
tensor([[2., 2., 2.],
        [2., 2., 2.]])
tensor([[2., 2., 2.],
        [2., 2., 2.]])
