# PyTorch: Tensors and autograd

https://pytorch.org/tutorials/beginner/examples_autograd/two_layer_net_autograd.html

In [1]:
import torch

dtype = torch.float
# device = torch.device("cpu")
device = torch.device("cuda:0") # Uncomment this to run on GPU

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold input and outputs.
# Setting requires_grad=False indicates that we do not need to compute gradients
# with respect to these Tensors during the backward pass.
x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

# Create random Tensors for weights.
# Setting requires_grad=True indicates that we want to compute gradients with
# respect to these Tensors during the backward pass.
w1 = torch.randn(D_in, H, device=device, dtype=dtype, requires_grad=True)
w2 = torch.randn(H, D_out, device=device, dtype=dtype, requires_grad=True)

learning_rate = 1e-6
for t in range(500):
    # Forward pass: compute predicted y using operations on Tensors; these
    # are exactly the same operations we used to compute the forward pass using
    # Tensors, but we do not need to keep references to intermediate values since
    # we are not implementing the backward pass by hand.
    y_pred = x.mm(w1).clamp(min=0).mm(w2)

    # Compute and print loss using operations on Tensors.
    # Now loss is a Tensor of shape (1,)
    # loss.item() gets the a scalar value held in the loss.
    loss = (y_pred - y).pow(2).sum()
    print(t, loss.item())

    # Use autograd to compute the backward pass. This call will compute the
    # gradient of loss with respect to all Tensors with requires_grad=True.
    # After this call w1.grad and w2.grad will be Tensors holding the gradient
    # of the loss with respect to w1 and w2 respectively.
    loss.backward()

    # Manually update weights using gradient descent. Wrap in torch.no_grad()
    # because weights have requires_grad=True, but we don't need to track this
    # in autograd.
    # An alternative way is to operate on weight.data and weight.grad.data.
    # Recall that tensor.data gives a tensor that shares the storage with
    # tensor, but doesn't track history.
    # You can also use torch.optim.SGD to achieve this.
    with torch.no_grad():
        w1 -= learning_rate * w1.grad
        w2 -= learning_rate * w2.grad

        # Manually zero the gradients after updating weights
        w1.grad.zero_()
        w2.grad.zero_()

0 28981736.0
1 25295786.0
2 26014494.0
3 27076048.0
4 25646932.0
5 20736526.0
6 14197837.0
7 8490062.0
8 4762887.0
9 2698136.0
10 1634063.0
11 1084809.25
12 787170.4375
13 611964.9375
14 498466.28125
15 418270.53125
16 357532.3125
17 309262.875
18 269685.6875
19 236565.453125
20 208490.609375
21 184523.296875
22 163885.859375
23 146006.0625
24 130450.640625
25 116861.90625
26 104972.359375
27 94524.0234375
28 85305.84375
29 77143.9140625
30 69899.3828125
31 63452.6875
32 57703.08203125
33 52559.0390625
34 47947.34375
35 43806.640625
36 40078.7109375
37 36719.75390625
38 33690.55078125
39 30951.375
40 28469.171875
41 26215.51171875
42 24166.58984375
43 22300.900390625
44 20600.056640625
45 19047.421875
46 17630.541015625
47 16333.7998046875
48 15146.4541015625
49 14058.2197265625
50 13060.623046875
51 12144.3212890625
52 11301.296875
53 10524.4072265625
54 9808.0234375
55 9145.5283203125
56 8533.2548828125
57 7966.84765625
58 7442.7626953125
59 6957.10888671875
60 6506.5859375
61 6088.7