In [None]:
%matplotlib inline


PyTorch: Tensors and autograd
-------------------------------

A fully-connected ReLU network with one hidden layer and no biases, trained to
predict y from x by minimizing squared Euclidean distance.

This implementation computes the forward pass using operations on PyTorch
Tensors, and uses PyTorch autograd to compute gradients.


A PyTorch Tensor represents a node in a computational graph. If ``x`` is a
Tensor that has ``x.requires_grad=True`` then ``x.grad`` is another Tensor
holding the gradient of ``x`` with respect to some scalar value.



In [1]:
import torch

dtype = torch.float
device = torch.device("cpu")
# dtype = torch.device("cuda:0") # Uncomment this to run on GPU

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold input and outputs.
# Setting requires_grad=False indicates that we do not need to compute gradients
# with respect to these Tensors during the backward pass.
x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

# Create random Tensors for weights.
# Setting requires_grad=True indicates that we want to compute gradients with
# respect to these Tensors during the backward pass.
w1 = torch.randn(D_in, H, device=device, dtype=dtype, requires_grad=True)
w2 = torch.randn(H, D_out, device=device, dtype=dtype, requires_grad=True)

learning_rate = 1e-6
for t in range(500):
    # Forward pass: compute predicted y using operations on Tensors; these
    # are exactly the same operations we used to compute the forward pass using
    # Tensors, but we do not need to keep references to intermediate values since
    # we are not implementing the backward pass by hand.
    y_pred = x.mm(w1).clamp(min=0).mm(w2)

    # Compute and print loss using operations on Tensors.
    # Now loss is a Tensor of shape (1,)
    # loss.item() gets the a scalar value held in the loss.
    loss = (y_pred - y).pow(2).sum()
    print(t, loss.item())

    # Use autograd to compute the backward pass. This call will compute the
    # gradient of loss with respect to all Tensors with requires_grad=True.
    # After this call w1.grad and w2.grad will be Tensors holding the gradient
    # of the loss with respect to w1 and w2 respectively.
    loss.backward()

    # Manually update weights using gradient descent. Wrap in torch.no_grad()
    # because weights have requires_grad=True, but we don't need to track this
    # in autograd.
    # An alternative way is to operate on weight.data and weight.grad.data.
    # Recall that tensor.data gives a tensor that shares the storage with
    # tensor, but doesn't track history.
    # You can also use torch.optim.SGD to achieve this.
    with torch.no_grad():
        w1 -= learning_rate * w1.grad
        w2 -= learning_rate * w2.grad

        # Manually zero the gradients after updating weights
        w1.grad.zero_()
        w2.grad.zero_()

0 27474704.0
1 22330140.0
2 23600544.0
3 27958116.0
4 32199080.0
5 31709912.0
6 25043572.0
7 15293150.0
8 7846143.5
9 3789707.0
10 1991932.625
11 1217223.875
12 861286.375
13 671950.5625
14 553402.9375
15 468706.03125
16 402946.65625
17 349473.125
18 304871.3125
19 267170.875
20 235213.78125
21 207819.578125
22 184179.671875
23 163682.5
24 145844.734375
25 130260.46875
26 116615.3046875
27 104618.3515625
28 94040.0
29 84690.7265625
30 76407.7109375
31 69046.609375
32 62495.8203125
33 56659.8046875
34 51450.01953125
35 46784.12109375
36 42600.71484375
37 38843.14453125
38 35460.13671875
39 32410.9140625
40 29658.623046875
41 27168.87109375
42 24914.515625
43 22871.806640625
44 21018.744140625
45 19335.005859375
46 17802.59375
47 16406.576171875
48 15134.1123046875
49 13972.43359375
50 12909.6796875
51 11937.19140625
52 11046.666015625
53 10230.4189453125
54 9481.30859375
55 8793.6806640625
56 8161.458984375
57 7580.62158203125
58 7045.541015625
59 6552.29052734375
60 6097.357421875
61 5