### Warm-up numpy

In [1]:
import numpy as np

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is out dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# create random input and output data
x = np.random.randn(N, D_in)
y = np.random.randn(N, D_out)

# Randomly initialize weights
w1 = np.random.randn(D_in, H)
w2 = np.random.randn(H, D_out)

learning_rate = 1e-6
for t in range(500):
    # Forward pass: compute predicted y
    h = x.dot(w1)
    h_relu = np.maximum(h, 0)
    y_pred = h_relu.dot(w2)
    
    # compute and print loss
    loss = np.square(y_pred - y).sum()
    print(t, loss)
    
    # Backprop to compute gradients of w1 and w2 with respect to loss
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.T.dot(grad_y_pred)
    grad_h_relu = grad_y_pred.dot(w2.T)
    grad_h = grad_h_relu.copy()
    grad_h[h < 0] = 0
    grad_w1 = x.T.dot(grad_h)
    
    
    # Update weights
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

0 33079764.198836215
1 29951373.876757078
2 26360632.10616112
3 20474050.97698617
4 13843201.127183635
5 8406877.878090357
6 4937730.063095343
7 3000923.818089542
8 1968738.9837876451
9 1403389.935375949
10 1071455.5587089558
11 858044.3140799856
12 708630.0817151754
13 596986.6336130439
14 509527.61695981945
15 438800.8604611723
16 380580.14360568335
17 331951.2959693894
18 290846.37633976
19 255866.13968714897
20 225967.25223144417
21 200225.1723848087
22 177963.30766701838
23 158643.41406910797
24 141819.79169617285
25 127105.9613115446
26 114191.11804531614
27 102814.69063983353
28 92757.48378038734
29 83854.61498431809
30 75942.94152277379
31 68898.33685522765
32 62608.94158425846
33 56980.46013453228
34 51935.12816501553
35 47402.67943013302
36 43326.70751071628
37 39651.68257011518
38 36334.69141546126
39 33331.30029653511
40 30611.187794845908
41 28141.26228051528
42 25896.000707481806
43 23853.34117806792
44 21992.530430267576
45 20291.65131225987
46 18738.571442285975
47 1731

### Tensors

In [2]:
import torch


dtype = torch.float
device = torch.device("cpu")
# device = torch.device("cuda:0") # Uncomment this to run on GPU

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random input and output data
x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

# Randomly initialize weights
w1 = torch.randn(D_in, H, device=device, dtype=dtype)
w2 = torch.randn(H, D_out, device=device, dtype=dtype)

learning_rate = 1e-6
for t in range(500):
    # Forward pass: compute predicted y
    h = x.mm(w1)
    h_relu = h.clamp(min=0)
    y_pred = h_relu.mm(w2)

    # Compute and print loss
    loss = (y_pred - y).pow(2).sum().item()
    print(t, loss)

    # Backprop to compute gradients of w1 and w2 with respect to loss
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.t().mm(grad_y_pred)
    grad_h_relu = grad_y_pred.mm(w2.t())
    grad_h = grad_h_relu.clone()
    grad_h[h < 0] = 0
    grad_w1 = x.t().mm(grad_h)

    # Update weights using gradient descent
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

0 37342216.0
1 36216748.0
2 38202828.0
3 35884000.0
4 27145782.0
5 16059579.0
6 8128121.5
7 4066789.75
8 2296781.5
9 1519419.375
10 1135806.125
11 911296.125
12 758589.0
13 643934.1875
14 552814.0625
15 478309.875
16 416288.375
17 364176.53125
18 320073.9375
19 282479.0
20 250225.9375
21 222419.15625
22 198333.765625
23 177351.34375
24 159024.625
25 142955.46875
26 128802.140625
27 116301.0625
28 105227.1171875
29 95391.5390625
30 86640.515625
31 78841.34375
32 71870.4609375
33 65624.2265625
34 60007.48046875
35 54950.4296875
36 50384.515625
37 46256.8828125
38 42522.83203125
39 39136.703125
40 36061.3984375
41 33264.0625
42 30715.9765625
43 28391.0546875
44 26267.255859375
45 24325.1875
46 22544.943359375
47 20913.4609375
48 19415.8359375
49 18038.8203125
50 16771.962890625
51 15605.439453125
52 14530.1796875
53 13537.7685546875
54 12621.1943359375
55 11774.3876953125
56 10991.2734375
57 10265.732421875
58 9593.42578125
59 8970.4814453125
60 8393.119140625
61 7857.40673828125
62 7359.

### Autograd

In [3]:
# -*- coding: utf-8 -*-
import torch

dtype = torch.float
device = torch.device("cpu")
# device = torch.device("cuda:0") # Uncomment this to run on GPU

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold input and outputs.
# Setting requires_grad=False indicates that we do not need to compute gradients
# with respect to these Tensors during the backward pass.
x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

# Create random Tensors for weights.
# Setting requires_grad=True indicates that we want to compute gradients with
# respect to these Tensors during the backward pass.
w1 = torch.randn(D_in, H, device=device, dtype=dtype, requires_grad=True)
w2 = torch.randn(H, D_out, device=device, dtype=dtype, requires_grad=True)

learning_rate = 1e-6
for t in range(500):
    # Forward pass: compute predicted y using operations on Tensors; these
    # are exactly the same operations we used to compute the forward pass using
    # Tensors, but we do not need to keep references to intermediate values since
    # we are not implementing the backward pass by hand.
    y_pred = x.mm(w1).clamp(min=0).mm(w2)

    # Compute and print loss using operations on Tensors.
    # Now loss is a Tensor of shape (1,)
    # loss.item() gets the a scalar value held in the loss.
    loss = (y_pred - y).pow(2).sum()
    print(t, loss.item())

    # Use autograd to compute the backward pass. This call will compute the
    # gradient of loss with respect to all Tensors with requires_grad=True.
    # After this call w1.grad and w2.grad will be Tensors holding the gradient
    # of the loss with respect to w1 and w2 respectively.
    loss.backward()

    # Manually update weights using gradient descent. Wrap in torch.no_grad()
    # because weights have requires_grad=True, but we don't need to track this
    # in autograd.
    # An alternative way is to operate on weight.data and weight.grad.data.
    # Recall that tensor.data gives a tensor that shares the storage with
    # tensor, but doesn't track history.
    # You can also use torch.optim.SGD to achieve this.
    with torch.no_grad():
        w1 -= learning_rate * w1.grad
        w2 -= learning_rate * w2.grad

        # Manually zero the gradients after updating weights
        w1.grad.zero_()
        w2.grad.zero_()

0 41403324.0
1 42509220.0
2 47719360.0
3 45270784.0
4 31782720.0
5 15864936.0
6 6642548.0
7 3018841.5
8 1748895.375
9 1239477.75
10 975533.625
11 803160.5
12 675025.9375
13 574086.875
14 492208.0
15 424778.1875
16 368685.125
17 321632.96875
18 281953.625
19 248166.078125
20 219254.78125
21 194389.578125
22 172877.25
23 154199.296875
24 137933.8125
25 123718.5859375
26 111283.5546875
27 100324.40625
28 90630.6171875
29 82033.515625
30 74389.1484375
31 67576.2890625
32 61484.21875
33 56027.25390625
34 51131.44921875
35 46728.0546875
36 42762.2578125
37 39184.58203125
38 35950.65234375
39 33021.27734375
40 30364.064453125
41 27950.73828125
42 25754.50390625
43 23753.0
44 21930.171875
45 20264.6953125
46 18741.1328125
47 17346.458984375
48 16068.099609375
49 14901.560546875
50 13830.4580078125
51 12845.2138671875
52 11938.068359375
53 11102.06640625
54 10330.9931640625
55 9618.6728515625
56 8960.685546875
57 8352.376953125
58 7789.27490234375
59 7267.93994140625
60 6784.96435546875
61 6336

443 0.00015193539729807526
444 0.00014896478387527168
445 0.00014627404743805528
446 0.00014355262101162225
447 0.00014055434439796954
448 0.00013788981596007943
449 0.00013549387222155929
450 0.00013263363507576287
451 0.00013020168989896774
452 0.0001279444113606587
453 0.0001253294904017821
454 0.00012295128544792533
455 0.00012030347716063261
456 0.00011822432861663401
457 0.00011604028259171173
458 0.00011410725710447878
459 0.00011237499711569399
460 0.00011002835526596755
461 0.00010814625420607626
462 0.00010638387175276875
463 0.0001045749377226457
464 0.00010260196722811088
465 0.00010074355668621138
466 9.933175169862807e-05
467 9.794071229407564e-05
468 9.652990411268547e-05
469 9.4842747785151e-05
470 9.316245996160433e-05
471 9.147608943749219e-05
472 9.008785127662122e-05
473 8.874096238287166e-05
474 8.711177360964939e-05
475 8.586076728533953e-05
476 8.44487949507311e-05
477 8.322008943650872e-05
478 8.15112070995383e-05
479 8.04613227955997e-05
480 7.930185529403389e-

### Defining new autograd function

In [4]:
# -*- coding: utf-8 -*-
import torch


class MyReLU(torch.autograd.Function):
    """
    We can implement our own custom autograd Functions by subclassing
    torch.autograd.Function and implementing the forward and backward passes
    which operate on Tensors.
    """

    @staticmethod
    def forward(ctx, input):
        """
        In the forward pass we receive a Tensor containing the input and return
        a Tensor containing the output. ctx is a context object that can be used
        to stash information for backward computation. You can cache arbitrary
        objects for use in the backward pass using the ctx.save_for_backward method.
        """
        ctx.save_for_backward(input)
        return input.clamp(min=0)

    @staticmethod
    def backward(ctx, grad_output):
        """
        In the backward pass we receive a Tensor containing the gradient of the loss
        with respect to the output, and we need to compute the gradient of the loss
        with respect to the input.
        """
        input, = ctx.saved_tensors
        grad_input = grad_output.clone()
        grad_input[input < 0] = 0
        return grad_input


dtype = torch.float
device = torch.device("cpu")
# device = torch.device("cuda:0") # Uncomment this to run on GPU

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold input and outputs.
x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

# Create random Tensors for weights.
w1 = torch.randn(D_in, H, device=device, dtype=dtype, requires_grad=True)
w2 = torch.randn(H, D_out, device=device, dtype=dtype, requires_grad=True)

learning_rate = 1e-6
for t in range(500):
    # To apply our Function, we use Function.apply method. We alias this as 'relu'.
    relu = MyReLU.apply

    # Forward pass: compute predicted y using operations; we compute
    # ReLU using our custom autograd operation.
    y_pred = relu(x.mm(w1)).mm(w2)

    # Compute and print loss
    loss = (y_pred - y).pow(2).sum()
    print(t, loss.item())

    # Use autograd to compute the backward pass.
    loss.backward()

    # Update weights using gradient descent
    with torch.no_grad():
        w1 -= learning_rate * w1.grad
        w2 -= learning_rate * w2.grad

        # Manually zero the gradients after updating weights
        w1.grad.zero_()
        w2.grad.zero_()

0 37178640.0
1 39569988.0
2 46869148.0
3 48044828.0
4 35979864.0
5 18396344.0
6 7354672.0
7 3144675.0
8 1779766.75
9 1267424.5
10 1005498.875
11 832255.0625
12 701406.25
13 597146.5625
14 512095.875
15 441900.75
16 383403.25
17 334359.0625
18 292898.03125
19 257608.734375
20 227413.828125
21 201527.34375
22 179156.40625
23 159739.0625
24 142834.390625
25 128073.15625
26 115129.53125
27 103724.28125
28 93649.3125
29 84718.484375
30 76794.1328125
31 69740.03125
32 63437.0859375
33 57800.8828125
34 52748.34765625
35 48205.3828125
36 44109.67578125
37 40415.88671875
38 37075.62109375
39 34050.515625
40 31306.517578125
41 28813.54296875
42 26545.892578125
43 24480.0390625
44 22595.517578125
45 20874.1640625
46 19300.494140625
47 17860.5
48 16540.55078125
49 15329.4345703125
50 14217.6279296875
51 13195.5234375
52 12254.96875
53 11389.14453125
54 10591.240234375
55 9854.869140625
56 9175.103515625
57 8547.1728515625
58 7967.0029296875
59 7430.08203125
60 6932.880859375
61 6471.9814453125
62 

### TensorFlow: static graphs

In [5]:
# -*- coding: utf-8 -*-
import tensorflow as tf
import numpy as np

# First we set up the computational graph:

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create placeholders for the input and target data; these will be filled
# with real data when we execute the graph.
x = tf.placeholder(tf.float32, shape=(None, D_in))
y = tf.placeholder(tf.float32, shape=(None, D_out))

# Create Variables for the weights and initialize them with random data.
# A TensorFlow Variable persists its value across executions of the graph.
w1 = tf.Variable(tf.random_normal((D_in, H)))
w2 = tf.Variable(tf.random_normal((H, D_out)))

# Forward pass: Compute the predicted y using operations on TensorFlow Tensors.
# Note that this code does not actually perform any numeric operations; it
# merely sets up the computational graph that we will later execute.
h = tf.matmul(x, w1)
h_relu = tf.maximum(h, tf.zeros(1))
y_pred = tf.matmul(h_relu, w2)

# Compute loss using operations on TensorFlow Tensors
loss = tf.reduce_sum((y - y_pred) ** 2.0)

# Compute gradient of the loss with respect to w1 and w2.
grad_w1, grad_w2 = tf.gradients(loss, [w1, w2])

# Update the weights using gradient descent. To actually update the weights
# we need to evaluate new_w1 and new_w2 when executing the graph. Note that
# in TensorFlow the the act of updating the value of the weights is part of
# the computational graph; in PyTorch this happens outside the computational
# graph.
learning_rate = 1e-6
new_w1 = w1.assign(w1 - learning_rate * grad_w1)
new_w2 = w2.assign(w2 - learning_rate * grad_w2)

# Now we have built our computational graph, so we enter a TensorFlow session to
# actually execute the graph.
with tf.Session() as sess:
    # Run the graph once to initialize the Variables w1 and w2.
    sess.run(tf.global_variables_initializer())

    # Create numpy arrays holding the actual data for the inputs x and targets
    # y
    x_value = np.random.randn(N, D_in)
    y_value = np.random.randn(N, D_out)
    for _ in range(500):
        # Execute the graph many times. Each time it executes we want to bind
        # x_value to x and y_value to y, specified with the feed_dict argument.
        # Each time we execute the graph we want to compute the values for loss,
        # new_w1, and new_w2; the values of these Tensors are returned as numpy
        # arrays.
        loss_value, _, _ = sess.run([loss, new_w1, new_w2],
                                    feed_dict={x: x_value, y: y_value})
        print(loss_value)

W0731 18:39:42.543005 4488967616 deprecation.py:323] From /Users/ksriharikota/.local/lib/python3.7/site-packages/tensorflow/python/ops/math_grad.py:1205: add_dispatch_support.<locals>.wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


37028760.0
33683800.0
33592844.0
30739378.0
23353538.0
14479026.0
7827870.0
4150501.5
2393015.2
1563443.5
1138472.8
890413.25
725743.7
605489.1
512228.84
437336.06
376067.38
325186.38
282622.47
246676.94
216135.27
190035.0
167643.72
148365.75
131673.53
117165.11
104505.38
93445.2
83737.67
75192.35
67649.164
60973.93
55054.023
49787.203
45094.465
40905.242
37156.85
33797.9
30784.566
28076.012
25634.38
23429.79
21437.156
19635.797
18004.717
16524.67
15182.264
13963.639
12854.557
11844.004
10921.272
10078.001
9307.002
8600.789
7953.79
7360.4253
6815.4814
6314.604
5854.157
5430.372
5039.8696
4679.9277
4348.1367
4041.8855
3759.1296
3498.561
3257.6917
3034.7393
2828.219
2636.859
2459.4482
2294.754
2141.87
1999.882
1868.0063
1745.3363
1631.2294
1525.0317
1426.2129
1334.2695
1248.5531
1168.6499
1094.1482
1024.6774
959.85876
899.3396
842.8104
790.01794
740.70306
694.6124
651.5005
611.1934
573.48346
538.2125
505.1835
474.26657
445.31628
418.19748
392.8011
368.9976
346.68307
325.77426
306.1689
28

### nn module

In [6]:
# -*- coding: utf-8 -*-
import torch

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold inputs and outputs
x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

# Use the nn package to define our model as a sequence of layers. nn.Sequential
# is a Module which contains other Modules, and applies them in sequence to
# produce its output. Each Linear Module computes output from input using a
# linear function, and holds internal Tensors for its weight and bias.
model = torch.nn.Sequential(
    torch.nn.Linear(D_in, H),
    torch.nn.ReLU(),
    torch.nn.Linear(H, D_out),
)

# The nn package also contains definitions of popular loss functions; in this
# case we will use Mean Squared Error (MSE) as our loss function.
loss_fn = torch.nn.MSELoss(reduction='sum')

learning_rate = 1e-4
for t in range(500):
    # Forward pass: compute predicted y by passing x to the model. Module objects
    # override the __call__ operator so you can call them like functions. When
    # doing so you pass a Tensor of input data to the Module and it produces
    # a Tensor of output data.
    y_pred = model(x)

    # Compute and print loss. We pass Tensors containing the predicted and true
    # values of y, and the loss function returns a Tensor containing the
    # loss.
    loss = loss_fn(y_pred, y)
    print(t, loss.item())

    # Zero the gradients before running the backward pass.
    model.zero_grad()

    # Backward pass: compute gradient of the loss with respect to all the learnable
    # parameters of the model. Internally, the parameters of each Module are stored
    # in Tensors with requires_grad=True, so this call will compute gradients for
    # all learnable parameters in the model.
    loss.backward()

    # Update the weights using gradient descent. Each parameter is a Tensor, so
    # we can access its gradients like we did before.
    with torch.no_grad():
        for param in model.parameters():
            param -= learning_rate * param.grad

0 627.3693237304688
1 578.9525756835938
2 537.3444213867188
3 500.9615478515625
4 468.4933166503906
5 439.5971984863281
6 413.3749694824219
7 389.4493713378906
8 367.4945373535156
9 347.161376953125
10 328.3087463378906
11 310.7049560546875
12 294.221435546875
13 278.7095642089844
14 264.0848693847656
15 250.30126953125
16 237.25132751464844
17 224.84739685058594
18 213.07313537597656
19 201.8680877685547
20 191.1895294189453
21 181.00918579101562
22 171.32334899902344
23 162.1163330078125
24 153.34661865234375
25 144.99632263183594
26 137.0426025390625
27 129.4884033203125
28 122.29558563232422
29 115.45553588867188
30 108.94844818115234
31 102.77508544921875
32 96.93603515625
33 91.4049072265625
34 86.1700210571289
35 81.20249938964844
36 76.5050048828125
37 72.06610107421875
38 67.87068176269531
39 63.9106559753418
40 60.17069625854492
41 56.65071105957031
42 53.336219787597656
43 50.21283721923828
44 47.26587677001953
45 44.4904899597168
46 41.87096405029297
47 39.4067497253418
48 

395 0.00018246960826218128
396 0.00017781404312700033
397 0.00017328087415080518
398 0.0001688602933427319
399 0.00016455714649055153
400 0.00016036772285588086
401 0.00015628428081981838
402 0.00015230606368277222
403 0.0001484347158111632
404 0.00014465772255789489
405 0.00014098048268351704
406 0.0001373966078972444
407 0.00013390628737397492
408 0.000130505402921699
409 0.00012719024380203336
410 0.0001239659613929689
411 0.00012082533066859469
412 0.00011775465827668086
413 0.00011476932559162378
414 0.00011186233314219862
415 0.00010903152724495158
416 0.00010626956645864993
417 0.00010357986320741475
418 0.00010095650213770568
419 9.840157872531563e-05
420 9.591453999746591e-05
421 9.349308675155044e-05
422 9.113053965847939e-05
423 8.882868860382587e-05
424 8.658363367430866e-05
425 8.43995512695983e-05
426 8.22693036752753e-05
427 8.019506640266627e-05
428 7.817048026481643e-05
429 7.620045653311536e-05
430 7.428346725646406e-05
431 7.240910781547427e-05
432 7.058760820655152e

### optim

In [7]:
# -*- coding: utf-8 -*-
import torch

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold inputs and outputs
x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

# Use the nn package to define our model and loss function.
model = torch.nn.Sequential(
    torch.nn.Linear(D_in, H),
    torch.nn.ReLU(),
    torch.nn.Linear(H, D_out),
)
loss_fn = torch.nn.MSELoss(reduction='sum')

# Use the optim package to define an Optimizer that will update the weights of
# the model for us. Here we will use Adam; the optim package contains many other
# optimization algoriths. The first argument to the Adam constructor tells the
# optimizer which Tensors it should update.
learning_rate = 1e-4
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
for t in range(500):
    # Forward pass: compute predicted y by passing x to the model.
    y_pred = model(x)

    # Compute and print loss.
    loss = loss_fn(y_pred, y)
    print(t, loss.item())

    # Before the backward pass, use the optimizer object to zero all of the
    # gradients for the variables it will update (which are the learnable
    # weights of the model). This is because by default, gradients are
    # accumulated in buffers( i.e, not overwritten) whenever .backward()
    # is called. Checkout docs of torch.autograd.backward for more details.
    optimizer.zero_grad()

    # Backward pass: compute gradient of the loss with respect to model
    # parameters
    loss.backward()

    # Calling the step function on an Optimizer makes an update to its
    # parameters
    optimizer.step()

0 730.4883422851562
1 712.4772338867188
2 694.9523315429688
3 677.9525146484375
4 661.41650390625
5 645.3283081054688
6 629.6404418945312
7 614.517578125
8 599.8334350585938
9 585.4839477539062
10 571.5645141601562
11 558.0635986328125
12 545.0640869140625
13 532.5257568359375
14 520.3631591796875
15 508.5727233886719
16 497.1874694824219
17 486.0825500488281
18 475.22711181640625
19 464.59747314453125
20 454.20220947265625
21 444.0964050292969
22 434.22235107421875
23 424.5998840332031
24 415.2012939453125
25 406.02838134765625
26 397.1226806640625
27 388.4356994628906
28 379.9325256347656
29 371.6165466308594
30 363.4718017578125
31 355.4936828613281
32 347.7178039550781
33 340.1342468261719
34 332.71588134765625
35 325.4467468261719
36 318.3362731933594
37 311.390869140625
38 304.578369140625
39 297.905029296875
40 291.390380859375
41 285.0202941894531
42 278.77740478515625
43 272.64532470703125
44 266.6549987792969
45 260.7748107910156
46 255.02955627441406
47 249.38111877441406
48

467 2.9057457595627056e-07
468 2.707708404159348e-07
469 2.5273706683037744e-07
470 2.3569702989334473e-07
471 2.199645763312219e-07
472 2.0497395780694205e-07
473 1.9112381721697602e-07
474 1.7817598063629703e-07
475 1.658648187685685e-07
476 1.5468283720565523e-07
477 1.441727448536767e-07
478 1.342334599030437e-07
479 1.2499963020218274e-07
480 1.163888114774636e-07
481 1.0825410612369524e-07
482 1.0087982360573733e-07
483 9.385891530655499e-08
484 8.737654155765995e-08
485 8.124153083599595e-08
486 7.556066350389301e-08
487 7.035971094637716e-08
488 6.536642160881456e-08
489 6.081457115669764e-08
490 5.6614709365021554e-08
491 5.256175228396387e-08
492 4.8912113470578333e-08
493 4.5492523526036166e-08
494 4.2209279627059004e-08
495 3.91900059071304e-08
496 3.642513846102702e-08
497 3.386853109077492e-08
498 3.140334214890572e-08
499 2.9101926202201867e-08


### Custom nn modules

In [8]:
# -*- coding: utf-8 -*-
import torch


class TwoLayerNet(torch.nn.Module):
    def __init__(self, D_in, H, D_out):
        """
        In the constructor we instantiate two nn.Linear modules and assign them as
        member variables.
        """
        super(TwoLayerNet, self).__init__()
        self.linear1 = torch.nn.Linear(D_in, H)
        self.linear2 = torch.nn.Linear(H, D_out)

    def forward(self, x):
        """
        In the forward function we accept a Tensor of input data and we must return
        a Tensor of output data. We can use Modules defined in the constructor as
        well as arbitrary operators on Tensors.
        """
        h_relu = self.linear1(x).clamp(min=0)
        y_pred = self.linear2(h_relu)
        return y_pred


# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold inputs and outputs
x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

# Construct our model by instantiating the class defined above
model = TwoLayerNet(D_in, H, D_out)

# Construct our loss function and an Optimizer. The call to model.parameters()
# in the SGD constructor will contain the learnable parameters of the two
# nn.Linear modules which are members of the model.
criterion = torch.nn.MSELoss(reduction='sum')
optimizer = torch.optim.SGD(model.parameters(), lr=1e-4)
for t in range(500):
    # Forward pass: Compute predicted y by passing x to the model
    y_pred = model(x)

    # Compute and print loss
    loss = criterion(y_pred, y)
    print(t, loss.item())

    # Zero gradients, perform a backward pass, and update the weights.
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

0 685.4006958007812
1 633.806396484375
2 589.1121215820312
3 549.9537353515625
4 515.08447265625
5 483.62945556640625
6 454.81005859375
7 428.3700256347656
8 404.2173767089844
9 381.78070068359375
10 360.807373046875
11 340.9759521484375
12 322.2793884277344
13 304.67816162109375
14 288.0943298339844
15 272.3668518066406
16 257.3733215332031
17 243.18524169921875
18 229.7479248046875
19 216.9298095703125
20 204.7373046875
21 193.143798828125
22 182.12249755859375
23 171.66860961914062
24 161.76678466796875
25 152.37307739257812
26 143.47950744628906
27 135.06698608398438
28 127.10296630859375
29 119.54712677001953
30 112.39842224121094
31 105.63764953613281
32 99.2680435180664
33 93.25701141357422
34 87.58985137939453
35 82.24028015136719
36 77.2007827758789
37 72.4593505859375
38 68.0042724609375
39 63.816898345947266
40 59.87974548339844
41 56.182518005371094
42 52.72125244140625
43 49.46517562866211
44 46.40460968017578
45 43.53996658325195
46 40.85391616821289
47 38.33686828613281


392 4.6209599531721324e-05
393 4.483644443098456e-05
394 4.3507890950422734e-05
395 4.2216270230710506e-05
396 4.09654930990655e-05
397 3.9753649616613984e-05
398 3.857824776787311e-05
399 3.744020068552345e-05
400 3.6333211028249934e-05
401 3.52566821675282e-05
402 3.4216034691780806e-05
403 3.320402174722403e-05
404 3.222244413336739e-05
405 3.1273055355995893e-05
406 3.035113877558615e-05
407 2.945431333500892e-05
408 2.8588676286744885e-05
409 2.7746116757043637e-05
410 2.6928635634249076e-05
411 2.6134028303204104e-05
412 2.5365774490637705e-05
413 2.4618295356049202e-05
414 2.3893622710602358e-05
415 2.3192404114524834e-05
416 2.2510717826662585e-05
417 2.1847725292900577e-05
418 2.1209101760177873e-05
419 2.0583500372595154e-05
420 1.9979732314823195e-05
421 1.9394599803490564e-05
422 1.882352080428973e-05
423 1.827218511607498e-05
424 1.7736569134285673e-05
425 1.7218671928276308e-05
426 1.671328027441632e-05
427 1.6224768842221238e-05
428 1.5747931684018113e-05
429 1.528787470

### Control Flow + Weight sharing

In [9]:
# -*- coding: utf-8 -*-
import random
import torch


class DynamicNet(torch.nn.Module):
    def __init__(self, D_in, H, D_out):
        """
        In the constructor we construct three nn.Linear instances that we will use
        in the forward pass.
        """
        super(DynamicNet, self).__init__()
        self.input_linear = torch.nn.Linear(D_in, H)
        self.middle_linear = torch.nn.Linear(H, H)
        self.output_linear = torch.nn.Linear(H, D_out)

    def forward(self, x):
        """
        For the forward pass of the model, we randomly choose either 0, 1, 2, or 3
        and reuse the middle_linear Module that many times to compute hidden layer
        representations.

        Since each forward pass builds a dynamic computation graph, we can use normal
        Python control-flow operators like loops or conditional statements when
        defining the forward pass of the model.

        Here we also see that it is perfectly safe to reuse the same Module many
        times when defining a computational graph. This is a big improvement from Lua
        Torch, where each Module could be used only once.
        """
        h_relu = self.input_linear(x).clamp(min=0)
        for _ in range(random.randint(0, 3)):
            h_relu = self.middle_linear(h_relu).clamp(min=0)
        y_pred = self.output_linear(h_relu)
        return y_pred


# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold inputs and outputs
x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

# Construct our model by instantiating the class defined above
model = DynamicNet(D_in, H, D_out)

# Construct our loss function and an Optimizer. Training this strange model with
# vanilla stochastic gradient descent is tough, so we use momentum
criterion = torch.nn.MSELoss(reduction='sum')
optimizer = torch.optim.SGD(model.parameters(), lr=1e-4, momentum=0.9)
for t in range(500):
    # Forward pass: Compute predicted y by passing x to the model
    y_pred = model(x)

    # Compute and print loss
    loss = criterion(y_pred, y)
    print(t, loss.item())

    # Zero gradients, perform a backward pass, and update the weights.
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

0 651.4310302734375
1 642.5762939453125
2 644.22314453125
3 683.4178466796875
4 638.5966796875
5 639.9195556640625
6 638.3102416992188
7 584.932373046875
8 512.1100463867188
9 624.0691528320312
10 430.57366943359375
11 560.863525390625
12 339.63250732421875
13 613.1778564453125
14 608.558837890625
15 526.618408203125
16 195.59478759765625
17 618.2294921875
18 613.4796142578125
19 606.7532348632812
20 597.8316650390625
21 431.25421142578125
22 573.9872436523438
23 558.2959594726562
24 110.28905487060547
25 479.9520568847656
26 459.16424560546875
27 434.3299865722656
28 469.9855651855469
29 295.9674072265625
30 278.80706787109375
31 341.4176330566406
32 226.59463500976562
33 291.4370422363281
34 163.94522094726562
35 323.25311279296875
36 295.6117248535156
37 151.20220947265625
38 140.95242309570312
39 124.34688568115234
40 254.89404296875
41 165.8935546875
42 118.97591400146484
43 143.68336486816406
44 148.20098876953125
45 172.3243408203125
46 162.99644470214844
47 167.14041137695312
4

387 0.6687149405479431
388 0.6279309391975403
389 0.54644376039505
390 1.5882680416107178
391 0.6780098080635071
392 1.2595725059509277
393 0.9658536911010742
394 1.502620816230774
395 1.553446650505066
396 0.3895246088504791
397 0.41579189896583557
398 0.7966228127479553
399 1.5085381269454956
400 1.0778653621673584
401 0.8613473176956177
402 0.6874183416366577
403 0.32525670528411865
404 0.3244551122188568
405 1.7710522413253784
406 3.6018173694610596
407 0.4650043547153473
408 2.5233259201049805
409 2.8258211612701416
410 0.9032678008079529
411 1.4292421340942383
412 0.5467759966850281
413 1.3062469959259033
414 2.4292547702789307
415 0.2540760040283203
416 1.487181305885315
417 1.2046949863433838
418 1.5236375331878662
419 0.1489960104227066
420 0.6962140202522278
421 0.2561681270599365
422 0.8578364849090576
423 0.8475244045257568
424 0.8651139736175537
425 0.42848432064056396
426 0.24386551976203918
427 0.8049486875534058
428 0.3943071663379669
429 0.3267848789691925
430 0.740501