In [1]:
import numpy as np

N, D_in, H, D_out = 64, 1000, 100, 10

x = np.random.randn(N, D_in)
y = np.random.randn(N, D_out)

w1 = np.random.randn(D_in, H)
w2 = np.random.randn(H, D_out)

learning_rate = 1e-6

for t in range(500):
    h = x.dot(w1)
    h_relu = np.maximum(h, 0)
    y_pred = h_relu.dot(w2)
    
    loss = np.square(y_pred - y).sum()
    print(t, loss)
    
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.T.dot(grad_y_pred)
    grad_h_relu = grad_y_pred.dot(w2.T)
    grad_h = grad_h_relu.copy()
    grad_h[h < 0] = 0
    grad_w1 = x.T.dot(grad_h)
    
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

0 37069338.8554
1 38289062.1259
2 45366842.9674
3 47663109.8065
4 37623663.8251
5 20478833.7958
6 8534599.18629
7 3563377.55207
8 1897981.78497
9 1288000.94293
10 996864.341715
11 815103.919562
12 682381.712652
13 578496.903872
14 494576.920612
15 425725.678751
16 368650.495236
17 320967.727425
18 280791.9537
19 246770.93038
20 217756.460208
21 192889.600848
22 171500.253488
23 152998.200247
24 136910.979397
25 122859.590079
26 110541.843462
27 99706.755678
28 90149.1153941
29 81688.7380817
30 74177.4730939
31 67487.0009793
32 61519.6836596
33 56177.2332386
34 51376.3260545
35 47061.5670913
36 43174.1888047
37 39661.2517254
38 36485.6547685
39 33608.565474
40 30999.0937585
41 28626.6552788
42 26465.2967365
43 24494.0241523
44 22694.0271895
45 21047.6075565
46 19539.1291403
47 18155.8388422
48 16885.580316
49 15718.0954246
50 14644.415979
51 13656.9255855
52 12746.6875769
53 11906.6982566
54 11130.1709015
55 10411.6526894
56 9746.01865053
57 9129.29875628
58 8556.95651938
59 8025.605148

In [3]:
import torch

dtype = torch.float
device = torch.device("cpu")

N, D_in, H, D_out = 64, 1000, 100, 10

x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

w1 = torch.randn(D_in, H, device=device, dtype=dtype)
w2 = torch.randn(H, D_out, device=device, dtype=dtype)

learning_rate = 1e-6
for t in range(500):
    h = x.mm(w1)
    h_relu = h.clamp(min=0)
    y_pred = h_relu.mm(w2)
    
    loss = (y_pred - y).pow(2).sum().item()
    print(t, loss)
    
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.t().mm(grad_y_pred)
    grad_h_relu = grad_y_pred.mm(w2.t())
    grad_h = grad_h_relu.clone()
    grad_h[h < 0] = 0
    grad_w1 = x.t().mm(grad_h)
    
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

0 33173898.0
1 27770522.0
2 25229012.0
3 21833886.0
4 17032176.0
5 11773340.0
6 7500088.5
7 4595137.0
8 2865578.75
9 1875916.75
10 1311095.75
11 974367.625
12 761259.5
13 616691.4375
14 512112.875
15 432507.78125
16 369495.0
17 318375.46875
18 276073.03125
19 240621.8125
20 210639.046875
21 185084.296875
22 163169.8125
23 144283.0
24 127949.0078125
25 113774.421875
26 101424.3828125
27 90627.96875
28 81158.25
29 72837.0859375
30 65501.44921875
31 59019.234375
32 53281.484375
33 48185.08984375
34 43647.265625
35 39603.3828125
36 35991.9453125
37 32767.0546875
38 29876.2265625
39 27277.419921875
40 24937.380859375
41 22827.8359375
42 20922.865234375
43 19201.248046875
44 17647.48828125
45 16239.05078125
46 14959.7529296875
47 13795.4501953125
48 12734.4521484375
49 11769.623046875
50 10888.8212890625
51 10083.1328125
52 9345.162109375
53 8668.5830078125
54 8047.6240234375
55 7477.32373046875
56 6952.76025390625
57 6469.82275390625
58 6024.841796875
59 5614.6298828125
60 5235.65673828125


In [3]:
import torch

dtype = torch.float
device = torch.device('cpu')

N, D_in, H, D_out = 64, 1000, 100, 10

x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

w1 = torch.randn(D_in, H, device=device, dtype=dtype, requires_grad=True)
w2 = torch.randn(H, D_out, device=device, dtype=dtype, requires_grad=True)

learning_rate = 1e-6

for t in range(500):
    y_pred = x.mm(w1).clamp(min=0).mm(w2)
    
    loss = (y_pred - y).pow(2).sum()
    print(t, loss.item())
    
    loss.backward()
    
    with torch.no_grad():
        w1 -= learning_rate * w1.grad
        w2 -= learning_rate * w2.grad
        
        w1.grad.zero_()
        w2.grad.zero_()

0 28242194.0
1 20269592.0
2 17397024.0
3 16423134.0
4 15807363.0
5 14587559.0
6 12548584.0
7 9909153.0
8 7265505.5
9 5030337.0
10 3386805.75
11 2266871.75
12 1544011.375
13 1083776.625
14 790939.3125
15 600625.25
16 473417.65625
17 384954.40625
18 320883.1875
19 272546.375
20 234765.59375
21 204348.4375
22 179327.5625
23 158322.15625
24 140453.640625
25 125094.8125
26 111785.265625
27 100177.9609375
28 90003.234375
29 81078.6484375
30 73198.046875
31 66219.2578125
32 60010.640625
33 54478.43359375
34 49539.51953125
35 45119.828125
36 41162.33203125
37 37605.83203125
38 34402.296875
39 31513.40625
40 28905.443359375
41 26543.86328125
42 24404.37109375
43 22465.283203125
44 20704.685546875
45 19100.931640625
46 17640.8046875
47 16308.6181640625
48 15090.2275390625
49 13974.763671875
50 12953.595703125
51 12015.990234375
52 11154.5966796875
53 10362.6337890625
54 9634.1689453125
55 8962.3701171875
56 8342.7353515625
57 7770.7294921875
58 7242.39599609375
59 6754.2265625
60 6302.6499023437

497 6.588878750335425e-05
498 6.494774424936622e-05
499 6.396775279426947e-05


In [7]:
import torch

class MyReLU(torch.autograd.Function):
    @staticmethod
    def forward(ctx, input):
        ctx.save_for_backward(input)
        return input.clamp(min=0)
    
    @staticmethod
    def backward(ctx, grad_output):
        input, = ctx.saved_tensors
        grad_input = grad_output.clone()
        grad_input[input<0] = 0
        return grad_input
    
dtype = torch.float
device = torch.device('cpu')

N, D_in, H, D_out = 64, 1000, 100, 10

x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

w1 = torch.randn(D_in, H, device=device, dtype=dtype, requires_grad=True)
w2 = torch.randn(H, D_out, device=device, dtype=dtype, requires_grad=True)

learning_rate = 1e-6

for t in range(500):
    relu = MyReLU.apply
    
    y_pred = relu(x.mm(w1)).mm(w2)
    
    loss = (y_pred - y).pow(2).sum()
    print(t, loss.item())
    
    loss.backward()
    
    with torch.no_grad():
        w1 -= learning_rate * w1.grad
        w2 -= learning_rate * w2.grad
        
        w1.grad.zero_()
        w2.grad.zero_()

0 28551752.0
1 22212358.0
2 19483092.0
3 17456258.0
4 14970377.0
5 11961065.0
6 8862199.0
7 6203367.0
8 4204403.0
9 2836529.25
10 1942733.5
11 1372676.0
12 1006700.5
13 767660.0
14 606293.75
15 493234.59375
16 410661.96875
17 348039.75
18 298904.0625
19 259355.078125
20 226806.09375
21 199546.203125
22 176406.328125
23 156560.90625
24 139422.34375
25 124521.359375
26 111489.40625
27 100064.7890625
28 89999.3359375
29 81119.0546875
30 73241.578125
31 66241.140625
32 60005.94921875
33 54440.75390625
34 49466.8203125
35 45004.9140625
36 40998.7421875
37 37397.2890625
38 34154.0
39 31228.990234375
40 28583.443359375
41 26188.16796875
42 24017.712890625
43 22046.037109375
44 20254.390625
45 18623.869140625
46 17138.12890625
47 15783.419921875
48 14546.78125
49 13416.701171875
50 12383.1142578125
51 11436.8271484375
52 10569.5341796875
53 9774.4248046875
54 9044.912109375
55 8374.8076171875
56 7758.66650390625
57 7191.76611328125
58 6669.861328125
59 6189.06494140625
60 5745.93359375
61 5337

467 0.00013991778541821986
468 0.0001372237893519923
469 0.0001346765347989276
470 0.00013250744086690247
471 0.00013061496429145336
472 0.0001281696168007329
473 0.00012627290561795235
474 0.0001238555705640465
475 0.00012202305515529588
476 0.00011992910003755242
477 0.00011790908320108429
478 0.00011609439388848841
479 0.00011404794349800795
480 0.00011254840501351282
481 0.00011061784607591107
482 0.00010903421207331121
483 0.00010703482257667929
484 0.0001055055545293726
485 0.00010403601481812075
486 0.0001026265017571859
487 0.00010081991058541462
488 9.920202137436718e-05
489 9.768557356437668e-05
490 9.635070455260575e-05
491 9.489077638136223e-05
492 9.353721543448046e-05
493 9.203131048707291e-05
494 9.051195229403675e-05
495 8.920389518607408e-05
496 8.790560968918726e-05
497 8.666569192428142e-05
498 8.553014777135104e-05
499 8.408464054809883e-05


In [1]:
import tensorflow as tf
import numpy as np

# First we set up the computational graph:

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create placeholders for the input and target data; these will be filled
# with real data when we execute the graph.
x = tf.placeholder(tf.float32, shape=(None, D_in))
y = tf.placeholder(tf.float32, shape=(None, D_out))

# Create Variables for the weights and initialize them with random data.
# A TensorFlow Variable persists its value across executions of the graph.
w1 = tf.Variable(tf.random_normal((D_in, H)))
w2 = tf.Variable(tf.random_normal((H, D_out)))

# Forward pass: Compute the predicted y using operations on TensorFlow Tensors.
# Note that this code does not actually perform any numeric operations; it
# merely sets up the computational graph that we will later execute.
h = tf.matmul(x, w1)
h_relu = tf.maximum(h, tf.zeros(1))
y_pred = tf.matmul(h_relu, w2)

# Compute loss using operations on TensorFlow Tensors
loss = tf.reduce_sum((y - y_pred) ** 2.0)

# Compute gradient of the loss with respect to w1 and w2.
grad_w1, grad_w2 = tf.gradients(loss, [w1, w2])

# Update the weights using gradient descent. To actually update the weights
# we need to evaluate new_w1 and new_w2 when executing the graph. Note that
# in TensorFlow the the act of updating the value of the weights is part of
# the computational graph; in PyTorch this happens outside the computational
# graph.
learning_rate = 1e-6
new_w1 = w1.assign(w1 - learning_rate * grad_w1)
new_w2 = w2.assign(w2 - learning_rate * grad_w2)

# Now we have built our computational graph, so we enter a TensorFlow session to
# actually execute the graph.
with tf.Session() as sess:
    # Run the graph once to initialize the Variables w1 and w2.
    sess.run(tf.global_variables_initializer())

    # Create numpy arrays holding the actual data for the inputs x and targets
    # y
    x_value = np.random.randn(N, D_in)
    y_value = np.random.randn(N, D_out)
    for _ in range(500):
        # Execute the graph many times. Each time it executes we want to bind
        # x_value to x and y_value to y, specified with the feed_dict argument.
        # Each time we execute the graph we want to compute the values for loss,
        # new_w1, and new_w2; the values of these Tensors are returned as numpy
        # arrays.
        loss_value, _, _ = sess.run([loss, new_w1, new_w2],
                                    feed_dict={x: x_value, y: y_value})
        print(loss_value)

2.79977e+07
2.16616e+07
1.95398e+07
1.86002e+07
1.72736e+07
1.49221e+07
1.17448e+07
8.4879e+06
5.75078e+06
3.76912e+06
2.45882e+06
1.63622e+06
1.12741e+06
811199.0
609490.0
476164.0
384191.0
317835.0
267939.0
229098.0
197998.0
172543.0
151349.0
133465.0
118221.0
105128.0
93815.0
83994.5
75420.5
67896.8
61269.6
55414.0
50223.8
45609.7
41500.2
37830.4
34544.5
31598.2
28947.5
26558.9
24399.8
22445.5
20673.7
19063.9
17599.3
16265.1
15047.4
13935.0
12917.2
11984.7
11129.4
10343.1
9620.12
8954.47
8341.07
7775.17
7252.37
6769.1
6322.28
5908.61
5525.29
5169.5
4839.15
4532.44
4247.24
3981.94
3735.04
3505.06
3290.69
3090.79
2904.34
2730.39
2567.71
2415.63
2273.42
2140.32
2015.74
1899.05
1789.68
1687.16
1591.01
1500.81
1416.15
1336.75
1262.06
1191.87
1125.88
1063.84
1005.47
950.537
898.828
850.145
804.286
761.076
720.341
681.933
645.739
611.571
579.317
548.879
520.142
493.008
467.37
443.162
420.271
398.634
378.174
358.833
340.534
323.226
306.85
291.358
276.683
262.782
249.623
237.154
225.337
214.

In [3]:
import torch

N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold inputs and outputs
x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

model = torch.nn.Sequential(
    torch.nn.Linear(D_in, H),
    torch.nn.ReLU(),
    torch.nn.Linear(H, D_out),
)

loss_fn = torch.nn.MSELoss(size_average=False)

learning_rate = 1e-4

for t in range(500):
    y_pred = model(x)
    loss = loss_fn(y_pred, y)
    print(t, loss.item())
    
    model.zero_grad()
    
    loss.backward()
    
    with torch.no_grad():
        for param in model.parameters():
            param -= learning_rate * param.grad

0 736.23388671875
1 678.4370727539062
2 629.1464233398438
3 586.5814208984375
4 549.156005859375
5 516.10302734375
6 486.2945556640625
7 459.2853698730469
8 434.47564697265625
9 411.59625244140625
10 390.16455078125
11 370.0075988769531
12 350.8526611328125
13 332.8504333496094
14 315.799560546875
15 299.51849365234375
16 283.9769287109375
17 269.0727844238281
18 254.8993682861328
19 241.4175262451172
20 228.62088012695312
21 216.4160614013672
22 204.7119903564453
23 193.50592041015625
24 182.8052978515625
25 172.5855712890625
26 162.8506317138672
27 153.57435607910156
28 144.7512664794922
29 136.349853515625
30 128.37600708007812
31 120.8272705078125
32 113.65332794189453
33 106.8646011352539
34 100.4197998046875
35 94.3327865600586
36 88.59671783447266
37 83.20555877685547
38 78.14373016357422
39 73.3489990234375
40 68.84947967529297
41 64.62625122070312
42 60.6623649597168
43 56.947261810302734
44 53.463863372802734
45 50.1958122253418
46 47.119510650634766
47 44.24424362182617
48 4

In [7]:
import torch

class TwoLayerNet(torch.nn.Module):
    def __init__(self, D_in, H, D_out):
        """
        In the constructor we instantiate two nn.Linear modules and assign them as
        member variables.
        """
        super(TwoLayerNet, self).__init__()
        self.linear1 = torch.nn.Linear(D_in, H)
        self.linear2 = torch.nn.Linear(H, D_out)

    def forward(self, x):
        """
        In the forward function we accept a Tensor of input data and we must return
        a Tensor of output data. We can use Modules defined in the constructor as
        well as arbitrary operators on Tensors.
        """
        h_relu = self.linear1(x).clamp(min=0)
        y_pred = self.linear2(h_relu)
        return y_pred

N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold inputs and outputs
x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

model = TwoLayerNet(D_in, H, D_out)

loss_fn = torch.nn.MSELoss(size_average=False)

learning_rate = 1e-4
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
for t in range(500):
    y_pred = model(x)
    loss = loss_fn(y_pred, y)
    print(t, loss.item())
    
    model.zero_grad()
    
    loss.backward()
    
    optimizer.step()

0 698.00341796875
1 680.39453125
2 663.3118896484375
3 646.7330932617188
4 630.7099609375
5 615.3352661132812
6 600.4649658203125
7 586.0576171875
8 572.1348266601562
9 558.56005859375
10 545.3370971679688
11 532.5280151367188
12 520.145263671875
13 508.094970703125
14 496.361328125
15 484.90240478515625
16 473.7643737792969
17 462.9311828613281
18 452.4148254394531
19 442.1537170410156
20 432.1737060546875
21 422.4618835449219
22 413.0622253417969
23 403.87451171875
24 394.9458312988281
25 386.1817626953125
26 377.6141662597656
27 369.2471923828125
28 361.0851135253906
29 353.1434631347656
30 345.4161071777344
31 337.86090087890625
32 330.4488525390625
33 323.1906433105469
34 316.1276550292969
35 309.2127685546875
36 302.4154968261719
37 295.7884216308594
38 289.3097229003906
39 282.9577331542969
40 276.74432373046875
41 270.6851501464844
42 264.7418518066406
43 258.9107360839844
44 253.2032470703125
45 247.6062469482422
46 242.13116455078125
47 236.7576141357422
48 231.4783935546875


371 0.00014726974768564105
372 0.0001384907227475196
373 0.00013021094491705298
374 0.00012241475633345544
375 0.00011507039744174108
376 0.00010815958376042545
377 0.00010164578270632774
378 9.551699622534215e-05
379 8.974776574177667e-05
380 8.43136222101748e-05
381 7.920099596958607e-05
382 7.438966713380069e-05
383 6.98640214977786e-05
384 6.560195470228791e-05
385 6.159828626550734e-05
386 5.7827954151434824e-05
387 5.428288932307623e-05
388 5.095042070024647e-05
389 4.781799725606106e-05
390 4.4869106204714626e-05
391 4.2103078158106655e-05
392 3.949769597966224e-05
393 3.705019480548799e-05
394 3.4751985367620364e-05
395 3.25927248923108e-05
396 3.05616958939936e-05
397 2.8656355425482616e-05
398 2.6865347535931505e-05
399 2.518344444979448e-05
400 2.360321923333686e-05
401 2.2120158973848447e-05
402 2.0728950403281488e-05
403 1.942310882441234e-05
404 1.819393946789205e-05
405 1.7043121260940097e-05
406 1.5964322301442735e-05
407 1.4948864190955646e-05
408 1.399790562572889e-05

In [8]:
import torch

class DynamicNet(torch.nn.Module):
    def __init__(self, D_in, H, D_out):
        """
        In the constructor we construct three nn.Linear instances that we will use
        in the forward pass.
        """
        super(DynamicNet, self).__init__()
        self.input_linear = torch.nn.Linear(D_in, H)
        self.middle_linear = torch.nn.Linear(H, H)
        self.output_linear = torch.nn.Linear(H, D_out)

    def forward(self, x):
        """
        For the forward pass of the model, we randomly choose either 0, 1, 2, or 3
        and reuse the middle_linear Module that many times to compute hidden layer
        representations.

        Since each forward pass builds a dynamic computation graph, we can use normal
        Python control-flow operators like loops or conditional statements when
        defining the forward pass of the model.

        Here we also see that it is perfectly safe to reuse the same Module many
        times when defining a computational graph. This is a big improvement from Lua
        Torch, where each Module could be used only once.
        """
        h_relu = self.input_linear(x).clamp(min=0)
        for _ in range(random.randint(0, 3)):
            h_relu = self.middle_linear(h_relu).clamp(min=0)
        y_pred = self.output_linear(h_relu)
        return y_pred

N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold inputs and outputs
x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

model = DynamicNet(D_in, H, D_out)

loss_fn = torch.nn.MSELoss(size_average=False)

learning_rate = 1e-4
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
for t in range(500):
    y_pred = model(x)
    loss = loss_fn(y_pred, y)
    print(t, loss.item())
    
    model.zero_grad()
    
    loss.backward()
    
    optimizer.step()

0 606.9224853515625
1 630.7542114257812
2 612.0521240234375
3 594.744384765625
4 610.9837036132812
5 610.26123046875
6 609.3720703125
7 584.9010009765625
8 582.2395629882812
9 578.98193359375
10 575.3099365234375
11 578.544921875
12 605.6307373046875
13 565.2733764648438
14 604.6633911132812
15 604.092041015625
16 556.9542236328125
17 553.0677490234375
18 547.3174438476562
19 601.7824096679688
20 532.9429321289062
21 546.6819458007812
22 517.4017944335938
23 599.7786865234375
24 501.26458740234375
25 540.4779052734375
26 598.31982421875
27 478.2379455566406
28 535.9032592773438
29 534.1178588867188
30 531.927978515625
31 529.3870239257812
32 595.6445922851562
33 523.9718627929688
34 521.1152954101562
35 517.9906616210938
36 433.2706604003906
37 511.7545166015625
38 592.9989624023438
39 421.52850341796875
40 416.68377685546875
41 501.2197265625
42 498.7532043457031
43 590.5896606445312
44 493.4849853515625
45 393.1357116699219
46 388.2869873046875
47 588.3651733398438
48 377.32827758789

429 34.1163215637207
430 8.769438743591309
431 3.7478768825531006
432 8.475934982299805
433 8.284961700439453
434 8.037867546081543
435 32.3483772277832
436 32.06574630737305
437 31.600624084472656
438 30.981069564819336
439 30.236162185668945
440 7.548725128173828
441 7.572643280029297
442 28.16847038269043
443 7.532630920410156
444 5.184507369995117
445 7.434255123138428
446 26.20253562927246
447 7.290620803833008
448 5.557984828948975
449 5.571108341217041
450 24.857593536376953
451 7.086109161376953
452 24.208038330078125
453 23.8068790435791
454 23.298038482666016
455 22.703716278076172
456 5.532103538513184
457 5.501139163970947
458 21.02039909362793
459 20.485923767089844
460 8.393680572509766
461 19.42645835876465
462 18.897132873535156
463 18.322141647338867
464 17.71451759338379
465 5.482295989990234
466 9.847711563110352
467 5.4884796142578125
468 10.106987953186035
469 15.6009521484375
470 10.124311447143555
471 5.29252290725708
472 5.19181489944458
473 14.88025951385498
47