# Tensors

### Warm-up: numpy

In [1]:
import numpy as np

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random input and output data
x = np.random.randn(N, D_in)
y = np.random.randn(N, D_out)

# Randomly initialize weights
w1 = np.random.randn(D_in, H)
w2 = np.random.randn(H, D_out)

learning_rate = 1e-6
for t in range(500):
    # Forward pass: compute predicted y
    h = x.dot(w1)
    h_relu = np.maximum(h,0)
    y_pred = h_relu.dot(w2)
    
    # Compute and print loss
    loss = np.square(y_pred - y).sum()
    print(t, loss)
    
    # Backprop to compute gradients of w1 and w2  with respect to loss
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.T.dot(grad_y_pred)
    grad_h_relu = grad_y_pred.dot(w2.T)
    grad_h = grad_h_relu.copy()
    grad_h[h<0] = 0
    grad_w1 = x.T.dot(grad_h)
    
    # Update weights
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

0 30013392.058496468
1 28082361.084647972
2 30175607.685382314
3 31380406.525137167
4 28269767.87991444
5 20586518.895238332
6 12339558.844099157
7 6519588.328472834
8 3423615.902798177
9 1956650.9131299644
10 1268377.832901427
11 918644.5931144
12 717530.3257053932
13 585575.3076905775
14 489894.01180568384
15 415863.27894756745
16 356375.1734263998
17 307566.58953292976
18 266883.1185833
19 232631.86663543008
20 203587.89191340652
21 178855.03647419813
22 157696.96248821352
23 139512.8817015809
24 123797.95611544125
25 110144.05731891372
26 98253.849183934
27 87874.23770969531
28 78807.13333444041
29 70839.8636459379
30 63813.067639174245
31 57595.59711011514
32 52083.901534011944
33 47185.704246292284
34 42816.83705564499
35 38917.4352459254
36 35428.72397734082
37 32302.22920257105
38 29492.305134854563
39 26962.895310571592
40 24683.4478877051
41 22626.25175534031
42 20766.602072974005
43 19081.58710536235
44 17553.064105841462
45 16164.885788328636
46 14901.957529460728
47 13751.

445 0.0003423890852842839
446 0.00032941850181181736
447 0.00031693202864196816
448 0.00030491937580226784
449 0.0002933630095353592
450 0.00028224641505925313
451 0.0002715527318990977
452 0.00026126434128777754
453 0.0002513667756443877
454 0.0002418455960057307
455 0.00023268714337159766
456 0.00022387491321368034
457 0.00021539730785255048
458 0.0002072421749945621
459 0.00019939715013169056
460 0.00019184887763990297
461 0.0001845871231421828
462 0.00017760160511807812
463 0.0001708816261729252
464 0.00016441587326119478
465 0.00015819556027916754
466 0.0001522140138745387
467 0.00014645629054726625
468 0.0001409170003418993
469 0.00013558729605725814
470 0.00013045961190737745
471 0.00012552673703128907
472 0.00012078093342971662
473 0.0001162146945413291
474 0.00011182151225465779
475 0.00010759485283915948
476 0.00010352841315856047
477 9.961582976981758e-05
478 9.585146691583213e-05
479 9.22296773728022e-05
480 8.874594002231128e-05
481 8.539327190176354e-05
482 8.216749652919

### PyTorch: Tensors

In [2]:
import torch

In [3]:
dtype = torch.float
#device = torch.device('cpu')
device = torch.device('cuda:0')

In [4]:
# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random input and output data
x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

# Randomly initialize weights
w1 = torch.randn(D_in, H, device=device, dtype=dtype)
w2 = torch.randn(H, D_out, device=device, dtype=dtype)

learning_rate = 1e-6
for t in range(500):
    # Forward pass: compute predicted y
    h = x.mm(w1)
    h_relu = h.clamp(min=0)
    y_pred = h_relu.mm(w2)
    
    # Compute and print loss
    loss = (y_pred - y).pow(2).sum().item()
    print(t, loss)
    
    # Backprop to compute gradients of w1 and w2 with respect to loss
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.t().mm(grad_y_pred)
    grad_h_relu = grad_y_pred.mm(w2.t())
    grad_h = grad_h_relu.clone()
    grad_h[h<0] = 0
    grad_w1 = x.t().mm(grad_h)
    
    # Update weights using gradient descent
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2
    

0 44589148.0
1 47807200.0
2 49765508.0
3 40272152.0
4 23363580.0
5 10320578.0
6 4501752.0
7 2412513.0
8 1617921.25
9 1234532.75
10 998351.5
11 828756.8125
12 697726.4375
13 592748.8125
14 507170.03125
15 436521.75
16 377777.9375
17 328586.125
18 287067.0
19 251824.375
20 221734.65625
21 195958.65625
22 173825.125
23 154670.125
24 138015.125
25 123487.328125
26 110774.25
27 99620.515625
28 89790.421875
29 81095.03125
30 73381.453125
31 66522.953125
32 60417.1953125
33 54961.51171875
34 50074.84375
35 45692.1328125
36 41756.3203125
37 38209.6640625
38 35009.83984375
39 32115.60546875
40 29494.083984375
41 27116.5078125
42 24956.41015625
43 22992.58203125
44 21204.10546875
45 19571.974609375
46 18081.5546875
47 16719.9453125
48 15475.1083984375
49 14333.85546875
50 13287.1005859375
51 12325.40234375
52 11441.0546875
53 10627.19140625
54 9879.6708984375
55 9191.107421875
56 8555.947265625
57 7969.0693359375
58 7426.41943359375
59 6924.2763671875
60 6459.72802734375
61 6029.2783203125
62 56

# Autograd

### PyTorch: Tensors and autograd

In [5]:
# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold input and outputs.
# Setting requires_grad=False indicates that we do not need to compute gradients
# with respect to these Tensors during the backward pass.
x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

# Randomly initialize weights
# Setting requires_grad=True indicates that we want to compute gradients
# with respect to these Tensors during the backward pass.
w1 = torch.randn(D_in, H, device=device, dtype=dtype, requires_grad=True)
w2 = torch.randn(H, D_out, device=device, dtype=dtype, requires_grad=True)

learning_rate = 1e-6
for t in range(500):
    # Forward pass: compute predicted y using operations on Tensors; these
    # are exactly the same operations we used to compute the forward pass using
    # Tensors, but we do not need to keep references to intermediate values
    # since we are not implementing the backward pass by hand.
    y_pred = x.mm(w1).clamp(min=0).mm(w2)
    
    # Compute and print loss
    # Now loss is a Tensor of shape (1,)
    # loss.item() gets a scalar value held in the loss.
    loss = (y_pred - y).pow(2).sum()
    print(t, loss.item())
    
    # Use autograd to compute the backward pass. This call will compute the
    # gradient of loss with respect to all Tensors with requires_grad = True.
    # After this call w1.grad and w2.gard will be Tensors holding the gradient
    # of the loss with repect to w1 and w2 respectively.
    loss.backward()
    
    # Manulally update weights using gradient descent. Wrap in torch.no_grad()
    # because weights have requires_grad=True, but we don't need to track this
    # in autograd.
    # An alternative way is to operate on weight.data and weight.grad.data.
    # Recall that tensor.data gives a tensor that shares the storage with
    # tensor, but doesn't track history.
    # You can also use torch.optim.SGD to achieve this.
    with torch.no_grad():
        w1 -= learning_rate * w1.grad
        w2 -= learning_rate * w2.grad
        
        # Manually zero the gradients after updating weights
        w1.grad.zero_()
        w2.grad.zero_()
    

    

0 37756080.0
1 34870064.0
2 33191706.0
3 27608946.0
4 19055258.0
5 11006836.0
6 5893689.5
7 3238826.75
8 1974515.0
9 1350509.625
10 1011164.25
11 801721.125
12 657418.875
13 549820.75
14 465576.9375
15 397644.9375
16 341900.84375
17 295670.5
18 256982.90625
19 224442.03125
20 196879.09375
21 173366.4375
22 153220.546875
23 135894.78125
24 120896.515625
25 107907.421875
26 96583.125
27 86666.96875
28 77949.9921875
29 70264.4296875
30 63476.421875
31 57459.2578125
32 52108.01953125
33 47336.8203125
34 43071.66015625
35 39250.140625
36 35818.68359375
37 32736.82421875
38 29958.4765625
39 27449.142578125
40 25179.26953125
41 23122.8828125
42 21256.443359375
43 19560.3125
44 18017.5390625
45 16612.57421875
46 15330.365234375
47 14158.8515625
48 13086.9755859375
49 12105.5166015625
50 11205.7109375
51 10379.62109375
52 9620.791015625
53 8923.1552734375
54 8281.1044921875
55 7689.6220703125
56 7144.40185546875
57 6642.052734375
58 6178.2744140625
59 5750.4013671875
60 5355.185546875
61 4989.4

### PyTorch: Defining new autograd functions

We can ealsily define our own autograd operator. 

We can then use our new autograd opearator by constructing an instance and calling it like a function, passing Tensors containing input data.

In [6]:
class MyReLU(torch.autograd.Function):
    """
    We can implement our own custom autograd Functions by subclassing
    torch.autograd.Function and implementing the forward and backward passes
    which operate on Tensors.
    """
    
    @staticmethod
    def forward(ctx, input):
        """
        In the forward pass we receive a Tensor containing the input and return
        a Tensor containing the output. ctx is a context object that can be used
        to stash information for backward computation. You can cache arbitrary
        objects for use in the backward pass using the ctx.save_for_backward method.
        """
        ctx.save_for_backward(input)
        return input.clamp(min=0)
    
    @staticmethod
    def backward(ctx, grad_output):
        """
        In the backward pass we receive a Tensor containing the gradient of the loss
        with respect to the output, and we need to compute the gradient of the loss
        with respect to the input.
        """
        input, = ctx.saved_tensors
        grad_input = grad_output.clone()
        grad_input[input<0] = 0
        return grad_input
    
    

dtype = torch.float
device = torch.device("cpu")
# device = torch.device("cuda:0") # Uncomment this to run on GPU

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold input and outputs.
x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

# Create random Tensors for weights.
w1 = torch.randn(D_in, H, device=device, dtype=dtype, requires_grad=True)
w2 = torch.randn(H, D_out, device=device, dtype=dtype, requires_grad=True)

learning_rate = 1e-6
for t in range(500):
    # To apply our Function, we use Function.apply method. We alias this as 'relu'.
    relu = MyReLU.apply

    # Forward pass: compute predicted y using operations; we compute
    # ReLU using our custom autograd operation.
    y_pred = relu(x.mm(w1)).mm(w2)

    # Compute and print loss
    loss = (y_pred - y).pow(2).sum()
    print(t, loss.item())

    # Use autograd to compute the backward pass.
    loss.backward()

    # Update weights using gradient descent
    with torch.no_grad():
        w1 -= learning_rate * w1.grad
        w2 -= learning_rate * w2.grad

        # Manually zero the gradients after updating weights
        w1.grad.zero_()
        w2.grad.zero_()        

0 31893116.0
1 30850552.0
2 31210794.0
3 28917272.0
4 22566736.0
5 14725250.0
6 8380022.5
7 4590935.0
8 2640690.5
9 1684833.75
10 1195375.5
11 920590.25
12 746875.1875
13 625070.5
14 532750.0625
15 459374.28125
16 399199.625
17 348871.125
18 306291.03125
19 269963.375
20 238751.4375
21 211869.21875
22 188596.8125
23 168307.75
24 150574.0625
25 135023.375
26 121339.59375
27 109271.53125
28 98592.2421875
29 89117.78125
30 80699.4453125
31 73190.8125
32 66469.328125
33 60448.65625
34 55048.828125
35 50212.0859375
36 45854.109375
37 41921.9609375
38 38367.5703125
39 35149.7421875
40 32234.0546875
41 29586.88671875
42 27180.771484375
43 24992.5546875
44 23000.40234375
45 21181.8125
46 19521.296875
47 18003.994140625
48 16618.283203125
49 15352.4150390625
50 14191.7734375
51 13127.263671875
52 12149.857421875
53 11251.5361328125
54 10425.51171875
55 9665.4482421875
56 8965.74609375
57 8320.9658203125
58 7726.2685546875
59 7177.3974609375
60 6670.3525390625
61 6201.798828125
62 5768.602050781

# nn module

### PyTorch: nn

In [2]:
import torch

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold inputs and outputs
x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

# Use the nn package to define our model as a sequence of layers. 
# nn.Sequential is a Module which contains other Modules, and applies
# them in sequence to produce its output. Each Linear Module computes
# output from input using a linear function, and holds internal Tensors
# for its weight and bias.
model = torch.nn.Sequential(
    torch.nn.Linear(D_in, H),
    torch.nn.ReLU(),
    torch.nn.Linear(H, D_out),
)

# The nn package also contains definitions of popular loss functions;
# in this case we will use Mean Squared Error as our loss fucntion.
loss_fn = torch.nn.MSELoss(reduction='sum')

learning_rate = 1e-4
for t in range(500):
    # Forward pass: compute predicted y by passing x to the model. Module objects
    # override the __call__ operator so you can call them like functions. When
    # doing so you pass a Tensor of input data to the Module and it produces
    # a Tensor of output data.
    y_pred = model(x)
    
    # Compute and print loss. We pass Tensors containing the predicted and ture
    # values of y, and the loss function returns a Tensor containing the loss.
    loss = loss_fn(y_pred, y)
    print(t, loss.item())
    
    # Zero the gradients before running the backward pass.
    model.zero_grad()
    
    # Backward pass: compute gradient of the loss with respect to all the learnable
    # parameters of the model. Internally, the parameters of each Module are stored
    # in Tensors with requires_grad=True, so this call will compute gradients for
    # all learnable parameters in the model.
    loss.backward()
    
    # Update the weights using gradient descent. Each parameter is a Tensor, so
    # we can access its gradients like we did before
    with torch.no_grad():
        for param in model.parameters():
            param -= learning_rate * param.grad
    

0 697.44775390625
1 649.9174194335938
2 608.752685546875
3 572.3272705078125
4 539.5880126953125
5 510.0417785644531
6 483.2933044433594
7 458.7454833984375
8 435.98980712890625
9 414.7358093261719
10 394.985595703125
11 376.4670715332031
12 358.8547058105469
13 341.9606018066406
14 325.8065490722656
15 310.31732177734375
16 295.4791564941406
17 281.216796875
18 267.4828186035156
19 254.31190490722656
20 241.62332153320312
21 229.4487762451172
22 217.8243408203125
23 206.68846130371094
24 196.02383422851562
25 185.78668212890625
26 175.96632385253906
27 166.55618286132812
28 157.54464721679688
29 148.92686462402344
30 140.70033264160156
31 132.84523010253906
32 125.3919677734375
33 118.28794860839844
34 111.5387954711914
35 105.12967681884766
36 99.05128479003906
37 93.28731536865234
38 87.82860565185547
39 82.66034698486328
40 77.7466049194336
41 73.09430694580078
42 68.66226959228516
43 64.4865951538086
44 60.558624267578125
45 56.85906982421875
46 53.377418518066406
47 50.1102867126

347 0.00039692482096143067
348 0.0003863483725581318
349 0.00037605816032737494
350 0.00036604978959076107
351 0.00035631965147331357
352 0.00034685261198319495
353 0.0003376510285306722
354 0.00032870544237084687
355 0.00031999763450585306
356 0.00031153514282777905
357 0.0003033046086784452
358 0.00029529567109420896
359 0.000287507165921852
360 0.00027992550167255104
361 0.00027255769236944616
362 0.00026539660757407546
363 0.00025841634487733245
364 0.00025162738165818155
365 0.0002450241590850055
366 0.0002385997213423252
367 0.00023234864056576043
368 0.0002262663620058447
369 0.00022035726578906178
370 0.00021459651179611683
371 0.00020899248193018138
372 0.0002035416109720245
373 0.0001982410904020071
374 0.000193075742572546
375 0.00018805291620083153
376 0.00018316211935598403
377 0.00017840614600572735
378 0.00017377342737745494
379 0.0001692634541541338
380 0.00016487709945067763
381 0.00016060493362601846
382 0.00015645318489987403
383 0.00015240821812767535
384 0.00014846

### PyTorch: optim

In [3]:

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold inputs and outputs
x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

# Use the nn package to define our model and loss function.
model = torch.nn.Sequential(
    torch.nn.Linear(D_in, H),
    torch.nn.ReLU(),
    torch.nn.Linear(H, D_out),
)
loss_fn = torch.nn.MSELoss(reduction='sum')

# Use the optim package to define an Optimizer that will update the weights of
# the model for us. Here we will use Adam; the optim package contains many other
# optimization algorithms. The first argument to the Adam constructor tells the 
# optimizer which Tensors it should update.
learning_rate = 1e-4
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
for t in range(500):
    # Forward pass: compute predicted y by passing x to the model.
    y_pred = model(x)
    
    # Compute and print loss.
    loss = loss_fn(y_pred, y)
    print(t, loss.item())
    
    # Before the backward pass, use the optimizer object to zero all of the
    # gradients for the variables it will update (which are learnable
    # weights of the model).This is because by default, gradients are
    # accumulated in buffers(i.e., not overwritten) whenever .backward()
    # is called. Checkout docs of torch.autograd.backward for more details.
    optimizer.zero_grad()
    
    # Backward pass: compute gradient of the loss with respect to model
    # parameters.
    loss.backward()
    
    # Calling the step function on an Optimizer makes an update to its
    # parameters.
    optimizer.step()

0 696.7754516601562
1 679.078125
2 662.0038452148438
3 645.5186157226562
4 629.5328979492188
5 613.9044799804688
6 598.8106079101562
7 584.157958984375
8 569.8890380859375
9 556.0236206054688
10 542.6332397460938
11 529.6138916015625
12 516.935302734375
13 504.6131286621094
14 492.6385192871094
15 481.0657653808594
16 469.8335876464844
17 458.8351745605469
18 448.1546630859375
19 437.77593994140625
20 427.6322021484375
21 417.77801513671875
22 408.240478515625
23 398.95111083984375
24 389.8852844238281
25 381.0438232421875
26 372.4076232910156
27 363.96502685546875
28 355.69451904296875
29 347.6182861328125
30 339.7381896972656
31 332.02783203125
32 324.48223876953125
33 317.0998840332031
34 309.895751953125
35 302.8872985839844
36 296.0647888183594
37 289.35162353515625
38 282.76708984375
39 276.33941650390625
40 270.0298767089844
41 263.8493347167969
42 257.7806701660156
43 251.83810424804688
44 246.01632690429688
45 240.3097381591797
46 234.71380615234375
47 229.231689453125
48 223.

354 4.584368798532523e-05
355 4.2742438381537795e-05
356 3.984822251368314e-05
357 3.7141027860343456e-05
358 3.461831511231139e-05
359 3.2256801205221564e-05
360 3.0057055482757278e-05
361 2.8001877581118606e-05
362 2.6084569981321692e-05
363 2.4295755792991258e-05
364 2.2623584300163202e-05
365 2.1063880922156386e-05
366 1.9611241441452876e-05
367 1.8255856048199348e-05
368 1.699204949545674e-05
369 1.581368087499868e-05
370 1.471284940635087e-05
371 1.3687214050150942e-05
372 1.2731773495033849e-05
373 1.1841263585665729e-05
374 1.1010687558155041e-05
375 1.0236796697427053e-05
376 9.51744459598558e-06
377 8.846008313412312e-06
378 8.221948519349098e-06
379 7.638888746441808e-06
380 7.096963145158952e-06
381 6.593207672267454e-06
382 6.1234245549712796e-06
383 5.685489668394439e-06
384 5.278431217448087e-06
385 4.900764452031581e-06
386 4.54777546110563e-06
387 4.2210390347463544e-06
388 3.916708465112606e-06
389 3.632911557360785e-06
390 3.3698431707307464e-06
391 3.125458079011878

### PyTorch: Custom nn Modules

In [6]:
class TwoLayerNet(torch.nn.Module):
    def __init__(self, D_in, H, D_out):
        """
        In the constructor we instantiate two nn.Linear modules and
        assign them as member variables.
        """
        super(TwoLayerNet, self).__init__()
        self.linear1 = torch.nn.Linear(D_in, H)
        self.linear2 = torch.nn.Linear(H, D_out)
        
    def forward(self, x):
        """
        In the forward function we accept a Tensor of input data and we must return
        a Tensor of output data. We can use Modules defined in the constructor as
        well as arbitrary operators on Tensors.
        """
        h_relu = self.linear1(x).clamp(min=0)
        y_pred = self.linear2(h_relu)
        return y_pred
    
    
# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold inputs and outputs
x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

# Constructor our model by instantiating the class defined above.
model = TwoLayerNet(D_in, H, D_out)

# Construct our loss function and an Optimizer. The call to model.parameters()
# in the SGD constructor will contain the learnable parameters of the two
# nn.Linear modules which are members of the model.
criterion = torch.nn.MSELoss(reduction='sum')
optimizer = torch.optim.SGD(model.parameters(), lr=1e-4)
for t in range(500):
    # Forward pass: Compute predicted y by passing x to the model
    y_pred = model(x)

    # Compute and print loss
    loss = criterion(y_pred, y)
    print(t, loss.item())

    # Zero gradients, perform a backward pass, and update the weights.
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()    

0 662.1619873046875
1 614.8142700195312
2 574.49365234375
3 539.0364379882812
4 506.9942321777344
5 477.70361328125
6 451.2509765625
7 426.9744873046875
8 404.3953552246094
9 383.466064453125
10 363.9256591796875
11 345.4716796875
12 327.8823547363281
13 311.1994934082031
14 295.3193359375
15 280.18792724609375
16 265.689453125
17 251.82852172851562
18 238.52769470214844
19 225.81143188476562
20 213.634033203125
21 202.00192260742188
22 190.8517303466797
23 180.20333862304688
24 170.05409240722656
25 160.37081909179688
26 151.13084411621094
27 142.33526611328125
28 133.9647979736328
29 126.0158462524414
30 118.502197265625
31 111.37481689453125
32 104.6221694946289
33 98.24649810791016
34 92.21812438964844
35 86.53280639648438
36 81.17533874511719
37 76.14189910888672
38 71.41434478759766
39 66.97129821777344
40 62.81193161010742
41 58.907962799072266
42 55.24562072753906
43 51.81462097167969
44 48.60533142089844
45 45.583831787109375
46 42.75654983520508
47 40.11912536621094
48 37.655

348 0.0007192319608293474
349 0.0007029078551568091
350 0.0006869611097499728
351 0.0006713875336572528
352 0.0006561788031831384
353 0.0006413192022591829
354 0.0006268154247663915
355 0.0006126409280113876
356 0.0005987979238852859
357 0.0005852779722772539
358 0.0005720648332498968
359 0.0005591609515249729
360 0.0005465539288707078
361 0.0005342350341379642
362 0.000522211950737983
363 0.0005104601732455194
364 0.0004989831359125674
365 0.0004877553437836468
366 0.0004768062208313495
367 0.0004661013081204146
368 0.00045563935418613255
369 0.00044541829265654087
370 0.0004354302363935858
371 0.00042567678610794246
372 0.0004161387914791703
373 0.0004068283305969089
374 0.0003977249434683472
375 0.000388829386793077
376 0.00038013330777175725
377 0.00037164081004448235
378 0.0003633358282968402
379 0.00035522753023542464
380 0.00034730200422927737
381 0.00033955424441955984
382 0.00033198061282746494
383 0.00032458422356285155
384 0.00031734848744235933
385 0.00031028277589939535
38

### PyTorch: Control Flow + Weight Sharing

In [7]:
import random

class DynamicNet(torch.nn.Module):
    def __init__(self, D_in, H, D_out):
        """
        In the constructor we construct three nn.Linear instances that we will use
        in the forward pass.
        """
        super(DynamicNet, self).__init__()
        self.input_linear = torch.nn.Linear(D_in, H)
        self.middle_linear = torch.nn.Linear(H, H)
        self.output_linear = torch.nn.Linear(H, D_out)
        
    def forward(self, x):
        """
        For the forward pass of the model, we randomly choose either 0, 1, 2, or 3
        and reuse the middle_linear Module that many times to compute hidden layer
        representations.
        
        Since each forward pass builds a dynamic computation graph, we can use normal
        Python control-flow operators like loops or conditional statements when
        defining the forward pass of the model.
        
        Here we also see that it is perfectly safe to reuse the same Module many
        times when defining a computational graph. This is a big improvement from Lua
        Torch, where each Module could be used only once.  
        """
        h_relu = self.input_linear(x).clamp(min=0)
        for _ in range(random.randint(0,3)):
            h_relu = self.middle_linear(h_relu).clamp(min=0)
        y_pred = self.output_linear(h_relu)
        return y_pred
        
        
# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold inputs and outputs
x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

# Construct our model by instantiating the class defined above
model = DynamicNet(D_in, H, D_out)        
        

# Construct our loss function and an Optimizer. Training this strange model with
# vanilla stochastic gradient descent is tough, so we use momentum
criterion = torch.nn.MSELoss(reduction='sum')
optimizer = torch.optim.SGD(model.parameters(), lr=1e-4, momentum=0.9)
for t in range(500):
    # Forward pass: Compute predicted y by passing x to the model
    y_pred = model(x)

    # Compute and print loss
    loss = criterion(y_pred, y)
    print(t, loss.item())

    # Zero gradients, perform a backward pass, and update the weights.
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()        

0 635.7162475585938
1 611.4730224609375
2 551.4341430664062
3 611.5872802734375
4 436.3512268066406
5 376.50494384765625
6 610.4963989257812
7 269.50909423828125
8 598.03271484375
9 588.4447631835938
10 573.4440307617188
11 601.8612670898438
12 586.7355346679688
13 598.30517578125
14 574.39990234375
15 592.1998901367188
16 159.4378662109375
17 450.24517822265625
18 536.3029174804688
19 571.2371826171875
20 95.78076171875
21 552.5015258789062
22 475.9610595703125
23 334.2368469238281
24 504.48004150390625
25 93.62947082519531
26 458.37811279296875
27 259.950927734375
28 78.7154541015625
29 398.2374572753906
30 58.09040451049805
31 356.8712463378906
32 292.4539489746094
33 307.9762878417969
34 40.44586944580078
35 37.3007698059082
36 243.77610778808594
37 156.63467407226562
38 139.66073608398438
39 127.04591369628906
40 113.00645446777344
41 183.75502014160156
42 83.87735748291016
43 159.13565063476562
44 206.80625915527344
45 96.06564331054688
46 95.88032531738281
47 498.7696533203125
4

370 0.5061831474304199
371 0.6863328814506531
372 0.4557819962501526
373 2.0216357707977295
374 1.3122254610061646
375 1.1545212268829346
376 1.3099915981292725
377 1.2684053182601929
378 0.9754418134689331
379 0.29574304819107056
380 0.7595024108886719
381 0.22988243401050568
382 3.209639310836792
383 0.9608185887336731
384 0.693142831325531
385 1.6297680139541626
386 0.2974812388420105
387 0.8370693922042847
388 0.8415535688400269
389 0.5577976703643799
390 0.5902547836303711
391 0.5682117342948914
392 1.2694697380065918
393 0.47668346762657166
394 0.30192136764526367
395 0.5077135562896729
396 0.4163951277732849
397 0.393092542886734
398 0.3677048683166504
399 2.142930507659912
400 0.3701348602771759
401 0.5117154717445374
402 0.6650006771087646
403 1.7096329927444458
404 0.28262171149253845
405 0.2177334725856781
406 1.2332484722137451
407 0.3354977071285248
408 0.35215750336647034
409 0.3175017833709717
410 0.2670741081237793
411 0.23324669897556305
412 0.16482923924922943
413 0.1