In [2]:
# simple layers using numpy

import numpy as np

# initialize variables
batch_size = 10
input_dim = 1000
output_dim = 10
hidden_dim = 100

# input and output dat
x = np.random.randn(batch_size, input_dim)
y = np.random.randn(batch_size, output_dim)

#initialize weights
w1 = np.random.randn(input_dim,hidden_dim)
w2 = np.random.randn(hidden_dim, output_dim)

#learning rate
alpha = 1e-6

#model
for t in range(200):
    h = x.dot(w1)
    relu = np.maximum(h,0)
    pre_y = relu.dot(w2)

    #compute and print loss
    loss = np.square(pre_y - y).sum()
    print(loss,t)
    
    # Backprop to compute gradients of w1 and w2 with respect to loss
    grad_y_pred = 2.0 * (pre_y - y)
    grad_w2 = relu.T.dot(grad_y_pred)
    grad_h_relu = grad_y_pred.dot(w2.T)
    grad_h = grad_h_relu.copy()
    grad_h[h < 0] = 0
    grad_w1 = x.T.dot(grad_h)

    # Update weights
    w1 -= alpha * grad_w1
    w2 -= alpha * grad_w2


3547705.9025954576 0
1933563.5143963834 1
1222896.2987174229 2
829313.673069263 3
584299.6973279217 4
421377.04292408866 5
308813.4396692697 6
229537.02135933656 7
172587.93327340166 8
131059.5018572198 9
100340.78039140307 10
77405.83322446288 11
60117.78082678976 12
46971.66544081336 13
36938.64061429133 14
29199.717441289908 15
23188.137428033082 16
18517.898109380505 17
14853.205343892096 18
11957.789074963881 19
9659.342140260556 20
7826.84511671458 21
6360.09757848203 22
5181.647126200905 23
4231.625046684206 24
3463.4061405096386 25
2840.419516646893 26
2333.874551511089 27
1921.003114420194 28
1583.772293118158 29
1307.6620039833067 30
1081.1968171008366 31
895.1164243233733 32
741.9634944072496 33
615.7150184558371 34
511.49333236232206 35
425.34211104707276 36
354.040480189505 37
294.9439027106213 38
245.91629421513105 39
205.19926454346324 40
171.35060059491195 41
143.18544663587238 42
119.72886070019219 43
100.17744930961673 44
83.86826930126719 45
70.25496945248321 46
58.8

In [8]:
x.shape

(10, 1000)

In [15]:
# it cannot utilize GPU to accelerate its numerical computations.

In [18]:
# Simple layers using PyTorch

import torch

#it needs to convert the datatype
dtype = torch.float
device = torch.device("cpu")

x_new =torch.randn(batch_size, input_dim, dtype = dtype, device = device)
y_new = torch.randn(batch_size, output_dim, dtype = dtype, device =device)

# intialize weights
w1_new = torch.randn(input_dim, hidden_dim)
w2_new = torch.randn(hidden_dim, output_dim)

for t in range(500):
    h_new = x_new.mm(w1_new)
    relu_new = h_new.clamp(min = 0)
    pre_y_new = relu_new.mm(w2_new)
    
    # calculate loss
    loss_new = (pre_y_new - y_new).pow(2).sum().item()
    print(loss, t)
    
    # backprop and gradient (minimizing loss)
    grad_y_pred = 2.0 * (pre_y_new - y_new)
    grad_w2 = relu_new.t().mm(grad_y_pred)
    grad_h_relu = grad_y_pred.mm(w2_new.t())
    grad_h = grad_h_relu.clone()
    grad_h[h_new < 0] = 0
    grad_w1 = x_new.t().mm(grad_h)
    
    #update weights
    w1_new -= alpha * grad_w1
    w2_new -= alpha * grad_w2

    

1.4333150448950228e-08 0
1.4333150448950228e-08 1
1.4333150448950228e-08 2
1.4333150448950228e-08 3
1.4333150448950228e-08 4
1.4333150448950228e-08 5
1.4333150448950228e-08 6
1.4333150448950228e-08 7
1.4333150448950228e-08 8
1.4333150448950228e-08 9
1.4333150448950228e-08 10
1.4333150448950228e-08 11
1.4333150448950228e-08 12
1.4333150448950228e-08 13
1.4333150448950228e-08 14
1.4333150448950228e-08 15
1.4333150448950228e-08 16
1.4333150448950228e-08 17
1.4333150448950228e-08 18
1.4333150448950228e-08 19
1.4333150448950228e-08 20
1.4333150448950228e-08 21
1.4333150448950228e-08 22
1.4333150448950228e-08 23
1.4333150448950228e-08 24
1.4333150448950228e-08 25
1.4333150448950228e-08 26
1.4333150448950228e-08 27
1.4333150448950228e-08 28
1.4333150448950228e-08 29
1.4333150448950228e-08 30
1.4333150448950228e-08 31
1.4333150448950228e-08 32
1.4333150448950228e-08 33
1.4333150448950228e-08 34
1.4333150448950228e-08 35
1.4333150448950228e-08 36
1.4333150448950228e-08 37
1.4333150448950228e-08

1.4333150448950228e-08 350
1.4333150448950228e-08 351
1.4333150448950228e-08 352
1.4333150448950228e-08 353
1.4333150448950228e-08 354
1.4333150448950228e-08 355
1.4333150448950228e-08 356
1.4333150448950228e-08 357
1.4333150448950228e-08 358
1.4333150448950228e-08 359
1.4333150448950228e-08 360
1.4333150448950228e-08 361
1.4333150448950228e-08 362
1.4333150448950228e-08 363
1.4333150448950228e-08 364
1.4333150448950228e-08 365
1.4333150448950228e-08 366
1.4333150448950228e-08 367
1.4333150448950228e-08 368
1.4333150448950228e-08 369
1.4333150448950228e-08 370
1.4333150448950228e-08 371
1.4333150448950228e-08 372
1.4333150448950228e-08 373
1.4333150448950228e-08 374
1.4333150448950228e-08 375
1.4333150448950228e-08 376
1.4333150448950228e-08 377
1.4333150448950228e-08 378
1.4333150448950228e-08 379
1.4333150448950228e-08 380
1.4333150448950228e-08 381
1.4333150448950228e-08 382
1.4333150448950228e-08 383
1.4333150448950228e-08 384
1.4333150448950228e-08 385
1.4333150448950228e-08 386
1

In [22]:
# Simple 2 layer model using PyTorch and Autograd 

import torch 

# initialize variables
batch_size = 10
input_dim = 1000
output_dim = 10
hidden_dim = 100

# input and output dat
x = torch.randn(batch_size, input_dim, device = device, dtype = dtype)
y = torch.randn(batch_size, output_dim, device = device, dtype = dtype)

#initialize weights
w1 = torch.randn(input_dim,hidden_dim, device = device , dtype = dtype, requires_grad=True)
w2 = torch.randn(hidden_dim, output_dim, device = device, dtype = dtype, requires_grad=True)

#learning rate
alpha = 1e-6

for t in range(500):
    #h =x.mm(w1)
    #relu = h.clamp(min=0)
    #pred_y = relu.mm(w2)
    
    pred_y = x.mm(w1).clamp(min=0).mm(w2)
    
    #calculating loss 
    loss = (pred_y - y).pow(2).sum()
    print(t, loss.item() )
    
    #gradient and backpropogation using autograd
    loss.backward()
    
    with torch.no_grad():
        w1 -= alpha * w1.grad
        w2 -= alpha * w2.grad

        # Manually zero the gradients after updating weights
        w1.grad.zero_()
        w2.grad.zero_()
    

0 3684150.25
1 2325161.75
2 1580509.125
3 1120072.25
4 817638.3125
5 609893.3125
6 462933.1875
7 356537.125
8 278091.15625
9 219431.390625
10 174923.6875
11 140705.171875
12 114165.8515625
13 93258.734375
14 76708.921875
15 63599.87890625
16 53026.47265625
17 44407.2421875
18 37340.13671875
19 31514.6875
20 26697.845703125
21 22688.630859375
22 19336.28515625
23 16522.642578125
24 14153.5361328125
25 12151.3310546875
26 10454.255859375
27 9011.7958984375
28 7782.42138671875
29 6731.990234375
30 5835.6416015625
31 5067.44873046875
32 4406.10546875
33 3835.68603515625
34 3342.84912109375
35 2916.403076171875
36 2546.791748046875
37 2226.08740234375
38 1947.519775390625
39 1705.276123046875
40 1494.321044921875
41 1310.364013671875
42 1149.79345703125
43 1009.5045776367188
44 886.836669921875
45 779.4885864257812
46 685.4757080078125
47 603.0956420898438
48 530.844482421875
49 467.4548034667969
50 411.7970886230469
51 362.9031677246094
52 319.92572021484375
53 282.1383361816406
54 248.893

In [26]:
# simple 2 layer by defining own autograd function

import torch


class MyReLU(torch.autograd.Function):
    """
    We can implement our own custom autograd Functions by subclassing
    torch.autograd.Function and implementing the forward and backward passes
    which operate on Tensors.
    """

    @staticmethod
    def forward(ctx, input):
        """
        In the forward pass we receive a Tensor containing the input and return
        a Tensor containing the output. ctx is a context object that can be used
        to stash information for backward computation. You can cache arbitrary
        objects for use in the backward pass using the ctx.save_for_backward method.
        """
        ctx.save_for_backward(input)
        return input.clamp(min=0)

    @staticmethod
    def backward(ctx, grad_output):
        """
        In the backward pass we receive a Tensor containing the gradient of the loss
        with respect to the output, and we need to compute the gradient of the loss
        with respect to the input.
        """
        input, = ctx.saved_tensors
        grad_input = grad_output.clone()
        grad_input[input < 0] = 0
        return grad_input


dtype = torch.float
device = torch.device("cpu")
# dtype = torch.device("cuda:0") # Uncomment this to run on GPU

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold input and outputs.
x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

# Create random Tensors for weights.
w1 = torch.randn(D_in, H, device=device, dtype=dtype, requires_grad=True)
w2 = torch.randn(H, D_out, device=device, dtype=dtype, requires_grad=True)

learning_rate = 1e-6

for t in range(500):
    # To apply our Function, we use Function.apply method. We alias this as 'relu'.
    relu = MyReLU.apply

    # Forward pass: compute predicted y using operations; we compute
    # ReLU using our custom autograd operation.
    y_pred = relu(x.mm(w1)).mm(w2)

    # Compute and print loss
    loss = (y_pred - y).pow(2).sum()
    print(t, loss.item())

    # Use autograd to compute the backward pass.
    loss.backward()

    # Update weights using gradient descent
    with torch.no_grad():
        w1 -= learning_rate * w1.grad
        w2 -= learning_rate * w2.grad

        # Manually zero the gradients after updating weights
        w1.grad.zero_()
        w2.grad.zero_()


0 29136582.0
1 22538760.0
2 18611656.0
3 15121986.0
4 11723165.0
5 8587083.0
6 6052405.0
7 4182483.25
8 2905744.0
9 2057884.5
10 1502368.875
11 1134037.75
12 884142.6875
13 708902.5625
14 581721.6875
15 486058.4375
16 411888.375
17 352851.75
18 304942.53125
19 265376.6875
20 232278.328125
21 204270.359375
22 180327.171875
23 159752.09375
24 141996.984375
25 126587.921875
26 113161.96875
27 101393.8125
28 91042.5625
29 81907.7578125
30 73829.265625
31 66660.2421875
32 60285.875
33 54605.6640625
34 49532.03515625
35 44991.2265625
36 40920.5390625
37 37265.62890625
38 33977.9765625
39 31015.828125
40 28341.30859375
41 25924.599609375
42 23737.298828125
43 21754.1328125
44 19953.9921875
45 18318.513671875
46 16830.7421875
47 15476.1318359375
48 14240.416015625
49 13112.8251953125
50 12082.6396484375
51 11140.712890625
52 10278.6513671875
53 9488.7626953125
54 8765.0498046875
55 8101.30712890625
56 7492.14306640625
57 6931.99658203125
58 6417.1337890625
59 5943.2763671875
60 5506.9760742187

452 3.0692295695189387e-05
453 3.015537549799774e-05
454 2.9689559596590698e-05
455 2.926281376858242e-05
456 2.8955206289538182e-05
457 2.860409949789755e-05
458 2.819997644110117e-05
459 2.78219176834682e-05
460 2.7439849873189814e-05
461 2.709811997192446e-05
462 2.662823499122169e-05
463 2.629314076330047e-05
464 2.596953527245205e-05
465 2.5644565539550968e-05
466 2.5411898604943417e-05
467 2.5024564820341766e-05
468 2.4618131647002883e-05
469 2.4313943868037313e-05
470 2.4207760361605324e-05
471 2.388823304499965e-05
472 2.3524016796727665e-05
473 2.3210457584355026e-05
474 2.299723564647138e-05
475 2.2639738745056093e-05
476 2.245272116851993e-05
477 2.2354051907313988e-05
478 2.1963147446513176e-05
479 2.1716292394557968e-05
480 2.1591815311694518e-05
481 2.1389201720012352e-05
482 2.1116671632626094e-05
483 2.0878609575447626e-05
484 2.079516889352817e-05
485 2.049534668913111e-05
486 2.027622758760117e-05
487 2.004164161917288e-05
488 1.9853316189255565e-05
489 1.953085848072

In [12]:
# Simple two layers net using tensorflow, computational graph are static in nature

# import libraries
import tensorflow as tf
import numpy as np

#Define output dimension, input dimension, hidden dimension and batch size
input_dim = 1000
output_dim = 10
hidden_dim = 100
batch_size = 10

#input and output
x = tf.placeholder(tf.float32, shape =(None, input_dim))
y = tf.placeholder(tf.float32, shape = (None, output_dim))

# intializing random weights
w1 = tf.Variable(tf.random_normal((input_dim,hidden_dim)))
w2 = tf.Variable(tf.random_normal((hidden_dim, output_dim)))

# learning rate
alpha = 1e-6

h = tf.matmul(x,w1)
relu = tf.maximum(h, tf.zeros(1))
pred_y = tf.matmul(h,w2)

#compute loss and print 
loss = tf.reduce_sum((pred_y -y)**2.0)


#compute gradient of loss with respect to w1 and w2
gradw1, gradw2 = tf.gradients(loss,[w1,w2])

# update weights
w1_new = w1.assign(w1-gradw1*alpha)
w2_new = w2.assign(w2 - gradw2*alpha)

#Run the model
with tf.Session() as sess:
    # Run the graph once to initialize the Variables w1 and w2.
    sess.run(tf.global_variables_initializer())
    
    #Create numpy arrays holding the actual data for the inputs x and targets y
    x_value = np.random.randn(batch_size, input_dim)
    y_value = np.random.randn(batch_size, output_dim)
    
    for _ in range(500):
        # Execute the graph many times. Each time it executes we want to bind
        # x_value to x and y_value to y, specified with the feed_dict argument.
        # Each time we execute the graph we want to compute the values for loss,
        # new_w1, and new_w2; the values of these Tensors are returned as numpy
        # arrays.
        loss_value, _, _ = sess.run([loss, w1_new, w2_new],
                                    feed_dict={x: x_value, y: y_value})
        print(loss_value)
    


12095528.0
3608115.5
1318086.2
531681.8
229880.58
104843.19
49881.203
24551.582
12422.788
6430.5464
3392.8765
1819.4694
989.5855
544.9695
303.46942
170.72482
96.94087
55.52866
32.06572
18.661789
10.939226
6.454547
3.833002
2.2908452
1.3763499
0.83132565
0.5048953
0.30780774
0.18850113
0.11582549
0.07147863
0.044179607
0.027423657
0.01710952
0.010683816
0.006673011
0.004191803
0.002637762
0.0016634199
0.0010586367
0.0006763056
0.00043306386
0.00028288492
0.0001905504
0.00012606446
8.460724e-05
6.0620765e-05
4.283475e-05
3.134935e-05
2.2568991e-05
1.659524e-05
1.2280822e-05
8.922598e-06
6.867026e-06
5.35763e-06
4.243962e-06
3.4581183e-06
2.7948768e-06
2.2771728e-06
1.959037e-06
1.6410133e-06
1.4131788e-06
1.313321e-06
1.1187319e-06
1.000539e-06
8.7546755e-07
7.9637226e-07
7.2243654e-07
6.5444476e-07
6.3939825e-07
5.65013e-07
4.882834e-07
4.7512515e-07
4.345331e-07
4.0543887e-07
4.0674047e-07
3.9764285e-07
3.850642e-07
3.7526422e-07
3.489804e-07
3.519416e-07
3.331939e-07
3.31983e-07
3.024

In [16]:
# 2 layers using nn package of PyTorch

# import libraries
import torch

# define device nad dtype
device = torch.device("cpu")
dtype = torch.float
# define dimensions
input_dim = 1000
output_dim =10
hidden_dim = 100
batch_size = 10

#define input and output
x = torch.randn(batch_size, input_dim, device = device, dtype = dtype)
y = torch.randn(batch_size, output_dim, device = device, dtype = dtype)

#initialize weight
#w1 = torch.randn(input_dim. hidden_dim)
#w2 = torch.randn(hidden_dim, output_dim)

# model
model = torch.nn.Sequential(torch.nn.Linear(input_dim, hidden_dim), 
                            torch.nn.ReLU(), 
                            torch.nn.Linear(hidden_dim, output_dim),
                           )

#nn package also contains loss function 
lossfn = torch.nn.MSELoss(size_average=False)

#define learning rate
alpha = 1e-4

#computation
for t in range(500):
    #prediction
    pred_y = model(x)
    
    #for calculating loss
    loss = lossfn(pred_y, y)
    print(t, loss.item())
    
    #make model gradient 0
    model.zero_grad()
    
    #calculate the backward loss with respet to parameters in the model
    loss.backward()
    
    # updating weights
    with torch.no_grad():
        for param in model.parameters():
            param -= alpha*param.grad



0 98.58744049072266
1 91.84552764892578
2 85.81375122070312
3 80.46526336669922
4 75.61821746826172
5 71.17677307128906
6 67.10432434082031
7 63.34642791748047
8 59.891544342041016
9 56.71033477783203
10 53.763465881347656
11 51.0285758972168
12 48.49121856689453
13 46.133575439453125
14 43.93048095703125
15 41.868247985839844
16 39.91353225708008
17 38.060089111328125
18 36.325225830078125
19 34.68577194213867
20 33.13546371459961
21 31.668880462646484
22 30.266979217529297
23 28.927370071411133
24 27.648509979248047
25 26.43114471435547
26 25.27056884765625
27 24.169116973876953
28 23.120161056518555
29 22.120450973510742
30 21.168073654174805
31 20.2579345703125
32 19.388303756713867
33 18.559673309326172
34 17.7685489654541
35 17.012889862060547
36 16.290494918823242
37 15.600332260131836
38 14.944304466247559
39 14.319622993469238
40 13.721799850463867
41 13.149789810180664
42 12.603034973144531
43 12.080653190612793
44 11.579506874084473
45 11.100531578063965
46 10.64313507080078

In [26]:
# we can use optimization function to optimize the model

#import libraries
import torch

# define dimensions
input_dim = 1000
output_dim = 10
hidden_dim = 100
batch_size = 10

# define input and output
x = torch.randn(batch_size, input_dim)
y = torch.randn(batch_size, output_dim)

#model
model = torch.nn.Sequential(torch.nn.Linear(input_dim, hidden_dim),
                           torch.nn.ReLU(),
                           torch.nn.Linear(hidden_dim, output_dim),
                           )

#loss function from nn package
loss_fn = torch.nn.MSELoss(size_average=False)

#optimizer function(Adam)
learning_rate = 1e-4
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

for t in range(500):
    pred_y = model(x)
    loss = loss_fn(pred_y, y)
    print(t, loss.item())
    
    #optimizer fuct
    optimizer.zero_grad()
    
    #compute gradient of loss with respect to model parameters
    loss.backward()
    
    optimizer.step()
    

0 77.72431945800781
1 72.34355163574219
2 67.39958953857422
3 62.76002883911133
4 58.37371826171875
5 54.26560592651367
6 50.457889556884766
7 46.95602035522461
8 43.70286178588867
9 40.66218566894531
10 37.797969818115234
11 35.12674331665039
12 32.624881744384766
13 30.291061401367188
14 28.108600616455078
15 26.06667137145996
16 24.151281356811523
17 22.347023010253906
18 20.685853958129883
19 19.144868850708008
20 17.709627151489258
21 16.375219345092773
22 15.134001731872559
23 13.971059799194336
24 12.880343437194824
25 11.860724449157715
26 10.907492637634277
27 10.019267082214355
28 9.191619873046875
29 8.42030143737793
30 7.714921951293945
31 7.057720184326172
32 6.448736667633057
33 5.883931636810303
34 5.360673904418945
35 4.880648136138916
36 4.436875343322754
37 4.026938438415527
38 3.6484901905059814
39 3.299453020095825
40 2.978501081466675
41 2.683473587036133
42 2.4126970767974854
43 2.1651828289031982
44 1.9395116567611694
45 1.734290599822998
46 1.5475472211837769
47

393 4.491556866020874e-13
394 4.4164780339805976e-13
395 4.2392933781443265e-13
396 4.4591175370201075e-13
397 4.501861123468176e-13
398 4.22708092487345e-13
399 4.020718220171249e-13
400 3.5988334708136893e-13
401 3.683071642807123e-13
402 3.5415182071674156e-13
403 4.246371049926312e-13
404 3.991991199409073e-13
405 4.582629848509656e-13
406 4.076368149280585e-13
407 4.494367118051956e-13
408 4.3927817112987544e-13
409 4.0580494693742697e-13
410 3.8071390658089843e-13
411 3.718321223838972e-13
412 3.493986783925651e-13
413 3.366311136093758e-13
414 3.3352248914042537e-13
415 3.4157160606895776e-13
416 3.3217634372306737e-13
417 3.0862573781320624e-13
418 3.1850672273237013e-13
419 3.190618342446827e-13
420 2.966908402984858e-13
421 2.948867278834699e-13
422 2.711418329442994e-13
423 2.637588498305421e-13
424 2.6514662861132354e-13
425 2.779697045457441e-13
426 2.481879719101743e-13
427 2.693377205292835e-13
428 2.6478580612832037e-13
429 2.647302949770891e-13
430 2.5851304603918823e-

In [31]:
#Defining own custom nn models

#import libraries
import torch

#define model
class TwoLayerNet(torch.nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        """
        In the constructor we instantiate two nn.Linear modules and assign them as
        member variables.
        """
        super(TwoLayerNet, self).__init__()
        self.linear1 = torch.nn.Linear(input_dim, hidden_dim)
        self.linear2 = torch.nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        """
        In the forward function we accept a Tensor of input data and we must return
        a Tensor of output data. We can use Modules defined in the constructor as
        well as arbitrary operators on Tensors.
        """
        h_relu = self.linear1(x).clamp(min=0)
        y_pred = self.linear2(h_relu)
        return y_pred
    
model = TwoLayerNet(input_dim,hidden_dim,output_dim)

criterion = torch.nn.MSELoss(size_average=False)
optimizer = torch.optim.SGD(model.parameters(), lr=1e-4)
for t in range(500):
    # Forward pass: Compute predicted y by passing x to the model
    y_pred = model(x)

    # Compute and print loss
    loss = criterion(y_pred, y)
    print(t, loss.item())

    # Zero gradients, perform a backward pass, and update the weights.
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
        
        
    
        
    

0 86.68621826171875
1 79.74000549316406
2 73.66283416748047
3 68.39078521728516
4 63.709266662597656
5 59.568153381347656
6 55.82979202270508
7 52.4925537109375
8 49.4591178894043
9 46.67131805419922
10 44.12826919555664
11 41.771366119384766
12 39.534610748291016
13 37.45335006713867
14 35.506187438964844
15 33.69004440307617
16 31.992752075195312
17 30.39280128479004
18 28.889543533325195
19 27.478649139404297
20 26.14542579650879
21 24.8833065032959
22 23.69634246826172
23 22.572839736938477
24 21.50531578063965
25 20.494352340698242
26 19.539470672607422
27 18.63178062438965
28 17.76822853088379
29 16.94482421875
30 16.161754608154297
31 15.418608665466309
32 14.71018123626709
33 14.035890579223633
34 13.391613006591797
35 12.776007652282715
36 12.188488006591797
37 11.628043174743652
38 11.093038558959961
39 10.581636428833008
40 10.092599868774414
41 9.626208305358887
42 9.183116912841797
43 8.756580352783203
44 8.34929370880127
45 7.960546493530273
46 7.588694095611572
47 7.2337

409 2.028068138315575e-06
410 1.955250581886503e-06
411 1.8852636003430234e-06
412 1.8172645468439441e-06
413 1.751934178173542e-06
414 1.6895492080948316e-06
415 1.628656491448055e-06
416 1.5699947653047275e-06
417 1.5141140465857461e-06
418 1.4596083701690077e-06
419 1.4071608802623814e-06
420 1.356715301881195e-06
421 1.30835189793288e-06
422 1.261275770048087e-06
423 1.2159514426457463e-06
424 1.1720578640961321e-06
425 1.1304848612780916e-06
426 1.089688225874852e-06
427 1.0509052117413376e-06
428 1.0134991725863074e-06
429 9.769175903784344e-07
430 9.422210496268235e-07
431 9.084932912628574e-07
432 8.758714216128283e-07
433 8.445695698355848e-07
434 8.139912210936018e-07
435 7.850723022784223e-07
436 7.570191087324929e-07
437 7.301210871446528e-07
438 7.040608238639834e-07
439 6.785460300307022e-07
440 6.546458735101623e-07
441 6.311143465609348e-07
442 6.084749770707276e-07
443 5.869368919775297e-07
444 5.657030897054938e-07
445 5.455989366964786e-07
446 5.261608180262556e-07
4

In [34]:
import  random

class DynamicNet(torch.nn.Module):
    def __init__(self, D_in, H, D_out):
        """
        In the constructor we construct three nn.Linear instances that we will use
        in the forward pass.
        """
        super(DynamicNet, self).__init__()
        self.input_linear = torch.nn.Linear(D_in, H)
        self.middle_linear = torch.nn.Linear(H, H)
        self.output_linear = torch.nn.Linear(H, D_out)

    def forward(self, x):
        """
        For the forward pass of the model, we randomly choose either 0, 1, 2, or 3
        and reuse the middle_linear Module that many times to compute hidden layer
        representations.

        Since each forward pass builds a dynamic computation graph, we can use normal
        Python control-flow operators like loops or conditional statements when
        defining the forward pass of the model.

        Here we also see that it is perfectly safe to reuse the same Module many
        times when defining a computational graph. This is a big improvement from Lua
        Torch, where each Module could be used only once.
        """
        h_relu = self.input_linear(x).clamp(min=0)
        for _ in range(random.randint(0, 3)):
            h_relu = self.middle_linear(h_relu).clamp(min=0)
        y_pred = self.output_linear(h_relu)
        return y_pred


# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold inputs and outputs
x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

# Construct our model by instantiating the class defined above
model = DynamicNet(D_in, H, D_out)

# Construct our loss function and an Optimizer. Training this strange model with
# vanilla stochastic gradient descent is tough, so we use momentum
criterion = torch.nn.MSELoss(size_average=False)
optimizer = torch.optim.SGD(model.parameters(), lr=1e-4, momentum=0.9)
for t in range(500):
    # Forward pass: Compute predicted y by passing x to the model
    y_pred = model(x)

    # Compute and print loss
    loss = criterion(y_pred, y)
    print(t, loss.item())

    # Zero gradients, perform a backward pass, and update the weights.
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

0 650.240966796875
1 650.5770263671875
2 649.8693237304688
3 649.5689086914062
4 647.7523803710938
5 646.4772338867188
6 645.0961303710938
7 640.032958984375
8 637.8634033203125
9 634.9379272460938
10 631.4083251953125
11 611.7835693359375
12 638.7273559570312
13 597.552978515625
14 618.871826171875
15 576.6282958984375
16 635.7840576171875
17 678.0615234375
18 633.8411865234375
19 529.9270629882812
20 601.519775390625
21 629.171875
22 591.1906127929688
23 479.6010437011719
24 462.29168701171875
25 564.9052124023438
26 434.14013671875
27 538.622314453125
28 521.4501342773438
29 328.7056884765625
30 566.6901245117188
31 548.4910888671875
32 438.9781799316406
33 202.6980743408203
34 315.1336975097656
35 456.02935791015625
36 356.5508117675781
37 409.3898620605469
38 142.82315063476562
39 131.2164764404297
40 249.6953125
41 331.6607971191406
42 303.3162536621094
43 237.72218322753906
44 252.63760375976562
45 238.42230224609375
46 191.36024475097656
47 178.93418884277344
48 201.29714965820

490 0.04224051162600517
491 0.1533915251493454
492 0.04745282977819443
493 0.29355862736701965
494 0.11741518974304199
495 0.27661290764808655
496 0.10731523483991623
497 0.10137064754962921
498 0.2485172003507614
499 0.07670194655656815
