In [2]:
import torch

### Pytorch: Tensors

Compared with numpy array, tensors can be accelerated by gpu

In [11]:
dtype = torch.float
device = torch.device("cpu")

### N is batch size ###
### D_in is input dimension ###
### H is hidden dimension ###
### D_out is output dimension ###
N, D_in, H, D_out = 64, 1000, 100, 10

### create random input and output data ###
x = torch.randn(N, D_in, device= device, dtype= dtype)
y = torch.randn(N, D_out, device= device, dtype= dtype)

### randomly initialize weights ###
w1 = torch.randn(D_in, H, device= device, dtype= dtype)
w2 = torch.randn(H, D_out, device= device, dtype= dtype)

learning_rate = 1e-6

for t in range(500):
    ### Forward ###
    h = x.mm(w1)
    h_relu = h.clamp(min = 0)
    y_pred = h_relu.mm(w2)
    
    ### Compute and print loss ###
    loss = (y_pred - y).pow(2).sum().item()
    print(t, loss)
    
    ### Backprop ###
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.t().mm(grad_y_pred)
    grad_h_relu = grad_y_pred.mm(w2.t())
    grad_h = grad_h_relu.clone()
    grad_h[h<0] = 0 
    grad_w1 = x.t().mm(grad_h)
    
    ### Update ###
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2
    
    

0 36824136.0
1 34934300.0
2 33891576.0
3 28707544.0
4 20010248.0
5 11596945.0
6 6210991.0
7 3418319.75
8 2101501.0
9 1456046.5
10 1106257.25
11 890370.5625
12 740648.625
13 628023.25
14 539052.5
15 466489.15625
16 406371.0625
17 355959.3125
18 313269.96875
19 276865.46875
20 245645.953125
21 218699.46875
22 195334.3125
23 174975.484375
24 157171.28125
25 141532.90625
26 127747.6015625
27 115557.734375
28 104762.46875
29 95164.546875
30 86603.4453125
31 78954.953125
32 72117.203125
33 65978.7734375
34 60451.52734375
35 55467.921875
36 50964.5
37 46896.96875
38 43208.3359375
39 39856.04296875
40 36804.734375
41 34023.859375
42 31485.423828125
43 29165.0390625
44 27041.01953125
45 25094.859375
46 23309.654296875
47 21669.625
48 20160.9453125
49 18771.681640625
50 17491.146484375
51 16310.37890625
52 15219.99609375
53 14211.6689453125
54 13279.1865234375
55 12415.673828125
56 11615.251953125
57 10872.7001953125
58 10183.5263671875
59 9543.251953125
60 8947.775390625
61 8393.7763671875
62 7

### Pytorch: Autograd

Pytorch defines a computational graph. Tensors are nodes of this graph and operations are egdes of this graph.

In [23]:
dtype = torch.float
device = torch.device("cpu")

N, D_in, H, D_out = 64, 1000, 100, 10

x = torch.randn(N, D_in, dtype=dtype, device=device)
y = torch.randn(N, D_out, dtype=dtype, device=device)

w1 = torch.randn(D_in, H, dtype=dtype, device=device, requires_grad=True)
w2 = torch.randn(H, D_out, dtype=dtype, device=device, requires_grad=True)
learning_rate = 1e-6
for t in range(500):
    y_pred = x.mm(w1).clamp(min=0).mm(w2)
    #print(w1.grad)
    
    loss = (y_pred - y).pow(2).sum()
    #print((w1**w1).requires_grad)
    print(t, loss.item())
    
    loss.backward()
    #print(w1.grad)
    
    with torch.no_grad():
        print(w1.grad)
        w1 -= learning_rate * w1.grad
        w2 -= learning_rate * w2.grad
        w1.grad.zero_()
        w2.grad.zero_()
    
        

    
    

0 28129622.0
tensor([[  9784.0088,  13964.7441,     55.3807,  ...,  -2923.7559,
           9632.6816,  -7742.5547],
        [   375.4456,  -2779.0330,   6649.9067,  ...,   9196.0195,
          -8043.2349,   8879.7344],
        [  3180.8132,  -4522.4331,    405.2173,  ...,   5161.0928,
          -1245.5688,  -7972.8203],
        ...,
        [-14938.4277,   -484.7680,  -4901.8711,  ...,   -371.3183,
          -4230.4707,   9059.1445],
        [ -6596.9863,  -4648.0146,   9040.8877,  ...,   9476.5898,
          -1792.9071,    288.2370],
        [ -9836.1045,  -5579.5288,  -6117.4868,  ...,  -2568.9316,
         -11591.1846,   1656.4840]])


tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])

#### Note: 
Any operations in no_grad() block will have no gradients, but this can't change the require_grad = True, which is a property of variable. This block will make new generated w1 and w2 with no gradients, thus, after this block, the inplacement change of w1 and w2 will only change the value, this operation will not generate new gradient to replace the gradient of initial defined variable w1 and w2. Thus, the gradient calculation can be continued. However, if don't use this block, both value and gradient will be changed and this will make the back propogation in pytorch not continue. 

### nn packages

In [5]:
N, D_in, H, D_out = 64, 1000, 100, 10

x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

model = torch.nn.Sequential(
    torch.nn.Linear(D_in, H),
    torch.nn.ReLU(),
    torch.nn.Linear(H, D_out),
)

loss_fn = torch.nn.MSELoss(reduction = "sum")

learning_rate = 1e-4

for t in range(500):
    y_pred = model(x)
    loss = loss_fn(y_pred, y)
    print(t, loss.item())
    model.zero_grad()
    
    loss.backward()
    
    with torch.no_grad():
        for param in model.parameters():
            param -= learning_rate * param.grad
            


0 650.3442993164062
1 604.2210083007812
2 564.6799926757812
3 529.9041748046875
4 498.9967346191406
5 471.2443542480469
6 445.6927795410156
7 422.0105285644531
8 400.01507568359375
9 379.38787841796875
10 359.9504089355469
11 341.6209411621094
12 324.45697021484375
13 308.1536560058594
14 292.60205078125
15 277.7419128417969
16 263.58740234375
17 250.06468200683594
18 237.11968994140625
19 224.74713134765625
20 212.90447998046875
21 201.5299530029297
22 190.61366271972656
23 180.18072509765625
24 170.21881103515625
25 160.729248046875
26 151.6788787841797
27 143.0669708251953
28 134.87533569335938
29 127.09285736083984
30 119.70440673828125
31 112.69740295410156
32 106.05799102783203
33 99.77698516845703
34 93.83963775634766
35 88.2470703125
36 82.96912384033203
37 77.98987579345703
38 73.313232421875
39 68.90575408935547
40 64.76207733154297
41 60.86355972290039
42 57.19585418701172
43 53.74699783325195
44 50.509857177734375
45 47.474246978759766
46 44.62519836425781
47 41.94518661499

### optim packages

In [6]:
N, D_in, H, D_out = 64, 1000, 100, 10

x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

model = torch.nn.Sequential(
    torch.nn.Linear(D_in, H),
    torch.nn.ReLU(),
    torch.nn.Linear(H, D_out)
)

loss_fn = torch.nn.MSELoss(reduction = "sum")

learning_rate = 1e-4
optimizer = torch.optim.Adam(model.parameters(), lr = learning_rate)

for t in range(500):
    y_pred = model(x)
    loss = loss_fn(y_pred, y)
    print(t, loss.item())
    
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

0 678.1625366210938
1 661.3023071289062
2 644.8690795898438
3 628.8681030273438
4 613.4217529296875
5 598.4525146484375
6 583.9448852539062
7 569.8159790039062
8 556.1116943359375
9 542.8263549804688
10 529.8829956054688
11 517.3135986328125
12 505.08721923828125
13 493.1727600097656
14 481.60333251953125
15 470.28369140625
16 459.2950439453125
17 448.7535400390625
18 438.49127197265625
19 428.5130920410156
20 418.79052734375
21 409.3816223144531
22 400.1676940917969
23 391.134521484375
24 382.3775939941406
25 373.82464599609375
26 365.4658508300781
27 357.30413818359375
28 349.325439453125
29 341.5034484863281
30 333.8855895996094
31 326.4191589355469
32 319.1176452636719
33 311.973876953125
34 304.99066162109375
35 298.150390625
36 291.447021484375
37 284.863525390625
38 278.3905944824219
39 272.041015625
40 265.84124755859375
41 259.76031494140625
42 253.78890991210938
43 247.9304656982422
44 242.1908721923828
45 236.571044921875
46 231.05201721191406
47 225.62875366210938
48 220.30

480 6.28144107395201e-06
481 6.019124157319311e-06
482 5.766830327047501e-06
483 5.524202606466133e-06
484 5.292487912811339e-06
485 5.0702342377917375e-06
486 4.855530733038904e-06
487 4.651216841011774e-06
488 4.454055670066737e-06
489 4.264873496140353e-06
490 4.083870862814365e-06
491 3.910216491931351e-06
492 3.743546130863251e-06
493 3.5835012113238918e-06
494 3.4299853268748848e-06
495 3.2825964808580466e-06
496 3.142198465866386e-06
497 3.0068479190958897e-06
498 2.877919769161963e-06
499 2.753517037490383e-06


### Custom nn model

In [None]:
class TwoLayerNet(torch.nn.Module):
    def __init__(self, D_in, D_out):
        super(TwoLayerNet, self).__init__()
        self.linear1 = torch.nn.Linear(D_in, H)
        self.linear2 = torch.nn.Linear(H, D_out)
        
    def forward(self, x):
        h_relu = self.linear1(x).clamp(min = 0)
        y_pred = self.linear2(h_relu)
        
        return y_pred
    