Numpy is a great framework, but it cannot utilize GPUs to accelerate.

Pytorch: Tensors

In [1]:
import torch

dtype = torch.float
device = torch.device("cpu")

N, D_in, H, D_out = 64, 1000, 100, 10

x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

w1 = torch.randn(D_in, H, device=device, dtype=dtype)
w2 = torch.randn(H, D_out, device=device, dtype=dtype)

learning_rate = 1e-6

for t in range(500):
    h = x.mm(w1)
    h_relu = h.clamp(min=0)   #clamp
    h_pred = h_relu.mm(w2)
    
    loss = (y_pred - y).pow(2).sum().item()
    print(t, loss)
    
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.T.dot(grad_y_pred)
    grad_h_relu = grad_y_pred.dot(w2.T)
    grad_h = grad_h_relu.copy()
    grad_h[h<0] = 0
    grad_w1 = x.T.dot(grad_h)
    
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2
    

NameError: name 'y_pred' is not defined

## Autograd

In [4]:
import torch

dtype = torch.float
device = torch.device("cpu")

x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

w1 = torch.randn(D_in, H, device=device, dtype=dtype, requires_grad=True)
w2 = torch.randn(H, D_out, device=device, dtype=dtype, requires_grad=True)

learning_rate = 1e-6
for t in range(500):
    y_pred = x.mm(w1).clamp(min=0).mm(w2)
    
    loss = (y_pred - y).pow(2).sum()
    print(t, loss.item())
    
    loss.backward()
    
    with torch.no_grad():
        w1 -= learning_rate * w1.grad
        w2 -= learning_rate * w2.grad
        
        w1.grad.zero_()
        w2.grad.zero_()


0 23800362.0
1 18243842.0
2 15795331.0
3 14299629.0
4 12753918.0
5 10855842.0
6 8675467.0
7 6554152.5
8 4730940.5
9 3329047.5
10 2321486.0
11 1631481.0
12 1167804.125
13 858279.5
14 649607.375
15 506301.15625
16 405158.34375
17 331699.25
18 276674.6875
19 234216.671875
20 200586.375
21 173324.25
22 150838.953125
23 132007.1875
24 116079.921875
25 102464.1015625
26 90737.765625
27 80591.109375
28 71748.4765625
29 64008.3515625
30 57215.8671875
31 51247.3125
32 45978.28125
33 41312.94921875
34 37175.65625
35 33504.84375
36 30238.958984375
37 27323.32421875
38 24715.703125
39 22380.609375
40 20288.140625
41 18409.44140625
42 16720.0703125
43 15201.4404296875
44 13833.9423828125
45 12600.6484375
46 11485.8388671875
47 10478.083984375
48 9565.83203125
49 8739.158203125
50 7989.8046875
51 7309.75341796875
52 6691.79736328125
53 6129.89794921875
54 5618.48876953125
55 5152.84033203125
56 4728.65869140625
57 4341.71240234375
58 3988.47216796875
59 3665.914306640625
60 3371.198974609375
61 3101

374 7.285514584509656e-05
375 7.11350585334003e-05
376 6.923596083652228e-05
377 6.769477477064356e-05
378 6.643423694185913e-05
379 6.509358354378492e-05
380 6.352621130645275e-05
381 6.242416566237807e-05
382 6.12626681686379e-05
383 5.9797919675474986e-05
384 5.857484939042479e-05
385 5.728284304495901e-05
386 5.604585021501407e-05
387 5.504945875145495e-05
388 5.398094072006643e-05
389 5.278392563923262e-05
390 5.169654104975052e-05
391 5.085593511466868e-05
392 4.971387897967361e-05
393 4.889824049314484e-05
394 4.8146066546905786e-05
395 4.6903045586077496e-05
396 4.6290508180391043e-05
397 4.5399952796287835e-05
398 4.476230969885364e-05
399 4.4198510295245796e-05
400 4.3130879930686206e-05
401 4.2483850847929716e-05
402 4.170231841271743e-05
403 4.1187908209394664e-05
404 4.049965718877502e-05
405 3.975131403421983e-05
406 3.927105717593804e-05
407 3.857522096950561e-05
408 3.7825422623427585e-05
409 3.730943353730254e-05
410 3.6750123399542645e-05
411 3.647327685030177e-05
412

### Define new autograd functions

In [5]:
import torch

class MyReLU(torch.autograd.Function):
    @staticmethod
    def forward(ctx, input):
        ctx.save_for_backward(input)
        return input.clamp(min=0)
    
    @staticmethod
    def backward(ctx, grad_output):
        input, = ctx.saved_tensors
        grad_input = grad_output.clone()
        grad_input[input < 0] = 0
        return grad_input
    
dtype = torch.float
device = torch.device("cpu")

x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

w1 = torch.randn(D_in, H, device=device, dtype=dtype, requires_grad=True)
w2 = torch.randn(H, D_out, device=device, dtype=dtype, requires_grad=True)

learning_rate = 1e-6
for t in range(500):
    relu = MyReLU.apply
    
    y_pred = relu(x.mm(w1)).mm(w2)
    
    loss = (y_pred - y).pow(2).sum()
    print(t, loss.item())
    
    loss.backward()
    
    with torch.no_grad():
        w1 -= learning_rate * w1.grad
        w2 -= learning_rate * w2.grad
        
        w1.grad.zero_()
        w2.grad.zero_()

0 31226824.0
1 27709170.0
2 29599224.0
3 31628710.0
4 30008158.0
5 22918044.0
6 14217057.0
7 7513337.0
8 3865465.25
9 2134033.75
10 1345837.5
11 959513.8125
12 745647.4375
13 609193.8125
14 511666.21875
15 436479.34375
16 375853.125
17 325858.5625
18 283998.59375
19 248609.921875
20 218545.578125
21 192850.359375
22 170745.3125
23 151606.46875
24 134965.140625
25 120427.09375
26 107723.0625
27 96568.109375
28 86738.5703125
29 78064.8125
30 70379.359375
31 63556.69140625
32 57484.14453125
33 52065.81640625
34 47224.14453125
35 42893.5078125
36 39010.17578125
37 35520.96484375
38 32377.927734375
39 29545.75390625
40 26988.419921875
41 24677.048828125
42 22584.73046875
43 20691.400390625
44 18972.591796875
45 17410.935546875
46 15989.548828125
47 14694.8798828125
48 13514.3359375
49 12436.314453125
50 11454.25390625
51 10558.09765625
52 9737.1044921875
53 8984.9931640625
54 8295.759765625
55 7663.53173828125
56 7083.06591796875
57 6549.5888671875
58 6058.98095703125
59 5608.13525390625
60

420 6.53534589218907e-05
421 6.405044405255467e-05
422 6.332936027320102e-05
423 6.229093560250476e-05
424 6.108878005761653e-05
425 6.005327668390237e-05
426 5.906046135351062e-05
427 5.791298099211417e-05
428 5.68311006645672e-05
429 5.5924519983818755e-05
430 5.4938172979746014e-05
431 5.4096504754852504e-05
432 5.3234794904710725e-05
433 5.24266179127153e-05
434 5.1633604016387835e-05
435 5.080763730802573e-05
436 5.006259380024858e-05
437 4.960822479915805e-05
438 4.876159437117167e-05
439 4.811721373698674e-05
440 4.729493957711384e-05
441 4.649409675039351e-05
442 4.583408372127451e-05
443 4.521634036791511e-05
444 4.4490559957921505e-05
445 4.3893116526305676e-05
446 4.322973472881131e-05
447 4.2801719246199355e-05
448 4.1910800064215437e-05
449 4.1127001168206334e-05
450 4.075022297911346e-05
451 4.0226437704404816e-05
452 3.948722223867662e-05
453 3.869270585710183e-05
454 3.839400960714556e-05
455 3.8032572774682194e-05
456 3.7348567275330424e-05
457 3.677227141452022e-05
45

### Using Tensorflow

In [6]:
import tensorflow as tf
import numpy as np

x = tf.placeholder(tf.float32, shape=(None, D_in))
y = tf.placeholder(tf.float32, shape=(None, D_out))

w1 = tf.Variable(tf.random_normal((D_in, H)))
w2 = tf.Variable(tf.random_normal((H, D_out)))

h = tf.matmul(x, w1)
h_relu = tf.maximum(h, tf.zeros(1))
y_pred = tf.matmul(h_relu, w2)

loss = tf.reduce_sum((y-y_pred) ** 2.0)

grad_w1, grad_w2 = tf.gradients(loss, [w1, w2])

learning_rate = 1e-6
new_w1 = w1.assign(w1 - learning_rate * grad_w1)
new_w2 = w2.assign(w2 - learning_rate * grad_w2)

with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    
    x_value = np.random.randn(N, D_in)
    y_value = np.random.randn(N, D_out)
    
    for _ in range(500):
        loss_value, _, _ = sess.run([loss, new_w1, new_w2], feed_dict={x: x_value, y: y_value})
        print(loss_value)

ModuleNotFoundError: No module named 'tensorflow'

### NN Module

In [8]:
import torch

x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

model = torch.nn.Sequential(torch.nn.Linear(D_in, H),
                           torch.nn.ReLU(),
                           torch.nn.Linear(H, D_out),)
loss_fn = torch.nn.MSELoss(reduction='sum')

learning_rate = 1e-4
for t in range(500):
    y_pred = model(x)
    
    loss = loss_fn(y_pred, y)
    print(t, loss.item())
    
    model.zero_grad()
    
    loss.backward()
    
    with torch.no_grad():
        for param in model.parameters():
            param -= learning_rate * param.grad

0 683.4682006835938
1 630.852783203125
2 585.6353149414062
3 546.2060546875
4 511.0404052734375
5 479.5335998535156
6 450.97100830078125
7 424.6772766113281
8 400.48193359375
9 377.8827209472656
10 356.7005615234375
11 336.8962097167969
12 318.3818054199219
13 301.0243835449219
14 284.66937255859375
15 269.1233825683594
16 254.43490600585938
17 240.5471954345703
18 227.3851318359375
19 214.99722290039062
20 203.2389678955078
21 192.0908966064453
22 181.52566528320312
23 171.4949951171875
24 161.9365997314453
25 152.88116455078125
26 144.30616760253906
27 136.17933654785156
28 128.47706604003906
29 121.20630645751953
30 114.32954406738281
31 107.83792877197266
32 101.70552825927734
33 95.90848541259766
34 90.43562316894531
35 85.26400756835938
36 80.38306427001953
37 75.77965545654297
38 71.43455505371094
39 67.34085845947266
40 63.47260665893555
41 59.8231086730957
42 56.39030075073242
43 53.16056442260742
44 50.11922073364258
45 47.25902557373047
46 44.56632995605469
47 42.02878952026

351 0.00045019216486252844
352 0.00043794853263534606
353 0.0004260540590621531
354 0.000414488953538239
355 0.0004032533906865865
356 0.000392311456380412
357 0.0003816797398030758
358 0.0003713498590514064
359 0.00036130339140072465
360 0.0003515415301080793
361 0.00034204983967356384
362 0.00033282325603067875
363 0.00032383776851929724
364 0.00031510673579759896
365 0.00030662788776680827
366 0.00029837567126378417
367 0.00029035002808086574
368 0.00028254688368178904
369 0.0002749532286543399
370 0.0002675735449884087
371 0.00026039074873551726
372 0.0002534142113290727
373 0.0002466182049829513
374 0.00024001749989110976
375 0.00023359058832284063
376 0.00022734401863999665
377 0.00022126792464405298
378 0.0002153595123672858
379 0.00020960667461622506
380 0.00020401281653903425
381 0.00019857703591696918
382 0.00019328513008076698
383 0.00018813755013979971
384 0.00018312540487386286
385 0.0001782534527592361
386 0.0001735174300847575
387 0.0001689096534391865
388 0.000164418161

### PyTorch: optim

In [10]:
import torch

x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

model = torch.nn.Sequential(torch.nn.Linear(D_in, H),
                           torch.nn.ReLU(),
                           torch.nn.Linear(H, D_out),
                           )
loss_fn = torch.nn.MSELoss(reduction='sum')

learning_rate = 1e-4

optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

for t in range(500):
    y_pred = model(x)
    
    loss = loss_fn(y_pred, y)
    print(t, loss.item())
    
    optimizer.zero_grad()
    
    loss.backward()
    
    optimizer.step()  #calling the step function
    

0 731.1836547851562
1 712.4283447265625
2 694.2645874023438
3 676.6832275390625
4 659.5474243164062
5 642.9369506835938
6 626.8805541992188
7 611.3248901367188
8 596.2207641601562
9 581.5968017578125
10 567.4736328125
11 553.7990112304688
12 540.60693359375
13 527.7333984375
14 515.440185546875
15 503.5059509277344
16 491.8835754394531
17 480.53704833984375
18 469.4756164550781
19 458.8651123046875
20 448.5372314453125
21 438.5068054199219
22 428.8114929199219
23 419.32647705078125
24 410.1085205078125
25 401.1551513671875
26 392.4711608886719
27 384.0137023925781
28 375.73028564453125
29 367.64990234375
30 359.7162780761719
31 351.9682922363281
32 344.4225769042969
33 337.0555419921875
34 329.8369445800781
35 322.8043518066406
36 315.911376953125
37 309.1890869140625
38 302.6123352050781
39 296.1492004394531
40 289.8258056640625
41 283.62066650390625
42 277.52764892578125
43 271.5631103515625
44 265.6977233886719
45 259.9376525878906
46 254.26828002929688
47 248.72486877441406
48 243.

372 0.0016209495952352881
373 0.0015497079584747553
374 0.001481699524447322
375 0.001416563056409359
376 0.0013543283566832542
377 0.0012948302319273353
378 0.001237936899997294
379 0.0011835639597848058
380 0.0011315783485770226
381 0.0010818841401487589
382 0.0010343471076339483
383 0.0009889513021335006
384 0.0009455165127292275
385 0.0009040085133165121
386 0.0008643213077448308
387 0.0008263929048553109
388 0.0007900940836407244
389 0.000755423738155514
390 0.0007222677231766284
391 0.000690582615789026
392 0.0006602748180739582
393 0.0006313036428764462
394 0.0006036281702108681
395 0.0005771565483883023
396 0.0005518379621207714
397 0.00052764912834391
398 0.0005045186262577772
399 0.0004824130155611783
400 0.0004612554039340466
401 0.0004410454130265862
402 0.0004217214591335505
403 0.0004032519645988941
404 0.00038558835512958467
405 0.00036869439645670354
406 0.00035254578688181937
407 0.0003370978811290115
408 0.0003223505336791277
409 0.0003082451585214585
410 0.0002947371

### Define your own modules

In [12]:
import torch

class TwoLayerNet(torch.nn.Module):
    def __init__(self, D_in, H, D_out):
        super(TwoLayerNet, self).__init__()
        self.linear1 = torch.nn.Linear(D_in, H)
        self.linear2 = torch.nn.Linear(H, D_out)
        
    def forward(self, x):
        h_relu = self.linear1(x).clamp(min=0)
        y_pred = self.linear2(h_relu)
        return y_pred
    
model = TwoLayerNet(D_in, H, D_out)

criterion = torch.nn.MSELoss(reduction='sum')
optimizer = torch.optim.SGD(model.parameters(), lr=1e-4)

for t in range(500):
    y_pred = model(x)
    
    loss = criterion(y_pred, y)
    print(t, loss.item())
    
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

0 692.4212646484375
1 638.226318359375
2 591.80517578125
3 551.4996948242188
4 515.8740234375
5 483.9986267089844
6 455.16510009765625
7 428.81878662109375
8 404.5206604003906
9 382.0644836425781
10 361.1020812988281
11 341.43878173828125
12 322.9793395996094
13 305.4423522949219
14 288.8076477050781
15 272.9966735839844
16 257.951171875
17 243.69097900390625
18 230.13938903808594
19 217.22789001464844
20 204.9268341064453
21 193.21893310546875
22 182.0924835205078
23 171.52210998535156
24 161.49134826660156
25 151.9659423828125
26 142.9309844970703
27 134.3842315673828
28 126.3046875
29 118.6611557006836
30 111.43988037109375
31 104.63069915771484
32 98.21143341064453
33 92.16549682617188
34 86.46083068847656
35 81.09092712402344
36 76.04642486572266
37 71.3086929321289
38 66.8655776977539
39 62.70244216918945
40 58.80543899536133
41 55.14900207519531
42 51.73414993286133
43 48.535552978515625
44 45.54789733886719
45 42.75592803955078
46 40.14765548706055
47 37.70569610595703
48 35.42

404 0.0004492470470722765
405 0.0004383505729492754
406 0.0004277042462490499
407 0.0004173365014139563
408 0.00040721750701777637
409 0.0003973401035182178
410 0.00038771596155129373
411 0.0003783171414397657
412 0.00036915286909788847
413 0.0003602170618250966
414 0.00035148669849149883
415 0.00034297205274924636
416 0.000334674259647727
417 0.0003265751583967358
418 0.0003186678804922849
419 0.00031095847953110933
420 0.00030343918479047716
421 0.00029609978082589805
422 0.0002889308671001345
423 0.0002819534274749458
424 0.00027513952227309346
425 0.0002684963110368699
426 0.00026200260617770255
427 0.00025567569537088275
428 0.0002494933141861111
429 0.00024347596627194434
430 0.00023760365729685873
431 0.000231865284149535
432 0.0002262626658193767
433 0.00022081001952756196
434 0.00021548496442846954
435 0.00021028821356594563
436 0.00020521738042589277
437 0.00020026316633448005
438 0.0001954441104317084
439 0.0001907314290292561
440 0.00018613330030348152
441 0.000181649040314