# Table of Contents
### Tensors
- Warm-up:numpy
- Pytorch:Tensors

### Autograd
- Pytorch: Tensors and autograd
- Pytorch: Defining new autograd functions
- TensorFlow: Static Graphs

### nn module 
- Pytorch: nn
- Pytorch: optim
- Pytorch: Custom nn Modules
- Pytorch: Control Flow + Weight Sharing

### Examples
- Tensors
- Autograd
- nn module

In [2]:
# Warm-up: numpy
# -*- coding:  utf-8 -*-
import numpy as np

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random input and ouput data
x = np.random.randn(N, D_in)
y = np.random.randn(N, D_out)

# Randomly initialize weights
w1 = np.random.randn(D_in, H)
w2 = np.random.randn(H, D_out)

learning_rate = 1e-6
for t in range(500):
    # Forward pss: compute predicted y
    h = x.dot(w1)
    h_relu = np.maximum(h, 0)
    y_pred = h_relu.dot(w2)
    
    # Compute and print loss
    loss = np.square(y_pred - y).sum()
    print(t, loss)
    
    # Backprop to compute gradients
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.T.dot(grad_y_pred)
    grad_h_relu = grad_y_pred.dot(w2.T)
    grad_h = grad_h_relu.copy()
    grad_h[h < 0] = 0
    grad_w1 = x.T.dot(grad_h)
    
    # Update weights
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

0 29913837.264608484
1 23187694.460365783
2 20942201.470368057
3 19719152.989012405
4 18031973.15533874
5 15135617.186022755
6 11635652.770433625
7 8170675.513401496
8 5458269.08810668
9 3557139.8074454907
10 2354624.3382138414
11 1610865.9372185343
12 1156213.916305292
13 870011.2503119644
14 683354.3751354484
15 555393.2158363309
16 463160.8821313317
17 393577.61122498574
18 338883.7719188229
19 294562.7940284497
20 257820.64975826387
21 226887.47108979482
22 200535.3804203067
23 177883.18813258916
24 158310.1699915392
25 141289.65146860186
26 126429.16471752497
27 113411.90385923596
28 101946.83941988015
29 91815.97603666614
30 82842.3929929185
31 74869.1981966443
32 67768.57730059448
33 61439.22640537152
34 55796.25799715304
35 50746.52165845057
36 46210.053662301856
37 42127.185691579776
38 38449.7151299458
39 35130.81335545928
40 32133.26043855111
41 29424.427743100776
42 26970.407543834903
43 24741.73848696771
44 22717.90816815115
45 20875.82008190883
46 19197.980437653307
47 17

358 0.0006248976778166955
359 0.0005962274449550725
360 0.0005688816019360583
361 0.0005428032696315806
362 0.000517927297094386
363 0.0004941917963879197
364 0.0004715531369168361
365 0.0004499568963885044
366 0.00042936127905218485
367 0.0004097122325847699
368 0.00039096312865743305
369 0.00037307873691938564
370 0.00035601746675565846
371 0.0003397480751520208
372 0.00032422346520170594
373 0.000309412051466174
374 0.0002952763193748406
375 0.00028179064997966067
376 0.00026892357524899017
377 0.0002566885893097347
378 0.0002449788836439917
379 0.00023380271186740627
380 0.00022313791612699378
381 0.0002129643426162547
382 0.00020325589378438201
383 0.00019399380090800072
384 0.00018515584178659995
385 0.00017672143243647518
386 0.0001686727011189825
387 0.00016099314404925154
388 0.00015366663259856056
389 0.00014667400418451473
390 0.00014000113723755412
391 0.0001336341879411083
392 0.00012755854995811372
393 0.0001217585543975581
394 0.00011622477711300434
395 0.000110943133925

In [4]:
# Pytorch implementation
# -*- coding: utf-8 -*-

import torch

dtype = torch.float
device = device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

N, D_in, H, D_out = 64, 1000, 100, 10

x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

w1 = torch.randn(D_in, H, device=device, dtype=dtype)
w2 = torch.randn(H, D_out, device=device, dtype=dtype)

learning_rate = 1e-6
for t in range(500):
    # Forward pass: compute predicted y
    h = x.mm(w1)
    h_relu = h.clamp(min=0)
    y_pred = h_relu.mm(w2)
    
    # Compute and print loss
    loss = (y_pred - y).pow(2).sum().item()
    print(t, loss)
    
    # Backprop to compute gradients of w1 and w2 with respect to loss
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.t().mm(grad_y_pred)
    grad_h_relu = grad_y_pred.mm(w2.t())
    grad_h = grad_h_relu.clone()
    grad_h[h < 0] = 0
    grad_w1 = x.t().mm(grad_h)
    
    # update weights using gradient descent
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

cuda:0
0 31572772.0
1 30490226.0
2 36632312.0
3 43286404.0
4 42356664.0
5 30180974.0
6 15533451.0
7 6443276.0
8 2768567.75
9 1473899.375
10 988380.6875
11 761336.5
12 624807.875
13 527033.8125
14 450719.4375
15 388678.875
16 337261.09375
17 294149.1875
18 257665.640625
19 226597.609375
20 200020.828125
21 177127.59375
22 157320.859375
23 140113.453125
24 125097.375
25 111946.984375
26 100442.3125
27 90298.59375
28 81333.21875
29 73385.0390625
30 66322.359375
31 60037.98828125
32 54427.33984375
33 49411.09375
34 44916.73828125
35 40878.328125
36 37245.6015625
37 33972.4609375
38 31019.353515625
39 28350.7265625
40 25936.927734375
41 23749.34375
42 21765.228515625
43 19966.572265625
44 18332.548828125
45 16845.279296875
46 15489.943359375
47 14254.0263671875
48 13127.720703125
49 12098.970703125
50 11159.12890625
51 10299.453125
52 9511.833984375
53 8789.884765625
54 8127.40625
55 7518.740234375
56 6959.46728515625
57 6445.28076171875
58 5971.9072265625
59 5535.97412109375
60 5134.203613

385 0.00022142445959616452
386 0.00021607897360809147
387 0.00021068232308607548
388 0.00020469364244490862
389 0.00020010405569337308
390 0.00019518259796313941
391 0.000189638085430488
392 0.00018481061852071434
393 0.00017975247465074062
394 0.00017562255379743874
395 0.0001717964478302747
396 0.00016739318380132318
397 0.00016353395767509937
398 0.00015943382459226996
399 0.00015617164899595082
400 0.0001522297679912299
401 0.00014893831394147128
402 0.00014585506869480014
403 0.00014280990581028163
404 0.0001392417907482013
405 0.000136174974613823
406 0.0001331544917775318
407 0.0001311245432589203
408 0.00012835997040383518
409 0.0001254254166269675
410 0.00012307323049753904
411 0.00012039248395012692
412 0.00011759381595766172
413 0.00011542386346263811
414 0.00011269657989032567
415 0.0001104784314520657
416 0.00010854622814804316
417 0.00010692102659959346
418 0.00010438983736094087
419 0.00010256177483825013
420 0.00010039289190899581
421 9.858924022410065e-05
422 9.6557923