
Warm-up: numpy
--------------

A fully-connected ReLU network with one hidden layer and no biases, trained to
predict y from x using Euclidean error.

This implementation uses numpy to manually compute the forward pass, loss, and
backward pass.

A numpy array is a generic n-dimensional array; it does not know anything about
deep learning or gradients or computational graphs, and is just a way to perform
generic numeric computations.

Source Link: http://pytorch.org/tutorials/beginner/examples_tensor/two_layer_net_numpy.html


<h1 style="background-image: linear-gradient( 135deg, #ABDCFF 10%, #0396FF 100%);"> Orinal Tutorial code

In [1]:
%matplotlib inline

In [2]:
import numpy as np

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random input and output data
x = np.random.randn(N, D_in)
y = np.random.randn(N, D_out)

# Randomly initialize weights
w1 = np.random.randn(D_in, H)
w2 = np.random.randn(H, D_out)

learning_rate = 1e-6
for t in range(500):
    # Forward pass: compute predicted y
    h = x.dot(w1)
    h_relu = np.maximum(h, 0)
    y_pred = h_relu.dot(w2)

    # Compute and print loss
    loss = np.square(y_pred - y).sum()
    print(t, loss)

    # Backprop to compute gradients of w1 and w2 with respect to loss
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.T.dot(grad_y_pred)
    grad_h_relu = grad_y_pred.dot(w2.T)
    grad_h = grad_h_relu.copy()
    grad_h[h < 0] = 0
    grad_w1 = x.T.dot(grad_h)

    # Update weights
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

0 27134129.1492
1 22763629.5156
2 21288268.6851
3 20084319.3526
4 17630734.9061
5 14139132.1337
6 10198268.5412
7 6875436.3943
8 4441344.30624
9 2883367.64557
10 1918415.78307
11 1337582.17532
12 979192.661553
13 751848.045015
14 599913.532731
15 493255.593274
16 414607.602011
17 354156.014454
18 306105.045991
19 266941.405281
20 234313.330396
21 206667.966218
22 183069.019013
23 162739.46491
24 145126.656406
25 129765.902562
26 116315.990973
27 104495.703445
28 94080.0363571
29 84869.0120901
30 76702.0624814
31 69440.7195776
32 62970.9432794
33 57195.8917614
34 52032.6783146
35 47403.9475426
36 43245.4784712
37 39502.6001431
38 36129.5051176
39 33081.2233377
40 30324.6091618
41 27829.102591
42 25566.1945454
43 23510.8376215
44 21641.4124103
45 19938.7005338
46 18386.1868093
47 16969.3937777
48 15674.6996543
49 14491.7098573
50 13408.3398872
51 12414.7914754
52 11502.934711
53 10665.3338878
54 9895.36630323
55 9186.82962877
56 8534.23169176
57 7932.75859373
58 7377.97444103
59 6865.982

---

<h1 style="background-image: linear-gradient( 135deg, #ABDCFF 10%, #0396FF 100%);"> An One-move Example

### Demo Setting

In [3]:
# Demo set
N = 4      # N     is batch size
D_in = 3   # D_in  is input dimension
H = 2      # H     is hidden dimension
D_out = 1  # D_out is output dimension

# Create random input and output data
x = np.random.randn(N, D_in)
y = np.random.randn(N, D_out)

# Randomly initialize weights
w1 = np.random.randn(D_in, H)
w2 = np.random.randn(H, D_out)

# Learning rate
learning_rate = 1e-6

In [4]:
# Check input and output
print("x shape: ", x.shape)
print(x)
print("----------------------------------------")
print("y shape: ", y.shape)
print(y)

print("----------------------------------------")
print("----------------------------------------")

# Check initialized weights
print("w1 shape: ", w1.shape)
print(w1)
print("----------------------------------------")
print("w2 shape: ", w2.shape)
print(w2)

x shape:  (4, 3)
[[ 1.46630273 -0.28439528 -2.17351802]
 [ 0.9221081   0.67609038 -0.81827607]
 [-0.34743241 -0.33185863  0.10973094]
 [ 0.23806759 -1.19525403 -0.45219163]]
----------------------------------------
y shape:  (4, 1)
[[ 0.38602718]
 [-1.62320251]
 [-0.02611463]
 [-1.12400456]]
----------------------------------------
----------------------------------------
w1 shape:  (3, 2)
[[-0.00765758 -0.40056082]
 [-1.0192426   0.98049355]
 [ 0.32065702  0.86188859]]
----------------------------------------
w2 shape:  (2, 1)
[[-0.1960287 ]
 [ 0.54521736]]


### Forward Pass

In [5]:
# Forward pass: compute predicted y
h = x.dot(w1)
h_relu = np.maximum(h, 0)
y_pred = h_relu.dot(w2)

# # Compute and print loss
loss = np.square(y_pred - y).sum()
print(loss)

3.62141684539


### Backprop

In [6]:
# Backprop to compute gradients of w1 and w2 with respect to loss
grad_y_pred = 2.0 * (y_pred - y)
grad_w2 = h_relu.T.dot(grad_y_pred)
grad_h_relu = grad_y_pred.dot(w2.T)
grad_h = grad_h_relu.copy()
grad_h[h < 0] = 0
grad_w1 = x.T.dot(grad_h)

# Update weights
w1 -= learning_rate * grad_w1
w2 -= learning_rate * grad_w2

### Combined..?

In [7]:
# Demo set
N = 2      # N     is batch size
D_in = 5   # D_in  is input dimension
H = 2      # H     is hidden dimension
D_out = 1  # D_out is output dimension

# Create random input and output data
x = np.random.randn(N, D_in)
y = np.random.randn(N, D_out)

# Randomly initialize weights
w1 = np.random.randn(D_in, H)
w2 = np.random.randn(H, D_out)

# Learning rate
learning_rate = 1e-6


for t in range(500):
    # Forward pass: compute predicted y
    h = x.dot(w1)
    h_relu = np.maximum(h, 0)
    y_pred = h_relu.dot(w2)

    # Compute and print loss
    loss = np.square(y_pred - y).sum()
    print(t, loss)

    # Backprop to compute gradients of w1 and w2 with respect to loss
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.T.dot(grad_y_pred)
    grad_h_relu = grad_y_pred.dot(w2.T)
    grad_h = grad_h_relu.copy()
    grad_h[h < 0] = 0
    grad_w1 = x.T.dot(grad_h)

    # Update weights
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

0 2.54744656042
1 2.54737997933
2 2.54731340047
3 2.54724682385
4 2.54718024947
5 2.54711367732
6 2.5470471074
7 2.54698053972
8 2.54691397427
9 2.54684741106
10 2.54678085008
11 2.54671429134
12 2.54664773483
13 2.54658118055
14 2.54651462851
15 2.5464480787
16 2.54638153113
17 2.54631498579
18 2.54624844268
19 2.54618190181
20 2.54611536316
21 2.54604882676
22 2.54598229258
23 2.54591576064
24 2.54584923093
25 2.54578270345
26 2.54571617821
27 2.5456496552
28 2.54558313442
29 2.54551661587
30 2.54545009956
31 2.54538358548
32 2.54531707363
33 2.54525056401
34 2.54518405662
35 2.54511755146
36 2.54505104854
37 2.54498454785
38 2.54491804939
39 2.54485155316
40 2.54478505916
41 2.54471856739
42 2.54465207786
43 2.54458559055
44 2.54451910547
45 2.54445262263
46 2.54438614202
47 2.54431966363
48 2.54425318748
49 2.54418671356
50 2.54412024186
51 2.5440537724
52 2.54398730517
53 2.54392084016
54 2.54385437739
55 2.54378791685
56 2.54372145853
57 2.54365500245
58 2.54358854859
59 2.543522