### 作業目標: 使用Pytorch進行微分與倒傳遞
這份作業我們會實作微分與倒傳遞以及使用Pytorch的Autograd。

### 使用Pytorch實作微分與倒傳遞

這裡我們很簡單的實作兩層的神經網路進行回歸問題，其中loss function為L2 loss

$$
L2\_loss = (y_{pred}-y)^2
$$

兩層經網路如下所示
$$
y_{pred} = ReLU(XW_1)W_2
$$

In [1]:
import torch
device = torch.device('cpu')

In [2]:
# N: batch size
# D_in: input dimension
# H: hidden dimension
# D_out: output dimension
N, D_in, H, D_out = 64, 1000, 100, 10

# 隨機生成x, y
x = torch.randn((N, D_in)).to(device)
y = torch.randn((N, D_out)).to(device)

# 初始化weight W1, W2
W1 = torch.randn((D_in, H)).to(device)
W2 = torch.randn((H, D_out)).to(device)

# 設置learning rate
learning_rate = 1e-6

# 訓練500個epoch
for t in range(500):
    # 向前傳遞: 計算y_pred
    h = torch.matmul(x, W1)
    h_relu = torch.relu(h)
    y_pred = torch.matmul(h_relu, W2)

    # 計算loss
    loss = torch.square(y_pred - y).sum()
    print(t, loss.item())

    # 倒傳遞: 計算W1與W2對loss的微分(梯度)
    y_pred_grad = 2. * (y_pred - y)
    W2_grad = h_relu.T.mm(y_pred_grad)
    h_grad = y_pred_grad.mm(W2.T) * (h > 0.)
    W1_grad = x.T.mm(h_grad)

    # 參數更新
    W1.data -= learning_rate * W1_grad
    W2.data -= learning_rate * W2_grad

0 28857712.0
1 22943972.0
2 23173282.0
3 26145276.0
4 28730274.0
5 27749460.0
6 22061424.0
7 14291092.0
8 7917722.0
9 4134870.0
10 2241319.75
11 1344878.875
12 907821.375
13 675370.375
14 536491.5
15 443457.09375
16 375306.34375
17 322192.6875
18 279157.90625
19 243500.84375
20 213413.4375
21 187884.953125
22 166061.28125
23 147276.609375
24 131022.8515625
25 116902.6875
26 104590.3203125
27 93802.734375
28 84323.25
29 75983.5625
30 68612.65625
31 62075.57421875
32 56261.015625
33 51078.828125
34 46449.4609375
35 42302.578125
36 38579.58984375
37 35230.80078125
38 32217.525390625
39 29496.923828125
40 27036.54296875
41 24809.052734375
42 22788.005859375
43 20951.74609375
44 19281.5703125
45 17760.5078125
46 16374.841796875
47 15110.03125
48 13953.6865234375
49 12895.6162109375
50 11926.34375
51 11037.37109375
52 10221.3798828125
53 9472.1865234375
54 8783.15625
55 8149.1123046875
56 7564.990234375
57 7026.73095703125
58 6530.4365234375
59 6072.52197265625
60 5649.5341796875
61 5258.472

### 使用Pytorch的Autograd

In [None]:
import torch
device = torch.device('cpu')

In [3]:
# N: batch size
# D_in: input dimension
# H: hidden dimension
# D_out: output dimension
N, D_in, H, D_out = 64, 1000, 100, 10

# 隨機生成x, y
x = torch.randn((N, D_in)).to(device)
y = torch.randn((N, D_out)).to(device)

# 初始化weight W1, W2
W1 = torch.randn((D_in, H), requires_grad=True).to(device)
W2 = torch.randn((H, D_out), requires_grad=True).to(device)

# 設置learning rate
learning_rate = 1e-6

# 訓練500個epoch
for t in range(500):
    # 向前傳遞: 計算y_pred
    y_pred = torch.matmul(torch.relu(torch.matmul(x, W1)), W2)

    # 計算loss
    loss = torch.square(y_pred - y).sum()
    print(t, loss.item())

    # 倒傳遞: 計算W1與W2對loss的微分(梯度)
    loss.backward()

    # 參數更新: 這裡在更新參數時，我們不希望更新參數的計算也被紀錄微分相關的資訊，因此使用torch.no_grad()
    with torch.no_grad():
        # 更新參數W1 W2
        W1.data -= learning_rate * W1.grad 
        W2.data -= learning_rate * W2.grad 

        # 將紀錄的gradient清空(因為已經更新參數)
        W1.grad.zero_()
        W2.grad.zero_()

0 31796488.0
1 25652004.0
2 22268644.0
3 18775970.0
4 14643703.0
5 10463907.0
6 7032214.5
7 4600789.0
8 3043196.75
9 2085861.75
10 1499474.125
11 1130572.0
12 888193.8125
13 720739.5625
14 598869.375
15 506321.5625
16 433601.96875
17 374908.1875
18 326509.9375
19 286024.4375
20 251765.125
21 222615.359375
22 197528.265625
23 175819.03125
24 156935.890625
25 140513.171875
26 126149.015625
27 113516.6171875
28 102412.34375
29 92586.8515625
30 83859.8984375
31 76096.9453125
32 69165.8203125
33 62961.9375
34 57399.94921875
35 52400.83984375
36 47900.66796875
37 43843.56640625
38 40178.20703125
39 36861.83203125
40 33854.5625
41 31127.1328125
42 28647.369140625
43 26389.556640625
44 24333.080078125
45 22457.802734375
46 20743.92578125
47 19176.3203125
48 17739.966796875
49 16422.318359375
50 15213.9052734375
51 14104.2685546875
52 13085.685546875
53 12148.9169921875
54 11285.77734375
55 10491.0537109375
56 9758.5849609375
57 9081.9814453125
58 8457.0009765625
59 7879.1982421875
60 7344.5932

In [None]:
# N: batch size
# D_in: input dimension
# H: hidden dimension
# D_out: output dimension
N, D_in, H, D_out = 64, 1000, 100, 10

# 隨機生成x, y
x = torch.randn((N, D_in)).to(device)
y = torch.randn((N, D_out)).to(device)

# 初始化weight W1, W2
W1 = torch.randn((D_in, H), requires_grad=True).to(device)
W2 = torch.randn((H, D_out), requires_grad=True).to(device)

# 設置learning rate
learning_rate = 1e-6

# 訓練500個epoch
for t in range(500):
  # 向前傳遞: 計算y_pred
    y_pred = torch.matmul(torch.relu(torch.matmul(x, W1)), W2)

    # 計算loss
    loss = torch.square(y_pred - y).sum()
    print(t, loss.item())

    # 倒傳遞: 計算W1與W2對loss的微分(梯度)
    loss.backward()

    # 參數更新: 這裡在更新參數時，我們不希望更新參數的計算也被紀錄微分相關的資訊，因此使用torch.no_grad()
    with torch.no_grad():
        # 更新參數W1 W2
        W1.data = W1.data - (learning_rate*W1.grad)  
        W2.data = W2.data - (learning_rate*W2.grad)
        # 將紀錄的gradient清空(因為已經更新參數)
        W1.grad.zero_()
        W2.grad.zero_()