# Backpropagation

7 x 5 x 2 의 3계층 신경망 구조를 작성하여 오차 역전파로 학습이 잘 진행되고, 편미분이 수치 미분의 결과와 동일한지 검증한다.

In [1]:
import numpy as np
np.set_printoptions(suppress=True)

def init():
    global i, w_2, b_2, w_3, b_3, t
        
    i = np.array([0.4,-0.2,0.1,0.1,-0.15,0.6,-0.9]).reshape(-1, 1)

    np.random.seed(12)
    w_2 = np.random.rand(5, 7)
    b_2 = np.random.rand(5).reshape(-1, 1)
    w_3 = np.random.rand(2, 5)
    b_3 = np.random.rand(2).reshape(-1, 1)
    
    t = np.array([[0.87503811],[0.83690408]])

# 정답 행렬
np.random.seed(10)
t_w_2 = np.random.rand(5, 7)
t_b_2 = np.random.rand(5).reshape(-1, 1)
t_w_3 = np.random.rand(2, 5)
t_b_3 = np.random.rand(2).reshape(-1, 1)

In [2]:
def sigmoid(z: np.ndarray):
    return 1 / (1 + np.exp(-z))

def d_sigmoid(z: np.ndarray):
    return sigmoid(z) * (1.0 - sigmoid(z))

def gradient_check(analytic, numeric):
    numerator = abs(analytic-numeric)
    denominator = max(analytic,numeric)
    difference = numerator/denominator
    
    if difference < 1e-2: # cs231n의 권장 수치는 1e-7인데 여기서는 좀 더 높인다.
        print ("The gradient is correct!")
    else:
        print ("The gradient is wrong!")
        
def forward(i):
    global w_2, b_2, w_3, b_3
    
    net_h = np.dot(w_2, i) + b_2
    out_h = sigmoid(net_h)
    
    net_o = np.dot(w_3, out_h) + b_3
    out_o = sigmoid(net_o)
    
    return net_h, out_h, net_o, out_o

In [3]:
def train(i, t):
    lr = 0.1
    
    global w_2, w_3
    
    net_h, out_h, net_o, out_o = forward(i)

    d_output_errors = - (t - out_o)
    delta_w_3 = np.dot(d_output_errors * d_sigmoid(net_o), out_h.T)
    w_3 += - lr * delta_w_3
    
    # backpropagation!
    d_hidden_errors = np.dot(w_3.T, d_output_errors * d_sigmoid(net_o))
    delta_w_2 = np.dot(d_hidden_errors * d_sigmoid(net_h), i.T)
    w_2 += - lr * delta_w_2
    
    # bias 학습 필요
    
    return delta_w_2, delta_w_3

def query(i, t):
    _, _, _, out_o = forward(i)

    # E 출력
    print(t - out_o)

In [4]:
init()
train(i,t)
query(i,t)

[[ 0.02447658]
 [-0.01159232]]


In [5]:
# Gradient Checking
# w_2 검증

init()
delta_w_2, delta_w_3 = train(i,t)
h = 1e-7

init()
for j in range(7):
    for k in range(5):
        # 수치 미분(numerical gradient) 진행
        init()
        w_2[k][j] += h
        _, _, _, out_o = forward(i)
        e1 = np.sum((t - out_o) ** 2) / 2

        init()
        w_2[k][j] -= h
        _, _, _, out_o = forward(i)
        e2 = np.sum((t - out_o) ** 2) / 2

        # 수치 미분 결과가 해석적 미분(analytic gradient)과 동일한지 검증
        numeric_delta_w_2 = (e1 - e2) / (2 * h)
        print("%.16f, %.16f" % (delta_w_2[k][j], numeric_delta_w_2))
        gradient_check(delta_w_2[k][j], numeric_delta_w_2)
    print()

-0.0000551309011710, -0.0000550599404394
The gradient is correct!
-0.0002346868297633, -0.0002346188692002
The gradient is correct!
0.0000003325025091, 0.0000003956931354
The gradient is wrong!
-0.0002301421339975, -0.0002300861183437
The gradient is correct!
-0.0001182083089648, -0.0001181382365523
The gradient is correct!

0.0000275654505855, 0.0000275299732013
The gradient is correct!
0.0001173434148817, 0.0001173094580460
The gradient is correct!
-0.0000001662512546, -0.0000001978568676
The gradient is correct!
0.0001150710669987, 0.0001150430624245
The gradient is correct!
0.0000591041544824, 0.0000590691281695
The gradient is correct!

-0.0000137827252927, -0.0000137649966295
The gradient is correct!
-0.0000586717074408, -0.0000586547119468
The gradient is correct!
0.0000000831256273, 0.0000000989112221
The gradient is wrong!
-0.0000575355334994, -0.0000575215344648
The gradient is correct!
-0.0000295520772412, -0.0000295345738426
The gradient is correct!

-0.0000137827252927, -0

In [6]:
# 실제 학습이 잘 되는지 확인
for _ in range(100): 
    train(i,t)
query(i,t)

[[ 0.01847292]
 [-0.00897506]]


In [7]:
# 학습된 가중치와 정답 가중치 비교
w_2

array([[0.15464142, 0.73981041, 0.26343466, 0.53385904, 0.0143955 ,
        0.91946488, 0.89963805],
       [0.03543062, 0.95594474, 0.13771162, 0.28433065, 0.60532974,
        0.94723892, 0.84821486],
       [0.00226082, 0.52122523, 0.55203803, 0.48537781, 0.76813356,
        0.16071913, 0.76455688],
       [0.02277663, 0.13422676, 0.11676472, 0.31038929, 0.67071508,
        0.47418002, 0.81174293],
       [0.29062181, 0.73260847, 0.70288111, 0.32782823, 0.3342594 ,
        0.97961061, 0.62225321]])

In [8]:
t_w_2

array([[0.77132064, 0.02075195, 0.63364823, 0.74880388, 0.49850701,
        0.22479665, 0.19806286],
       [0.76053071, 0.16911084, 0.08833981, 0.68535982, 0.95339335,
        0.00394827, 0.51219226],
       [0.81262096, 0.61252607, 0.72175532, 0.29187607, 0.91777412,
        0.71457578, 0.54254437],
       [0.14217005, 0.37334076, 0.67413362, 0.44183317, 0.43401399,
        0.61776698, 0.51313824],
       [0.65039718, 0.60103895, 0.8052232 , 0.52164715, 0.90864888,
        0.31923609, 0.09045935]])

# 미분 계산

In [9]:
import sympy
sympy.init_printing(use_latex='mathjax')
w11, x1, w12, x2, w13, x3, b3 = sympy.symbols('w11 x1 w12 x2 w13 x3 b3')
z = w11 * x1 + w12 * x2 + w13 * x3 + b3
z

b₃ + w₁₁⋅x₁ + w₁₂⋅x₂ + w₁₃⋅x₃

In [10]:
sympy.Derivative(z, w11).doit()

x₁