# Backpropagation

7 x 6 x 5 x 2 의 4계층 신경망 구조를 작성하여 오차 역전파로 학습이 잘 진행되고, 편미분이 수치 미분의 결과와 동일한지 검증한다.

In [1]:
import numpy as np
np.set_printoptions(suppress=True)

def init():
    global i, w_1, b_1, w_2, b_2, w_3, b_3, t
        
    i = np.array([0.4,-0.2,0.1,0.1,-0.15,0.6,-0.9]).reshape(-1, 1)

    np.random.seed(12)
    w_1 = np.random.rand(6, 7)
    b_1 = np.random.rand(6).reshape(-1, 1)
    w_2 = np.random.rand(5, 6)
    b_2 = np.random.rand(5).reshape(-1, 1)
    w_3 = np.random.rand(2, 5)
    b_3 = np.random.rand(2).reshape(-1, 1)
    
    t = np.array([[0.87503811],[0.83690408]])

In [2]:
def sigmoid(z: np.ndarray):
    return 1 / (1 + np.exp(-z))

def d_sigmoid(a: np.ndarray):
    return a * (1.0 - a)

def gradient_check(analytic, numeric):
    numerator = abs(analytic-numeric)
    denominator = max(analytic,numeric)
    difference = numerator/denominator
    
    if difference < 1e-3: # cs231n의 권장 수치는 1e-7인데 여기서는 좀 더 높인다.
        print ("The gradient is correct!")
    else:
        print ("The gradient is wrong!")
        
def forward(i):
    global w_1, b_1, w_2, b_2, w_3, b_3
    
    net_h1 = np.dot(w_1, i) + b_1
    out_h1 = sigmoid(net_h1)
    
    net_h2 = np.dot(w_2, out_h1) + b_2
    out_h2 = sigmoid(net_h2)
    
    net_o = np.dot(w_3, out_h2) + b_3
    out_o = sigmoid(net_o)
    
    return net_h1, out_h1, net_h2, out_h2, net_o, out_o

In [3]:
def train(i, t):
    lr = 0.1
    
    global w_1, b_1, w_2, b_2, w_3, b_3
    global delta_w_1, delta_b_1, delta_w_2, delta_b_2, delta_w_3, delta_b_3
    
    net_h1, out_h1, net_h2, out_h2, net_o, out_o = forward(i)

    # backpropagation!
    d_o_errors = - (t - out_o)
    delta_w_3 = np.dot(d_o_errors * d_sigmoid(out_o), out_h2.T)
    w_3 += - lr * delta_w_3
    delta_b_3 = d_o_errors * d_sigmoid(out_o)
    b_3 += - lr * delta_b_3

    d_h2_errors = np.dot(w_3.T, d_o_errors * d_sigmoid(out_o))
    delta_w_2 = np.dot(d_h2_errors * d_sigmoid(out_h2), out_h1.T)
    w_2 += - lr * delta_w_2
    delta_b_2 = d_h2_errors * d_sigmoid(out_h2)
    b_2 += - lr * delta_b_2

    d_h1_errors = np.dot(w_2.T, d_h2_errors * d_sigmoid(out_h2))
    delta_w_1 = np.dot(d_h1_errors * d_sigmoid(out_h1), i.T)
    w_1 += - lr * delta_w_1
    delta_b_1 = d_h1_errors * d_sigmoid(out_h1)
    b_1 += - lr * delta_b_1
    

def query(i, t):
    _, _, _, _, _, out_o = forward(i)

    # E 출력
    print(t - out_o)

In [4]:
init()
train(i,t)
query(i,t)

[[-0.09422695]
 [-0.12718581]]


In [5]:
# Gradient Checking(w_1, b_1)
init()
train(i,t)
h = 1e-7

init()
for k in range(6):
    for j in range(7):
        # 수치 미분(numerical gradient) 진행
        init()
        w_1[k][j] += h
        _, _, _, _, _, out_o = forward(i)
        e1 = np.sum((t - out_o) ** 2) / 2

        init()
        w_1[k][j] -= h
        _, _, _, _, _, out_o = forward(i)
        e2 = np.sum((t - out_o) ** 2) / 2

        # 수치 미분 결과가 해석적 미분(analytic gradient)과 동일한지 검증
        numeric_delta_w_1 = (e1 - e2) / (2 * h)
        print("%.16f, %.16f" % (delta_w_1[k][j], numeric_delta_w_1))
        gradient_check(delta_w_1[k][j], numeric_delta_w_1)

    init()
    b_1[k] += h
    _, _, _, _, _, out_o = forward(i)
    e1 = np.sum((t - out_o) ** 2) / 2

    init()
    b_1[k] -= h
    _, _, _, _, _, out_o = forward(i)
    e2 = np.sum((t - out_o) ** 2) / 2

    # 수치 미분 결과가 해석적 미분(analytic gradient)과 동일한지 검증
    numeric_delta_b_1 = (e1 - e2) / (2 * h)
    print()
    print("%.16f, %.16f" % (delta_b_1[k], numeric_delta_b_1))
    gradient_check(delta_b_1[k], numeric_delta_b_1)
    print()

0.0000974372978874, 0.0000974974961876
The gradient is correct!
-0.0000487186489437, -0.0000487487654410
The gradient is correct!
0.0000243593244718, 0.0000243742439426
The gradient is correct!
0.0000243593244718, 0.0000243742439426
The gradient is correct!
-0.0000365389867078, -0.0000365616738274
The gradient is correct!
0.0001461559468311, 0.0001462461922397
The gradient is correct!
-0.0002192339202466, -0.0002193693057068
The gradient is correct!

0.0002435932447185, 0.0002437437317954
The gradient is correct!

0.0000913394440743, 0.0000913891792104
The gradient is correct!
-0.0000456697220372, -0.0000456946720045
The gradient is correct!
0.0000228348610186, 0.0000228473663599
The gradient is correct!
0.0000228348610186, 0.0000228473663599
The gradient is correct!
-0.0000342522915279, -0.0000342708066786
The gradient is correct!
0.0001370091661115, 0.0001370838078468
The gradient is correct!
-0.0002055137491673, -0.0002056257334543
The gradient is correct!

0.0002283486101858, 0.000

In [6]:
# 실제 학습이 잘 되는지 확인
for _ in range(100): 
    train(i,t)
query(i,t)

[[-0.08964921]
 [-0.11842518]]


In [7]:
w_1

array([[0.15315712, 0.74055256, 0.26306359, 0.53348796, 0.01495211,
        0.91723843, 0.90297772],
       [0.0324718 , 0.95742415, 0.13697192, 0.28359095, 0.60643929,
        0.9428007 , 0.85487219],
       [0.00186352, 0.52142388, 0.5519387 , 0.48527849, 0.76828255,
        0.16012318, 0.76545081],
       [0.01956525, 0.13583245, 0.11596188, 0.30958645, 0.67191935,
        0.46936296, 0.81896852],
       [0.28857552, 0.73363161, 0.70236954, 0.32731666, 0.33502675,
        0.97654118, 0.62685746],
       [0.94967597, 0.76779443, 0.82484986, 0.40648091, 0.45154749,
        0.3996753 , 0.99657266]])

# 미분 계산

In [8]:
import sympy
sympy.init_printing(use_latex='mathjax')
w11, x1, w12, x2, w13, x3, b3 = sympy.symbols('w11 x1 w12 x2 w13 x3 b3')
z = w11 * x1 + w12 * x2 + w13 * x3 + b3
z

b₃ + w₁₁⋅x₁ + w₁₂⋅x₂ + w₁₃⋅x₃

In [9]:
sympy.Derivative(z, b3).doit()

1