In [26]:
import numpy as np
import sys, os
sys.path.append(os.pardir)
from common.layers import *
from collections import OrderedDict
from tqdm import tqdm

수치 미분을 통한 기울기 계산은 계산이 오래걸림. 따라서 오차역전파를 통해 구현

## 5.4 단순한 계층 구현

### 곱셈 계층

In [1]:
class MulLayer:
    def __init__(self):
        self.x = None
        self.y = None
    
    def forward(self, x, y):
        self.x = x
        self.y = y
        out = x * y 
        
        return out 
    
    def backward(self, dout): # dout = 이전 층의 미분 값
        dx = dout * self.y
        dy = dout * self.x
        
        return dx, dy

In [2]:
# ex1

apple = 100 
apple_num = 2 
tax = 1.1 

mul_apple_layer = MulLayer()
mul_tax_layer = MulLayer()

# 순전파
apple_price = mul_apple_layer.forward(apple, apple_num)
price = mul_tax_layer.forward(apple_price, tax)

print(f'price : {price}')

# 역전파
dprice = 1
dapple_price, dtax = mul_tax_layer.backward(dprice)
dapple, dapple_num = mul_apple_layer.backward(dapple_price)

print(dapple, dapple_num, dtax, dapple_price)

price : 220.00000000000003
2.2 110.00000000000001 200 1.1


부동소수점 때문에 소수점 이하 숫자들이 발생.

### 덧셈 계층

In [9]:
class AddLayer:
    def __init__(self): # 덧셈 역전파는 그냥 받은 값을 보내면 되니까, 순전파 값을 저장할 필요가 없음
        pass
    
    def forward(self, x, y):
        out = x + y
        
        return out
    
    def backward(self, dout):
        dx = dout * 1
        dy = dout * 1
        
        return dx, dy

In [10]:
# ex2. 덧셈과 곱셈 계층 모두 활용 예시
apple = 100
num_a = 2
orange = 150
num_o = 3
tax = 1.1

# 계층
mal = MulLayer() # 사과
mol = MulLayer() # 오렌지 
aaol = AddLayer() # 사과 & 오렌지
mtl = MulLayer() # tax

# forward
apple_price = mal.forward(apple, num_a)
orange_price = mol.forward(orange, num_o)
ao_price = aaol.forward(apple_price, orange_price)
price  = mtl.forward(ao_price, tax)
print(f'price : {price}')

# backward
dprice = 1
dao_price, dtax = mtl.backward(dprice)
dapple_price, dorange_price = aaol.backward(dao_price)
dapple, dnum_a = mal.backward(dapple_price)
dorange, dnum_o = mol.backward(dorange_price)

print(dapple, dnum_a, dorange, dnum_o, dtax)

price : 715.0000000000001
2.2 110.00000000000001 3.3000000000000003 165.0 650


## 5.5 활성화 함수 계층 구현 

### 5.5.1 ReLU

In [12]:
class Relu:
    def __init__(self):
        self.mask = None
        
    def forward(self, x):
        self.mask = (x <= 0) # 0을 기준으로 true, false
        out = x.copy()
        out[self.mask] = 0
        
        return out
    
    def backward(self, dout):
        dout[self.mask] = 0 # 순전파시 0보다 작았던 애들은 역전파 신호 보내지 않음
        dx = dout
        
        return dx

### 5.5.2 Sigmoid

In [14]:
class Sigmoid:
    def __init__(self):
        self.out = out # 순전파 출력값 저장
        
    def forward(self, x):
        out = 1 / (1 + np.exp(-x))
        self.out = out
        
        return out
    
    def backward(self, dout):
        dx = dout * (1 - self.out) * self.out
        
        return dx

## 5.6 Affine, Softmax

### 5.6.2 Affine

`Affine` : 순전파에서 행렬 곱을 의미  

역전파 시, 행렬의 전치행렬 곱으로 표현이 가능(p.172)

In [15]:
class Affine:
    def __init__(self, W, b):
        self.W = W
        self.b = b
        self.x = None
        self.dW = None
        self.db = None
        
    def forward(self, x):
        self.x = x
        out = np.dot(x, self.W) + self.b
        
        return out
    
    def backward(self, dout):
        dx = np.dot(dout, self.W.T) # 가중치의 전치행렬 곱으로 나타냄
        self.dW = np.dot(self.x.T, dout) # input의 전치행렬
        self.db = np.sum(dout, axis=0)
        
        return dx

### 5.6.3 Softmax + CEE

In [16]:
# softmax + CEE 
class SoftmaxWithLoss:
    def __init__(self):
        self.loss = None 
        self.y = None # softmax 출력
        self.t = None # answer
    
    
    def forward(self, x, t):
        self.t = t
        self.y = softmax(y)
        self.loss = cross_entropy_error(self.y, self.t)
        
        return self.loss
    
    
    def backward(self, dout=1):
        batch_size = self.t.shape[0]
        dx = (self.y - self.t) / batch_size # 역전파 시에는 배치로 나눠서 데이터 1개당의 오차를 앞으로 전파
        
        return dx

## 5.7 오차역전파를 활용한 최종 구현

In [23]:
class TwoLayerNet:
    def __init__(self, input_size, hidden_size, output_size, weight_init_std = 0.01):
        # parameter
        self.params = {}
        self.params['W1'] = weight_init_std * np.random.randn(input_size, hidden_size)
        self.params['b1'] = np.zeros(hidden_size)
        self.params['W2'] = weight_init_std * np.random.randn(hidden_size, output_size)
        self.params['b2'] = np.zeros(output_size)
        
        # layer
        self.layers = OrderedDict() # 순서가 있는 딕셔너리 (순전파에서 layer의 순서를 통해 진행하려고 사용, 역전파는 반대)
        self.layers['Affine1'] = Affine(self.params['W1'], self.params['b1'])
        self.layers['Relu1'] = Relu()
        self.layers['Affine2'] = Affine(self.params['W2'], self.params['b2'])
        
        self.lastLayer = SoftmaxWithLoss() # 마지막 출력층
    
    # 마지막 softmax를 제외한 순전파
    def predict(self, x):
        for layer in self.layers.values():
            x = layer.forward(x)
        
        return x
    
    def loss(self, x, t):
        y = self.predict(x)
        
        return self.lastLayer.forward(y, t)
    
    def accuracy(self, x, t):
        y = self.predict(x)
        y = np.argmax(y, axis=1)
        if t.ndim != 1:
            t = np.argmax(t, axis=1) # onehot 형태면 정답 인덱스만 가져오도록
            
        accuracy = np.sum(y==t) / float(x.shape[0])
        
        return accuracy
    
    # 수치미분 활용한 G.D
    def numerical_gradient(self, x, t):
        loss_W = lambda W: self.loss(x, t)
        
        grads = {}
        grads['W1'] = numerical_gradient(loss_W, self.params['W1'])
        grads['b1'] = numerical_gradient(loss_W, self.params['b1'])
        grads['W2'] = numerical_gradient(loss_W, self.params['W2'])
        grads['b2'] = numerical_gradient(loss_W, self.params['b2'])
        
        return grads
    
    # 오차역전파
    def gradient(self, x, t):
        # 순전파
        self.loss(x, t)
        
        # 역전파
        dout = 1
        dout = self.lastLayer.backward(dout)
        
        layers = list(self.layers.values()) # 계층들 list 형태로 반환
        layers.reverse() # 역전파는 반대로
        for layer in layers:
            dout = layer.backward(dout)
        
        grads = {}
        grads['W1'] = self.layers['Affine1'].dW
        grads['b1'] = self.layers['Affine1'].db
        grads['W2'] = self.layers['Affine2'].dW
        grads['b2'] = self.layers['Affine2'].db
        
        return grads

### 5.7.3 수치미분을 통해 오차역전파 검증

In [25]:
from dataset.mnist import load_mnist
import time 
from common.gradient import numerical_gradient

(x_train, t_train), (x_test, t_test) = load_mnist(normalize=True, one_hot_label=True)

x_batch = x_train[:3]
t_batch = t_train[:3]

network = TwoLayerNet(input_size=784, hidden_size=50, output_size=10)

# 수치미분
start_gn = time.time()
grad_numerical = network.numerical_gradient(x_batch, t_batch)
end_gn = time.time()
print('수치미분: ' + str(end_gn - start_gn))

# 역전파
start_gb = time.time()
grad_backprop = network.gradient(x_batch, t_batch)
end_gb = time.time()
print('역전파: ' + str(end_gb - start_gb))

# 차이 검증
for key in grad_backprop.keys():
    diff = np.average(np.abs(grad_backprop[key] - grad_numerical[key]))
    print(key + " : " + str(diff))

수치미분: 5.403294086456299
역전파: 0.002431154251098633
W1 : 4.1146087306120684e-10
b1 : 2.55322563792737e-09
W2 : 5.8462063189670085e-09
b2 : 1.404320693537686e-07


시간도 역전파가 매우 빠르고, 오차는 거의 없음

### 5.7.4 오차역전파를 통한 학습 구현

In [28]:
(x_train, t_train), (x_test, t_test) = load_mnist(normalize=True, one_hot_label=True)

network = TwoLayerNet(input_size=784, hidden_size=50, output_size=10)

iters_num = 10000
train_size = x_train.shape[0]
batch_size = 100
learning_rate = 0.1

train_loss_list = []
train_acc_list = []
test_acc_list = []

iter_per_epoch = max(train_size/batch_size, 1)

for i in tqdm(range(iters_num)):
    batch_mask = np.random.choice(train_size, batch_size)
    x_batch = x_train[batch_mask]
    t_batch = t_train[batch_mask]
    
    # 오차역전파 활용
    grad = network.gradient(x_batch, t_batch)
    
    for key in ('W1', 'b1', 'W2', 'b2'):
        network.params[key] -= learning_rate * grad[key]
        
    loss = network.loss(x_batch, t_batch)
    train_loss_list.append(loss)
    
    if i % iter_per_epoch == 0:
        train_acc = network.accuracy(x_train, t_train)
        test_acc = network.accuracy(x_test, t_test)
        train_acc_list.append(train_acc)
        test_acc_list.append(test_acc)
        print(train_acc, test_acc)

  0%|▏                                      | 39/10000 [00:00<01:31, 108.72it/s]

0.1015 0.0981


  6%|██▍                                   | 646/10000 [00:02<00:44, 208.63it/s]

0.9053333333333333 0.908


 12%|████▌                                | 1240/10000 [00:04<00:44, 196.54it/s]

0.92635 0.9281


 19%|██████▉                              | 1878/10000 [00:06<00:38, 210.16it/s]

0.9364833333333333 0.9379


 25%|█████████▏                           | 2477/10000 [00:08<00:36, 204.44it/s]

0.94535 0.9446


 31%|███████████▎                         | 3063/10000 [00:10<00:35, 197.30it/s]

0.9518333333333333 0.9495


 37%|█████████████▌                       | 3654/10000 [00:12<00:30, 209.89it/s]

0.95715 0.9541


 43%|███████████████▋                     | 4254/10000 [00:14<00:27, 208.67it/s]

0.9608333333333333 0.9568


 49%|█████████████████▉                   | 4853/10000 [00:16<00:24, 211.30it/s]

0.9650333333333333 0.9621


 55%|████████████████████▏                | 5452/10000 [00:18<00:21, 209.19it/s]

0.9677833333333333 0.9627


 60%|██████████████████████▍              | 6048/10000 [00:20<00:20, 195.94it/s]

0.9692833333333334 0.965


 66%|████████████████████████▌            | 6641/10000 [00:22<00:16, 209.03it/s]

0.9725833333333334 0.9671


 73%|██████████████████████████▉          | 7277/10000 [00:24<00:13, 205.73it/s]

0.9744166666666667 0.9679


 78%|████████████████████████████▉        | 7834/10000 [00:26<00:12, 169.43it/s]

0.97585 0.9682


 85%|███████████████████████████████▎     | 8468/10000 [00:28<00:07, 203.17it/s]

0.9774333333333334 0.9685


 91%|█████████████████████████████████▌   | 9066/10000 [00:30<00:04, 212.22it/s]

0.9784 0.9713


 97%|███████████████████████████████████▊ | 9672/10000 [00:32<00:01, 200.66it/s]

0.9788166666666667 0.9699


100%|████████████████████████████████████| 10000/10000 [00:33<00:00, 302.61it/s]
