In [23]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# Backpropagation 직접 구현하기

### 1. 단순한 계층(덧셈과 곱셈) 구현하기

$ z = x * y $  
$ \frac{\partial z}{\partial x} = y $
$ \frac{\partial z}{\partial y} = x $

In [None]:
class MulLayer:
    def __init__(self):
        self.x = None
        self.y = None
        
    def forward(self, x, y):
        self.x = x
        self.y = y
        out = x*y # z = x * y
        
        return out
    
    def backward(self, dout):
        dx = dout * self.y # dz/dx = y
        dy = dout * self.x # dz/dy = x
        
        return dx, dy

In [6]:
apple = 100
apple_num = 2
tax = 1.1

# 계층들
mul_apple_layer = MulLayer()
mul_tax_layer = MulLayer()

# Forward
apple_preice = mul_apple_layer.forward(apple, apple_num)
price = mul_tax_layer.forward(apple_preice, tax)

print(price)

220.00000000000003


In [10]:
# Backward
dprice = 1
dapple_price, dtax = mul_apple_layer.backward(dprice)
dapple, dapple_num = mul_tax_layer.backward(dapple_price)

print(dapple, dapple_num, dtax)

2.2 400 100


$ z = x + y $  

$ \frac{\partial z}{\partial x} = 1$
$ \frac{\partial z}{\partial y} = 1$


In [11]:
class AddLayer:
    def __init__(self):
        pass
        
    def forward(self, x, y):
        out = x + y # z = x + y
        
        return out
    
    def backward(self, dout):
        dx = dout * 1 # dz/dx = 1
        dy = dout * 1 # dz/dy = 1
        
        return dx, dy

In [15]:
apple = 100
apple_num = 2
orange = 150
orange_num = 3
tax = 1.1

# Layers (그래프에서 결과 노드를 뜻한다.)
mul_apple_layer = MulLayer()
mul_orange_layer = MulLayer()
add_apple_orange_layer = AddLayer()
mul_tax_layer = MulLayer()

# Forward
apple_price = mul_apple_layer.forward(apple, apple_num)
orange_price = mul_orange_layer.forward(orange, orange_num)
all_price = add_apple_orange_layer.forward(apple_price, orange_price)
price = mul_tax_layer.forward(all_price, tax)

# Backward
dprice = 1
dall_price, dtax = mul_tax_layer.backward(dprice)
dapple_price, dorange_price = add_apple_orange_layer.backward(dall_price)
dorange, dorange_num = mul_orange_layer.backward(dorange_price)
dapple, dapple_num = mul_apple_layer.backward(dapple_price)

print(price)
print(dapple_num, dapple, dorange, dorange_num, dtax)

715.0000000000001
110.00000000000001 2.2 3.3000000000000003 165.0 650


### 2. 활성화 함수 계층 구현하기

#### (1)ReLU Layer  
$ y = \begin{cases} x & (x > 0) \\ 0 & (x \le 0) \end{cases} $  
$ \frac{\partial y}{\partial x} = \begin{cases} 1 & (x > 0) \\ 0 & (x \le 0) \end{cases} $

In [17]:
class Relu:
    def __init__(self):
        self.mask = None
     
    def forward(self, x):
        self.mask = (x <= 0)
        out = x.copy()
        out[self.mask == 0] = 0
        #mask는 True/False로 구성된 넘파이 배열
        #x값이 0이하면 True, 아니면 False를 유지
        
        #x = np.array([[1.0, -0.5], [-2.0, 3.0]])
        #mask = (x<=0)
        #[[False True]
        #[True False]] False는 0
                
        return out
    
    def backward(self, dout):
        dout[self.mask] = 0
        dx = dout
        
        return dx

#### (2) Sigmoid  Layer
$ y = sigmoid(x) = \frac{1}{1 + e^{-x}} $  
$ \frac{\partial y}{\partial x} = y(1-y) $ 

#####  참고) 
$ y = \frac{1}{x} $     $  \frac{\partial y}{\partial x} = -\frac{1}{x^2} = -y^2$  
  

$ y = e^x$   $ \frac{\partial y}{\partial x} = e^x $

In [18]:
class Sigmoid:
    def __init__(self):
        self.out = None
        
    def forward(self, x):
        out = 1 / (1 + np.exp(-x))
        self.out = out
        
        return out

    def backward(self, dout):
        dx = dout * (1- self.out) * self.out
        
        return dx

### 3. Affine/Softmax 계층 구현하기  
Affine 계층이란 보통의 WX + B 계층을 뜻한다.

#### 참고) 다차원 배열의 계산

In [27]:
import numpy as np
X = np.random.rand(2)
W = np.random.rand(2, 3)
B = np.random.rand(3)
X.shape #여기서 (2,) 는 (1, 2) 의 의미
W.shape
B.shape #여기서 (3,) 는 (1, 3) 의 의미

Y = np.dot(X, W) + B
Y.shape

(2,)

(2, 3)

(3,)

(3,)

$ L = WX + B \text{　　　　}$  　　　      W, X, B 는 행렬  
$ \frac{\partial L}{\partial \mathbf{X}} = \frac{\partial L}{\partial \mathbf{Y}} \cdot \mathbf{W}^T $ 　　　(1,2) = (1,3) * (3,2)  
$ \frac{\partial L}{\partial \mathbf{W}} = \mathbf{X}^T \cdot \frac{\partial L}{\partial \mathbf{Y}} $  　　　(2,3) = (2,1) * (1,3)  
$ W = \begin{pmatrix} w_{11} & w_{21} & w_{31} \\ w_{12} & w_{22} & w_{32} \end{pmatrix}$  
$ W^T = \begin{pmatrix} w_{11} & w_{12} \\ w_{21} & w_{22} \\ w_{31} & w_{32} \end{pmatrix}$

#### (1)배치용 Affine Layer  


In [28]:
X_dot_W = np.array([[0, 0, 0], [10, 10, 10]])
B = np.array([1, 2, 3])

X_dot_W

X_dot_W + B

array([[ 0,  0,  0],
       [10, 10, 10]])

array([[ 1,  2,  3],
       [11, 12, 13]])

In [32]:
dY = np.array([[1, 2, 3], [4, 5, 6]])
dY

dB = np.sum(dY, axis = 0)
dB

array([[1, 2, 3],
       [4, 5, 6]])

array([5, 7, 9])

In [33]:
class Affine:
    def __init__(self, W, b):
        self.W = W
        self.b = b
        self.x = None
        self.dW = None
        self.db = None
        
    def forward(self, x):
        self.x = x
        out = np.dot(x, self.W) + self.b
        
        return out
    
    def backward(self, dout):
        dx = np.dot(dout, self.W.T) # dL/dx
        self.dW = np.dot(self.x.T, dout) #dL/dW
        self.db = np.sum(dout, axis = 0) # dL/db
        
        return dx

#### (2)Softmax_with_Loss Layer

$ y = \text{softmax}(a)$  
$ L = \text{cross_entropy}(y) $  

$ \frac{\partial L}{\partial a} = y - t $



In [34]:
class SoftmaxWithLoss:
    def __init__(self):
        self.loss = None # loss값
        self.y = None #softmax 출력값(확률)
        self.t = None #정답 레이블(1 or 0)
        
    def forward(self, x, t):
        self.t = t
        self.y = softmax(x)
        self.loss = cross_entropy_error(self.y, self.t)
        return self.loss
    
    def backward(self, dout = 1):
        batch_size = self.t.shape[0]
        dx = (self.y - self.t) / batch_size
        
        return dx

### 4. 신경망 구현하기

In [46]:
from common.layers import *
from common.gradient import numerical_gradient
from collections import OrderedDict

class TwoLayerNet:
    
    def __init__(self, input_size, hidden_size, output_size, weight_init_std = 0.01):
        
        #가중치 초기화
        self.params = {}
        self.params['W1'] = weight_init_std * np.random.randn(input_size, hidden_size)
        self.params['b1'] = np.zeros(hidden_size)
        self.params['W2'] = weight_init_std * np.random.randn(hidden_size, output_size)
        self.params['b2'] = np.zeros(output_size)
        
        #계층 생성
        self.layers = OrderedDict()
        self.layers['Affine1'] = Affine(self.params['W1'], self.params['b1'])
        self.layers['Relu1'] = Relu()
        self.layers['Affine2'] = Affine(self.params['W2'], self.params['b2'])
        
        self.lastLayer = SoftmaxWithLoss()
        
    def predict(self, x):
        for layer in self.layers.values():
            x = layer.forward(x)
            
        return x
    
    def loss(self, x, t):
        y = self.predict(x)
        return self.lastLayer.forward(y, t)
    
    def accuracy(self, x, t):
        y = self.predict(x)
        y = np.argmax(y, axis = 1)
        if t.ndim != 1 : t = np.argmax(t, axis = 1)
        accuracy = np.sum(y == t) / float(x.shape[0])
            
        return accuracy
    
    def numerical_gradient(self, x, t):
        loss_W = lambda W:self.loss(x, t)
        
        grads = {}
        grads['W1'] = numerical_gradient(loss_W, self.params['W1'])
        grads['b1'] = numerical_gradient(loss_W, self.params['b1'])
        grads['W2'] = numerical_gradient(loss_W, self.params['W2'])
        grads['b2'] = numerical_gradient(loss_W, self.params['b2'])
        
        return grads
    
    def gradient(self, x, t):
        #forward
        self.loss(x, t)
        
        #backward
        dout = 1
        dout = self.lastLayer.backward(dout)
        
        layers = list(self.layers.values())
        layers.reverse()
        for layer in layers:
            dout = layer.backward(dout)
            
        #결과 저장
        grads = {}
        grads['W1'] = self.layers['Affine1'].dW
        grads['b1'] = self.layers['Affine1'].db
        grads['W2'] = self.layers['Affine2'].dW
        grads['b2'] = self.layers['Affine2'].db
        
        return grads

### 5. 구현한 신경망으로 MNIST 데이터 학습하기!

In [48]:
from dataa.mnist import load_mnist


(x_train, t_train), (x_test, t_test) = load_mnist(normalize=True, one_hot_label=True)
network = TwoLayerNet(input_size=784, hidden_size=50, output_size=10)

iters_num = 10000
train_size = x_train.shape[0]
batch_size = 100
learning_rate = 0.1

train_loss_list = []
train_acc_list = []
test_acc_list = []

iter_per_epoch = max(train_size / batch_size, 1)

for i in range(iters_num):
    batch_mask = np.random.choice(train_size, batch_size)
    x_batch = x_train[batch_mask]
    t_batch = t_train[batch_mask]
    
    #backpropagation으로 기울기를 구한다.
    grad = network.gradient(x_batch, t_batch)
    
    #갱신해준다!!!
    for key in ('W1', 'b1', 'W2', 'b2'):
        network.params[key] -= learning_rate * grad[key]
        
    loss = network.loss(x_batch, t_batch)
    train_loss_list.append(loss)
    
    if i % iter_per_epoch == 0:
        train_acc = network.accuracy(x_train, t_train)
        test_acc = network.accuracy(x_test, t_test)
        train_acc_list.append(train_acc)
        test_acc_list.append(test_acc)
        print(i, train_acc, test_acc)

0 0.07731666666666667 0.0772
600 0.89905 0.9037
1200 0.9222166666666667 0.9264
1800 0.9370833333333334 0.9375
2400 0.9438333333333333 0.9439
3000 0.9515666666666667 0.9498
3600 0.9575333333333333 0.9547
4200 0.9610833333333333 0.9574
4800 0.9644666666666667 0.9587
5400 0.9662833333333334 0.9617
6000 0.9678333333333333 0.9623
6600 0.96915 0.9636
7200 0.9727 0.9677
7800 0.9747666666666667 0.9669
8400 0.97585 0.9668
9000 0.9770666666666666 0.9707
9600 0.9778 0.9669
