### 誤差逆転伝播法
* 全体の計算をする必要がなく、局所的な計算をすれば良い。（問題を単純化できる）
* 逆方向の伝播によって効率よく微分を行える。

#### 連鎖律
* 合成関数を必要とする
$$z=(x+y)^2という式があったとする。以下の様に分けることができる。$$
$$z=t^2$$
$$t=x+y$$

$$\frac{\delta{z}}{\delta{x}}=\frac{\delta{z}}{\delta{t}} * \frac{\delta{t}}{\delta{x}}$$
* 上記のδtをお互いに打ち消しあって局所的な微分を求めることができます。
$$\frac{\delta{z}}{\delta{t}}=2t$$
$$\frac{\delta{t}}{\delta{t}}=1$$

$$\frac{\delta{z}}{\delta{x}}=\frac{\delta{z}}{\delta{t}} * \frac{\delta{t}}{\delta{x}}=2t*1=2(x+y)$$


#### 加算ノードの逆伝播
$$\frac{\delta{z}}{\delta{x}}=1$$
$$\frac{\delta{z}}{\delta{y}}=1$$
* 上記の様にどちらも１になります。

#### 乗算ノードの逆伝播
$$\frac{\delta{z}}{\delta{x}}=y$$
$$\frac{\delta{z}}{\delta{y}}=x$$

* ひっくり返した値を乗算して下流に流します。

In [16]:
import numpy as np
import matplotlib.pyplot as plt
import os
%matplotlib inline

In [2]:
#乗算レイヤの実装
class MulLayer:
    def __init__(self):
        self.x = None
        self.y = None
        
    def forward(self, x, y):
        self.x = x
        self.y = y
        out = x * y
        return out
    
    def backward(self, dout):
        dx = dout * self.y
        dy = dout * self.x
        
        return dx, dy

In [3]:
apple = 100
apple_num = 2
tax = 1.1

mul_apple_layer = MulLayer()
mul_tax_layer = MulLayer()

apple_price = mul_apple_layer.forward(apple, apple_num)
price = mul_tax_layer.forward(apple_price, tax)
print(price)

220.00000000000003


In [4]:
dprice = 1
dapple_price, dtax = mul_tax_layer.backward(dprice)
dapple, dapple_num = mul_apple_layer.backward(dapple_price)
print(dapple, dapple_num, dtax)

2.2 110.00000000000001 200


In [5]:
#加算レイヤの実装
class AddLayer:
    def __init__(self):
        pass
    
    def forward(self, x, y):
        out = x + y
        return out
    
    def backward(self, dout):
        dx = dout * 1
        dy = dout * 1
        return dx, dy

In [6]:
apple = 100
apple_num = 2
orange = 150
orange_num = 3
tax = 1.1

mul_apple_layer = MulLayer()
mul_orange_layer = MulLayer()
add_apple_oragnge_layer = AddLayer()
mul_tax_layer = MulLayer()

apple_price = mul_apple_layer.forward(apple, apple_num)
orange_price = mul_orange_layer.forward(orange, orange_num)
all_price = add_apple_oragnge_layer.forward(apple_price, orange_price)
price = mul_tax_layer.forward(all_price, tax)
print(price)

715.0000000000001


In [7]:
dprice = 1
dall_price, dtax = mul_tax_layer.backward(dprice)
dapple_price, dorange_price = add_apple_oragnge_layer.backward(dall_price)
dorange, dorange_num = mul_orange_layer.backward(dorange_price)
dapple, dapple_num = mul_apple_layer.backward(dapple_price)
print(dapple_num, dapple, dorange, dorange_num, dtax)

110.00000000000001 2.2 3.3000000000000003 165.0 650


### 活性化関数のレイヤの実装
* sigmoidとRELU

#### ReLUレイヤ
$$y=x(x > 0)$$
$$y=0(x <= 0)$$

$$\frac{\delta{y}}{\delta{x}}=1(x > 0)$$
$$\frac{\delta{x}}{\delta{x}}=0(x <= 0)$$

In [8]:
class ReLU:
    def __init__(self):
        self.mask = None
        
    def forward(self, x):
        self.mask = (x <= 0)
        out = x.copy()
        out[self.mask] = 0
        
        return out
        
    def backward(self, dout):
        dout[self.mask] = 0
        dx = dout
        
        return dx

In [10]:
x = np.array([[1.0, -.5], [-2.0, 3.0]])
print(x)
mask = (x <= 0)
print(mask)

[[ 1.  -0.5]
 [-2.   3. ]]
[[False  True]
 [ True False]]


#### Sigmoidレイヤ
$$y=\frac{1}{1+exp^{(-x)}}$$

##### Step1
$$\frac{\delta{y}}{\delta{x}}=-\frac{1}{x^2}=-y^2$$

##### Step2
* ＋ノードは上流の値をそのまま流す

##### Step3
「exp」ノードはy=exp(x)を表し、次の式で表す
$$\frac{\delta{y}}{\delta{x}}=\exp(x)$$

##### Step4
「x」ノードは順伝播の値をひっくり返して乗算する。ここでは−１の乗算する。

In [12]:
class Sigmoid:
    def __init__(self):
        self.out = None
        
    def forward(self, x):
        out = 1 / (1 + np.exp(-x))
        self.out = out
        
        return out
    
    def backward(self, dout):
        dx = dout * (1.0 - self.out) * self.out
        
        return dx

In [13]:
X = np.random.rand(2)
W = np.random.rand(2, 3)
B = np.random.rand(3)
Y = np.dot(X, W) + B

In [14]:
X_dot_W = np.array([[0, 0, 0], [10, 10, 10]])
B = np.array([1, 2, 3])
print(X_dot_W)
print(X_dot_W + B)

[[ 0  0  0]
 [10 10 10]]
[[ 1  2  3]
 [11 12 13]]


In [15]:
class Affine:
    def __init__(self, W, b):
        self.W = W
        self.b = b
        self.x = None
        self.dW = None
        self.db = None
        
    def forward(self, x):
        self.x = x
        out = np.dot(x, self.W) + self.b
        
        return out

    def backward(self, dout):
        dx = np.dot(dout, self.W.T)
        self.dw = np.dot(self.x.T, dout)
        self.db = no.sum(dout, axis=0)
        
        return dx

In [26]:
os.chdir("/work/dl_learning/deep-learning-from-scratch-master/common")
from functions import cross_entropy_error, softmax
from layers import *
from gradient import numerical_gradient

In [27]:
class SoftmaxWithLoss:
    def __init__(self):
        self.loss = None
        self.y = None
        self.t = None
        
    def forward(self, x, t):
        self.t = t
        self.y = functions.softmax(x)
        self.loss = functions.cross_entropy_error(self.y, self.t)
        
        return self.loss
    
    def backward(self, dout=1):
        batch_size = self.t.shape[0]
        dx = (self.y - self.t) / batch_size
        
        return dx

In [28]:
from collections import OrderedDict

In [29]:
class TwoLayerNet:

    def __init__(self, input_size, hidden_size, output_size, weight_init_std = 0.01):
        # 重みの初期化
        self.params = {}
        self.params['W1'] = weight_init_std * np.random.randn(input_size, hidden_size)
        self.params['b1'] = np.zeros(hidden_size)
        self.params['W2'] = weight_init_std * np.random.randn(hidden_size, output_size) 
        self.params['b2'] = np.zeros(output_size)

        # レイヤの生成
        self.layers = OrderedDict()
        self.layers['Affine1'] = Affine(self.params['W1'], self.params['b1'])
        self.layers['Relu1'] = Relu()
        self.layers['Affine2'] = Affine(self.params['W2'], self.params['b2'])

        self.lastLayer = SoftmaxWithLoss()
        
    def predict(self, x):
        for layer in self.layers.values():
            x = layer.forward(x)
        
        return x
        
    # x:入力データ, t:教師データ
    def loss(self, x, t):
        y = self.predict(x)
        return self.lastLayer.forward(y, t)
    
    def accuracy(self, x, t):
        y = self.predict(x)
        y = np.argmax(y, axis=1)
        if t.ndim != 1 : t = np.argmax(t, axis=1)
        
        accuracy = np.sum(y == t) / float(x.shape[0])
        return accuracy
        
    # x:入力データ, t:教師データ
    def numerical_gradient(self, x, t):
        loss_W = lambda W: self.loss(x, t)
        
        grads = {}
        grads['W1'] = numerical_gradient(loss_W, self.params['W1'])
        grads['b1'] = numerical_gradient(loss_W, self.params['b1'])
        grads['W2'] = numerical_gradient(loss_W, self.params['W2'])
        grads['b2'] = numerical_gradient(loss_W, self.params['b2'])
        
        return grads
        
    def gradient(self, x, t):
        # forward
        self.loss(x, t)

        # backward
        dout = 1
        dout = self.lastLayer.backward(dout)
        
        layers = list(self.layers.values())
        layers.reverse()
        for layer in layers:
            dout = layer.backward(dout)

        # 設定
        grads = {}
        grads['W1'], grads['b1'] = self.layers['Affine1'].dW, self.layers['Affine1'].db
        grads['W2'], grads['b2'] = self.layers['Affine2'].dW, self.layers['Affine2'].db

        return grads

In [30]:
os.chdir("/work/dl_learning/Mnist_data/dataset")
from mnist import load_mnist
os.chdir("/work/dl_learning/deep-learning-from-scratch-master/ch05")
from two_layer_net import TwoLayerNet

In [31]:
(X_train, t_train), (X_test, t_test) = load_mnist(one_hot_label=True)
network = TwoLayerNet(input_size=784, hidden_size=50, output_size=10)
X_batch = X_train[:3]
t_batch = t_train[:3]

grad_numerical = network.numerical_gradient(X_batch, t_batch)
grad_backprop = network.gradient(X_batch, t_batch)

In [32]:
for key in grad_numerical.keys():
    diff = np.average(np.abs(grad_backprop[key] - grad_numerical[key]))
    print(key + ":" + str(diff))

W1:4.826524701581082e-10
b1:2.694753378730194e-09
W2:5.64312285552036e-09
b2:1.3990573663191784e-07


In [33]:
# データの読み込み
(x_train, t_train), (x_test, t_test) = load_mnist(normalize=True, one_hot_label=True)

network = TwoLayerNet(input_size=784, hidden_size=50, output_size=10)

iters_num = 10000
train_size = x_train.shape[0]
batch_size = 100
learning_rate = 0.1

train_loss_list = []
train_acc_list = []
test_acc_list = []

iter_per_epoch = max(train_size / batch_size, 1)

for i in range(iters_num):
    batch_mask = np.random.choice(train_size, batch_size)
    x_batch = x_train[batch_mask]
    t_batch = t_train[batch_mask]
    
    # 勾配
    #grad = network.numerical_gradient(x_batch, t_batch)
    grad = network.gradient(x_batch, t_batch)
    
    # 更新
    for key in ('W1', 'b1', 'W2', 'b2'):
        network.params[key] -= learning_rate * grad[key]
    
    loss = network.loss(x_batch, t_batch)
    train_loss_list.append(loss)
    
    if i % iter_per_epoch == 0:
        train_acc = network.accuracy(x_train, t_train)
        test_acc = network.accuracy(x_test, t_test)
        train_acc_list.append(train_acc)
        test_acc_list.append(test_acc)
        print(train_acc, test_acc)


0.10491666666666667 0.11
0.9039 0.9057
0.9239666666666667 0.9266
0.9355666666666667 0.9365
0.9449666666666666 0.9449
0.9516333333333333 0.9495
0.9568666666666666 0.9526
0.9589333333333333 0.9546
0.96345 0.9581
0.9662166666666666 0.96
0.9698 0.9614
0.9716166666666667 0.9633
0.97295 0.9651
0.97485 0.9662
0.9757333333333333 0.9659
0.9781333333333333 0.9681
0.97885 0.9689
