In [95]:
"""
[誤差逆伝播法(Back Propagation)]
勾配を数値微分で計算する方法は, シンプルで実装が容易だが, 計算に時間がかかるという難点がある.
逆伝播は計算グラフの各ノードにおいて, 局所的な微分を信号に乗算して, 次のノードに伝達していく.
この計算を行うことによって, 目的とする微分の値を効率よく求めることができる.

[連鎖律の原理]
ある関数が合成関数で表される場合, その合成関数の微分は, 合成関数を構成するそれぞれの関数の微分の積によって表すことができる.
これを連鎖律の原理という.

"""
pass

In [96]:
"""
計算グラフにおける乗算レイヤの実装
"""
class MulLayer:
    def __init__(self):
        self.x = None
        self.y = None
    
    def forward(self, x, y):
        self.x = x
        self.y = y
        out = x * y
    
        return out
    
    def backward(self, dout):
        dx = dout * self.y # 順伝播のひっくり返した値を乗算して下流に流す
        dy = dout * self.x
        
        return dx, dy

In [97]:
"""
りんご２個の買い物
"""
apple = 100
apple_num = 2
orange_num = 3
tax = 1.1

# layer
mul_apple_layer = MulLayer()
mul_tax_layer = MulLayer()

# forward
apple_price = mul_apple_layer.forward(apple, apple_num)
price = mul_tax_layer.forward(apple_price, tax)

print(price)

# backward
# 各変数に関する微分を求める

dprice = 1
dapple_price, dtax = mul_tax_layer.backward(dprice)
dapple, dapple_num = mul_apple_layer.backward(dapple_price)

print("Diff for variables: ", dapple, "," , dapple_num, ",", dtax)

220.00000000000003
Diff for variables:  2.2 , 110.00000000000001 , 200


In [98]:
"""
計算グラフにおける加算レイヤの実装
"""
class AddLayer:
    
    def __init__(self):
        pass
    
    def forward(self, x, y):
        out = x + y
        return out
    
    def backward(self, dout):
        dx = dout * 1
        dy = dout * 1
        return dx, dy

In [99]:
"""
りんご２個とみかん３個の買い物
"""
apple = 100
apple_num = 2
orange = 150
orange_num = 3
tax = 1.1

# layer
mul_apple_layer = MulLayer()
mul_orange_layer = MulLayer()
add_apple_orange_layer = AddLayer()
mul_tax_layer = MulLayer()

# forward
apple_price = mul_apple_layer.forward(apple, apple_num)
orange_price = mul_orange_layer.forward(orange, orange_num)
all_price = add_apple_orange_layer.forward(apple_price, orange_price)
price = mul_tax_layer.forward(all_price, tax)

# backward
dprice = 1
dall_price, dtax = mul_tax_layer.backward(dprice)
dapple_price, dorange_price = add_apple_orange_layer.backward(dall_price)
dorange, dorange_num = mul_orange_layer.backward(dorange_price)
dapple, dapple_num = mul_apple_layer.backward(dapple_price)

print("Ouput Price:", price)
print("Diff of Apple num:", dapple_num)
print("Diff of Apple price:", dapple)
print("Diff of Orange num:", dorange_num)
print("Diff of Orange price:", dorange)
print("Diff of Tax:", dtax)

Ouput Price: 715.0000000000001
Diff of Apple num: 110.00000000000001
Diff of Apple price: 2.2
Diff of Orange num: 165.0
Diff of Orange price: 3.3000000000000003
Diff of Tax: 650


In [100]:
"""
活性化関数レイヤの実装
RelU(Rectified Linear Unit)レイヤー
"""
class Relu:
    def __init__(self):
        self.mask = None
        
    def forward(self, x):
        """0以下の値は0に変換する"""
        self.mask = (x <= 0)
        out = x.copy()
        out[self.mask] = 0
        
        return out
    
    def backward(self, dout):
        """保存していたマスクを使って微分を算出"""
        dout[self.mask] = 0
        dx = dout
        
        return dx

In [101]:
"""
Sigmoidレイヤー
"""
class Sigmoid:
    def __init__(self):
        self.out = None
    
    def forward(self, x):
        """順伝播を計算"""
        out = 1/ (1 + np.exp(-x))
        self.out = out
        
        return out
    
    def backward(self, dout):
        """doutに逆伝播(xに関するyの偏微分)(y(1-y))を乗算した値を返す"""
        dx = dout * (1.0 - self.out) * self.out
        
        return dx

In [102]:
"""
Affineレイヤーの実装(バッチ版)
# ニューラルネットワークの順伝播で行う行列の積は, アフィン変換と呼ばれる.
"""
class Affine:
    def __init__(self, W, b):
        self.W = W
        self.b = b
        self.x = None
        self.dW = None
        self.db = None
    
    def forward(self, x):
        self.x = x
        out = np.dot(x, self.W) + self.b
        
        return out
    
    def backward(self, dout):
        dx = np.dot(dout, self.W.T)
        self.dW = np.dot(self.x.T, dout)
        self.db = np.sum(dout, axis = 0)
        
        return dx

In [103]:
############################
# ソフトマックス関数
############################
def softmax(x):
    if x.ndim == 2:
        x = x.T
        x = x - np.max(x, axis=0)
        y = np.exp(x) / np.sum(np.exp(x), axis=0)
        return y.T 

    x = x - np.max(x) # オーバーフロー対策
    return np.exp(x) / np.sum(np.exp(x))

############################
# 交差エントロピー誤差
############################
def cross_entropy_error(y, t):
    if y.ndim == 1:
        t = t.reshape(1, t.size)
        y = y.reshape(1, y.size)
        
    # 教師データがone-hot-vectorの場合、正解ラベルのインデックスに変換
    if t.size == y.size:
        t = t.argmax(axis=1)
             
    batch_size = y.shape[0]
    return -np.sum(np.log(y[np.arange(batch_size), t] + 1e-7)) / batch_size

############################
# 数値微分
############################
def numerical_gradient(f, x):
    h = 1e-4 # 0.0001
    grad = np.zeros_like(x)
    
    it = np.nditer(x, flags=['multi_index'], op_flags=['readwrite'])
    while not it.finished:
        idx = it.multi_index
        tmp_val = x[idx]
        x[idx] = float(tmp_val) + h
        fxh1 = f(x) # f(x+h)
        
        x[idx] = tmp_val - h 
        fxh2 = f(x) # f(x-h)
        grad[idx] = (fxh1 - fxh2) / (2*h)
        
        x[idx] = tmp_val # 値を元に戻す
        it.iternext()   
        
    return grad

In [104]:
"""
Softmax-with-Lossレイヤー(出力層)の実装
Softmax-レイヤは, 入力された値を正規化(出力の和が1になるように変形)する.
交差エントロピー誤差レイヤー(Cross Entropy Error)も含めて実装する.
"""
class SoftmaxWithLoss:
    def __init__(self):
        self.loss = None # 損失
        self.y = None # softmaxの出力
        self.t = None # 教師データ(one-hot vector)
    
    def forward(self, x, t):
        self.t = t
        self.y = softmax(x)
        self.loss = cross_entropy_error(self.y, self.t)
        
        return self.loss
    
    def backward(self, dout=1):
        batch_size = self.t.shape[0]
        dx = (self.y - self.t) / batch_size # データ１個あたりの誤差が伝播するようにbatch_sizeで割る
        
        return dx

In [127]:
import sys, os
sys.path.append("./original")
import numpy as np
from collections import OrderedDict

class TwoLayerNet:
    def __init__(self, input_size, hidden_size, output_size, weight_init_std=0.01):
        # 重みの初期化
        self.params = {}
        self.params['W1'] = weight_init_std * np.random.randn(input_size, hidden_size)
        self.params['b1'] = np.zeros(hidden_size)
        self.params['W2'] = weight_init_std * np.random.randn(hidden_size, output_size)
        self.params['b2'] = np.zeros(output_size)
        
        for k in self.params:
            print("param {0} shape: ".format(k), self.params[k].shape)
        
        # レイヤの生成
        self.layers = OrderedDict() # 順番付きディクショナリ
        self.layers['Affine1'] = Affine(self.params['W1'], self.params['b1'])
        self.layers['Relu1'] = Relu()
        self.layers['Affine2'] = Affine(self.params['W2'], self.params['b2'])
        self.lastLayer = SoftmaxWithLoss()
    
    def predict(self, x):
        for layer in self.layers.values():
            x = layer.forward(x)
            
        return x
    
    def loss(self, x, t):
        """損失関数"""
        y = self.predict(x)
        # 交差エントロピー誤差を返す
        # print("loss y shape: ", y.shape)
        # print("loss t shape: ", y.shape)
        r = self.lastLayer.forward(y, t)
        #print("loss result:", r)
        return r
    
    def accuracy(self, x, t):
        y = self.predict(x)
        y = np.argmax(y, axis=1)
        if t.ndim != 1 : t = np.argmax(t, axis=1)
            
        accuracy = np.sum(y == t) / float(x.shape[0])
        return accuracy
        
    def numerical_gradient(self, x, t):
        """x: 入力データ(input data, t: 教師データ(training data)
             重みパラメータに対する勾配を求める."""
        loss_W = lambda W: self.loss(x, t)
        
        # パラメータごとに数値微分を計算する
        # すべてのパラメータごとに, 損失関数を実行（推論）を行い, 交差エントロピー誤差を計算することにより, 数値微分を計算する
        grads = {}
        grads['W1'] = numerical_gradient(loss_W, self.params['W1'])
        grads['b1'] = numerical_gradient(loss_W, self.params['b1'])
        grads['W2'] = numerical_gradient(loss_W, self.params['W2'])
        grads['b2'] = numerical_gradient(loss_W, self.params['b2'])
        
        return grads
    
    def gradient(self, x, t):
        # forward
        self.loss(x, t)
        
        # backward
        dout = 1
        dout = self.lastLayer.backward(dout)
        
        layers = list(self.layers.values())
        layers.reverse()
        for layer in layers:
            dout = layer.backward(dout)
        
        # 設定
        grads = {}
        grads['W1'] = self.layers['Affine1'].dW
        grads['b1'] = self.layers['Affine1'].db
        grads['W2'] = self.layers['Affine2'].dW
        grads['b2'] = self.layers['Affine2'].db
        
        return grads
        

In [128]:
"""
[勾配確認]
数値微分の結果と誤差逆伝播法の結果を比較して, 動作が正しいことを確認する.
"""
import sys, os
sys.path.append("./original")
sys.path.append("./original/ch05")
import numpy as np
from original.dataset.mnist import load_mnist

network = TwoLayerNet(input_size=784, hidden_size=50, output_size=10)

(x_train, t_train), (x_test, t_test) = load_mnist(normalize=True, one_hot_label=True)
x_batch = x_train[:1]
t_batch = t_train[:1]

print("x_batch shape:", x_batch.shape)
print("t_batch:", t_batch)

grad_numerical = network.numerical_gradient(x_batch, t_batch)
grad_backprop = network.gradient(x_batch, t_batch)

for key in grad_numerical.keys():
    diff = np.average(np.abs(grad_backprop[key] - grad_numerical[key]))
    print(key, ":", str(diff))

param W1 shape:  (784, 50)
param b1 shape:  (50,)
param W2 shape:  (50, 10)
param b2 shape:  (10,)
x_batch shape: (1, 784)
t_batch: [[0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]]
W1 : 5.007767305992867e-10
b1 : 3.637357424792799e-09
W2 : 8.271626123200124e-09
b2 : 1.7898440200919686e-07
