In [76]:
# 計算グラフの逆伝播
#       (x)->   ->(y)
#            (f)
# E*(dy/dx)<-   <-(E)

# 連鎖率
# Z = (x + y)^2 合成関数
# Z = t^2
# t = x + y
# dZ/dx = dZ/dt * dt/dx

In [77]:
# 加算レイヤ(AddLayer)と乗算レイヤ(MulLayer)の実装
class MulLayer:
    def __init__(self):
        self.x = None
        self.y = None
        
    def forward(self, x, y):
        self.x = x
        self.y = y
        out = x * y
        return out
    
    def backward(self, dout):
        dx = dout * self.y
        dy = dout * self.x
        return dx, dy
    
class AddLayer:
    def __init__(self):
        pass
    
    def forward(self, x, y):
        out = x + y
        return out
    
    def backward(self, dout):
        dx = dout * 1.0
        dy = dout * 1.0
        return dx, dy

In [78]:
apple = 100
apple_num = 2
tax = 1.1

# layer
mul_apple_layer = MulLayer()
mul_tax_layer = MulLayer()

# forward
apple_price = mul_apple_layer.forward(apple, apple_num)
price = mul_tax_layer.forward(apple_price, tax)

print(price)

220.00000000000003


In [79]:
# backward
dprice = 1
dapple_price, dtax = mul_tax_layer.backward(dprice)
dapple, dapple_num = mul_tax_layer.backward(dapple_price)

print(dapple, dapple_num, dtax)

1.2100000000000002 220.00000000000003 200


In [80]:
# リンゴ2個とみかん3個の買い物
apple = 100
apple_num = 2
orange = 150
orange_num = 3
tax = 1.1

# layer
mul_apple_layer = MulLayer()
mul_orange_layer = MulLayer()
add_apple_orange_layer = AddLayer()
mul_tax_layer = MulLayer()

# forward
apple_price = mul_apple_layer.forward(apple, apple_num)
orange_price = mul_orange_layer.forward(orange, orange_num)
all_price = add_apple_orange_layer.forward(apple_price, orange_price)
price = mul_tax_layer.forward(all_price, tax)

# backward
dprice = 1
dall_price, dtax = mul_tax_layer.backward(dprice)
print(dall_price)
print(dtax)
dapple_price, dorange_price = add_apple_orange_layer.backward(dall_price)
print(dapple_price)
print(dorange_price)
dorange, dorange_num = mul_orange_layer.backward(dorange_price)
dapple, dapple_num = mul_apple_layer.backward(dapple_price)

print(price)
print(dapple_num, dapple, dorange, dorange_num, dtax)

1.1
650
1.1
1.1
715.0000000000001
110.00000000000001 2.2 3.3000000000000003 165.0 650


In [81]:
# 活性化関数レイヤの実装
class ReLu:
    def __init__(self):
        self.mask = None
    
    def forward(self, x):
        self.mask = (x <= 0) # X<=0の要素に対して、Trueのラベリング
        out = x.copy()
        out[self.mask] = 0 # Trueのラベリングした出力は0にする
        return out
    
    def backward(self, dout):
        dout[self.mask] = 0
        dx = dout # x, out, doutの次元は同じ

In [82]:
import numpy as np
x = np.array([[1.0, -0.5], [-2.0, 3.0]])
print(x)

[[ 1.  -0.5]
 [-2.   3. ]]


In [83]:
mask = (x <= 0)
mask

array([[False,  True],
       [ True, False]])

In [84]:
class Sigmoid:
    def __init__(self):
        self.out = None
        
    def forward(self, x):
        out = 1 / (1 + np.exp(-x))
        self.out = out
        return out
    
    def backward(self, dout):
        dx = dout * (1.0 - self.out) * self.out
        return dx
    

In [85]:
# Affineレイヤ
X = np.random.rand(2) # input
W = np.random.rand(2, 3) # output
B = np.random.rand(3) # バイアス

print(X.shape)
print(W.shape)
print(B.shape)

(2,)
(2, 3)
(3,)


In [86]:
Y = np.dot(X, W) + B

In [87]:
Y

array([0.90713744, 1.53839672, 0.57554688])

In [88]:
# バッチ版Affineレイヤ
# dL/dX = dL/dY・W^T
# dL/dW = X^T・dL/dY
# dL/dB = dL/dYの第0軸に関する和

X_dot_W = np.array([[0, 0, 0], [10, 10, 10]])
B = np.array([1, 2, 3])
X_dot_W

array([[ 0,  0,  0],
       [10, 10, 10]])

In [89]:
X_dot_W + B

array([[ 1,  2,  3],
       [11, 12, 13]])

In [90]:
# バイアスの逆伝播の際の値は、各サンプルをNNに投入した後の、出力側の微分値を全サンプルで総和を取った値
dY = np.array([[1, 2, 3], [4, 5, 6]])
dY

array([[1, 2, 3],
       [4, 5, 6]])

In [91]:
dB = np.sum(dY, axis=0)
dB

array([5, 7, 9])

In [92]:
# バッチ版に対応し,入力も4次元（テンソル）に対応したAffineレイヤの実装
class Affine:
    def __init__(self, W, b):
        self.W = W
        self.b = b
        self.x = None
        self.dW = None
        self.db = None
        
    def forward(self, x):
        self.x = x
        out = np.dot(x, self.W) + self.b
        return out
    
    def backward(self, dout):
        dx = np.dot(dout, self.W.T)
        self.dW = np.dot(self.x.T, dout)
        self.db = np.sum(dout, axis=0)
        return dx

In [93]:
def softmax(x):
    if x.ndim == 2:
        x = x.T
        x = x - np.max(x, axis=0)
        y = np.exp(x) / np.sum(np.exp(x), axis=0)
        return y.T 

    x = x - np.max(x) # オーバーフロー対策
    return np.exp(x) / np.sum(np.exp(x))

In [94]:
def cross_entropy_error(y, t):
    if y.ndim == 1:
        t = t.reshape(1, t.size)
        y = y.reshape(1, y.size)
        
    # 教師データがone-hot-vectorの場合、正解ラベルのインデックスに変換
    if t.size == y.size:
        t = t.argmax(axis=1)
             
    batch_size = y.shape[0]
    return -np.sum(np.log(y[np.arange(batch_size), t])) / batch_size

In [95]:
# Softmax with Loss(cross entropy error)レイヤの実装
class SoftmaxWithLoss:
    def __init__(self):
        self.loss = None # 損失
        self.y = None # softmaxの出力
        self.t = None # 教師データ(one-hot vector)
        
    def forward(self, x, t):
        self.t = t
        self.y = softmax(x)
        self.loss = cross_entropy_error(self.y, self.t)
        return self.loss
    
    def backward(self, dout=1):
        batch_size = self.t.shape[0]
        dx = (self.y - self.t) / batch_size
        return dx

In [96]:
# 勾配
# x0とx1の偏微分をまとめて計算
def numerical_gradient(f, x):
    h = 1e-4
    shape = None
    if (x.ndim != 1):
        shape = x.shape
        x = x.reshape(1, -1)
        x = x[0]
    #print(x)
    grad = np.zeros_like(x) # xと同じ形状
    #print(grad)
    # 要素それぞれに微分をする
    for idx in range(x.size):
        tmp_val = x[idx]
        # f(x+h)の計算
        x[idx] = tmp_val + h # 微分される側の変数
        fxh1 = f(x) # もう一方の変数は固定
        
        # f(x-h)の計算
        x[idx] = tmp_val - h
        fxh2 = f(x)
        
        grad[idx] = (fxh1 - fxh2) / (2*h)
        x[idx] = tmp_val # 値を元に戻す
        
    if (shape is not None):
        grad = grad.reshape(shape)
        
    return grad

In [117]:
# 活性化関数レイヤの実装
class ReLu:
    def __init__(self):
        self.mask = None
    
    def forward(self, x):
        self.mask = (x <= 0) # X<=0の要素に対して、Trueのラベリング
        out = x.copy()
        out[self.mask] = 0 # Trueのラベリングした出力は0にする
        return out
    
    def backward(self, dout):
        dout[self.mask] = 0
        dx = dout # x, out, doutの次元は同じ
        return dx

In [126]:
# 誤差逆伝播法を実装したTwoLayerNetクラスの実装
from collections import OrderedDict

class TwoLayerNet:
    def __init__(self, input_size, hidden_size, output_size, weight_init_std=0.01):
        # 重みの初期化
        self.params = {}
        self.params['W1'] = weight_init_std * np.random.randn(input_size, hidden_size)
        self.params['b1'] = np.zeros(hidden_size)
        self.params['W2'] = weight_init_std * np.random.randn(hidden_size, output_size)
        self.params['b2'] = np.zeros(output_size)
        
        # レイヤの生成
        self.layers = OrderedDict()
        self.layers['Affine1'] = Affine(self.params['W1'], self.params['b1'])
        self.layers['ReLu'] = ReLu()
        self.layers['Affine2'] = Affine(self.params['W2'], self.params['b2'])
        self.lastLayer = SoftmaxWithLoss()
        
    def predict(self, x):
        for layer in self.layers.values():
            x = layer.forward(x)
        return x
    
    def loss(self, x, t):
        y = self.predict(x)
        return self.lastLayer.forward(y, t)
    
    def accuracy(self, x, t):
        y = self.predict(x)
        y = np.argmax(y, axis=1)
        if t.ndim !=1 :
            t = np.argmax(t, axis=1) # one-hot-vectorからlabel-vectorに変換   
        accuracy = np.sum(y == t) / float(x.shape[0])
        return accuracy
    
    def numerical_gradient(self, x, t):
        loss_W = lambda W: self.loss(x, t)
        
        grads = {}
        grads['W1'] = numerical_gradient(loss_W, self.params['W1'])
        grads['b1'] = numerical_gradient(loss_W, self.params['b1'])
        grads['W2'] = numerical_gradient(loss_W, self.params['W2'])
        grads['b2'] = numerical_gradient(loss_W, self.params['b2'])
        return grads
    
    def gradient(self, x, t):
        # 勾配を求めるために、誤差逆伝播法を実装
        
        # forward -> 各レイヤーにパラメータ類を保存させて、出力を得る
        self.loss(x, t)
        
        # backward -> 逆誤差伝播法で各レイヤのパラメータの勾配を保存
        dout = 1
        dout = self.lastLayer.backward(dout) # SoftmaxWithLoss
        layers = list(self.layers.values())
        layers.reverse()
        for layer in layers:
            dout = layer.backward(dout)
        
        # 勾配を取得
        grads = {}
        grads['W1'] = self.layers['Affine1'].dW
        grads['b1'] = self.layers['Affine1'].db
        grads['W2'] = self.layers['Affine2'].dW
        grads['b2'] = self.layers['Affine2'].db
        
        return grads

In [127]:
# 勾配確認：　数値微分による勾配と誤差逆伝播法による値を比較

import numpy as np
from mnist import load_mnist

(x_train, t_train), (x_test, t_test) = load_mnist(normalize=True, one_hot_label=True)
network = TwoLayerNet(input_size=784, hidden_size=50, output_size=10)

x_batch = x_train[:3]
t_batch = t_train[:3]

grad_numerical = network.numerical_gradient(x_batch, t_batch)
grad_backprop = network.gradient(x_batch, t_batch)

# 各重みの絶対誤差の平均を求める
for key in grad_numerical.keys():
    diff = np.average(np.abs(grad_backprop[key] - grad_numerical[key]))
    print(key + ":" + str(diff))

W1:2.725278948960078e-13
b1:1.015198259658734e-12
W2:9.30882779117282e-13
b2:1.1968202401346771e-10


In [128]:
# 誤差逆伝播法を使ったNNの学習
import numpy as np
from mnist import load_mnist

# データ (MNIST)
(x_train, t_train), (x_test, t_test) = load_mnist(normalize=True, one_hot_label=True)

# NN
network = TwoLayerNet(input_size=784, hidden_size=50, output_size=10)

# 学習条件
iters_num = 10000
train_size = x_train.shape[0]
batch_size = 100
learning_rate = 0.1

# 結果
train_loss_list = []
train_acc_list = []
test_acc_list = []

# エポック
iter_per_epoch = max(train_size / batch_size, 1)

for i in range(iters_num):
    # バッチ学習
    batch_mask = np.random.choice(train_size, batch_size)
    x_batch = x_train[batch_mask]
    t_batch = t_train[batch_mask]
    
    # 誤差逆伝播法によって勾配を求める
    grad = network.gradient(x_batch, t_batch)
    
    # パラメータの更新
    for key in ('W1', 'b1', 'W2', 'b2'):
        network.params[key] -= learning_rate * grad[key]
        
    # 誤差
    loss = network.loss(x_batch, t_batch)
    train_loss_list.append(loss)
    
    # エポック毎に精度の確認
    if i % iter_per_epoch == 0:
        train_acc = network.accuracy(x_train, t_train)
        test_acc = network.accuracy(x_test, t_test)
        train_acc_list.append(train_acc)
        test_acc_list.append(test_acc)
        print("training-iter %d train_acc %f, test_acc %f", i, test_acc, test_acc)

0.09406666666666667 0.0954
0.9060166666666667 0.911
0.9224 0.924
0.9348333333333333 0.9346
0.944 0.9434
0.9508166666666666 0.9496
0.95595 0.9539
0.9585666666666667 0.9577
0.9614166666666667 0.956
0.9662166666666666 0.9629
0.96855 0.9639
0.9707166666666667 0.9646
0.97345 0.9656
0.9763333333333334 0.9684
0.97685 0.969
0.9781666666666666 0.9692
0.97965 0.9723
