2210241 上坂力輝 情報工学工房 第3回課題

In [1]:
import numpy as num
import sys,os
sys.path.append(os.pardir)
from common.layers import *
from common.gradient import numerical_gradient
from collections import OrderedDict
from dataset.mnist import load_mnist
#os.environ['HTTP_PROXY']="http://proxy.uec.ac.jp:8080"
#os.environ['HTTPS_PROXY']="http://proxy.uec.ac.jp:8080"

1.Softmax with lossレイヤーを実装する

In [2]:
#softmaxの実装
def softmax(x):
    #x:入力データ
    exp_x = np.exp(x)
    sum_exp_x = sum(exp_x)
    y = exp_x / sum_exp_x
    return y

#closs_entropy_errorの実装
def closs_entropy_error(y,t):
    #y:出力データ,t:教師データ
    if y.ndim == 1:
        t = t.reshape(1,t.size)
        y = y.reshape(1,y.size)
        
    batch_size = y.shape[0]
    return -np.sum(np.log(y[np.arange(batch_size),t])) / batch_size

In [3]:
#Softmax_with_loss レイヤーの実装
class SofttmaxWithLoss:
    def __init__(self):
        self.loss = None #損失
        self.y = None    #softmaxの出力
        self.t = None    #教師データ
        
    def forward(self,x,t):
        self.t = t
        self.y = softmax(x)
        self.loss = closs_entropy_error(self.y,self.t)
        return self.loss
    
    def backward(self,dout=1):
        batch_size = self.t.shape[0]
        dx = (self.y - self.t) / batch_size
        return dx

2.Two layer netにおける勾配の確認

In [6]:
#Two layer net
class TwoLayerNet:
    def __init__(self,input_size,hidden_size,output_size,weight_init_std=0.01):
        #重みの初期化
        self.params = {}
        self.params['W1'] = weight_init_std * np.random.randn(input_size,hidden_size)
        self.params['b1'] = np.zeros(hidden_size) 
        self.params['W2'] = weight_init_std * np.random.randn(hidden_size,output_size)
        self.params['b2'] = np.zeros(output_size)
        
        #レイヤの生成
        self.layers = OrderedDict()
        self.layers['Affine1'] = Affine(self.params['W1'],self.params['b1'])
        self.layers['Relu1'] = Relu()
        self.layers['Affine2'] = Affine(self.params['W2'],self.params['b2'])
        
        self.lastLayer = SoftmaxWithLoss()
        
    def predict(self,x):
        for layer in self.layers.values():
            x = layer.forward(x)
            
        return x
    
    #x:入力,t:教師
    def loss(self,x,t):
        y = self.predict(x)
        
        return self.lastLayer.forward(y,t)
    
    def accuracy(self,x,t):
        y = self.predict(x)
        y = np.argmax(y,axis=1)
        if t.ndim != 1:t = np.argmax(t,axis=1)
        
        accuracy = np.sum(y == t) / float(x.shape[0])
        return accuracy
    
    def numerical_gradient(self,x,t):
        loss_W = lambda W: self.loss(x,t)
        
        grads = {}
        
        grads['W1'] = numerical_gradient(loss_W,self.params['W1'])
        grads['b1'] = numerical_gradient(loss_W,self.params['b1'])        
        grads['W2'] = numerical_gradient(loss_W,self.params['W2'])      
        grads['b2'] = numerical_gradient(loss_W,self.params['b2'])
        
        return grads
    
    def gradient(self,x,t):
        #forward
        self.loss(x,t)
        
        #backward
        dout = 1
        dout = self.lastLayer.backward(dout)
        
        layers = list(self.layers.values())
        layers.reverse()
        for layer in layers:
            dout = layer.backward(dout)
            
        #設定
        grads = {}
        grads['W1'] = self.layers['Affine1'].dW
        grads['b1'] = self.layers['Affine1'].db
        grads['W2'] = self.layers['Affine2'].dW
        grads['b2'] = self.layers['Affine2'].db
        
        return grads

In [7]:
#勾配の確認
(x_train, t_train), (x_test, t_test) = load_mnist(normalize=True, one_hot_label=True)
network = TwoLayerNet(input_size=784, hidden_size=50, output_size=10)
x_batch = x_train[:3]
t_batch = t_train[:3]

# 数値微分
grad_numerical = network.numerical_gradient(x_batch, t_batch)

# Backward
grad_numerical = network.numerical_gradient(x_batch,t_batch)
grad_backprop = network.gradient(x_batch,t_batch)

#各重みの絶対誤差の平均を求める
for key in grad_numerical.keys():
   diff = np.average( np.abs(grad_backprop[key] - grad_numerical[key]) )
   print(key + ":" + str(diff))

W1:4.785245830260185e-10
b1:2.9293192807511822e-09
W2:6.450463716742543e-09
b2:1.3963579606618426e-07


Two layer netの学習をする

In [10]:
(x_train, t_train), (x_test, t_test) = load_mnist(normalize=True, one_hot_label=True)
network = TwoLayerNet(input_size=784, hidden_size=50, output_size=10)

iters_num = 10000
train_size = x_train.shape[0]
batch_size = 100
learning_rate = 0.1

train_loss_list = []
train_acc_list = []
test_acc_list = []

iter_per_epoch = max(train_size / batch_size, 1)

for i in range(iters_num):
   batch_mask = np.random.choice(train_size, batch_size)
   x_batch = x_train[batch_mask]
   t_batch = t_train[batch_mask]

   # 勾配
   grad = TwoLayerNet.gradient(network, x_batch, t_batch)

   # 更新
   for key in ('W1', 'b1', 'W2', 'b2'):
       network.params[key] -= learning_rate * grad[key]
        
   loss = network.loss(x_batch, t_batch)
   train_loss_list.append(loss)
   if i % iter_per_epoch == 0:
       train_acc = network.accuracy(x_train, t_train)
       test_acc = network.accuracy(x_test, t_test)
       train_acc_list.append(train_acc)
       test_acc_list.append(test_acc)
       print(train_acc, test_acc)

0.13056666666666666 0.1349
0.9032333333333333 0.9054
0.9252 0.9275
0.9384 0.9388
0.9471833333333334 0.946
0.9551333333333333 0.9516
0.9595333333333333 0.9565
0.9629666666666666 0.9612
0.9665833333333333 0.9617
0.9653166666666667 0.9616
0.9714 0.9647
0.9727333333333333 0.9667
0.9744166666666667 0.968
0.97565 0.9687
0.9768166666666667 0.9697
0.97915 0.9704
0.9799666666666667 0.9696


感想

計算グラフを用いた誤差逆伝播では、ノードとエッジを使うだけなので複雑な計算を局所的に簡単に考えられることが分かった。また各計算結果を保持することで逆伝播しやすいということも分かった。

参考文献

斎藤 康毅,ゼロから作るDeep Learning―Pythonで学ぶディープラーニングの理論と実装,2016年09月,オライリージャパン,https://www.oreilly.co.jp/books/9784873117584/