In [44]:
# report 04 2021.6.23
import numpy as np
from common.gradient import numerical_gradient
from common.functions import sigmoid, softmax, sigmoid_grad
from dataset.mnist import load_mnist
def cross_entropy_error(y, t):
   if y.ndim == 1:
       t = t.reshape(1, t.size)
       y = y.reshape(1, y.size)
   # 教師データがone-hot-vectorの場合、正解ラベルのインデックスに変換
   if t.size == y.size:
       t = t.argmax(axis=1)
   batch_size = y.shape[0]
   return -np.sum(np.log(y[np.arange(batch_size), t])) / batch_size

In [45]:
class SoftmaxWithLoss:
   def __init__(self):
       self.loss = None
       self.y = None # softmaxの出力
       self.t = None # 教師データ
   def forward(self, x, t):
       self.t = t
       self.y = softmax(x)
       # forwardの式
       # -sum ( t * log (y))
       self.loss = cross_entropy_error(self.y, self.t)
       return self.loss
   def backward(self, dout=1):
       # backwardの式
       # yi - ti (iはIndex)
       batch_size = self.t.shape[0]
       dx = ((self.y - self.t) / batch_size) ## TODO memo 
       return dx

In [49]:
class TwoLayerNet:
  def __init__(self,input_size,hidden_size,output_size,weight_init_std=0.01):
    self.params = {}
    self.params['W1'] = weight_init_std * np.random.randn(input_size, hidden_size)
    self.params['b1'] = np.zeros(hidden_size)
    self.params['W2'] = weight_init_std * np.random.randn(hidden_size, output_size)
    self.params['b2'] = np.zeros(output_size)

    self.layers = OrderedDict()
    self.layers["Affine1"] = Affine(self.params['W1'],self.params['b1'])
    self.layers["Relu1"] = ReLU()
    self.layers["Affine2"] = Affine(self.params['W2'],self.params['b2'])
    self.lastLayer = SoftmaxWithLoss()


  def predict(self, x):
    for layer in self.layers.values():
      x = layer.forward(x)
    return x


  def loss(self, x, t):
    y = self.predict(x)
    return self.lastLayer.forward(y,t)

  def accuracy(self, x, t):
    y = self.predict(x)
    y = np.argmax(y, axis=1)
    t = np.argmax(t, axis=1)
    accuracy = np.sum(y == t) / float(x.shape[0])
    return accuracy

  def numerical_gradient(self, x, t):
      loss_W = lambda W: self.loss(x, t)

      grads = {}
      grads["W1"] = numerical_gradient(loss_W, self.params["W1"])
      grads["b1"] = numerical_gradient(loss_W, self.params["b1"])
      grads["W2"] = numerical_gradient(loss_W, self.params["W2"])
      grads["b2"] = numerical_gradient(loss_W, self.params["b2"])

      return grads

  def gradient(self, x, t):
    self.loss(x,t)
    dout = self.lastLayer.backward(1)
    layers = list(self.layers.values())
    layers.reverse()
    for layer in layers:
      dout = layer.backward(dout)
    grads = {}
    grads['W1'] = self.layers["Affine1"].dW
    grads['b1'] = self.layers["Affine1"].db
    grads['W2'] = self.layers["Affine2"].dW
    grads['b2'] = self.layers["Affine2"].db
    return grads

class Affine:
    def __init__(self, W, b):
        self.W, self.b = W, b
        self.x, self.dW, self.db = None, None, None

    def forward(self, x):
        self.x = x
        out = np.dot(x, self.W) + self.b
        return out

    def backward(self, dout):
        dx = np.dot(dout, self.W.T)
        self.dW = np.dot(self.x.T, dout)
        self.db = np.sum(dout, axis=0)
        return dx

class ReLU:
    def __init__(self):
        self.mask = None

    def forward(self, x):
        self.mask = (x <= 0)
        out = x.copy()
        out[self.mask] = 0
        return out

    def backward(self, dout):
        dx = dout.copy()
        dx[self.mask] = 0
        return dx

import numpy as np
import sys, os
sys.path.append(os.pardir)
from dataset.mnist import load_mnist
from common.gradient import numerical_gradient
from collections import OrderedDict
# データの読み込み 公式サイトが503だったのでミラーの
# https://storage.googleapis.com/cvdf-datasets/mnist/ に変更して実行
(x_train, t_train), (x_test, t_test) = load_mnist(normalize=True, one_hot_label=True)
network = TwoLayerNet(input_size=784, hidden_size=50, output_size=10)
x_batch = x_train[:3]
t_batch = t_train[:3]
# 数値微分
grad_numerical = network.numerical_gradient(x_batch, t_batch)
# Backward
#grad_backprop = gradient(x_batch, t_batch)
grad_backprop = network.gradient(x_batch, t_batch)
for key in grad_numerical.keys():
   diff = np.average( np.abs(grad_backprop[key] - grad_numerical[key]) )
   print(key + ":" + str(diff))





W1:2.8279509313595957e-13
b1:1.0349251354990415e-12
W2:1.0053347744707535e-12
b2:1.1990409082285326e-10


## よくわからん

In [50]:
(x_train, t_train), (x_test, t_test) = load_mnist(normalize=True, one_hot_label=True)
network = TwoLayerNet(input_size=784, hidden_size=50, output_size=10)
x_batch = x_train[:3]
t_batch = t_train[:3]
grad_numerical = network.numerical_gradient(x_batch, t_batch)
grad_backprop = network.gradient(x_batch, t_batch)
for key in grad_numerical.keys():
   diff = np.average( np.abs(grad_backprop[key] - grad_numerical[key]) )
   print(key + ":" + str(diff))

W1:1.1432583474035328e-06
b1:1.2818677594115318e-05
W2:1.0301085083942816e-12
b2:1.1968204205459188e-10


In [None]:
def gradient(network, x, t):
   # 自分で実装したSoftmax with lossクラスを使ってみてください
   lastLayer = SoftmaxWithLoss()
   # forward
   #self.loss(x, t)
   network.loss(x, t)
   # backward
   dout = 1
   dout = lastLayer.backward(dout)
   #layers = list(self.layers.values())
   layers = list(network.layers.values())
   layers.reverse()
   for layer in layers:
      dout = layer.backward(dout)
   # 設定
   grads = {}
   #grads['W1'], grads['b1'] = self.layers['Affine1'].dW, self.layers['Affine1'].db
   grads['W1'], grads['b1'] = network.layers['Affine1'].dW, self.layers['Affine1'].db
   #grads['W2'], grads['b2'] = self.layers['Affine2'].dW, self.layers['Affine2'].db
   grads['W2'], grads['b2'] = network.layers['Affine2'].dW, self.layers['Affine2'].db
   return grads

In [52]:
import numpy as np
from dataset.mnist import load_mnist
# データの読み込み
(x_train, t_train), (x_test, t_test) = load_mnist(normalize=True, one_hot_label=True)
network = TwoLayerNet(input_size=784, hidden_size=50, output_size=10)
iters_num = 10000
train_size = x_train.shape[0]
batch_size = 100
learning_rate = 0.1
train_loss_list = []
train_acc_list = []
test_acc_list = []
iter_per_epoch = max(train_size / batch_size, 1)
for i in range(iters_num):
   batch_mask = np.random.choice(train_size, batch_size)
   x_batch = x_train[batch_mask]
   t_batch = t_train[batch_mask]
   # 勾配
   #grad = network.numerical_gradient(x_batch, t_batch)
   #grad = gradient(x_batch, t_batch)
   grad = network.gradient(x_batch, t_batch)
   # 更新
   for key in ('W1', 'b1', 'W2', 'b2'):
       network.params[key] -= learning_rate * grad[key]
   loss = network.loss(x_batch, t_batch)
   train_loss_list.append(loss)
   if i % iter_per_epoch == 0:
       train_acc = network.accuracy(x_train, t_train)
       test_acc = network.accuracy(x_test, t_test)
       train_acc_list.append(train_acc)
       test_acc_list.append(test_acc)
       print(train_acc, test_acc)

0.07441666666666667 0.0723
0.9053666666666667 0.9099
0.9243833333333333 0.9267
0.9328833333333333 0.9316
0.9447333333333333 0.9426
0.9513333333333334 0.948
0.9565666666666667 0.9537
0.96185 0.9568
0.9635666666666667 0.9593
0.96735 0.9637
0.9699166666666666 0.9624
0.97275 0.966
0.9749833333333333 0.9687
0.9759666666666666 0.9689
0.9759 0.9672
0.9788166666666667 0.97
0.9789833333333333 0.9702


### なんもわからんかったので精進したい

参考
[GitHub1](https://github.com/statpng/DL-books/blob/cd1194c07338191b5058fe75fcbd9a2285c8331e/%5Bdeep-learning-from-scratch%5D_3.py)
[GitHub検索](https://github.com/search?q=numerical_gradient%28&type=Code)