# Previous code

In [1]:
import numpy as np

class Affine:
    def __init__(self, W, b):
        self.W = W
        self.b = b
        self.x = None
        self.dW = None
        self.db = None
        
    def forward(self, x):
        self.x = x
        out = np.dot(x, self.W) + self.b
        
        return out
    
    def backward(self, dout):
        dx = np.dot(dout, self.W.T)
        self.dW = np.dot(self.x.T, dout)
        self.db = np.sum(dout, axis=0)
        
        return dx
    
def cross_entropy_error(y, t):
    if y.ndim == 1:
        t = t.reshape(1, t.size)
        y = y.reshape(1, y.size)
        
    batch_size = y.shape[0]
    return -np.sum(t * np.log(y + 1e-7)) / batch_size

class SoftmaxWithLoss:
    def __init__(self):
        self.loss = None
        self.y = None
        self.t = None
        
    def forward(self, x, t):
        self.t = t
        self.y = softmax(x)
        self.loss = cross_entropy_error(self.y, self.t)
        
        return self.loss
    
    def backward(self, dout=1):
        batch_size = self.t.shape[0]
        dx = (self.y - self.t) / batch_size
        
        return dx
    
class Relu:
    def __init__(self):
        self.mask = None
        
    def forward(self, x):
        self.mask = (x <= 0)
        out = x.copy() # x > 0
        out[self.mask] = 0 # x <= 0
        
        return out
    
    def backward(self, dout):
        dout[self.mask] = 0 # x <= 0
        dx = dout # x > 0
        
        return dx
    
def softmax(x):
    C = np.max(x, axis=1).reshape(x.shape[0], 1)
    e = np.exp(x - C)
    s = np.sum(e, axis=1).reshape(x.shape[0], 1)
    
    return e / s

In [2]:
from collections import OrderedDict

class TwoLayerNet:
    def __init__(self, input_size, hidden_size, output_size, weight_init_std=0.01):
        self.params = {}
        self.params['W1'] = weight_init_std * np.random.randn(input_size, hidden_size)
        self.params['b1'] = weight_init_std * np.random.randn(hidden_size)
        self.params['W2'] = weight_init_std * np.random.randn(hidden_size, output_size)
        self.params['b2'] = weight_init_std * np.random.randn(output_size)
        
        self.layers = OrderedDict()
        self.layers['Affine1'] = Affine(self.params['W1'], self.params['b1'])
        self.layers['Relu1'] = Relu()
        self.layers['Affine2'] = Affine(self.params['W2'], self.params['b2'])
        
        self.lastLayer = SoftmaxWithLoss()
        
    def predict(self, x):
        for layer in self.layers.values():
            x = layer.forward(x)
            
        return x
    
    def loss(self, x, t):
        y = self.predict(x)
        
        return self.lastLayer.forward(y, t)
    
    def accuracy(self, x, t):
        y = self.predict(x)
        y = np.argmax(y, axis=1)
        if t.ndim != 1:
            t = np.argmax(t, axis=1)
            
        accuracy = np.sum(y == t) / float(x.shape[0])
        
        return accuracy
    
    def gradient(self, x, t):
        self.loss(x, t)
        
        dout = 1
        dout = self.lastLayer.backward(dout)
        
        layers = list(self.layers.values())
        layers.reverse()
        for layer in layers:
            dout = layer.backward(dout)
            
        grads = {}
        grads['W1'] = self.layers['Affine1'].dW
        grads['b1'] = self.layers['Affine1'].db
        grads['W2'] = self.layers['Affine2'].dW
        grads['b2'] = self.layers['Affine2'].db
        
        return grads

# SGD

In [3]:
class SGD:
    def __init__(self, lr=0.01):
        self.lr = lr
        
    def update(self, params, grads):
        for key in params.keys():
            params[key] -= self.lr * grads[key]

In [4]:
from sklearn.datasets import fetch_openml

mnist = fetch_openml('mnist_784')

dat = mnist.data.to_numpy()
tar = mnist.target.to_numpy()

x_train = dat[:60000,]
x_test = dat[60000:,]
t_train = tar[:60000]
t_test = tar[60000:]

I = np.eye(10)
t_train = I[t_train.astype(int)]
t_test = I[t_test.astype(int)]

  warn(


In [7]:
network = TwoLayerNet(input_size=784, hidden_size=50, output_size=10)
optimizer = SGD()

train_loss_list = []
train_acc_list = []
test_acc_list = []

iter_per_epoch = 300

iters_num = 10000
train_size = x_train.shape[0]
batch_size = 100
learning_rate = 0.1

for i in range(iters_num):
    batch_mask = np.random.choice(train_size, batch_size)
    x_batch = x_train[batch_mask]
    t_batch = t_train[batch_mask]
    
    x_batch = (x_batch - x_batch.mean()) / x_batch.std()
    
    grad = network.gradient(x_batch, t_batch)
    
    params = network.params
    optimizer.update(params, grad)
    
    #for key in ('W1', 'b1', 'W2', 'b2'):
    #    network.params[key] -= learning_rate * grad[key]
        
    #loss = network.loss(x_batch, t_batch)
    #train_loss_list.append(loss)
    
    if i % iter_per_epoch == 0:
        train_acc = network.accuracy(x_train, t_train)
        test_acc = network.accuracy(x_test, t_test)
        train_acc_list.append(train_acc)
        test_acc_list.append(test_acc)
        print("train acc, test acc : " + str(train_acc) + " | " + str(test_acc))

train acc, test acc : 0.17291666666666666 | 0.1753
train acc, test acc : 0.7432 | 0.7499
train acc, test acc : 0.82205 | 0.8297
train acc, test acc : 0.8356833333333333 | 0.8431
train acc, test acc : 0.8358 | 0.8423
train acc, test acc : 0.8390666666666666 | 0.8473
train acc, test acc : 0.8455166666666667 | 0.8539
train acc, test acc : 0.8560833333333333 | 0.865
train acc, test acc : 0.8578666666666667 | 0.8671
train acc, test acc : 0.87075 | 0.8782
train acc, test acc : 0.8794 | 0.8841
train acc, test acc : 0.8813666666666666 | 0.8881
train acc, test acc : 0.8897833333333334 | 0.8942
train acc, test acc : 0.88995 | 0.8942
train acc, test acc : 0.9001833333333333 | 0.9036
train acc, test acc : 0.8955666666666666 | 0.8975
train acc, test acc : 0.9059 | 0.9083
train acc, test acc : 0.9013333333333333 | 0.9042
train acc, test acc : 0.9146 | 0.9164
train acc, test acc : 0.9163166666666667 | 0.9178
train acc, test acc : 0.9228166666666666 | 0.9222
train acc, test acc : 0.9213833333333333 | 

# Momentum

In [8]:
class Momentum:
    def __init__(self, lr=0.01, momentum=0.9):
        self.lr = lr
        self.momentum = momentum
        self.v = None
        
    def update(self, params, grads):
        if self.v is None:
            self.v = {}
            for key, val in params.items():
                self.v[key] = np.zeros_like(val)
        
        for key in params.keys():
            self.v[key] = self.momentum * self.v[key] - self.lr * grads[key]
            
            params[key] += self.v[key]

In [10]:
network = TwoLayerNet(input_size=784, hidden_size=50, output_size=10)
optimizer = Momentum()

train_loss_list = []
train_acc_list = []
test_acc_list = []

iter_per_epoch = 300

iters_num = 10000
train_size = x_train.shape[0]
batch_size = 100
learning_rate = 0.1

for i in range(iters_num):
    batch_mask = np.random.choice(train_size, batch_size)
    x_batch = x_train[batch_mask]
    t_batch = t_train[batch_mask]
    
    x_batch = (x_batch - x_batch.mean()) / x_batch.std()
    
    grad = network.gradient(x_batch, t_batch)
    
    params = network.params
    optimizer.update(params, grad)
    
    #for key in ('W1', 'b1', 'W2', 'b2'):
    #    network.params[key] -= learning_rate * grad[key]
        
    #loss = network.loss(x_batch, t_batch)
    #train_loss_list.append(loss)
    
    if i % iter_per_epoch == 0:
        train_acc = network.accuracy(x_train, t_train)
        test_acc = network.accuracy(x_test, t_test)
        train_acc_list.append(train_acc)
        test_acc_list.append(test_acc)
        print("train acc, test acc : " + str(train_acc) + " | " + str(test_acc))

train acc, test acc : 0.12826666666666667 | 0.1318
train acc, test acc : 0.86565 | 0.8723
train acc, test acc : 0.9063166666666667 | 0.9096
train acc, test acc : 0.9356333333333333 | 0.9388
train acc, test acc : 0.9375666666666667 | 0.9373
train acc, test acc : 0.9449166666666666 | 0.9438
train acc, test acc : 0.94175 | 0.9427
train acc, test acc : 0.9559166666666666 | 0.9517
train acc, test acc : 0.95435 | 0.9523
train acc, test acc : 0.95275 | 0.9471
train acc, test acc : 0.94395 | 0.9379
train acc, test acc : 0.9570666666666666 | 0.9529
train acc, test acc : 0.9643166666666667 | 0.9562
train acc, test acc : 0.9562 | 0.9477
train acc, test acc : 0.9616 | 0.9561
train acc, test acc : 0.9536666666666667 | 0.9468
train acc, test acc : 0.9633833333333334 | 0.9539
train acc, test acc : 0.9647333333333333 | 0.9536
train acc, test acc : 0.9568166666666666 | 0.9484
train acc, test acc : 0.9548166666666666 | 0.9453
train acc, test acc : 0.9655833333333333 | 0.9569
train acc, test acc : 0.9648

# AdaGrad

In [11]:
class AdaGrad:
    def __init__(self, lr=0.01):
        self.lr = lr
        self.h = None
        
    def update(self, params, grads):
        if self.h is None:
            self.h = {}
            
            for key, val in params.items():
                self.h[key] = np.zeros_like(val)
                
        for key in params.keys():
            self.h[key] += grads[key] * grads[key]
            params[key] -= self.lr * grads[key] / (np.sqrt(self.h[key]) + 1e-7)

In [12]:
network = TwoLayerNet(input_size=784, hidden_size=50, output_size=10)
optimizer = AdaGrad()

train_loss_list = []
train_acc_list = []
test_acc_list = []

iter_per_epoch = 300

iters_num = 10000
train_size = x_train.shape[0]
batch_size = 100
learning_rate = 0.1

for i in range(iters_num):
    batch_mask = np.random.choice(train_size, batch_size)
    x_batch = x_train[batch_mask]
    t_batch = t_train[batch_mask]
    
    x_batch = (x_batch - x_batch.mean()) / x_batch.std()
    
    grad = network.gradient(x_batch, t_batch)
    
    params = network.params
    optimizer.update(params, grad)
    
    #for key in ('W1', 'b1', 'W2', 'b2'):
    #    network.params[key] -= learning_rate * grad[key]
        
    #loss = network.loss(x_batch, t_batch)
    #train_loss_list.append(loss)
    
    if i % iter_per_epoch == 0:
        train_acc = network.accuracy(x_train, t_train)
        test_acc = network.accuracy(x_test, t_test)
        train_acc_list.append(train_acc)
        test_acc_list.append(test_acc)
        print("train acc, test acc : " + str(train_acc) + " | " + str(test_acc))

train acc, test acc : 0.24983333333333332 | 0.2506
train acc, test acc : 0.8684333333333333 | 0.875
train acc, test acc : 0.8875833333333333 | 0.8936
train acc, test acc : 0.9081833333333333 | 0.9084
train acc, test acc : 0.9075166666666666 | 0.9111
train acc, test acc : 0.9115333333333333 | 0.9166
train acc, test acc : 0.9204333333333333 | 0.9225
train acc, test acc : 0.9243666666666667 | 0.9254
train acc, test acc : 0.93125 | 0.9311
train acc, test acc : 0.9315 | 0.9324
train acc, test acc : 0.9333666666666667 | 0.9329
train acc, test acc : 0.9385666666666667 | 0.9377
train acc, test acc : 0.9407666666666666 | 0.9415
train acc, test acc : 0.9398833333333333 | 0.9413
train acc, test acc : 0.9402666666666667 | 0.9406
train acc, test acc : 0.9431333333333334 | 0.943
train acc, test acc : 0.94435 | 0.9444
train acc, test acc : 0.9450333333333333 | 0.9453
train acc, test acc : 0.94365 | 0.9429
train acc, test acc : 0.9447333333333333 | 0.9432
train acc, test acc : 0.9443 | 0.9437
train ac