In [1]:
import sys,os
sys.path.append(os.pardir)  # 부모 디렉터리의 파일을 가져올 수 있도록 설정
import numpy as np
from collections import OrderedDict
from common.layers import *
from common.gradient import numerical_gradient

class TwoLayerNet:
    '''2층 신경망 구현'''
    def __init__(self, input_size, 
                 hidden_size, output_size, weight_init_std=0.01):
        '''
        초기화 수행
        Params:
            - input_size: 입력층 뉴런 수
            - hidden_size: 은닉층 뉴런 수
            - output_size: 출력층 뉴런 수
            - weight_init_std: 가중치 초기화 시 정규분포의 스케일
        '''
        # 가중치 초기화
        self.params = {
            'W1': weight_init_std * np.random.randn(input_size, hidden_size),
            'b1': np.zeros(hidden_size),
            'W2': weight_init_std * np.random.randn(hidden_size, output_size),
            'b2': np.zeros(output_size)
        }
        
        # 계층 생성
        self.layers = OrderedDict({
            'Affine1': Affine(self.params['W1'], self.params['b1']),
            'Relu1': Relu(),
            'Affine2': Affine(self.params['W2'], self.params['b2'])
        })
        
        self.last_layer = SoftmaxWithLoss()
        
    
    def predict(self, x):
        '''예측(추론)
            Pararms:
                - x: 이미지 데이터'''
        for layer in self.layers.values():
            x = layer.forward(x)
            
        return x
    
    def loss(self, x, t):
        '''
        손실함수의 값을 계산
        Params:
            - x: 이미지데이터, t: 정답 레이블
        '''
        y = self.predict(x)
        return self.last_layer.forward(y, t)
    
    def accuracy(self, x, t):
        '''
        정확도 계산
        Params:
            - x: 이미지 데이터
            - t: 정답 레이블
        '''
        y = self.predict(x)
        y = np.argmax(y, axis=1)
        if t.ndim != 1:
            t = np.argmax(t, axis=1)
        
        accuracy = np.sum(y==t) / float(x.shape[0])
        return accuracy
    
    
    def numerical_gradient(self, x, t):
        '''
        미분을 통한 가중치 매개변수의 기울기 계산
        Params:
            - x: 이미지 데이터
            - t: 정답 레이블 
        '''
        loss_W = lambda W: self.loss(x, t)
        
        grads = {
            'W1': numerical_gradient(loss_W, self.params['W1']),
            'b1': numerical_gradient(loss_W, self.params['b1']),
            'W2': numerical_gradient(loss_W, self.params['W2']),
            'b2': numerical_gradient(loss_W, self.params['b2'])
        }
        return grads
    
    def gradient(self, x, t):
        # forward
        self.loss(x, t)
        
        # backward
        dout = 1
        dout = self.last_layer.backward(dout)
        
        layers = list(self.layers.values())
        layers.reverse()
        for layer in layers:
            dout = layer.backward(dout)
            
        # 결과 저장
        grads = {
            'W1': self.layers['Affine1'].dW, 'b1': self.layers['Affine1'].db,
            'W2': self.layers['Affine2'].dW, 'b2': self.layers['Affine2'].db
        }
        return grads

In [2]:
%%time
import sys, os
sys.path.append(os.pardir)
import numpy as np
from dataset.mnist import load_mnist

# mnist load
(x_train, t_train), (x_test, t_test) = load_mnist(normalize=True)

network = TwoLayerNet(input_size=28*28, hidden_size=50, output_size=10)

x_batch = x_train[:3]
t_batch = t_train[:3]

grad_numerical = network.numerical_gradient(x_batch, t_batch)
grad_backprop = network.gradient(x_batch, t_batch)

# 각 가중치의 절대 오차의 평균을 구한다.
for key in grad_numerical.keys():
    diff = np.average(np.abs(grad_backprop[key] - grad_numerical[key]))
    print(key,":", str(diff))

W1 : 3.864104453654888e-10
b1 : 2.3828968641836055e-09
W2 : 4.639392628674223e-09
b2 : 1.394510603394683e-07
Wall time: 5.25 s


# 어떻게 구동되는지 체크!

In [36]:
input_size = 28*28; hidden_size=50; output_size=10; weight_init_std=.01;
print(input_size, hidden_size, output_size, weight_init_std)

784 50 10 0.01


In [37]:
W1 = weight_init_std * np.random.randn(input_size, hidden_size)
W1, W1.shape

(array([[-0.00341426, -0.00519704,  0.0008543 , ..., -0.00630821,
         -0.01050883,  0.0242224 ],
        [ 0.01380829,  0.00718157,  0.018027  , ...,  0.01140965,
         -0.0189462 ,  0.00074538],
        [ 0.01996407, -0.02190384, -0.01841735, ..., -0.02601784,
          0.00966829, -0.00208664],
        ...,
        [-0.0129087 , -0.00222335,  0.02354596, ...,  0.01295494,
          0.00692917,  0.00715465],
        [ 0.01750363, -0.0083064 ,  0.00989071, ...,  0.01260056,
          0.01552843, -0.00333524],
        [-0.01494274, -0.00983312, -0.01817081, ..., -0.00294322,
          0.0044816 , -0.01783037]]), (784, 50))

In [38]:
b1 = np.zeros(hidden_size)
b1, b1.shape

(array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]),
 (50,))

In [39]:
W2 = weight_init_std * np.random.randn(hidden_size, output_size)
W2, W2.shape

(array([[-9.81194685e-03, -1.66586678e-02, -3.52309052e-03,
          6.09391991e-03,  4.88168477e-03,  5.51387926e-03,
          9.11183356e-03,  8.23754800e-03, -6.71383723e-03,
          1.01050152e-02],
        [-4.36406643e-03, -1.40617591e-03, -2.84864604e-03,
         -8.35538470e-03,  5.37182161e-03, -2.68788305e-03,
         -1.61904515e-02,  5.88367122e-03, -8.87600082e-03,
         -1.87540280e-03],
        [ 4.10276622e-03,  1.48087151e-03,  1.04899050e-02,
          1.15717847e-02, -1.07847112e-02,  8.09617076e-03,
         -4.53152028e-03,  1.60437035e-02, -1.05091785e-02,
         -1.36420232e-03],
        [-5.50599219e-03, -5.50648731e-03,  1.90440022e-03,
         -8.65216219e-03,  2.56673991e-03, -2.57621223e-03,
          1.03967033e-02, -1.49734761e-02,  7.18339054e-03,
          6.70338799e-03],
        [ 1.68583000e-02, -1.66124053e-03,  6.54760705e-03,
         -1.66367058e-02, -3.44755038e-03,  1.37835605e-02,
         -1.16471379e-02, -7.84328148e-04, -1.636366

In [40]:
b2 = np.zeros(output_size)
b2, b2.shape

(array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]), (10,))

In [41]:
class Affine:
    def __init__(self, W, b):
        self.W = W
        self.b = b
        
        self.x = None
        self.original_x_shape = None
        # 가중치와 편향 매개변수의 미분
        self.dW = None
        self.db = None

    def forward(self, x):
        # 텐서 대응
        self.original_x_shape = x.shape
        x = x.reshape(x.shape[0], -1)
        self.x = x
        out = np.dot(self.x, self.W) + self.b
        return out

    def backward(self, dout):
        dx = np.dot(dout, self.W.T)
        self.dW = np.dot(self.x.T, dout)
        self.db = np.sum(dout, axis=0)
        dx = dx.reshape(*self.original_x_shape)  # 입력 데이터 모양 변경(텐서 대응)
        return dx

In [42]:
class Relu:
    def __init__(self):
        self.mask = None

    def forward(self, x):
        self.mask = (x <= 0)
        out = x.copy()
        out[self.mask] = 0

        return out

    def backward(self, dout):
        dout[self.mask] = 0
        dx = dout

        return dx

In [43]:
def cross_entropy_error(y, t):
    if y.ndim == 1:
        t = t.reshape(1, t.size)
        y = y.reshape(1, y.size)
        
    # 훈련 데이터가 원-핫 벡터라면 정답 레이블의 인덱스로 반환
    if t.size == y.size:
        t = t.argmax(axis=1)
             
    batch_size = y.shape[0]
    return -np.sum(np.log(y[np.arange(batch_size), t] + 1e-7)) / batch_size

def softmax(x):
    if x.ndim == 2:
        x = x.T
        x = x - np.max(x, axis=0)
        y = np.exp(x) / np.sum(np.exp(x), axis=0)
        return y.T 

    x = x - np.max(x) # 오버플로 대책
    return np.exp(x) / np.sum(np.exp(x))

class SoftmaxWithLoss:
    def __init__(self):
        self.loss = None # 손실함수
        self.y = None    # softmax의 출력
        self.t = None    # 정답 레이블(원-핫 인코딩 형태)
        
    def forward(self, x, t):
        self.t = t
        self.y = softmax(x)
        self.loss = cross_entropy_error(self.y, self.t)
        
        return self.loss

    def backward(self, dout=1):
        batch_size = self.t.shape[0]
        if self.t.size == self.y.size: # 정답 레이블이 원-핫 인코딩 형태일 때
            dx = (self.y - self.t) / batch_size
        else:
            dx = self.y.copy()
            dx[np.arange(batch_size), self.t] -= 1
            dx = dx / batch_size
        
        return dx

In [44]:
from collections import OrderedDict
params = {
    'W1':W1, 'b1':b1, 'W2':W2, 'b2':b2
}
layers = OrderedDict(
    {
        'Affine1' : Affine(params['W1'], params['b1']),
        'Relu1' : Relu(),
        'Affine2' : Affine(params['W2'], params['b2'])
    }
)
last_layer = SoftmaxWithLoss()

In [46]:
layers

OrderedDict([('Affine1', <__main__.Affine at 0x20189db16a0>),
             ('Relu1', <__main__.Relu at 0x20189db1400>),
             ('Affine2', <__main__.Affine at 0x20189db1518>)])

In [47]:
last_layer

<__main__.SoftmaxWithLoss at 0x20189db10f0>

In [48]:
x_train.shape

(60000, 784)

In [49]:
x_batch, t_batch, x_batch.shape, t_batch.shape

(array([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]], dtype=float32),
 array([5, 0, 4], dtype=uint8),
 (3, 784),
 (3,))

In [266]:
# self.loss
## loss_W = lambda W : loss(x,t)
def predict(x):
    for layer in layers.values():
        x = layer.forward(x)
    return x
 
def loss(x, t):
    '''
    손실함수의 값을 계산
    Params:
        - x: 이미지데이터, t: 정답 레이블
    '''
    y = predict(x)
    return last_layer.forward(y, t)

In [52]:
x_batch.shape, W1.shape

((3, 784), (784, 50))

In [54]:
x_batch.dot(W1).shape

(3, 50)

In [64]:
np.dot(x_batch, W1) + np.arange(50) - np.dot(x_batch, W1) # broadcasting

array([[ 0.,  1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10., 11., 12.,
        13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25.,
        26., 27., 28., 29., 30., 31., 32., 33., 34., 35., 36., 37., 38.,
        39., 40., 41., 42., 43., 44., 45., 46., 47., 48., 49.],
       [ 0.,  1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10., 11., 12.,
        13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25.,
        26., 27., 28., 29., 30., 31., 32., 33., 34., 35., 36., 37., 38.,
        39., 40., 41., 42., 43., 44., 45., 46., 47., 48., 49.],
       [ 0.,  1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10., 11., 12.,
        13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25.,
        26., 27., 28., 29., 30., 31., 32., 33., 34., 35., 36., 37., 38.,
        39., 40., 41., 42., 43., 44., 45., 46., 47., 48., 49.]])

In [67]:
x_hidden = np.dot(x_batch, W1) + b1
x_output = np.dot(x_hidden, W2) + b2
x_output, x_output.shape

(array([[-0.00630799,  0.00232234,  0.01427654,  0.00253922, -0.01640813,
         -0.00598654, -0.00541831, -0.00349949, -0.00075041, -0.00467397],
        [-0.00057521,  0.00635793,  0.00107234,  0.00448934, -0.00405301,
         -0.0075228 ,  0.00123402, -0.00730456, -0.00057363,  0.00054867],
        [ 0.00394519,  0.00324727, -0.00044033, -0.00134341, -0.00050139,
          0.00153177, -0.003671  , -0.01426108, -0.00303498,  0.00321731]]),
 (3, 10))

In [68]:
y = softmax(x_output)
y, y.shape

(array([[0.09960621, 0.10046956, 0.1016778 , 0.10049135, 0.09860523,
         0.09963823, 0.09969486, 0.09988634, 0.10016132, 0.0997691 ],
        [0.10000482, 0.10070057, 0.10016971, 0.10051258, 0.09965762,
         0.09931243, 0.10018591, 0.09933411, 0.10000497, 0.10011727],
        [0.10050764, 0.10043752, 0.10006783, 0.0999775 , 0.10006172,
         0.10026537, 0.09974506, 0.09869433, 0.09980852, 0.10043451]]),
 (3, 10))

In [71]:
y.sum(axis=1)

array([1., 1., 1.])

In [73]:
cross_entropy_error(y, t_batch)

2.303570463083997

``` python
def cross_entropy_error(y, t):
    if y.ndim == 1:
        t = t.reshape(1, t.size)
        y = y.reshape(1, y.size)
        
    # 훈련 데이터가 원-핫 벡터라면 정답 레이블의 인덱스로 반환
    if t.size == y.size:
        t = t.argmax(axis=1)
             
    batch_size = y.shape[0]
    return -np.sum(np.log(y[np.arange(batch_size), t] + 1e-7)) / batch_size
```

In [78]:
y.ndim==1, t_batch.size==y.size

(False, False)

In [80]:
batch_size = y.shape[0]
batch_size

3

In [93]:
np.arange(batch_size), t_batch

(array([0, 1, 2]), array([5, 0, 4], dtype=uint8))

In [94]:
y

array([[0.09960621, 0.10046956, 0.1016778 , 0.10049135, 0.09860523,
        0.09963823, 0.09969486, 0.09988634, 0.10016132, 0.0997691 ],
       [0.10000482, 0.10070057, 0.10016971, 0.10051258, 0.09965762,
        0.09931243, 0.10018591, 0.09933411, 0.10000497, 0.10011727],
       [0.10050764, 0.10043752, 0.10006783, 0.0999775 , 0.10006172,
        0.10026537, 0.09974506, 0.09869433, 0.09980852, 0.10043451]])

In [86]:
y[np.arange(batch_size), t_batch] # fancy index

array([0.09963823, 0.10000482, 0.10006172])

In [101]:
error = -np.sum(np.log(y[np.arange(batch_size), t_batch] + 1e-7)) / batch_size
error

2.303570463083997

In [315]:
## numerical_gradient(loss_W, params['W1'])
def numerical_gradient(f, x):
    h = 1e-4 # 0.0001
    grad = np.zeros_like(x)
    
    it = np.nditer(x, flags=['multi_index'], op_flags=['readwrite'])
    while not it.finished:
        idx = it.multi_index
        tmp_val = x[idx]
        x[idx] = float(tmp_val) + h
        fxh1 = f(x) # f(x+h)
        
        x[idx] = tmp_val - h 
        fxh2 = f(x) # f(x-h)
        grad[idx] = (fxh1 - fxh2) / (2*h)
        
        x[idx] = tmp_val # 값 복원
        it.iternext()   
        
    return grad

In [310]:
h = 1e-4
grad = np.zeros_like(W1)
grad.shape

(784, 50)

In [311]:
it = np.nditer(w1, flags=['multi_index'], op_flags=['readwrite'])
it

<numpy.nditer at 0x2018a439c10>

In [312]:
loss_W = lambda W : loss(x_batch, t_batch)

In [313]:
while not it.finished:
    idx = it.multi_index
    tmp_val = W1[idx]
    W1[idx] = float(tmp_val) + h
    fxh1 = loss_W(W1) # f(x+h)

    W1[idx] = tmp_val - h 
    fxh2 = loss_W(w1) # f(x-h)
    grad[idx] = (fxh1 - fxh2) / (2*h)

    W1[idx] = tmp_val # 값 복원
    it.iternext()

In [314]:
grad.sum()

-0.19138482495462128

array([[-0.00341426, -0.00519704,  0.0008543 , ..., -0.00630821,
        -0.01050883,  0.0242224 ],
       [ 0.01380829,  0.00718157,  0.018027  , ...,  0.01140965,
        -0.0189462 ,  0.00074538],
       [ 0.01996407, -0.02190384, -0.01841735, ..., -0.02601784,
         0.00966829, -0.00208664],
       ...,
       [-0.0129087 , -0.00222335,  0.02354596, ...,  0.01295494,
         0.00692917,  0.00715465],
       [ 0.01750363, -0.0083064 ,  0.00989071, ...,  0.01260056,
         0.01552843, -0.00333524],
       [-0.01494274, -0.00983312, -0.01817081, ..., -0.00294322,
         0.0044816 , -0.01783037]])