# Chapter 6. 학습 관련 기술들

## 6.1 매개변수 갱신
### 6.1.2 확률적 경사 하강법(SGD)

$ W \leftarrow W - \eta \frac{\partial L}{\partial W} $

In [3]:
import numpy as np

In [4]:
class SGD:
    def _init__(self, lr=0.01):
        self.lr = lr
        
    def update(self, params, grads):
        for key in params.key():
            params[key] -= self.lr * grads[key]

### 6.1.4 모멘텀

$ v \leftarrow \alpha v - \eta \frac{\partial L}{\partial W}$

$ W \leftarrow W + v$

In [5]:
class Momentum:
    def __init__(self, lr=0.01, momentum=0.9):
        self.lr = lr
        self.momentum = momentum
        self.v = None
        
    def update(self, params, grads):
        if self.v is None:
            self.v = {}
            for key, val in params.items():
                self.v[key] = np.zeros_like(val)
                
        for key in params.key():
            self.v[key] = self.momentum * self.v[key] - self.lr * grads[key]
            params[key] += self.v[key]
            

### 6.1.5 AdaGrad

$ h \leftarrow h + \frac{\partial L}{\partial W} \odot \frac{\partial L}{\partial W} $  
($\odot$ 기호는 행렬의 원소별 곱셈을 의미)

$W \leftarrow W - \eta \frac{1}{\sqrt{h}} \frac{\partial L}{\partial W}$

In [7]:
class AdaGrad:
    def __init__(self, lr=0.01):
        self.lr = lr
        self.h = None
    
    def update(self, params, grads):
        if self.h is None:
            self.h = {}
            for key, val in params.items():
                self.h[key] = np.zeros_like(val)
                
        for key in params.key():
            self.h[key] += grads[key] * grads[key]
            params[key] -= self.lr * grads[key] / (np.sqrt(self.h[key]) + 1e-7)

### 6.1.6 Adam  
Momentum + AdaGrad

In [9]:
class Adam:
    def __init__(self, lr=0.001, beta1=0.9, beta2=0.999):
        self.lr = lr
        self.beta1 = beta1
        self.beta2 = beta2
        self.iter = 0
        self.m = None
        self.v = None
        
    def update(self, params, grads):
        if self.m is None:
            self.m, self.v = {}, {}
            for key, val in parmas.items():
                self.m[key] = np.zeros_like(val)
                self.v[key] = np.zeros_like(val)
                
        self.iter += 1
        lr_t = self.lr * np.sqrt(1.0 - self.beta2 ** self.iter) / (1.0 - self.beta1 ** self.iter)
        
        for key in params.key():
            self.m[key] += (1 - self.beta1) * (grads[key] - self.m[key])
            self.v[key] += (1 - self.beta2) * (grads[key] ** 2 - self.v[key])
            
            params[key] -= lr_t * self.m[key] / (np.sqrt(self.v[key]) + 1e-7)

### 6.4.3 드롭아웃

신경망 모델이 복잡해지면 가중치 감소만으로는 대응하기 어려워진다.  
이럴 때는 흔히 **드롭아웃** 이라는 기법을 이용한다.  
**드롭아웃**은 뉴런을 임의로 삭제하면서 *학습*하는 방법이다.

In [10]:
class Dropout:
    def __init__(self, dropout_ratio=0.5):
        self.dropout_ratio = dropout_ratio
        self.mask = None
        
    def forward(self, x, train_flg=True):
        if train_flg:
            self.mask = np.random.rand(*x.shape) > self.dropout_ratio
            return x * self.mask
        else:
            return x * (1.0 - self.dropout_ratio)
        
    def backward(self, dout):
        return dout * self.mask

In [16]:
x = np.array([[1, 2, 2, 3], [4, 5, 6, 7]])
mask = np.random.rand(*x.shape) > 0.5
print(mask)

[[False False  True  True]
 [False  True False  True]]


In [17]:
x * mask

array([[0, 0, 2, 3],
       [0, 5, 0, 7]])

In [2]:
import numpy as np

ex = np.arange(6).reshape(-1, 3)
print(ex)

[[0 1 2]
 [3 4 5]]


In [6]:
ex.mean(axis=1)

array([1., 4.])