In [None]:
# implemented and written by Yeoreum Lee in AI HnV Lab @ Sahmyook University in 2023
__author__ = 'leeyeoreum02'

In [2]:
from typing import Tuple, Callable

import numpy as np

In [3]:
x_data = np.array([[2, 4], [4, 11], [6, 6], [8, 5], [10, 7], [12, 16], [14, 8], [16, 3], [18, 7]])
t_data = np.array([0, 0, 0, 0, 1, 1, 1, 1, 1]).reshape(9, 1)

print(x_data.shape, t_data.shape)

(9, 2) (9, 1)


### 1. 데이터 나누기(split)

In [4]:
def split_data(x_data: np.ndarray, t_data: np.ndarray, split_rate: float) -> Tuple[np.ndarray]:
    test_x_data = x_data[:int(split_rate * len(x_data))]
    test_t_data = t_data[:int(split_rate * len(t_data))]
    train_x_data = x_data[int(split_rate * len(x_data)):]
    train_t_data = t_data[int(split_rate * len(t_data)):]
    
    return train_x_data, train_t_data, test_x_data, test_t_data


train_x_data, train_t_data, test_x_data, test_t_data = split_data(x_data, t_data, split_rate=0.2)
print(train_x_data.shape, train_t_data.shape, test_x_data.shape, test_t_data.shape,)

(8, 2) (8, 1) (1, 2) (1, 1)


### 2. 활성 함수(activation function)

$$sigmoid(\boldsymbol{x}) = \frac {1} {1 + e^{-\boldsymbol{x}}}$$

In [5]:
def sigmoid(x: np.ndarray) -> np.ndarray:
    return 1 / (1 + np.exp(-x))

### 3. 원핫 인코딩(One-hot Encoding)

In [6]:
def onehot_encoding(train_t_data: np.ndarray, test_t_data: np.ndarray, num_classes: int = 2) -> Tuple[np.ndarray]:
    train_t_data_onehot = np.zeros((train_t_data.shape[0], num_classes), dtype=np.float32) + 0.01  # one-hot encoding (vectorization) + smoothing
    for i in range(len(train_t_data_onehot)):
        train_t_data_onehot[i, train_t_data[i]] = 0.99  # smoothing

    test_t_data_onehot = np.zeros((test_t_data.shape[0], num_classes), dtype=np.float32) + 0.01  # one-hot encoding (vectorization) + smoothing
    for i in range(len(test_t_data_onehot)):
        test_t_data_onehot[i, test_t_data[i]] = 0.99  # smoothing
    
    return train_t_data_onehot, test_t_data_onehot


train_t_data_onehot, test_t_data_onehot = onehot_encoding(train_t_data, test_t_data)
print(train_t_data_onehot[:3])
print(train_t_data_onehot.shape, test_t_data_onehot.shape)

[[0.99 0.01]
 [0.99 0.01]
 [0.99 0.01]]
(8, 2) (1, 2)


### 4. 신경망(neural network) 모델

$$f(W^{(2)}, b^{(2)})(\boldsymbol{x}) = sigmoid(W^{(2)}\boldsymbol{x} + b^{(2)})$$
$$g(W^{(3)}, b^{(3)})(\boldsymbol{x}) = \sigma(W^{(3)}\boldsymbol{x} + b^{(3)})$$
$$
\begin{matrix}
y &=& h(W^{(2)}, b^{(2)}, W^{(3)}, b^{(3)})(\boldsymbol{x}) \\
  &=& (g(W^{(3)}, b^{(3)}) \circ f(W^{(2)}, b^{(2)}))(\boldsymbol{x}) \\
  &=& g(W^{(3)}, b^{(3)})(f(W^{(2)}, b^{(2)})(\boldsymbol{x})) \\
  &=& \sigma(W^{(3)}sigmoid(W^{(2)}\boldsymbol{x} + b^{(2)}) + b^{(3)})
\end{matrix}
$$

In [15]:
class NeuralNetwork:
    def __init__(self) -> None:
        self.W2 = np.random.randn(2, 2)
        self.b2 = np.random.randn(2)
        self.W3 = np.random.randn(2, 2)
        self.b3 = np.random.randn(2)
        
    def forward(self, x: np.ndarray) -> np.ndarray:
        a1 = x
        z2 = a1 @ self.W2 + self.b2
        self.a2 = sigmoid(z2)
        z3 = self.a2 @ self.W3 + self.b3
        y = a3 = sigmoid(z3)
        return y

    def __call__(self, x: np.ndarray) -> np.ndarray:
        return self.forward(x)
    
    
model_back = NeuralNetwork()

### 5. 오차 함수 (error function, loss function)

- N은 데이터 개수 (행 개수)
- $y$는 정답(label) $\hat{y}$은 예측값(prediction)

$$MSE = \frac{1} {N}\sum_{i=1} ^N (\boldsymbol{y_{i}} - \boldsymbol{\hat{y_{i}}})^2$$

In [16]:
def mean_square_error(y_data: np.ndarray, t_data: np.ndarray) -> np.ndarray:
    return np.sum((t_data - y_data) ** 2) / len(y_data)

### 6. 모델 학습 (train)

메인 교재와 서브 강의만을 이용하여 오차역전파를 계산하고 학습 코드를 구현하시오. (구글링 금지)

0. 배치 사이즈는 1임
1. 모델 순전파 (forward)
2. 오차 계산 (loss)
3. 모델 파라미터(가중치 + 편향) 별 오차 함수의 오차역전파 계산 (backpropagation)
4. 가중치(weight), 편향(bias) 갱신 (경사 하강법, gradient descent)

In [17]:
def train(lr: float) -> None:
    for epoch in range(1000):
        for x_batch, t_batch in zip(train_x_data, train_t_data_onehot):
            y_data = model_back(x_batch)
            loss = mean_square_error(y_data, t_batch)
            
            # backpropagation
            round_E_round_b3 = np.array([..., ...])
            round_E_round_W3 = ...
            model_back.W3 -= lr * round_E_round_W3
            model_back.b3 -= lr * round_E_round_b3
            
            round_E_round_b2 = ...
            round_E_round_W2 = ...
            model_back.W2 -= lr * round_E_round_W2
            model_back.b2 -= - lr * round_E_round_b2
            
            if epoch % 10 == 0:
                print(f'Epoch: {epoch}, loss {loss}')


train(lr=1e-3)

Epoch: 0, loss 0.11319923179622736
Epoch: 0, loss 0.14155131209121938
Epoch: 0, loss 0.14879140598903834
Epoch: 0, loss 0.39372061920028345
Epoch: 0, loss 0.3996936566256902
Epoch: 0, loss 0.3970173653640901
Epoch: 0, loss 0.3335363151926713
Epoch: 0, loss 0.39492746825568825
Epoch: 10, loss 0.11371131907888737
Epoch: 10, loss 0.14199742766884244
Epoch: 10, loss 0.14932905189964743
Epoch: 10, loss 0.39238601860159894
Epoch: 10, loss 0.3984288041358447
Epoch: 10, loss 0.39571338033261505
Epoch: 10, loss 0.33160561235205277
Epoch: 10, loss 0.39358534299477876
Epoch: 20, loss 0.11423521494412651
Epoch: 20, loss 0.1424543816155216
Epoch: 20, loss 0.14987972427291052
Epoch: 20, loss 0.39104817168559347
Epoch: 20, loss 0.39716259952396726
Epoch: 20, loss 0.3944068083064324
Epoch: 20, loss 0.32965847178665425
Epoch: 20, loss 0.392239262587663
Epoch: 30, loss 0.1147708543445799
Epoch: 30, loss 0.14292221178622336
Epoch: 30, loss 0.1504435375419785
Epoch: 30, loss 0.38970725287320407
Epoch: 30,

### 7. 학습 속도 비교

오차역전파를 가중치 갱신에 사용한 문제 6과 수치미분을 가중치 갱신에 사용한 문제7 간의 속도를 비교해보시오.

In [19]:
def numerical_derivative(f: Callable, x: np.ndarray) -> np.ndarray:
    h = 1e-4
    grad = np.zeros_like(x)
    
    it = np.nditer(x, flags=['multi_index'], op_flags=['readwrite'])
    
    while not it.finished:
        idx = it.multi_index
        
        temp = x[idx]
        x[idx] = float(temp) + h
        fx1 = f(x)
        
        x[idx] = temp - h
        fx2 = f(x)
        grad[idx] = (fx1 - fx2) / (2 * h)
        
        x[idx] = temp
        it.iternext()
        
    return grad


def train_numerical_derivative(lr: float) -> None:
    for epoch in range(1000):
        for x_batch, t_batch in zip(train_x_data, train_t_data_onehot):
            y_data = model_numer(x_batch)
            loss = mean_square_error(y_data, t_batch)

            f = lambda W2, b2: sigmoid(x_batch @ W2 + b2)  # f(W2, b2)(x) = sigmoid(W2x + b2)
            h = lambda W2, b2, W3, b3: sigmoid(f(W2, b2) @ W3 + b3)  # y = h(W2, b2, W3, b3)(x) = g(W3, b3)(f(W2, b2)(x)) = sigmoid(W3(sigmoid(W3x + b2)) + b3)
            
            E_w2 = lambda W2: mean_square_error(h(W2, model_numer.b2, model_numer.W3, model_numer.b3), t_batch)
            E_b2 = lambda b2: mean_square_error(h(model_numer.W2, b2, model_numer.W3, model_numer.b3), t_batch)
            E_w3 = lambda W3: mean_square_error(h(model_numer.W2, model_numer.b2, W3, model_numer.b3), t_batch)
            E_b3 = lambda b3: mean_square_error(h(model_numer.W2, model_numer.b2, model_numer.W3, b3), t_batch)
            
            model_numer.W2 -= lr * numerical_derivative(E_w2, model_numer.W2)
            model_numer.b2 -= lr * numerical_derivative(E_b2, model_numer.b2)
            model_numer.W3 -= lr * numerical_derivative(E_w3, model_numer.W3)
            model_numer.b3 -= lr * numerical_derivative(E_b3, model_numer.b3)

            if epoch % 10 == 0:
                print(f'Epoch: {epoch}, loss {loss}')


model_numer = NeuralNetwork()
train_numerical_derivative(lr=1e-3)

Epoch: 0, loss 0.3249954391495161
Epoch: 0, loss 0.3208688095209839
Epoch: 0, loss 0.31797531572201915
Epoch: 0, loss 0.20321007933273952
Epoch: 0, loss 0.20003363999285848
Epoch: 0, loss 0.2021358207198346
Epoch: 0, loss 0.21911266216756606
Epoch: 0, loss 0.20373353819646461
Epoch: 10, loss 0.3256108778297751
Epoch: 10, loss 0.3215180902193899
Epoch: 10, loss 0.31867004807073546
Epoch: 10, loss 0.20182796591046936
Epoch: 10, loss 0.1987203804711819
Epoch: 10, loss 0.20075185867363332
Epoch: 10, loss 0.21734963907762572
Epoch: 10, loss 0.20226293793226569
Epoch: 20, loss 0.32622719737129924
Epoch: 20, loss 0.32216671576965983
Epoch: 20, loss 0.3193624531305483
Epoch: 20, loss 0.20046963596265716
Epoch: 20, loss 0.19742817566107906
Epoch: 20, loss 0.19939257410219693
Epoch: 20, loss 0.2156192007150933
Epoch: 20, loss 0.20082236440899173
Epoch: 30, loss 0.3268439758023222
Epoch: 30, loss 0.3228143333765377
Epoch: 30, loss 0.3200522426322927
Epoch: 30, loss 0.1991350205519084
Epoch: 30, l

### 7. 모델 추론 (evaluate)

In [20]:
def test(model):
    y_data = model(test_x_data)
    print(y_data, test_t_data)
    
    
test(model_back)
test(model_numer)

[[0.43552819 0.54511349]] [[0]]
[[0.43774026 0.64535815]] [[0]]
