### custom FFNN
#### 2) 여러개의 layer가 있는 FFNN
이제 여러개의 layer가 있는 FFNN을 고려해보자.
$$\begin{aligned}
s_1 &= x \cdot w_1 + b_1,  &(b, h_1^1)\\
h_1 &= f_1(s_1), &(b, h_1^1) \\
s_2 &= h_1 \cdot w_2 + b_2,  &(b, h_2^1) \\
h_2 &= f_2(s_2),  &(b, h_2^1) \\
s_3 &= h_2 \cdot w_3 + b_3,  &(b, h_3^1 = 1) \\
\hat y &= f(s_3),  &(b, 1) \\
e &= \sum_i (\hat y - y)^2, &(1, ) \\
& \\
\cfrac {\partial e}{\partial w_3} &= \cfrac{\partial e}{\partial \hat y} \cfrac{\partial \hat y}{\partial s_3} \cfrac{\partial s_3}{\partial w_3}, &(h_2^1, h_3^1=1) \\
&= h_2^T \cdot \left[ 2(\hat y - y) \times f^{'}(s_3) \right]\\
(h_2^1, h_3^1=1) &= (b, h_2^1)^T \cdot [(b, 1) \times (b, 1)]\\
& \\
\cfrac {\partial e}{\partial h_2} &= \cfrac{\partial e}{\partial \hat y} \cfrac{\partial \hat y}{\partial s_3} \cfrac{\partial s_3}{\partial h_2}, &(b, h_2^1) \\
&= \left[ 2(\hat y - y) \times f^{'}(s_3) \right] \cdot w_3 \\
(b, h_2^1) &= [(b, 1) \times (b, 1)] \cdot (h_3^1=1, h_2^1)\\
& \\
\cfrac {\partial e}{\partial w_2} &= \cfrac {\partial e}{\partial h_2} \cfrac{\partial h_2}{\partial s_2} \cfrac{\partial s_2}{\partial w_2}, &(h_1^1, h_2^1) \\
&= h_1^T \cdot \left[ \cfrac {\partial e}{\partial h_2} \times f_2^{'}(s_2) \right]\\
(h_1^1, h_2^1) &= (b, h_1^1)^T \cdot [(b, h_2^1) \times (b, h_2^1)]\\
& \\
\text{for b, }& \text{We assume } b \cdot x_0, \text{and always } x_0 = 1\\
\cfrac {\partial e}{\partial b_2} &= \cfrac {\partial e}{\partial h_2} \cfrac{\partial h_2}{\partial s_2} \cfrac{\partial s_2}{\partial b_2}, &(h_2^1,) \\
&= \sum_b \left[ \cfrac {\partial e}{\partial h_2} \times f_2^{'}(s_2) \right]\\
(h_2^1,) &= (b, 1)^T \cdot [(b, h_2^1) \times (b, h_2^1)]
\end{aligned}$$

따라서 일반화를 하면:
$$\begin{aligned}
\cfrac {\partial e}{\partial w_i} &= \cfrac {\partial e}{\partial h_i} \cfrac{\partial h_i}{\partial s_i} \cfrac{\partial s_i}{\partial w_i}, &(h_{i-1}^1, h_i^1) \\
&= h_{i-1}^T \cdot \left[ \cfrac {\partial e}{\partial h_i} \times f_i^{'}(s_i) \right]\\
(h_{i-1}^1, h_i^1) &= (b, h_{i-1}^1)^T \cdot [(b, h_i^1) \times (b, h_i^1)]\\
& \\
\cfrac {\partial e}{\partial h_i} &= \cfrac{\partial e}{\partial h_{i+1}} \cfrac{\partial h_{i+1}}{\partial s_{i+1}} \cfrac{\partial s_{i+1}}{\partial h_i}, &(b, h_2^1) \\
&= \left[ \cfrac{\partial e}{\partial h_{i+1}}  \times f^{'}(s_{i+1}) \right] \cdot w^T_{i+1} \\
(b, h_i^1) &= [(b, h_{i+1}^1) \times (b, h_{i+1}^1)] \cdot (h_{i+1}^1, h_i^1)\\
& \\
\cfrac {\partial e}{\partial b_i} &= \cfrac {\partial e}{\partial h_i} \cfrac{\partial h_i}{\partial s_i} \cfrac{\partial s_i}{\partial b_i}, &(h_i^1, ) \\
&= \sum_b \left[ \cfrac {\partial e}{\partial h_i} \times f_i^{'}(s_i) \right]\\
(h_i^1,) &= (b, 1)^T \cdot [(b, h_i^1) \times (b, h_i^1)]
\end{aligned}$$
- $b_i$를 구할 때, np.sum(arr, axis=-1)을 사용하면 된다.

이를 위해 propagate_forward(self, x)함수를 구해보자.

In [1]:
import numpy as np
import pandas as pd
from numpy.random import shuffle, rand
%matplotlib inline

In [2]:
x_train = np.random.rand(1024,2)
y_train = np.array( [ [3 * x[0] - 1.2 * x[1] + .5] for x in x_train ] )

x_val = np.random.rand(32,2)
y_val = np.array( [ [3 * x[0] - 1.2 * x[1] + .5] for x in x_val ] )

x_test = np.array( [ [0.2, 0.1], [0.3, 0.1], [0.4, 0.1], [0.5, 0.1] ] )
y_test = np.array( [ [3 * x[0] - 1.2 * x[1] + .5] for x in x_test ])

이를 위해 multi layer를 구현할 수 있도록 다음과 같은 절차에 따라 작성한다.

1. layer를 추가할 수 있는 add 함수를 구현한다.
 - 인자로, input의 size와 activation 

In [83]:
class FFNN:
    
    def __init__(self, lr=0.01):
        self.lr = lr # learning rate
        # weights list
        self.ws = []; self.bs = []
        self.fs = []; self.f_derivs = []
        self.N_layers = 0
        
    # layer를 추가하며, weights를 초기화하기
    def add(self, units, activation=None, activation_deriv=None, input_dim=None):
        self.ws.append(np.random.rand(input_dim * units).reshape(input_dim, units))
        self.bs.append(np.zeros(units))
        self.fs.append(activation)
        self.f_derivs.append(activation_deriv)
        self.N_layers += 1
        
    # feed forwarding
    def propagate_forward(self, x):
        s = [x]; o = [x] # for l = 0
        for idx in range(self.N_layers):
            s_input = np.dot(x, self.ws[idx]) + self.bs[idx]
            y = self.fs[idx](s_input)
            x = y
            o.append(y)
            s.append(s_input)
        return s, o

    # predicting
    def predict(self, x):
        s, o = self.propagate_forward(x)
        return o[-1] # output from the last layer

    # train for one batch. x 자체가 batch
    def train_on_batch(self, x_batch, y_batch, istrain=True):
        s, o = self.propagate_forward(x_batch)
        Y = o[-1]
        N = x_batch.shape[0]
        loss = np.sum((Y - y_batch))
        if istrain:
            qq = (2 * (Y - y_batch.reshape(-1, 1)) * self.f_derivs[3](s[3]) )
            dw3 = np.dot(o[2].T, qq)
            db3 = np.sum(Y - y_batch)
            
#             dw2 = 
            print(dw3)    
        return loss

    # epochs에 대해 batch 별로 학습하기
    def fit(self, x, y, batch_size, epochs, validation_data):
        Losses = [] # validation loss after each epoch

        for epoch in range(epochs):
            Loss = 0
            N = x.shape[0]
            for j in range(0, N, batch_size):
                x_batch = x[j:j+batch_size]
                y_batch = y[j:j+batch_size]
                n = x_batch.shape[0]
                Loss += (self.train_on_batch(x_batch, y_batch) / n)
        
            print("Train Loss at Epoch %d is %.8f" %(epoch, Loss))
            Losses["train_loss"].append(Loss)                

        return Losses

다양한 activation 함수 구현:
$$\begin{aligned}
\text{Linear}(x) &= x\\
\cfrac {\partial \text{Linear}(x)}{\partial x} &= 1\\
\text{Sigmoid}(x) &= \sigma (x) = \cfrac {1}{1-\exp(-x)}\\
\cfrac {\partial \text{Sigmoid}(x)}{\partial x} &= \sigma(x) * (1-\sigma(x))\\
\text{tahn}(x) &= \cfrac {\exp(x) - \exp(-x)}{\exp(x) + \exp(-x)}\\
\cfrac {\partial \text{tahn}(x)}{\partial x} &= 1-\text{tahn}^2(x)
\end{aligned}$$

In [84]:
def linear(x): # linear y = x
    return x

def linear_deriv(x): # derivative of y = x
    return 1

def sigmoid(x): # sigmoid
    return 1. / (1 + np.exp(-x))

def sigmoid_deriv(x): # derivative of sigmoid
    return sigmoid(x) * (1 - sigmoid(x))

def tanh(x): # hyperbolicv tangent
    return np.tanh(x)

def tanh_deriv(x): # derivative of hyperbolic tangent
    return 1 - np.tanh(x) ** 2

2 > 3 > 3 > 1 layer를 갖는 FFNN의 인스턴스를 생성한다.
- 출력 2개의 선형 활성함수를 갖는 첫 hidden layer 추가
- 출력 3개의 sigmoid 활성함수를 갖는 둘째 hidden layer 추가
- 출력 3개의 tanh 활성함수를 갖는 셋째 hidden layer 추가
- 출력 1개의 선형 활성함수를 갖는 출력 layer 추가

In [85]:
fnn = FFNN()
fnn.add(2, activation=linear, activation_deriv=linear_deriv, input_dim=2)
fnn.add(3, activation=sigmoid, activation_deriv=sigmoid_deriv, input_dim=2)
fnn.add(3, activation=tanh, activation_deriv=tanh_deriv, input_dim=3)
fnn.add(1, activation=linear, activation_deriv=linear_deriv, input_dim=3)

In [86]:
Losses = fnn.fit(x_train, y_train, 20, 1000, (x_val, y_val))

[[16.45835394]
 [17.51188543]
 [17.41021938]]
[[15.85710376]
 [16.90036225]
 [16.7525848 ]]
[[19.28909527]
 [20.56067273]
 [20.41877638]]
[[32.96269037]
 [35.13226913]
 [34.8626111 ]]
[[26.17110277]
 [27.8749166 ]
 [27.65077779]]
[[15.10910823]
 [16.11781614]
 [15.98723928]]
[[22.0137833 ]
 [23.45955011]
 [23.30636787]]
[[27.53213592]
 [29.36722545]
 [29.10605003]]
[[21.13821865]
 [22.54672877]
 [22.36208499]]
[[4.9693352 ]
 [5.33095889]
 [5.22371485]]
[[17.54747917]
 [18.69266665]
 [18.56497352]]
[[16.25982829]
 [17.32427045]
 [17.19323546]]
[[27.0808948 ]
 [28.88973563]
 [28.61462813]]
[[24.68507998]
 [26.29185526]
 [26.07176781]]
[[26.09105952]
 [27.79690636]
 [27.52224102]]
[[20.54932637]
 [21.90365567]
 [21.75821435]]
[[29.60136049]
 [31.51425353]
 [31.29902563]]
[[22.71426805]
 [24.21809355]
 [23.99293241]]
[[17.36048358]
 [18.53581921]
 [18.33080322]]
[[38.28658272]
 [40.8161044 ]
 [40.46844308]]
[[30.20307218]
 [32.19669989]
 [31.91775086]]
[[23.04348401]
 [24.61357374]
 [24.37

NameError: name 'i' is not defined

In [32]:
y_pred = fnn.predict(x_test)
print('y_pred'); print(y_pred)
print('y_test'); print(y_test)

y_pred
[[0.97862965]
 [0.9794233 ]
 [0.98018522]
 [0.9809166 ]]
y_test
[0.98 1.28 1.58 1.88]


losses의 결과를 이전과 같이 시각화하라.

(1024,)
