<a href="https://colab.research.google.com/github/hyyoka/DeepLearning-models-Implemetation-from-scratch/blob/main/Fully_Connected_Layer_from_scratch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Fully Connected Layer from scratch

본 포스트에서는 fc layer를 numpy만을 이용해 구현하는 것을 목적으로 한다. 목차는 다음과 같다:
- activation functions
- Parameter initialization
- Models
    - Linear forward and backward
    - Linear+activation forward and backward
    - multi-layer forward and backward
- Cross-Entropy loss


In [7]:
import numpy as np

## Activation functions

In [19]:
# activation functions forward&backward

def sigmoid(Z):
    A = 1/(1+np.exp(-Z))
    return A, Z
def sigmoid_backward(dA, Z):
    s = 1/(1+np.exp(-Z))
    dZ = dA * s * (1-s)
    #(dZ.shape == Z.shape)
    return dZ

def relu(Z):
    A = np.maximum(0,Z)
    #(A.shape == Z.shape)
    return A, Z
def relu_backward(dA, Z):
    dZ = np.array(dA, copy=True)
    dZ[Z <= 0] = 0
    dZ[Z > 0] = 1
    #(dZ.shape == Z.shape)
    return dZ

## Parameter initialization

In [20]:
def initialize_parameters(layer_dims):
    parameters = {}
    L = len(layer_dims)
    for l in range(1, L):
        # W = (현재 레이어 hidden_dim, 이전 레이어 hidden_dim)
        parameters['W' + str(l)] = np.random.randn(layer_dims[l], layer_dims[l-1]) / np.sqrt(layer_dims[l-1]) #*0.01
        # b = (현재 레이어 hidden_dim,1)
        parameters['b' + str(l)] = np.zeros((layer_dims[l], 1))
    return parameters

## Model forward and backward


### Linear 

#### Linear Forward



In [23]:
def linear_forward(x, W, b):
    # Wx+b
    Z = W.dot(x) + b
    # 사용한 값을 cache에 저장: 역전파에서 사용
    cache = (x, W, b)
    return Z, cache

#### Linear backward

**Inputs**
- dZ: Z의 변화량, cost function 에 대한 gradient
- cache : forward 과정에서 필요한 값

__Returns__

- dX_prev : 지난 레이어의 activation 을 통과한 x의 cost function에 대한 변화량입니다.

- dW : Linear 구간의 weight의 cost function에 대한 변화량 입니다.

- db : Linear 구간의 bias의 cost function 에 대한 변화량 입니다.


$$ dW^{[l]} = \frac{\partial \mathcal{L} }{\partial W^{[l]}} = \frac{1}{m} dZ^{[l]} X^{[l-1] T} $$

$$ db^{[l]} = \frac{\partial \mathcal{L} }{\partial b^{[l]}} = \frac{1}{m} \sum_{i = 1}^{m} dZ^{[l](i)}$$

$$ dX^{[l-1]} = \frac{\partial \mathcal{L} }{\partial X^{[l-1]}} = W^{[l] T} dZ^{[l]} $$


In [30]:
def linear_backward(dZ, cache):
    X_prev, W, b = cache
    m = X_prev.shape[1]
    
    dW = np.dot(dZ,X_prev.T)/m
    db = np.sum(dZ, axis=1, keepdims=True)/m
    dX_prev = np.dot(W.T, dZ)
    
    assert (dX_prev.shape == X_prev.shape)
    assert (dW.shape == W.shape)
    assert (db.shape == b.shape)
    
    return dX_prev, dW, db

### Linear-Activation 

#### Linear-activation forward

In [56]:
def linear_activation_forward(X_prev, W, b, activation):
    
    Z, linear_cache = linear_forward(X_prev, W, b)
    if activation == "relu":
        X, activation_cache = relu(Z)
    elif activation == "sigmoid":
        X, activation_cache = sigmoid(Z)
    
    assert (X.shape == (W.shape[0], X_prev.shape[1]))
    
    cache = (linear_cache, activation_cache)
    return X, cache

#### Linear-activation backward

$$g(.) = Activation function$$

$$dZ^{[l]} = dA^{[l]} * g'(Z^{[l]})$$.  

</br>

__Inputs__

- dA : 현재 layer의 gradient값

- cache : forward pass에서 계산했던 linear(Z) 부분과 activation(A) 부분의 계산값



In [57]:
def linear_activation_backward(dA, cache, activation):
    linear_cache, activation_cache = cache
    if activation == "relu":
        dZ = relu_backward(dA, activation_cache)
    elif activation == "sigmoid":
        dZ = sigmoid_backward(dA, activation_cache)
    dA_prev, dW, db = linear_backward(dZ, linear_cache)
    return dA_prev, dW, db

## N-layers 

### N-layers forward

In [82]:
def N_layers_forward(X, parameters):
    # N개의 레이어
    caches = []
    # weight와 bias가 저장 => 2로 나누어주어야한다.
    L = len(parameters) // 2
    for l in range(1, L):
        X_prev = X
        X, cache = linear_activation_forward(X_prev, parameters['W' + str(l)], parameters['b' + str(l)], activation = "relu")
        caches.append(cache)
    
    # output layer는 sigmoid를 통과하게 한다
    prediction, cache = linear_activation_forward(X, parameters['W' + str(L)], parameters['b' + str(L)], activation = "sigmoid")
    caches.append(cache)
    assert (prediction.shape == (1,X.shape[1]))
    return prediction, caches

### N-layers backward

In [99]:
def N_layers_backward(prediction, Y, caches):
    grads = {} 
    L = len(caches) 
    m = prediction.shape[1]
    # Shape는 AL과 동일해야함
    Y = Y.reshape(prediction.shape) 
    
    # Initializing the backpropagation
    dAL = - (np.divide(Y,prediction)- np.divide(1-Y, 1-prediction))
    # caches index를 잡아둡니다.
    current_cache = caches[L-1] 
    grads["dX" + str(L-1)], grads["dW" + str(L)], grads["db" + str(L)] = linear_activation_backward(dAL, current_cache, activation="sigmoid")
    
    for l in reversed(range(L-1)):
        # indexing입니다.
        current_cache = caches[l]
        dX_prev_temp, dW_temp, db_temp = linear_activation_backward(grads["dX"+str(l+1)], current_cache, activation="relu")
        grads["dX" + str(l)] = dX_prev_temp
        grads["dW" + str(l + 1)] = dW_temp
        grads["db" + str(l + 1)] = db_temp
    return grads

## Cross-Entropy loss

우리는 신경망을 포워드해 예측값 $\hat{y}$값을 찾을 수 있었다. 우리는 이를 실제 y 레이블과 비교하여 학습을 시켜야한다.가장 대표적인 손실함수는 cross-entropy 함수이다. 

$$-\frac{1}{m} \sum\limits_{i = 1}^{m} (y^{(i)}\log\left(a^{[L] (i)}\right) + (1-y^{(i)})\log\left(1- a^{[L](i)}\right))$$


__Arguments__

- prediction: $\hat{y}$ = (1, number of examples)


In [100]:
def cross_entropy_loss(prediction, Y):
    m = Y.shape[1]
    cost = (-1.0/m)*np.sum(np.multiply(Y,np.log(prediction)) + np.multiply(1-Y, np.log(1-prediction)))
    cost = np.squeeze(cost)    
    return cost

## Update Parameters

 Learning rate인 $\alpha$ 에 Gradient를 곱해서 현재의 parameter에 빼준다.

$$ W^{[l]} = W^{[l]} - \alpha \text{ } dW^{[l]} $$
$$ b^{[l]} = b^{[l]} - \alpha \text{ } db^{[l]} $$


In [101]:
def update_parameters(parameters, grads, learning_rate):
    L = len(parameters) // 2 
    for l in range(L):
        parameters["W" + str(l+1)] = parameters["W" + str(l+1)] - learning_rate*grads["dW"+str(l+1)]
        parameters["b" + str(l+1)] = parameters["b" + str(l+1)] - learning_rate*grads["db"+str(l+1)]
    return parameters

# TEST

In [102]:
# hidden_dims 정의
layer_dims = [10,3,4,1] # 마지막은 output layer니까 1

# parameter 초기화
parameters = initialize_parameters(layer_dims)
print([p.shape for p in parameters.values()])
parameters

[(3, 10), (3, 1), (4, 3), (4, 1), (1, 4), (1, 1)]


{'W1': array([[ 0.53243065,  0.00305796, -0.69612127, -0.04844105, -0.16577115,
         -0.19976574,  0.42064148,  0.00543465,  0.01085197,  0.00489016],
        [-0.13175912, -0.4122241 , -0.12556203, -0.03547758, -0.12859427,
         -0.31126873, -0.27426719,  0.22951715, -0.73416828,  0.16474125],
        [ 0.16523694, -0.01938631,  0.00299674, -0.21583193,  0.02295272,
          0.20964523,  0.09126517,  0.74511916,  0.05919713,  0.25924268]]),
 'W2': array([[-0.49674039,  0.20180174,  0.16685849],
        [ 0.54802751, -0.07481611, -0.86328035],
        [-0.33828482,  0.00778318,  0.26607548],
        [-0.59093043,  0.5587151 , -0.24957987]]),
 'W3': array([[-0.36826909,  0.26995726, -0.12282318, -0.00718011]]),
 'b1': array([[0.],
        [0.],
        [0.]]),
 'b2': array([[0.],
        [0.],
        [0.],
        [0.]]),
 'b3': array([[0.]])}

In [103]:
# input&label
X = np.random.randn(10,4)
Y = np.array([[0, 1, 1, 0]]) # 가상 레이블

In [104]:
# model forward
prediction, caches = N_layers_forward(X, parameters)

In [105]:
# loss
loss = cross_entropy_loss(prediction, Y)
print("loss = " + str(loss))

loss = 0.7142134287396995


In [106]:
# model backward
grads = N_layers_backward(prediction, Y, caches)
grads

{'dW1': array([[-0.42985543,  0.02934384, -0.53340049, -0.18540024,  0.29553729,
          0.13436646,  0.02284437,  0.27726265,  0.45119101, -0.18286423],
        [ 0.05323166, -0.059707  , -0.06076259,  0.11169388, -0.15910129,
         -0.00974531, -0.27530891,  0.39017389,  0.20109524,  0.28961388],
        [-0.13943342, -0.01119068, -0.51052298, -0.41582312,  0.23249993,
          0.23993895, -0.46652143,  0.60654381,  1.06237585, -0.21466427]]),
 'dW2': array([[0.        , 0.10989209, 0.34788942],
        [0.        , 0.        , 0.        ],
        [0.09079876, 0.10989209, 0.53764367],
        [0.        , 0.        , 0.        ]]),
 'dW3': array([[-0.04062438,  0.        , -0.05609834,  0.        ]]),
 'dX0': array([[ 0.        ,  0.03347782,  0.69766759,  0.16523694],
        [ 0.        , -0.4316104 , -0.01632835, -0.01938631],
        [ 0.        , -0.12256529, -0.69312453,  0.00299674],
        [ 0.        , -0.25130951, -0.26427298, -0.21583193],
        [ 0.        , -0.

In [107]:
parameters = update_parameters(parameters, grads, 0.05)
parameters

{'W1': array([[ 0.55392342,  0.00159077, -0.66945124, -0.03917104, -0.18054801,
         -0.20648407,  0.41949926, -0.00842848, -0.01170758,  0.01403337],
        [-0.1344207 , -0.40923875, -0.1225239 , -0.04106228, -0.12063921,
         -0.31078146, -0.26050174,  0.21000845, -0.74422304,  0.15026055],
        [ 0.17220861, -0.01882677,  0.02852289, -0.19504078,  0.01132772,
          0.19764828,  0.11459124,  0.71479197,  0.00607834,  0.2699759 ]]),
 'W2': array([[-0.49674039,  0.19630713,  0.14946402],
        [ 0.54802751, -0.07481611, -0.86328035],
        [-0.34282476,  0.00228858,  0.2391933 ],
        [-0.59093043,  0.5587151 , -0.24957987]]),
 'W3': array([[-0.36623787,  0.26995726, -0.12001826, -0.00718011]]),
 'b1': array([[-0.0125],
        [-0.0125],
        [-0.0375]]),
 'b2': array([[-0.025 ],
        [ 0.    ],
        [-0.0375],
        [ 0.    ]]),
 'b3': array([[0.00054206]])}