# Building Deep Neural Networks


## 1 - Packages

In [1]:
import numpy as np
import h5py
import matplotlib.pyplot as plt
from testCases_v2 import *
from dnn_utils_v2 import sigmoid, sigmoid_backward, relu, relu_backward

%matplotlib inline
plt.rcParams['figure.figsize'] = (5.0, 4.0) # set default size of plots
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'

%load_ext autoreload
%autoreload 2

np.random.seed(1)

## 2 - Outline

- Initialize the parameters for a two-layer network and for an $L$-layer neural network.
- Implement the forward propagation module (shown in purple in the figure below).
     - Complete the LINEAR part of a layer's forward propagation step (resulting in $Z^{[l]}$).
     - We give you the ACTIVATION function (relu/sigmoid).
     - Combine the previous two steps into a new [LINEAR->ACTIVATION] forward function.
     - Stack the [LINEAR->RELU] forward function L-1 time (for layers 1 through L-1) and add a [LINEAR->SIGMOID] at the end (for the final layer $L$). This gives you a new L_model_forward function.
- Compute the loss.
- Implement the backward propagation module (denoted in red in the figure below).
    - Complete the LINEAR part of a layer's backward propagation step.
    - We give you the gradient of the ACTIVATE function (relu_backward/sigmoid_backward) 
    - Combine the previous two steps into a new [LINEAR->ACTIVATION] backward function.
    - Stack [LINEAR->RELU] backward L-1 times and add [LINEAR->SIGMOID] backward in a new L_model_backward function
- Finally update the parameters.

<center><img src="images/final outline.png" style="width:800px;height:500px;"></center>
<caption><center> <b>Figure 1</b> </center></caption><br>

## 3 - Inititalization 

#### 3.1 2-layer NN

 - model structure is Linear -> RELU -> Linear -> Sigmoid
 - weights are randomly initaialized
 - bias is 0 initially


In [2]:
def init_params(n_x, n_h, n_y):
    """
    args:
     n_x -> size of input layer
     n_h -> size of hidden layer
     n_y -> size of output layer
    Returns:
     params -> python dict containing:
      W1 -> weights matrix of shape (n_h, n_x)
      W2 -> weights matrix of shape (n_y, n_h)
      b1 -> bias vector of shape (n_h,1)
      b2 -> bias vector of shape (n_y,1)
    """
    W1 = np.random.randn(n_h,n_x)*0.01
    W2 = np.random.randn(n_y,n_h)*0.01
    b1 = np.zeros((n_h,1))
    b2 = np.zeros((n_y,1))
    
    params = {
      "W1": W1,
      "b1": b1,
      "W2": W2,
      "b2": b2}
    
    return params 


In [3]:
parameters = init_params(2,2,1)
print("W1 = " + str(parameters["W1"]))
print("b1 = " + str(parameters["b1"]))
print("W2 = " + str(parameters["W2"]))
print("b2 = " + str(parameters["b2"]))

W1 = [[ 0.01624345 -0.00611756]
 [-0.00528172 -0.01072969]]
b1 = [[0.]
 [0.]]
W2 = [[ 0.00865408 -0.02301539]]
b2 = [[0.]]


### 3.2 Initialize params for L-layer network

In [4]:
def init_params_deep(layer_dims):
    """
    Args:
     layer_dims -> list containing dims of each layer
    return:
     params -> dict containing weights and bias of L layers:
      Wl -> weight of l-th layer of shape(layer_dims[l],layer_dims[l-1])
      bl -> weight of l-th layer of shape(layer_dims[l],1)
    """
    np.random.seed(3)
    params = {}
    L = len(layer_dims)

    for i in range(1,L):
        params['W' + str(i)] = np.random.randn(layer_dims[i], layer_dims[i-1]) * 0.01
        params['b' + str(i)] = np.zeros((layer_dims[i],1))
    
    return params


In [5]:
parameters = init_params_deep([5,4,3])
print("W1 = " + str(parameters["W1"]))
print("b1 = " + str(parameters["b1"]))
print("W2 = " + str(parameters["W2"]))
print("b2 = " + str(parameters["b2"]))

W1 = [[ 0.01788628  0.0043651   0.00096497 -0.01863493 -0.00277388]
 [-0.00354759 -0.00082741 -0.00627001 -0.00043818 -0.00477218]
 [-0.01313865  0.00884622  0.00881318  0.01709573  0.00050034]
 [-0.00404677 -0.0054536  -0.01546477  0.00982367 -0.01101068]]
b1 = [[0.]
 [0.]
 [0.]
 [0.]]
W2 = [[-0.01185047 -0.0020565   0.01486148  0.00236716]
 [-0.01023785 -0.00712993  0.00625245 -0.00160513]
 [-0.00768836 -0.00230031  0.00745056  0.01976111]]
b2 = [[0.]
 [0.]
 [0.]]


## 4 - Forward Propagation Module

The function order is as follows:
- linear
- linear -> Activation
- ( linear -> ReLU ) $*$ (L - 1) -> Linear -> sigmoid 


### 4.1 - Linear Forward

building linear part of the forward propagation which is as follows
$$Z^{[l]} = W^{[l]}A^{[l-1]} +b^{[l]}\tag{1}$$


In [6]:
def linear_forward(A,W,b):
    """
    Args:
     A -> activation of previous layer, shape -> (size of previous layer, number of examples)
     W -> weights of current layer, array of shape(size of current layer, size of previous layer)
     b -> bias vector, array of shape(size of current layer, 1)
    Return:
     Z -> pre-activation param of shape (size of current layer, number of examples)
     cache -> dictionary containing "A", "W" and "b" for backward propagation
    
    """
    Z = np.dot(W,A) + b
    cache = (A,W,b)
    return Z, cache

In [7]:
A, W, b = linear_forward_test_case()

Z, linear_cache = linear_forward(A, W, b)
print("Z = " + str(Z))

Z = [[ 3.26295337 -1.23429987]]


### 4.2 Linear Activation forward
- sigmoid
- ReLU

In [8]:
def linear_activation_forward(A_prev, W, b, activation):
    """
    Args:
     A_prev -> activations from previous layer: (size of prev. layer, number of examples)
     W -> weights of current layer, array of shape(size of current layer, size of previous layer)
     b -> bias vector, array of shape(size of current layer, 1)
     activation -> input as "sigmoid" or "relu"
    Returns:
     A -> output of activation function
     cache -> dict containing "linear_cache" and "activation_cache"
    """
    Z, linear_cache = linear_forward(A_prev, W, b)
    if activation.lower() == "sigmoid":
        A, activation_cache = sigmoid(Z)
    elif activation.lower() == "relu":
        A, activation_cache = relu(Z)

    assert (A.shape == (W.shape[0], A_prev.shape[1]))
    cache = (linear_cache, activation_cache)
    return A, cache

In [9]:
A_prev, W, b = linear_activation_forward_test_case()

A, linear_activation_cache = linear_activation_forward(A_prev, W, b, activation = "sigmoid")
print("With sigmoid: A = " + str(A))

A, linear_activation_cache = linear_activation_forward(A_prev, W, b, activation = "relu")
print("With ReLU: A = " + str(A))

With sigmoid: A = [[0.96890023 0.11013289]]
With ReLU: A = [[3.43896131 0.        ]]


### 4.3 - L layer model

For even more convenience when implementing the $L$-layer Neural Net, you will need a function that replicates the previous one (`linear_activation_forward` with RELU) $L-1$ times, then follows that with one `linear_activation_forward` with SIGMOID.

<img src="images/model_architecture_kiank.png" style="width:600px;height:300px;">
<caption><center> <b>Figure 2</b> : <b>[LINEAR -> RELU] * (L-1) -> LINEAR -> SIGMOID</b> model</center></caption><br>

$A^{[L]} = \sigma(Z^{[L]}) = \sigma(W^{[L]} A^{[L-1]} + b^{[L]})$

In [10]:
def L_model_forward(X, params):
    """
    Args:
     X -> data, array of shape(input size, number of example)
     params -> output of init_params_deep()
    
    Return:
     AL -> last post activation value
     caches -> list of caches
      -every cache of linear_relu_forward, linear_sigmoid_forward(),
    """
    caches = []
    A = X
    L = len(params) // 2
    
    for l in range(1,L):
        A_prev = A
        A, cache = linear_activation_forward(A_prev, params["W" + str(l)], params["b" + str(l)], activation='relu')
        caches.append(cache)
    
    AL, cache = linear_activation_forward(A, params["W" + str(L)], params["b" + str(L)], activation="sigmoid")
    caches.append(cache)

    assert(AL.shape == (1,X.shape[1]))

    return AL, caches

In [11]:
X, parameters = L_model_forward_test_case()
AL, caches = L_model_forward(X, parameters)
print("AL = " + str(AL))
print("Length of caches list = " + str(len(caches)))

AL = [[0.17007265 0.2524272 ]]
Length of caches list = 2


## 5 - Cost Function

$$-\frac{1}{m} \sum\limits_{i = 1}^{m} (y^{(i)}\log\left(a^{[L] (i)}\right) + (1-y^{(i)})\log\left(1- a^{[L](i)}\right)) \tag{2}$$

In [12]:
def compute_cost(AL, Y):
    """
    Args:
     AL -> label predictions, shape(1,number of examples)
     Y -> true "label" vector, shape(1, number of examples)

    Return:
     cost -> cross-entropy cost
    """

    m = Y.shape[1]

    cost = (-1./m) * np.sum(np.multiply(Y, np.log(AL)) + np.multiply((1-Y), np.log(1-AL)))
    cost = np.squeeze(cost)
    assert(cost.shape == ())

    return cost

In [13]:
Y, AL = compute_cost_test_case()

print("cost = " + str(compute_cost(AL, Y)))

cost = 0.41493159961539694


## 6 - Backward propagation module

Just like with forward propagation, you will implement helper functions for backpropagation. Remember that back propagation is used to calculate the gradient of the loss function with respect to the parameters. 

**Reminder**:

<img src="images/backprop_kiank.png" style="width:650px;height:250px;">
<caption><center> <b>Figure 3</b> : Forward and Backward propagation for <b>LINEAR->RELU->LINEAR->SIGMOID</b> <br> The purple blocks represent the forward propagation, and the red blocks represent the backward propagation. </center></caption>

### 6.1 Linear Backward


For layer $l$, the linear part is: $Z^{[l]} = W^{[l]} A^{[l-1]} + b^{[l]}$ (followed by an activation).

Suppose you have already calculated the derivative $dZ^{[l]} = \frac{\partial \mathcal{L} }{\partial Z^{[l]}}$. You want to get $(dW^{[l]}, db^{[l]} dA^{[l-1]})$.

<img src="images/linearback_kiank.png" style="width:250px;height:300px;">
<caption><center> <b>Figure 4</b> </center></caption>

The three outputs $(dW^{[l]}, db^{[l]}, dA^{[l]})$ are computed using the input $dZ^{[l]}$.Here are the formulas you need:
$$ dW^{[l]} = \frac{\partial \mathcal{L} }{\partial W^{[l]}} = \frac{1}{m} dZ^{[l]} A^{[l-1] T} \tag{3}$$
$$ db^{[l]} = \frac{\partial \mathcal{L} }{\partial b^{[l]}} = \frac{1}{m} \sum_{i = 1}^{m} dZ^{[l](i)}\tag{4}$$
$$ dA^{[l-1]} = \frac{\partial \mathcal{L} }{\partial A^{[l-1]}} = W^{[l] T} dZ^{[l]} \tag{5}$$


In [14]:
def linear_backward(dZ, cache):
    """
    dZ -> gradient of cost w.r.t linear output of current layer
    cache -> tuple of values (A_prev, W, b) from forward propagation of current layer
    return:
    dA_prev -> Gradient of the cost with respect to the activation (of the previous layer l-1), same shape as A_prev
    dW -- Gradient of the cost with respect to W (current layer l), same shape as W
    db -- Gradient of the cost with respect to b (current layer l), same shape as b
    """

    A_prev, W, b = cache
    m = A_prev.shape[1]

    dW = (1./m) * np.dot(dZ, A_prev.T)
    db = (1./m) * np.sum(dZ, axis = 1, keepdims=True)
    dA_prev = np.dot(W.T, dZ)
    assert (dA_prev.shape == A_prev.shape)
    assert (dW.shape == W.shape)
    assert (db.shape == b.shape)
    
    return dA_prev, dW, db

In [15]:
# Set up some test inputs
dZ, linear_cache = linear_backward_test_case()

dA_prev, dW, db = linear_backward(dZ, linear_cache)
print ("dA_prev = "+ str(dA_prev))
print ("dW = " + str(dW))
print ("db = " + str(db))

dA_prev = [[ 0.51822968 -0.19517421]
 [-0.40506361  0.15255393]
 [ 2.37496825 -0.89445391]]
dW = [[-0.10076895  1.40685096  1.64992505]]
db = [[0.50629448]]


### 6.2 Linear Activation Backward

In [16]:
def linear_activation_backward(dA, cache, activation):
    """
    Args:
     dA -> post activation gradient for current layer l
     cache -> tuple of values (linear_cache, activation_cache) for back propagation
     activation -> input as "sigmoid" or "relu"
    Return:
     dA_prev -> Gradient of the cost with respect to the activation (of the previous layer l-1), same shape as A_prev
     dW -> Gradient of the cost with respect to W (current layer l), same shape as W
     db -> Gradient of the cost with respect to b (current layer l), same shape as b
    """
    linear_cache, activation_cache = cache

    if activation.lower() == "relu":
        dZ = relu_backward(dA, activation_cache)
        dA_prev, dW, db = linear_backward(dZ, linear_cache)
    elif activation.lower() == "sigmoid":
        dZ = sigmoid_backward(dA, activation_cache)
        dA_prev, dW, db = linear_backward(dZ, linear_cache)

    return dA_prev, dW, db


In [17]:
AL, linear_activation_cache = linear_activation_backward_test_case()

dA_prev, dW, db = linear_activation_backward(AL, linear_activation_cache, activation = "sigmoid")
print ("sigmoid:")
print ("dA_prev = "+ str(dA_prev))
print ("dW = " + str(dW))
print ("db = " + str(db) + "\n")

dA_prev, dW, db = linear_activation_backward(AL, linear_activation_cache, activation = "relu")
print ("relu:")
print ("dA_prev = "+ str(dA_prev))
print ("dW = " + str(dW))
print ("db = " + str(db))

sigmoid:
dA_prev = [[ 0.11017994  0.01105339]
 [ 0.09466817  0.00949723]
 [-0.05743092 -0.00576154]]
dW = [[ 0.10266786  0.09778551 -0.01968084]]
db = [[-0.05729622]]

relu:
dA_prev = [[ 0.44090989  0.        ]
 [ 0.37883606  0.        ]
 [-0.2298228   0.        ]]
dW = [[ 0.44513824  0.37371418 -0.10478989]]
db = [[-0.20837892]]


### 6.3 L-Model Backward

### 6.3 - L-Model Backward 

Now you will implement the backward function for the whole network. Recall that when you implemented the `L_model_forward` function, at each iteration, you stored a cache which contains (X,W,b, and z). In the back propagation module, you will use those variables to compute the gradients. Therefore, in the `L_model_backward` function, you will iterate through all the hidden layers backward, starting from layer $L$. On each step, you will use the cached values for layer $l$ to backpropagate through layer $l$. Figure 5 below shows the backward pass. 


<img src="images/mn_backward.png" style="width:450px;height:300px;">
<caption><center>  **Figure 5** : Backward pass  </center></caption>

** Initializing backpropagation**:
To backpropagate through this network, we know that the output is, 
$A^{[L]} = \sigma(Z^{[L]})$. Your code thus needs to compute `dAL` $= \frac{\partial \mathcal{L}}{\partial A^{[L]}}$.
To do so, use this formula (derived using calculus which you don't need in-depth knowledge of):
```python
dAL = - (np.divide(Y, AL) - np.divide(1 - Y, 1 - AL)) # derivative of cost with respect to AL
```

You can then use this post-activation gradient `dAL` to keep going backward. As seen in Figure 5, you can now feed in `dAL` into the LINEAR->SIGMOID backward function you implemented (which will use the cached values stored by the L_model_forward function). After that, you will have to use a `for` loop to iterate through all the other layers using the LINEAR->RELU backward function. You should store each dA, dW, and db in the grads dictionary. To do so, use this formula : 

$$grads["dW" + str(l)] = dW^{[l]}\tag{15} $$

For example, for $l=3$ this would store $dW^{[l]}$ in `grads["dW3"]`.

**Exercise**: Implement backpropagation for the *[LINEAR->RELU] $\times$ (L-1) -> LINEAR -> SIGMOID* model.

In [18]:
def L_model_backward(AL, Y, caches):
    """
    """
    grads = {}
    L = len(caches)
    m = AL.shape[1]
    Y = Y.reshape(AL.shape)

    dAL = - (np.divide(Y,AL) - np.divide(1-Y, 1 - AL))

    current_cache = caches[-1]
    grads["dA" + str(L)], grads["dW" + str(L)], grads["db" + str(L)] = linear_activation_backward(dAL, current_cache, activation="sigmoid")

    for l in reversed(range(L-1)):
        curr_cache = caches[l]
        grads["dA" + str(l+1)], grads["dW" + str(l+1)], grads["db" + str(l+1)] = linear_activation_backward(grads["dA" + str(l+2)], curr_cache, activation="relu")
    return grads

In [19]:
AL, Y_assess, caches = L_model_backward_test_case()
grads = L_model_backward(AL, Y_assess, caches)
print ("dW1 = "+ str(grads["dW1"]))
print ("db1 = "+ str(grads["db1"]))
print ("dA1 = "+ str(grads["dA1"]))

dW1 = [[0.41010002 0.07807203 0.13798444 0.10502167]
 [0.         0.         0.         0.        ]
 [0.05283652 0.01005865 0.01777766 0.0135308 ]]
db1 = [[-0.22007063]
 [ 0.        ]
 [-0.02835349]]
dA1 = [[ 0.          0.52257901]
 [ 0.         -0.3269206 ]
 [ 0.         -0.32070404]
 [ 0.         -0.74079187]]


### 6.4 Update Params

$$ W^{[l]} = W^{[l]} - \alpha \text{ } dW^{[l]} \tag{16}$$
$$ b^{[l]} = b^{[l]} - \alpha \text{ } db^{[l]} \tag{17}$$

where $\alpha$ is the learning rate. After computing the updated parameters, store them in the parameters dictionary. 

In [20]:
x = update_parameters_test_case()

({'W1': array([[-0.41675785, -0.05626683, -2.1361961 ,  1.64027081],
         [-1.79343559, -0.84174737,  0.50288142, -1.24528809],
         [-1.05795222, -0.90900761,  0.55145404,  2.29220801]]),
  'b1': array([[ 0.04153939],
         [-1.11792545],
         [ 0.53905832]]),
  'W2': array([[-0.5961597 , -0.0191305 ,  1.17500122]]),
  'b2': array([[-0.74787095]])},
 {'dW1': array([[ 1.78862847,  0.43650985,  0.09649747, -1.8634927 ],
         [-0.2773882 , -0.35475898, -0.08274148, -0.62700068],
         [-0.04381817, -0.47721803, -1.31386475,  0.88462238]]),
  'db1': array([[0.88131804],
         [1.70957306],
         [0.05003364]]),
  'dW2': array([[-0.40467741, -0.54535995, -1.54647732]]),
  'db2': array([[0.98236743]])})

In [24]:
def update_params(params, grads, learning_rate):
    L = len(params) // 2
    for l in range(L):
        params["W" + str(l+1)] = params["W" + str(l+1)] - learning_rate * grads["dW" + str(l+1)]
        params["b" + str(l+1)] = params["b" + str(l+1)] - learning_rate * grads["db" + str(l+1)]
    return params

In [25]:
parameters, grads = update_parameters_test_case()
parameters = update_params(parameters, grads, 0.1)

print ("W1 = "+ str(parameters["W1"]))
print ("b1 = "+ str(parameters["b1"]))
print ("W2 = "+ str(parameters["W2"]))
print ("b2 = "+ str(parameters["b2"]))

W1 = [[-0.59562069 -0.09991781 -2.14584584  1.82662008]
 [-1.76569676 -0.80627147  0.51115557 -1.18258802]
 [-1.0535704  -0.86128581  0.68284052  2.20374577]]
b1 = [[-0.04659241]
 [-1.28888275]
 [ 0.53405496]]
W2 = [[-0.55569196  0.0354055   1.32964895]]
b2 = [[-0.84610769]]
