In [1]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

plt.rcParams['figure.figsize'] = (5.0, 4.0) # set default size of plots
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'

%load_ext autoreload
%autoreload 2

np.random.seed(1)

In [9]:
def sigmoid(z):
    s = 1.0/(1+np.exp(-z))
    return s, z

In [22]:
def relu(z):
    vec0 = np.zeros(z.shape)
    r = np.maximum(0.0, z)
    return r, z

### 3.1 2-layer network

In [2]:
def initialize_parameters(n_x, n_h, n_y):
    np.random.seed(2)
    W1 = np.random.randn(n_h, n_x) * 0.01
    b1 = np.zeros((n_h, 1))
    W2 = np.random.randn(n_y, n_h) * 0.01
    b2 = np.zeros((n_y, 1))
    
    parameters = {"W1": W1,
                  "b1": b1,
                  "W2": W2,
                  "b2": b2}
    
    return parameters   

### 3.2 L-layer network

In [5]:
def initialize_parameters_deep(layer_dims):
    np.random.seed(3)
    parameters = {}
    L = len(layer_dims)
    
    for l in xrange(1, L):
        parameters['W'+str(l)] = np.random.randn(layer_dims[l], 
                                                 layer_dims[l-1]
                                                ) * 0.01
        parameters['b'+str(l)] = np.zeros((layer_dims[l], 1))
        
    return parameters

In [6]:
parameters = initialize_parameters_deep([5,4,3])
print("W1 = " + str(parameters["W1"]))
print("b1 = " + str(parameters["b1"]))
print("W2 = " + str(parameters["W2"]))
print("b2 = " + str(parameters["b2"]))

W1 = [[ 0.01788628  0.0043651   0.00096497 -0.01863493 -0.00277388]
 [-0.00354759 -0.00082741 -0.00627001 -0.00043818 -0.00477218]
 [-0.01313865  0.00884622  0.00881318  0.01709573  0.00050034]
 [-0.00404677 -0.0054536  -0.01546477  0.00982367 -0.01101068]]
b1 = [[0.]
 [0.]
 [0.]
 [0.]]
W2 = [[-0.01185047 -0.0020565   0.01486148  0.00236716]
 [-0.01023785 -0.00712993  0.00625245 -0.00160513]
 [-0.00768836 -0.00230031  0.00745056  0.01976111]]
b2 = [[0.]
 [0.]
 [0.]]


## 4 Forward propagation module
### 4.1 Linear Forward

In [7]:
def linear_forward(A, W, b):
    Z = np.dot(W, A) + b
    cache = (A, W, b)
    return Z, cache

### 4.2 Linear-Activation Forward

In [13]:
def linear_activation_forward(A_prev, W, b, activation):
    if activation == 'sigmoid':
        Z, linear_cache = linear_forward(A_prev, W, b)
        A, activation_cache = sigmoid(Z)
        
    if activation == 'relu':
        Z, linear_cache = linear_forward(A_prev, W, b)
        A, activation_cache = relu(Z)
        
    cache = (linear_cache, activation_cache)
    return A, cache

In [25]:
# A_prev, W, b = linear_activation_forward_test_case()
A_prev = np.array(
[[-0.41675785, -0.05626683],
 [-2.1361961,   1.64027081],
 [-1.79343559, -0.84174737]]
)

W = np.array(
[[ 0.50288142, -1.24528809, -1.05795222]]
)

b = np.array(
[[-0.90900761]]
)

A, linear_activation_cache = linear_activation_forward(A_prev, W, b, activation = "sigmoid")
print("With sigmoid: A = " + str(A))

A, linear_activation_cache = linear_activation_forward(A_prev, W, b, activation = "relu")
print("With ReLU: A = " + str(A))

With sigmoid: A = [[0.96890023 0.11013289]]
With ReLU: A = [[3.43896134 0.        ]]


### L-Layer Model

In [28]:
def L_model_forward(X, parameters):
    caches = []
    A = X
    L = len(parameters)/2
    
    for l in xrange(1, L):
        A_prev = A
        W, b = parameters['W'+str(l)], parameters['b'+str(l)]
        A, cache = linear_activation_forward(A_prev, W, b, 
                                             activation='relu')
        caches.append(cache)
    
    W, b = parameters['W'+str(L)], parameters['b'+str(L)]
    AL, cache = linear_activation_forward(A, W, b, 
                                          activation='sigmoid')
    caches.append(cache)
    
    return AL, caches

## 5 Cost function
$$-\frac{1}{m} \sum\limits_{i = 1}^{m} (y^{(i)}\log\left(a^{[L] (i)}\right) + (1-y^{(i)})\log\left(1- a^{[L](i)}\right)) \tag{7}$$

In [29]:
def compute_cost(AL, Y):
    m = Y.shape[1]
    cost = -1.0/m * np.sum(Y*np.log(AL) + (1-Y)*np.log(1-AL))
    cost = np.squeeze(cost)
    return cost

## 6 Backward propagation module
### 6.1 Linear backward

In [31]:
def linear_backward(dZ, cache):
    A_prev, W, b = cache
    m = A_prev.shape[1]
    
    dW = 1.0/m * np.dot(dZ, A_prev.T)
    db = 1.0/m * np.sum(dZ, axis=1, keepdims=True)
    dA_prev = np.dot(W.T, dZ)
    
    return dA_prev, dW, db

### 6.2 Linear-Activation backward

In [33]:
def linear_activation_backward(dA, cache, activation):
    linear_cache, activation_cache = cache
    
    if activation == 'relu':
        dZ = relu_backward(dA, activation_cache)
        dA_prev, dW, db = linear_backward(dZ, linear_cache)
        
    if activation == 'sigmoid':
        dZ = sigmoid_backward(dA, activation_cache)
        dA_prev, dW, db = linear_backward(dZ, linear_cache)
        
    return dA_prev, dW, db

### 6.3 L-Model Backward

In [44]:
def L_model_backward(AL, Y, caches):
    grads = {}
    L = len(caches)
    m = AL.shape[1]
    Y = Y.reshape(AL.shape)
    
    dAL = - (Y/AL - (1-Y)/(1-AL))
    
    current_cache = caches[L-1]
    (grads["dA" + str(L-1)], 
     grads["dW" + str(L)], 
     grads["db" + str(L)] ) = ( 
         linear_activation_backward(dAL, current_cache, 
                                    activation='sigmoid'))
    dA = grads['dA'+str(L-1)]
    for l in reversed(xrange(L-1)):
        current_cache = caches[l]
        (dA_prev_temp, dW_temp, db_temp) = ( 
                linear_activation_backward(dA, current_cache, 
                                           activation='relu')
        )
        grads['dA'+str(l)] = dA_prev_temp
        grads['dW'+str(l+1)] = dW_temp
        grads['db'+str(l+1)] = db_temp
        dA = dA_prev_temp
        
    return grads

In [46]:
# AL, Y_assess, caches = L_model_backward_test_case()
# AL = np.array(
#     [[ 1.78862847,  0.43650985]]
# )

# Y_assess = np.array(
#     [[1, 0]]
# )

# caches = np.array(

# )


# grads = L_model_backward(AL, Y_assess, caches)
# print_grads(grads)

### 6.4 Update Parameters

In [47]:
def update_parameters(parameters, grads, learning_rate):
    L = len(parameters) // 2 # number of layers in the neural network

    # Update rule for each parameter. Use a for loop.
    for l in range(L):
        W, b = parameters["W" + str(l+1)], parameters["b" + str(l+1)]
        dW, db = grads['dW'+str(l+1)], grads['db'+str(l+1)]
        parameters["W" + str(l+1)] = W - learning_rate * dW
        parameters["b" + str(l+1)] = b - learning_rate * db

    return parameters