In [1]:
import numpy as np
import matplotlib.pyplot as plt

In [1]:
def sigmoid(Z):
    """
    Implements the sigmoid activation in numpy
    
    Arguments:
    Z -- numpy array of any shape
    
    Returns:
    A -- output of sigmoid(z), same shape as Z
    cache -- returns Z as well, useful during backpropagation
    """
    
    A = 1/(1+np.exp(-Z))
    cache = Z
    
    return A, cache

def relu(Z):
    """
    Implement the RELU function.

    Arguments:
    Z -- Output of the linear layer, of any shape

    Returns:
    A -- Post-activation parameter, of the same shape as Z
    cache -- a python dictionary containing "A" ; stored for computing the backward pass efficiently
    """
    
    A = np.maximum(0,Z)
    
    assert(A.shape == Z.shape)
    
    cache = Z 
    return A, cache

def relu_backward(dA, cache):
    """
    Implement the backward propagation for a single RELU unit.

    Arguments:
    dA -- post-activation gradient, of any shape
    cache -- 'Z' where we store for computing backward propagation efficiently

    Returns:
    dZ -- Gradient of the cost with respect to Z
    """
    
    Z = cache
    dZ = np.array(dA, copy=True) # just converting dz to a correct object.
    
    # When z <= 0, you should set dz to 0 as well. 
    dZ[Z <= 0] = 0
    
    assert (dZ.shape == Z.shape)
    
    return dZ

def sigmoid_backward(dA, cache):
    """
    Implement the backward propagation for a single SIGMOID unit.

    Arguments:
    dA -- post-activation gradient, of any shape
    cache -- 'Z' where we store for computing backward propagation efficiently

    Returns:
    dZ -- Gradient of the cost with respect to Z
    """
    
    Z = cache
    
    s = 1/(1+np.exp(-Z))
    dZ = dA * s * (1-s)
    
    assert (dZ.shape == Z.shape)
    
    return dZ


In [2]:
def initialize_parameters(n_x,n_h,n_y):
    """
    Argument:
    n_x -- size of the input layer
    n_h -- size of the hidden layer
    n_y -- size of the output layer
    Returns:
    parameters -- python dictionary containing your parameters:
    W1 -- weight matrix of shape (n_h, n_x)
    b1 -- bias vector of shape (n_h, 1)
    W2 -- weight matrix of shape (n_y, n_h)
    b2 -- bias vector of shape (n_y, 1)
    """
    W1=np.random.randn(n_h,n_x)*0.01
    b1=np.zeros(n_h,1)
    W2=np.random.randn(n_y,n_x)*0.01
    b2=np.zeros(n_y,1)
    assert (W1.shape==(n_h,n_x))
    assert (b1.shape==(n_h,1))
    assert (W2.shape==(n_y,n_h))
    assert (b2.shape==(n_y,1))
    
    parameters={
        "W1":W1,
        "b1":b1,
        "W2":W2,
        "b2":b2
    }
    return parameters




In [7]:
def initialize_parameters_deep(layer_dims):
    """
    Arguments:
    layer_dims -- python array (list) containing the dimensions of each
    ,! layer in our network
    Returns:
    parameters -- python dictionary containing your parameters "W1",
    ,! "b1", ..., "WL", "bL":
    Wl -- weight matrix of shape (layer_dims[l], layer_dims[l-1])
    bl -- bias vector of shape (layer_dims[l], 1)
    """
    np.random.seed(3)
    parameters = {}
    L = len(layer_dims) # number of layers in the network
    for l in range(1, L):
        ### START CODE HERE ### ( 2 lines of code)
        parameters['W' + str(l)] = np.random.randn(layer_dims[l],
        layer_dims[l-1]) * 0.01
        parameters['b' + str(l)] = np.zeros((layer_dims[l], 1))
        ### END CODE HERE ###
        assert(parameters['W' + str(l)].shape == (layer_dims[l],
         layer_dims[l-1]))
        assert(parameters['b' + str(l)].shape == (layer_dims[l], 1))
    return parameters

In [9]:
def linear_forwar(A,W,b):
    """
    Implement the linear part of a layer's forward propagation.
    Arguments:
    A -- activations from previous layer (or input data): (size of
    ,! previous layer, number of examples)
    W -- weights matrix: numpy array of shape (size of current layer,
    ,! size of previous layer)
    b -- bias vector, numpy array of shape (size of the current layer,
    ,! 1)
    Returns:
    Z -- the input of the activation function, also called
    ,! pre-activation parameter
    cache -- a python dictionary containing "A", "W" and "b" ; stored
    ,! for computing the backward pass efficiently
    """
    Z=W @ A +b
    assert (Z.shape==(W.shape[0],A.shape[1]))
    cache=(A,W,b)
    return Z,cache
        

In [11]:
def sigmoid(self, z):
    return 1 / (1 + np.exp(-z))

def linear_activation_forward(A_prev, W, b, activation):
    """
    Implement the forward propagation for the LINEAR->ACTIVATION layer
    Arguments:
    A_prev -- activations from previous layer (or input data): (size of
    ,! previous layer, number of examples)
    W -- weights matrix: numpy array of shape (size of current layer,
    ,! size of previous layer)
    b -- bias vector, numpy array of shape (size of the current layer,
    ,! 1)
    activation -- the activation to be used in this layer, stored as a
    ,! text string: "sigmoid" or "relu"
    Returns:
    A -- the output of the activation function, also called the
    ,! post-activation value
    cache -- a python dictionary containing "linear_cache" and
    ,! "activation_cache";
    stored for computing the backward pass efficiently
    """
    if activation=="sigmoid":
        Z.linear_cache=linear_forwar(A_prev,W,b)
        A,activation_cache=sigmoid(Z)
    elif activation=="relu":
        Z,linear_cache=linear_activation_forward(A_prev,W,b)
        A,activation_cache=relu(Z)
    
    assert (A.shape==(W.shape[0],A_prev.shape[1]))
    cache=(linear_cache,activation_cache)
    return A.cache

In [15]:
def L_model_forward(X, parameters):
    """
    Implement forward propagation for the
    ,! [LINEAR->RELU]*(L-1)->LINEAR->SIGMOID computation
    Arguments:
    X -- data, numpy array of shape (input size, number of examples)
    parameters -- output of initialize_parameters_deep()
    Returns:
    AL -- last post-activation value
    caches -- list of caches containing:
    every cache of linear_relu_forward() (there are L-1 of
    ,! them, indexed from 0 to L-2)
    the cache of linear_sigmoid_forward() (there is one,
    ,! indexed L-1)
    """
    caches=[]
    A=X
    L=len(parameters)
    for l in range(1,L):
        A_prev=A
        A,cache=linear_activation_forward(A_prev,parameters['W'+str(l)],
                                         parameters['b'+str(l)],activation="relu")
        caches.append(cache)
        
    
    AL,cacahe=linear_activation_forward(A,parameters['W'+str(L)],
                                        parameters['b'+str(L)],activation="sigmoid")
    caches.append(cache)
    assert(AL.shape==(1,X.shape[1]))
    return AL.cache

In [17]:
def compute_cost(AL, Y):
    """
    Implement the cost function defined by equation (7).
    Arguments:
    AL -- probability vector corresponding to your label predictions,
    ,! shape (1, number of examples)
    Y -- true "label" vector (for example: containing 0 if non-cat, 1
    ,! if cat), shape (1, number of examples)
    Returns:
    cost -- cross-entropy cost
    """
    m=Y.shape[1]
    cost=-(Y @ np.log(AL.T) +(1-Y) @ np.log(1-AL).T)/m
    cost=np.squeeze(cost) #reduce a dim
    assert cost,shape==()
    return cost
    

In [18]:
def linear_backward(dZ, cache):
    """
    Implement the linear portion of backward propagation for a single
    ,! layer (layer l)
    Arguments:
    dZ -- Gradient of the cost with respect to the linear output (of
    ,! current layer l)
    cache -- tuple of values (A_prev, W, b) coming from the forward
    ,! propagation in the current layer
    Returns:
    dA_prev -- Gradient of the cost with respect to the activation (of
    ,! the previous layer l-1), same shape as A_prev
    dW -- Gradient of the cost with respect to W (current layer l),
    ,! same shape as W
    db -- Gradient of the cost with respect to b (current layer l),
    ,! same shape as b
    """
    A_prev,W,b=cache
    m=A_prev.shape[1]
    
    dW=(dZ @ A_prev.T)/m
    db=np.sum(dZ,axis=1,keepdims=True)/m
    dA_prev=W.T @ dZ
    assert (dA_prev.shape == A_prev.shape)
    assert (dW.shape == W.shape)
    assert (db.shape == b.shape)
    return dA_prev, dW, db

In [28]:
def relu_backward(dA,activation_cache):
    

def linear_activation_backward(dA,cache,activation):
    """
    Implement the backward propagation for the LINEAR->ACTIVATION
    ,! layer.
    Arguments:
    dA -- post-activation gradient for current layer l
    cache -- tuple of values (linear_cache, activation_cache) we store
    ,! for computing backward propagation efficiently
    activation -- the activation to be used in this layer, stored as a
    ,! text string: "sigmoid" or "relu"
    Returns:
    dA_prev -- Gradient of the cost with respect to the activation (of
    ,! the previous layer l-1), same shape as A_prev
    dW -- Gradient of the cost with respect to W (current layer l),
    ,! same shape as W
    db -- Gradient of the cost with respect to b (current layer l),
    ,! same shape as b
    """
    
    linear_cacahe,activation_cache=cache
    
    if activation=="relu":
        dZ=relu_backward(dA,activation_cache)
        dA_prev,dW,db=linear_backward(dZ,linear_cacahe)
    
    elif activation=="sigmoid":
        dZ=sigmoid_backward(dA,activation_cache)
        dA_prev,dW,db=linear_backward(dZ,linear_cacahe)
    
    return dA_prev,dW,db

In [25]:
def L_model_backward(AL,Y,caches):
    """
    Implement the backward propagation for the [LINEAR->RELU] * (L-1)
    ,! -> LINEAR -> SIGMOID group
        Arguments:
    AL -- probability vector, output of the forward propagation
    ,! (L_model_forward())
    Y -- true "label" vector (containing 0 if non-cat, 1 if cat)
    caches -- list of caches containing:
    every cache of linear_activation_forward() with "relu"
    ,! (it's caches[l], for l in range(L-1) i.e l = 0...L-2)
    the cache of linear_activation_forward() with "sigmoid"
    ,! (it's caches[L-1])
    Returns:
    grads -- A dictionary with the gradients
    grads["dA" + str(l)] = ...
    grads["dW" + str(l)] = ...
    grads["db" + str(l)] = ...
    """
    
    grads={}
    L=len(caches)
    m=AL.shape[1]
    Y=Y.reshape(AL.shape)
    
    dAL= - (np.divide(Y, AL) - np.divide(1 - Y, 1 - AL))
    
    current_cache = caches[L-1]
    grads["dA" + str(L)], grads["dW" + str(L)], grads["db" + str(L)] =linear_activation_backward(dAL, current_cache, "sigmoid")
    
    for l in reversed(range(L-1)):
        current_cache=caches[l]
        dA_prev_temp,dW_temp,db_temp=linear_activation_backward(grads["dA"+str(l+2)],current_cache,"relu")
        grads["dA"+str(l+1)]=dA_prev_temp
        grads["dW"+str(l+1)]=dW_temp
        grads["db"+str(l+1)]=db_temp
        return grads

In [27]:
def update_parameters(parameters, grads, learning_rate):
    """
    Update parameters using gradient descent
    Arguments:
    parameters -- python dictionary containing your parameters
    grads -- python dictionary containing your gradients, output of
    ,! L_model_backward
    Returns:
    parameters -- python dictionary containing your updated parameters
    parameters["W" + str(l)] = ...
    parameters["b" + str(l)] = ...
    """
    L = len(parameters) // 2 # number of layers in the neural network
    # Update rule for each parameter. Use a for loop.
    for l in range(L):
        parameters["W" + str(l+1)] = parameters["W" + str(l+1)] -learning_rate * grads["dW" + str(l + 1)]
        parameters["b" + str(l+1)] = parameters["b" + str(l+1)] -learning_rate * grads["db" + str(l + 1)]
    return parameters