In [None]:
import numpy as np

sigmoid forward

In [None]:
def sigmoid(Z):
  return (1/(1+np.exp(-Z))), Z

relu forward

In [None]:
def relu(Z):
  return np.maximum(0, Z), Z

sigmoid backward

In [None]:
def sigmoid_backward(dA, cache):
    Z = cache
    
    s = 1/(1+np.exp(-Z))
    dZ = dA * s * (1-s)
    
    assert (dZ.shape == Z.shape)
    
    return dZ

relu backward

In [None]:
def relu_backward(dA, Z):
  dZ = np.array(dA, copy=True) # just converting dz to a correct object.
  
  # When z <= 0, you should set dz to 0 as well. 
  dZ[Z <= 0] = 0
    
  assert (dZ.shape == Z.shape)
    
  return dZ


**Initializing parameters (Weights and biases)**

In [None]:
def initialize_parameters(layers):
  L = len(layers)
  parameters = {}
  np.random.seed(42)
  for l in range(1,L):
    parameters['W' + str(l)] = np.random.randn(layers[l], layers[l-1]) * 0.01
    parameters['b' + str(l)] = np.zeros((layers[l], 1))

  return parameters
    


**Forward Propagation**

**1. linear_forward**

In [None]:
def linear_forward(A_prev, W, b):
  Z = np.dot(W,A_prev) + b
  cache = A_prev, W, b #for backward propagation

  assert (Z.shape == (W.shape[0], A_prev.shape[1]))

  return Z, cache

**2. linear_activation_forward**

In [None]:
def linear_forward_activation(A_prev, W, b, activation):

  if activation == 'relu':
    Z, linear_cache = linear_forward(A_prev, W, b)
    A, act_cache = relu(Z)
  elif activation == 'sigmoid':
    Z, linear_cache = linear_forward(A_prev, W, b)
    A, act_cache = sigmoid(Z)   
  
  cache = (linear_cache, act_cache) 
  assert ( A.shape == (Z.shape)) 

  return A, cache



 **3. L_layer_forward**

In [None]:
def L_forward(X, parameters):
  A = X
  L = len(parameters)//2
  caches =[]
  #activations till the second last layer
  for l in range(1,L):
    A_prev = A
    A, cache =  linear_forward_activation(A_prev,
                                          parameters['W' + str(l)],
                                          parameters['b' + str(l)],
                                          'relu')
    caches.append(cache)

  #activation for the last layer
  AL, cache = linear_forward_activation(A,
                                        parameters['W' + str(L)],
                                        parameters['b' + str(L)],
                                        'sigmoid')  
  caches.append(cache)
  assert (AL.shape == (1, X.shape[1]))

  return AL, caches



**Compute cost**

In [None]:
def cost(AL, Y):
  m = Y.shape[1]
  cost = (-1/m)*np.sum(np.multiply(Y,np.log(AL))+np.multiply(1-Y,np.log(1-AL)))
  cost = np.squeeze(cost)
  assert cost.shape == ()

  return cost

**Backward Propagaton**

**1.linear_backward**

In [None]:
def linear_backward(dZ, cache):

  A_prev, W, b = cache
  m = A_prev.shape[1]

  dW = 1/m*np.dot(dZ,A_prev.T)
  db = 1/m*np.sum(dZ,axis=1,keepdims=True)
  dA_prev = np.dot(W.T,dZ)

  assert (dW.shape == W.shape)
  assert (db.shape == b.shape)
  assert (dA_prev.shape == A_prev.shape)

  return dA_prev, dW, db

**2.linear_activation_backward**

In [None]:
def linear_activation_backward(dA, cache,  activation):
  linear_cache, act_cache = cache 

  if activation == 'relu':
    dZ = relu_backward(dA, act_cache)
    dA_prev, dW, db = linear_backward(dZ, linear_cache)

  elif activation == 'sigmoid':
    dZ = sigmoid_backward(dA, act_cache)
    dA_prev, dW, db = linear_backward(dZ, linear_cache)  

  return dA_prev, dW, db

**3.L_backward**

In [None]:
def L_backward(AL, Y, caches):

  dAL = - (np.divide(Y, AL) - np.divide(1 - Y, 1 - AL))
  gradients = {}
  L = len(caches)
  
  #gradient of Lth layer:
  dA, dW, db = linear_activation_backward(dAL, caches[L-1],  activation ='sigmoid')
  gradients['dA' + str(L-1)] = dA
  gradients['dW' + str(L)] = dW
  gradients['db' + str(L)] = db

  #For other layers
  for l in reversed(range(L-1)):
    
    dA_prev, dW, db = linear_activation_backward(gradients["dA" + str(l + 1)], caches[l],  activation ='relu')
    gradients['dA' + str(l)] = dA_prev
    gradients['dW' + str(l+1)] = dW
    gradients['db' + str(l+1)] = db
   

  return gradients

**Updating parameters**

In [None]:
def update_params(parameters, gradients, learning_rate):
  L = len(parameters)//2
    for l in range(1,L+1):
     parameters['W' + str(l)] = parameters['W' + str(l)] - learning_rate * gradients['dW' + str(l)] 
     parameters['b' + str(l)] = parameters['b' + str(l)] - learning_rate * gradients['db' + str(l)] 

  return parameters

**L_layer nn model**

In [None]:
def L_layer_model(X, Y, layers, learning_rate, iterations, print_cost = False):
  np.random.seed(42)
  costs = []
  parameters = initialize_parameters(layers)

  for i in range(iterations):
    AL, caches = L_forward(X, parameters)

    c = cost(AL, Y)
    gradients = L_backward(AL, Y, caches)

    parameters = update_params(parameters, gradients, learning_rate)

    if print_cost and i % 100 == 0:
            print ("Cost after iteration %i: %f" % (i, c))
            costs.append(c)

  return parameters