In [1]:
import numpy as np

**INITIALIZING THE PARAMETERS RANDOMLY**

In [32]:
def initialize_parameters(layer_dimensions):
  L = len(layer_dimensions) # total number of layers in the network
  parameters = {}
  for i in range(1, L):
    parameters["W" + i] = np.random.randn(layer_dimensions[i], layer_dimensions[i - 1]) * 0.01
    parameters["b" + i ] =  np.zeros((layer_dimensions[i], 1))
  return parameters

**ACTIVATION FUNCTION**

In [33]:
def ReLU(z):
  cache = z
  return np.maximum(0, z), cache

In [34]:
def relu_backward(dA, cache):
    Z = cache
    dZ = np.array(dA, copy=True) 
    dZ[Z <= 0] = 0
    return dZ

def sigmoid_backward(dA, cache):
    Z = cache
    s = 1/(1+np.exp(-Z))
    dZ = dA * s * (1-s)
    return dZ

**FORWARD PROPAGATION**

In [35]:
def linear_forward(activation, weight, bias):
  return np.dot(activation, weight) + bias, (activation, weight, bias) # cache

In [36]:
def activation_forward(previous_activation, weight, bias, function):
  Z, forward_cache = linear_forward(previous_activation, weight, bias)
  if function == "sigmoid":
     current_activation, activation_cache = np.sigmoid(Z)
     cache = (forward_cache, activation_cache)
  else:
    current_activation, activation_cache = ReLU(Z)
    cache = (forward_cache, activation_cache)
  return current_activation, cache

**FORWARD PROPAGATION MODEL**

In [37]:
def forward_propogation(inputs, parameters):
   caches = []
   A = X
   L = len(parameters) // 2
   for i in range(L):
      A, cache = activation_forward(A_prev,    parameters['W' + str(l)], parameters['b' + str(l)], "relu")
      caches.append(cache)
   AL, cache = activation_forward(A, parameters['W' + str(L)],  parameters['b' + str(L)],  'sigmoid')
   caches.append(cache)
   return AL, caches

**CROSS ENTROPY COST FUNCTION**

In [38]:
def compute_cost(y_hat, outputs, parameters):
  number_of_examples = outputs.shape[0]
  cost = (-1 / number_of_examples) * np.sum(np.multiply(np.log(y_hat), outputs) + np.multiply((1 -outputs), np.log(1 -y_hat)))
  cost = np.squeeze(cost)
  return cost

**BACKWARD PROPOGATION**

In [39]:
def linear_backward(dZ, cache):
    A_prev, W, b = cache
    m = A_prev.shape[1]
    dW = np.dot(dZ, cache[0].T) / m
    db =np.sum(dZ, axis=1, keepdims=True) / m
    dA_prev = np.dot(cache[1].T, dZ)
    return dA_prev, dW, db

In [40]:
def activation_backward(dA, cache, function):
     linear_cache, activation_cache = cache
     if activation == "relu":
        dZ = relu_backward(dA, activation_cache)       
     else:
        dZ = sigmoid_backward(dA, activation_cache)
     dA_prev, dW, db = linear_backward(dZ, linear_cache)
     return dA_prev, dW, db

**BACKWARD PROPOGATION MODEL**

In [41]:
def backward_propogation(AL, Y, caches):
    grads = {}
    L = len(caches) 
    m = AL.shape[1]
    Y = Y.reshape(AL.shape) # Y is the same shape as AL
    
    dAL = dAL = - (np.divide(Y, AL) - np.divide(1 - Y, 1 - AL))
    current_cache = caches[-1]
    grads["dA" + str(L)], grads["dW" + str(L)], grads["db" + str(L)] = linear_backward(sigmoid_backward(dAL,  current_cache[1]),  current_cache[0])
    
    for l in reversed(range(L-1)):
        current_cache = caches[l]
        dA_prev_temp, dW_temp, db_temp = linear_backward(sigmoid_backward(dAL, current_cache[1]), current_cache[0])
        grads["dA" + str(l + 1)] = dA_prev_temp
        grads["dW" + str(l + 1)] = dW_temp
        grads["db" + str(l + 1)] = db_temp
    return grads

**UPDATING THE PARAMETERS**

In [42]:
def update_parameters(parameters, grads, learning_rate):
    L = len(parameters) // 2 # number of layers in the neural network
    for l in range(L):
        parameters["W" + str(l + 1)] = parameters["W" + str(l + 1)] - learning_rate * grads["dW" + str(l + 1)]
        parameters["b" + str(l + 1)] = parameters["b" + str(l + 1)] - learning_rate * grads["db" + str(l + 1)]       
    return parameters

**TRAINING THE MODEL**

In [46]:
def training_model(layer_dimensions, inputs, outputs, learning_rate, num_iterations):
    costs = []         
    parameters = initialize_parameters_deep(layers_dimensions) # initialize paramters
    for i in range(0, num_iterations): # gradient descent
        AL, caches = forward_propogation(inputs, parameters) # apply forward propogation
        cost = compute_cost(AL, outputs) # compute cost
        grads = backward_propogation(AL, outputs, caches) # apply backward propogation
        parameters = update_parameters(parameters, grads, learning_rate)
        if  i % 100 == 0:
          costs.append(cost)     
    return parameters