In [1]:
import numpy as np

**DEFINE THE NEURAL NETWORK STRUCTURE**

In [2]:
def layer_sizes(inputs, outputs, hidden_layer_dimension):
  size_input = inputs.shape[0] # size of the input layer
  size_output = outputs.shape[0] # size of the input layer
  size_hidden = hidden_layer_dimension
  return size_input, size_output, size_hidden

**INITIALIZE PARAMETERS RANDOMLY**

In [None]:
def initialize_parameters(inputs, outputs, hidden_layer_dimension):
  size_input, size_output, size_hidden = layer_sizes(inputs, outputs, hidden_layer_dimension)
  
  # weight and bias vector for the hidden layer = w1 and b1
  W1 = np.random.randn(size_hidden, size_input) * 0.01
  b1 = np.zeros((size_hidden, 1))
  
  # weight and bias vector for the output layer = w2 and b2
  W2 = np.random.randn(size_output, size_hidden) * 0.01
  b2 = np.zeros((size_output, 1))

  return {"W1": W1, "b1": b1, "W2": W2, "b2": b2}  # dictionary to keep the parameters

**FORWARD PROPAGATION**

In [19]:
def sigmoid(x):
   return 1/(1+np.exp(-x))

In [20]:
def forward_propagation(inputs, parameters):
  Z1 = np.dot(parameters["W1"], X) + parameters["b1"]
  A1 = np.tnah(Z1)
  Z2 = np.dot(parameters["W2"], A1) + parameters["b2"]
  A2 = sigmoid(Z2)  # y_hat
  return A2, {"Z1": Z1,  "A1": A1,  "Z2": Z2,  "A2": A2} # dictionary to keep A and Z values / cache

**CROSS-ENTROPY COST FUNCTION**

In [10]:
def compute_cost(y_hat, outputs, parameters):
  number_of_examples = outputs.shape[0]
  cost = (-1 / number_of_examples) * (np.multiply(np.log(y_hat), outputs) + np.multiply((1 -outputs), np.log(1 -y_hat)))
  cost = np.squeeze(cost)
  return cost

**BACKWARD PROPOGATION**

In [11]:
def backward_propogation(inputs, outputs, parameters, cache):
  # retreive parameters and cache
  W1 = parameters["W1"]
  W2 = parameters["W2"]
  A1 = cache["A1"]
  A2 = cache["A2"]
  # calculate dW1, db1, dW2, db2 - backward propogation
  dZ2= A2 - outputs
  dW2 = (1 / m) * np.dot(dZ2, A1.T)
  db2 = (1 / m) * np.sum(dZ2, axis=1, keepdims=True)
  dZ1 = np.multiply(np.dot(W2.T, dZ2), 1 - np.power(A1, 2))
  dW1 = (1 / m) * np.dot(dZ1, X.T)
  db1 = (1 / m) * np.sum(dZ1, axis=1, keepdims=True)

  return {"dW1": dW1, "db1": db1,  "dW2": dW2, "db2": db2}

**UPDATING THE PARAMETERS**

In [17]:
def update_parameters(parameters, gradients, learning_rate):
    # retreive the parameters
    W1 = parameters["W1"]
    b1 = parameters["b1"]
    W2 = parameters["W2"]
    b2 = parameters["b2"]
   # update the parameters with respect to their gradients
    W1 = W1 - learning_rate * gradients["dW1"]
    b1 = b1 - learning_rate * gradients["db1"]
    W2 = W2 -learning_rate * gradients["dW2"]
    b2 = b2 - learning_rate * gradients["db2"]

    return {"W1": W1, "b1": b1, "W2": W2, "b2": b2}

**TRAININING THE MODEL**

In [23]:
def train_model(inputs, outputs, hidden_layer_dimension, num_iterations = 10000):
    # initalize parameters
    parameters = intialize_parameters(inputs, outputs, hidden_layer_dimension)
    for i in range(num_iterations):
        A2, cache = forward_propagation(inputs, parameters) # forward propagate
        cost = compute_cost(A2, outputs, parameters) 
        grads = backward_propagation(parameters, cache, inputs, outputs) # backward propagate
        parameters = update_parameters(parameters, grads) # update parameters
    return parameters

**MAKING PREDICTIONS**

In [24]:
def making_predictions(inputs, parameters):
    A2, cache = forward_propagation(inputs, parameters)
    return A2 > 0.5