In [None]:
#Importing numpy as the main library
import numpy as np
# I import main_functions which will have functions such as activation functions and others
from main_functions import *

In [None]:
# Initializing weights randomly by neurons per layer and the number of layers
def init_weights(number_layers, neurons_by_layer):
    #Initializing weights and biases.
    weights = {}
    biases = {}
    
    #Doing a for loop across all the layers.
    for l in range(1, number_layers):
        #Returning random values of weights from the layer number l
        weights["W" + str(l)] = np.random.rand(neurons_by_layer[l], neurons_by_layer[l-1])
        biases["b" + str(l)] = np.random.rand(neurons_by_layer[l], 1)
    
    return weights, biases    


In [None]:
# Doing a function that performs a whole forward iteration for the Neural Network
def forward_prop_iteration(X, weights, biases, activation_names, number_layers, neurons_by_layer):
    """This function receives the inputs X (A[0]), the weights from the whole network by a dictionary,
    the biases by a dictionary and the activation names in a list from each layer. It keeps track 
    of the linear activations and the activations of the whole network in order to use it in the backpropagation
     algorithm and finally it gives the final result of the network"""
    
    # I begin the dictionary of the activated neurons and save A[0] by the input of the network.
    A_cache = {"A0": X}
    Z_cache = {}
    
    for l in range(1, number_layers):
        #Because the activation names start from the layer 1 I have to call as activation_name of layer 1 =
        # activation_names[0], activation_names of layer 2 = activation_names[1] ...
        activation_name = activation_names[l-1]
        
        #The weights and  biases defined by the dictionaries 
        A_prev = A_cache["A" + str(l-1)]
        W = weights["W" + str(l)]
        b = biases["b" + str(l)]
        
        #Doing the linear activation and the forward activation in the layer l. It's worth to notice that I need
        # to get Z because I will need those values for the backpropagation algorithm.
        Z = linear_activation(W, A_prev, b)
        A = forward_activation(W, A_prev, b, activation_name)
        # Now I can save those values in the caches of A and Z
        A_cache["A" + str(l)] = A
        Z_cache["Z" +str(l)] = Z
        
  # Finally, I save the final value of the network Y, which is the value of the activation function in the last layer.    
        
    Y_hat = A_cache["A" + str(number_layers-1)]
        
    return Z_cache, A_cache, Y_hat    
    

In [None]:
# After having made the forward propagation iteration I calculate the cost of the network and the derivative of
# the cost with respect to A_L

def cost_function_and_first_derivative(Y_hat, Y, name_loss='binary_class'):
    """In this function I combine the function cost_function from the main_functions script
    and I calculate the first derivative of the cost function with respect to A[L] in order to start
    the backpropagation algorithm."""
    
    cost = cost_function(Y_hat, Y, name_loss)
    dA_L = derivative_cost_logistic(Y_hat, Y)
    
    # I return the cost of the network and the derivative dA_L(Derivative of the cost function with respect
    # to A[L])
    return cost, dA_L

In [None]:
def back_prop_iteration(dA_L, Z_cache, A_cache, weights, biases, activation_names,\
                              number_layers, neurons_by_layer):
    
    """A function that makes an iteration of the backpropagation algorithm through the whole network."""
    
    # I assert that the number of layers minus 1 is equal to the length of the activation names and
    # the length of neurons_by_layer is equal to number_layers
    assert ((number_layers-1)==len(activation_names)) and (number_layers==len(neurons_by_layer)),\
    "The number of layers must be equal to the length of neurons_by_layer and the length of activation_names\
    must be equal to the number of layers minus 1."
    
    # I create  dictionary dW_cache and db_cache which will save the derivatives of the weights and biases from
    # the network.
    dW_cache = {}
    db_cache = {}
    
    
    #I define the first dA as dA_L.
    dA = dA_L
    
    # Here I start to iterate from the last layer (which index is [L-1] given that python starts in 0) 
    # to the layer 1 included. In python it takes the first value but not the last value of the loop,
    # so I have iterate from [L-1] to 0
    
    for l in range(number_layers-1, 0, -1):
        # As the activation_names list has an element less, we need to consider that activation_index= l-1
        act_ix = l-1
        # Now I take the activation name of this layer.
        activation_name = activation_names[act_ix]
        
        # The linear activation Z[l] is taken from the Z_cache dictionary and the activation A[l-1] is taken
        # from the A_cache dictionary. I also take the weights W of this layer from the dictionary weights.
        Z = Z_cache['Z' + str(l)]
        A_prev = A_cache['A' + str(l-1)]
        W = weights['W' + str(l)]
        
        # Now I calculate dZ, dW, db and dA with the backward_iteration function from the main_functions script.
        dZ, dW, db, dA_prev = backward_iteration(dA, Z, activation_name, A_prev, W)
        
        # Now I need to update dA to be the value of dA_prev in order to get the derivatives of the next 
        # iteration. It's not necessary to update dZ because the backward_iteration function does that internally.
        dA = dA_prev
        
        
        # I return now only the derivatives of the weights and biases because they are the parameters that are 
        # trained by the network.
        dW_cache['dW' + str(l)] = dW
        db_cache['db' + str(l)] = db
        
        
    return dW_cache, db_cache    


Since I assume that I am going to finish the network with the sigmoid function, I will use the next loss function:

$  L(\hat{Y}, Y) = -(y\log(\hat{y}) + (1-y)\log(1-\hat{y}))$

$ Cost = J(w, b) = \frac{1}{m}\sum_{i=1}^{m} L(\hat{y}^{(i)}, y^{(i)})$

$$ da^{[L]} = \frac{-y}{\hat{y}} + \frac{(1-y)}{(1-\hat{y})} $$
$$ \text{Being } \hat{y} \equiv \text{Activation from the last layer } a^{[L]}$$

In [None]:
def update_parameters(weights, biases, dW_cache, db_cache, number_layers, learning_rate=0.01):
    
    """This function will return the weights and biases updated after having calculated the derivatives of
    the parameters through the whole network and considering a learning rate. It's important to notice
    that the operations are not done inplace."""
    
    #First, let's assert that we are giving correctly the parameters.
    assert (len(weights) and len(biases) and len(dW_cache) and len(db_cache)) == number_layers-1, \
    "The length of weights, biases, dW_cache and db_cache must be equal to the number of layers -1"
    
    weights_updated = {}
    biases_updated = {}
    
    # I will go through the whole network from layer 1 to layer L-1 (Remember that the index starts with 0).
    # The for loop stops at the index [number_layers - 1]
    for l in range(1,number_layers):
        weights_updated["W" + str(l)] = weights["W" + str(l)] - learning_rate*dW_cache["dW" + str(l)]
        biases_updated["b" + str(l)] = biases["b" + str(l)] - learning_rate*db_cache["db" + str(l)]
        
    return weights_updated, biases_updated    


In [None]:
def training_network(X, Y, neurons_by_layer, number_layers, activation_names, name_loss='binary_class',\
                     learning_rate=0.01, number_iterations=50):
    
    """A function that implements the whole training of the network on some number of iterations."""
    
    # Although it's redundant giving the neurons by layer and the number of layers, on this way we can make sure that
    # we are inserting the number of layers we truly want and its architecture.
    
    assert len(neurons_by_layer) == number_layers, "The number of layers must be the same as the length of the \
    list of neurons_by_layer"
    
    #Let's initialize randomly the weights and biases of the network.
    weights, biases = init_weights(number_layers, neurons_by_layer)
    
    # Now we iterate over the network on the number of iterations.
    
    for iteration in range(number_iterations):
        
        # First we make the forward propagation were we get the caches and the final value of the network.
        Z_cache, A_cache, Y_hat = forward_prop_iteration(X, weights, biases, activation_names,\
                                      number_layers, neurons_by_layer)
        
        # Now we get the cost of the network at this iteration and the derivative dA_L to make possible start
        # the backpropagation algorithm. By default we use the loss function binary_class
        cost, dA_L = cost_function_and_first_derivative(Y_hat, Y, name_loss)
        
        # If the iteration we are in is a multiple of 100 we show the cost of the network
        if iteration%10==0:
            print('Cost in iteration ' + str(iteration) + ": " + str(cost))
        
        # After calculating the cost and the first derivative, we can start the backpropagation algorithm and
        # get the derivatives of the weights and biases.
        dW_cache, db_cache = back_prop_iteration(dA_L, Z_cache, A_cache, weights, biases, activation_names,\
                              number_layers, neurons_by_layer)
        
        #After getting the derivatives we can update the parameters of the network and go to the next iteration.
        weights, biases = update_parameters(weights, biases, dW_cache, db_cache, number_layers,\
                                            learning_rate)
        
        
        
    # After going through all the iterations we return the parameters trained of the network and the cost    
        
        
    return weights, biases, cost


In [None]:
#Draft where I implement ideas
# I use the seed to maintain the random numbers fixed
np.random.seed(2)
X = np.random.randn(5,12)*10
neurons_by_layer = [5, 4, 4, 3, 1]
number_layers = len(neurons_by_layer)
activation_names = ['relu', 'relu', 'relu', 'sigmoid']

weights, biases = init_weights(number_layers, neurons_by_layer)

Z_cache, A_cache, Y_hat = forward_prop_iteration(X, weights, biases, activation_names, number_layers, neurons_by_layer)
n_hat, m_hat = Y_hat.shape
Y = np.random.randint(low=0, high=2, size=(n_hat, m_hat))


cost, dA_L = cost_function_and_first_derivative(Y_hat, Y, name_loss='binary_class')

dW_cache, db_cache = back_prop_iteration(dA_L, Z_cache, A_cache, weights, biases,\
                                        activation_names, number_layers, neurons_by_layer)

weights_updated, biases_updated = update_parameters(weights, biases, dW_cache, db_cache, number_layers, learning_rate=0.01)


In [None]:
np.random.seed(2)
X = np.random.randn(5,12)*10
n, m = X.shape

neurons_by_layer = [5, 4, 4, 3, 1]
number_layers = 5
activation_names = ['relu', 'relu', 'relu', 'sigmoid']
Y = np.random.randint(low=0, high=2, size=(1,m))
