# Task 2

Implement basic backward pass in MLP. Perform forward and backward propagation through your network and check your gradients.
This time, the forward pass is implemented for you. Notice the matrix notation - the dimensions are in form $[m,nX,1]$, where $m$ is batch size (number of samples) and $nX$ is the size of sample vector.

In [5]:
# Import
import numpy as np

## Activations

Implement derivations of standard activation functions (ReLU, Sigmoid), which are used in your task.

In [6]:
#------------------------------------------------------------------------------
#   ActivationFunction class
#------------------------------------------------------------------------------
class ActivationFunction:
    def __init__(self):
        pass

    def __call__(self, z):
        pass

#------------------------------------------------------------------------------
#   LinearActivationFunction class
#------------------------------------------------------------------------------
class LinearActivationFunction(ActivationFunction):
    def __call__(self, z):
        return z

    def derivation(self, z):
        return 1

#------------------------------------------------------------------------------
#   RELUActivationFunction class
#------------------------------------------------------------------------------
class RELUActivationFunction(ActivationFunction):
    def __call__(self, z):
        return np.maximum(z, 0)

    def derivation(self, z):
        return int(x>=0)

#------------------------------------------------------------------------------
#   SigmoidActivationFunction class
#------------------------------------------------------------------------------
class SigmoidActivationFunction(ActivationFunction):
    def __call__(self, z):
        return 1.0/(1.0+np.exp(-z))

    def derivation(self, z):
        a = self(z)
        return np.multiply(self(z),(1-self(z)))
    
# Activation mapping
    
MAP_ACTIVATION_FUCTIONS = {
    "linear": LinearActivationFunction,
    "relu": RELUActivationFunction,
    "sigmoid": SigmoidActivationFunction
}

def CreateActivationFunction(kind):
    if (kind in MAP_ACTIVATION_FUCTIONS):
        return MAP_ACTIVATION_FUCTIONS[kind]()
    raise ValueError(kind, "Unknown activation function {0}".format(kind))

## Layer

This is the main class which can hold different types of layers and provides us with standard tasks like forward propagation. Implement backward functions for defined classes.

nUnits - number of neuron units in your layer

prevLayer - previous layer (need it to know the shape of it to create appropriate number of weights for you to use in current layer)

In [7]:
#------------------------------------------------------------------------------
#   Layer class
#------------------------------------------------------------------------------
class Layer:
    def __init__(self, act="linear", name="layer"):
        self.shape = (0, 0)
        self.activation = CreateActivationFunction(act)
        self.name = name

    def initialize(self, prevLayer):
        pass

    def forward(self, x):
        pass

#------------------------------------------------------------------------------
#   InputLayer class
#------------------------------------------------------------------------------
class InputLayer(Layer):
    def __init__(self, nUnits, name="Input"):
        super().__init__(act="linear", name=name)
        self.nUnits = nUnits

    def initialize(self, prevLayer):
        self.shape = (self.nUnits, 1)

    def forward(self, x):
        return x

    def backward(self, X):
        return None
    
#------------------------------------------------------------------------------
#   Basic Dense Layer class
#------------------------------------------------------------------------------
class DenseLayer(Layer):
    def __init__(self, nUnits, act="linear", name="Dense"):
        super().__init__(act, name=name)
        # init each neuron into list        
        self.nUnits = nUnits
        self.W = None
        self.b = None

    def initialize(self, prevLayer):
        #initialize all neurons
        self.shape = (self.nUnits, prevLayer.shape[0])

        # Initialize weights and bias
        prev_nUnits, _ = prevLayer.shape
        self.W = np.random.randn(self.nUnits, prev_nUnits)
        self.b = np.zeros((self.nUnits, 1), dtype=float)

    def forward(self, X):
        print("Forward of", self.name)
        self.z = np.matmul(self.W, X) + self.b         # Z = W*x + b
        self.a = self.activation(self.z)               # a = activation(Z)
        
        return self.a

    def backward(self, da, aPrev):
        #   da  =   dLoss -> dL/da of previous layer - with respect to backward pass
        #   aPrev   =   activation of previous layer needed for weights - with respect to forward pass
        batch_size = aPrev.shape[0]
        print("Backward of", self.name)
        _,m = da.shape
        
        dz = np.multiply(da,self.activation.derivation(z))
        dW = (1./m)*np.matmul(dz, aPrev.T)
        db = (1./m)*np.sum(dz,axis=1, keepdims=True)
        
        daPrev = np.matmul(W.T,dz)
        
        return daPrev
    

## Loss Functions

Implement two standard loss functions (Binary Cross Entropy and Mean Squared Error), which you will/can use in your implementation of MLP backward pass.

In [44]:
#------------------------------------------------------------------------------
#   LossFunction class
#------------------------------------------------------------------------------
class LossFunction:
    def __init__(self):
        pass

    def __call__(self, A, Y):
        pass

    def derivation(self, A, Y):
        pass


#------------------------------------------------------------------------------
#   BinaryCrossEntropyLossFunction class
#------------------------------------------------------------------------------
class BinaryCrossEntropyLossFunction(LossFunction):
    def __call__(self, A, Y):
        # Warning! Use of logarithm - Take care about definition scope
        return -(np.multiply(Y,np.log(A))+np.multiply((1-Y),np.log(1-A)))
    
    def derivation(self, A, Y):
        # Warning! Use of logarithm - Take care about definition scope
        return (-np.divide(Y,A)+np.divide((1-Y),(1-A)))
        
    
class MeanSquaredErrorLossFunction(LossFunction):
    def __call__(self, A, Y): # loss = (A-Y)^2
        return np.square(np.substract(A,Y))

    def derivation(self, A, Y): #dLoss = -1*(2(A-Y))
        return np.multpliy(-2,np.subtract(Y,A))


MAP_LOSS_FUNCTIONS = {
    "bce": BinaryCrossEntropyLossFunction,
    "mse": MeanSquaredErrorLossFunction
}

def CreateLossFunction(kind):
    if (kind in MAP_LOSS_FUNCTIONS):
        return MAP_LOSS_FUNCTIONS[kind]()
    raise ValueError(kind, "Unknown loss function {0}".format(kind))

## Model class

This is the basic class which holds all of your layers and encapsulate functionality to predict results from your input as a forward pass through all the layers after you create your model and initialize all the layers.

Implemet backpropagation.

In [58]:
#------------------------------------------------------------------------------
#   Model class
#------------------------------------------------------------------------------
class Model:
    def __init__(self, lossName):
        self.layers = []
        # Initialize loss function
        self.loss_fn = CreateLossFunction(lossName)
        
    def addLayer(self,  layer):
        self.layers.append(layer)

    def initialize(self):
        # Call initialization sequentially on all layers
        prevLayer = None
        for l in self.layers:
            l.initialize(prevLayer)
            prevLayer = l      
    
    def forward(self, X):
        # Single feed forward
        A = X
        for l in self.layers:
            A = l.forward(A)
            
        return A  
    
    def backward(self, dLoss):
        da = dLoss
        for layer, lPrev in zip(self.layers[::-1], self.layers[-2::-1]):
            da = layer.backward(da, lPrev.a)
            
        
    
    def compute_loss(self, A, Y):
        batch_size = Y.shape[0]
        
        return self.loss_fn(A,Y)
        
    
    def derive_loss(self, A, Y):
        batch_size = Y.shape[0]
        
        return self.loss_fn.derivation(self.compute_loss(A,Y),Y)

### Main Processing Cell

 1. Initialize dataset. 
 2. Declare a simple model (at least 4 layer) with relu on hidden layers and sigmoid on output layer.
 3. Perform forward pass through the network. 
 4. Compute loss.
 5. Derive loss.
 6. Perform backward pass.
 7. Celebrate and scroll lower.

In [61]:
# Main processing
from dataset import dataset_Circles
# Task A:

X, Y = dataset_Circles(n=16, radius=0.7, noise=0.0)


model = Model(lossName = "bce")
model.addLayer( InputLayer(nUnits=2, name="input_layer"))
model.addLayer( DenseLayer(nUnits=6, act="relu", name="1st_Layer"))
model.addLayer( DenseLayer(nUnits=3, act="relu", name="2nd_Layer"))
model.addLayer( DenseLayer(nUnits=2, act="relu", name="3rd_Layer"))
model.addLayer( DenseLayer(nUnits=1, act="sigmoid", name="4th_Layer"))

model.initialize()

A  = model.forward(X)

loss = model.derive_loss(A,Y)
loss.shape
# model.backward(loss)

Forward of 1st_Layer
Forward of 2nd_Layer
Forward of 3rd_Layer
Forward of 4th_Layer


(16, 1, 1)

**How does gradient checking work?**.

As in 1) and 2), you want to compare "gradapprox" to the gradient computed by backpropagation. The formula is still:

$$ \frac{\partial J}{\partial \theta} = \lim_{\varepsilon \to 0} \frac{J(\theta + \varepsilon) - J(\theta - \varepsilon)}{2 \varepsilon} \tag{1}$$

However, $\theta$ is not a scalar anymore. It is a dictionary called "parameters". We implemented a function "`dictionary_to_vector()`" for you. It converts the "parameters" dictionary into a vector called "values", obtained by reshaping all parameters (W1, b1, W2, b2, W3, b3) into vectors and concatenating them.

The inverse function is "`vector_to_dictionary`" which outputs back the "parameters" dictionary.


We have also converted the "gradients" dictionary into a vector "grad" using gradients_to_vector(). You don't need to worry about that.


Here is pseudo-code that will help you implement the gradient check.

For each i in num_parameters:
- To compute `J_plus[i]`:
    1. Set $\theta^{+}$ to `np.copy(parameters_values)`
    2. Set $\theta^{+}_i$ to $\theta^{+}_i + \varepsilon$
    3. Calculate $J^{+}_i$ using to `forward_propagation_n(x, y, vector_to_dictionary(`$\theta^{+}$ `))`.     
- To compute `J_minus[i]`: do the same thing with $\theta^{-}$
- Compute $gradapprox[i] = \frac{J^{+}_i - J^{-}_i}{2 \varepsilon}$

Thus, you get a vector gradapprox, where gradapprox[i] is an approximation of the gradient with respect to `parameter_values[i]`. You can now compare this gradapprox vector to the gradients vector from backpropagation. Just like for the 1D case (Steps 1', 2', 3'), compute: 
$$ difference = \frac {\| grad - gradapprox \|_2}{\| grad \|_2 + \| gradapprox \|_2 } \tag{3}$$


**The code will be added later** but soon enough ;)

In [118]:
# GRADED FUNCTION: gradient_check_n



## Verification cell

 8. Verify your solution by gradient checking.
 9. Start crying.
 10. Repeat until correct ;)

In [40]:
def gradient_check_n(network, X, Y, epsilon = 1e-7):
    """
    Checks if backward_propagation_n computes correctly the gradient of the cost output by forward_propagation_n

    Arguments:
    parameters -- python dictionary containing your parameters "W1", "b1", "W2", "b2", "W3", "b3":
    grad -- output of backward_propagation_n, contains gradients of the cost with respect to the parameters.
    x -- input datapoint, of shape (input size, 1)
    y -- true "label"
    epsilon -- tiny shift to the input to compute approximated gradient with formula(1)

    Returns:
    difference -- difference (2) between the approximated gradient and the backward propagation gradient
    """

    # Set-up variables
    gradapprox = []
    grad_backward = []

    for i,layer in enumerate(network.layers):
        # Compute gradapprox
        if i < 1:
            continue
        shape = layer.W.shape
        # print(shape[0], ',', shape[1])
        for i in range(shape[0]):
            for j in range(shape[1]):
                # print('i',i,'j',j)
                # Compute J_plus[i]. Inputs: "parameters_values, epsilon". Output = "J_plus[i]".
                # "_" is used because the function you have to outputs two parameters but we only care about the first one
                origin_W = layer.W[i][j]

                layer.W[i][j] = origin_W + epsilon
                A_plus = network.forward(X)
                J_plus = network.compute_loss(A_plus, Y)

                # Compute J_minus[i]. Inputs: "parameters_values, epsilon". Output = "J_minus[i]".
                layer.W[i][j] = origin_W - epsilon
                A_minus = network.forward(X)
                J_minus = network.compute_loss(A_minus, Y)

                # Compute gradapprox[i]
                gradapprox.append((J_plus - J_minus) / (2*epsilon))
                # print(layer.name, layer.dW.shape)
                # grad = np.mean(layer.dW, axis=0, keepdims=True)
                # grad_backward.append(grad[0][i][j])
                grad_backward.append(layer.dW[i][j])


    # Compare gradapprox to backward propagation gradients by computing difference.

    gradapprox = np.reshape(gradapprox, (-1, 1))
    grad_backward = np.reshape(grad_backward, (-1, 1))

    numerator = np.linalg.norm(grad_backward - gradapprox)
    denominator = np.linalg.norm(grad_backward) + np.linalg.norm(gradapprox)
    difference = numerator / denominator

    if difference > 2e-7:
        print ("\033[91m" + "There is a mistake in the backward propagation! difference = " + str(difference) + "\033[0m")
    else:
        print ("\033[92m" + "Your backward propagation works perfectly fine! difference = " + str(difference) + "\033[0m")