# Problem 1, Parameter initialization
This notebook aims to visualize the importance of parameter initialization. The initialization step can be critical to the modelâ€™s ultimate performance, and it requires the right method.

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import utils

# Task 0 - Create a simple Neural Network

This task isn't focused around creating the neural network, so I'll provide a framework for creating neural networks.

In [None]:
class Layer:
    def __init__(self):
        self.input = None
        self.output = None
        
    def forward(self, input):
        pass

    def backward(self, output_gradient, learning_rate):
        pass

In [None]:
class Dense(Layer):
    def __init__(self, input_size, output_size, init_weights, init_bias):
        self.weights = init_weights(output_size, input_size)
        self.bias = init_bias(output_size)
    
    def forward(self, input):
        self.input = input
        return np.dot(self.weights, self.input) + self.bias
    
    def backward(self, output_gradient, learning_rate):
        # Update params and return input gradient
        weights_gradient = np.dot(output_gradient, self.input.T)
        self.weights -= learning_rate * weights_gradient
        self.bias -= learning_rate * output_gradient
        return np.dot(self.weights.T, output_gradient)

In [None]:
class Activation(Layer):
    def __init__(self, activation, activation_prime):
        self.activation = activation
        self.activation_prime = activation_prime
    
    def forward(self, input):
        self.input = input
        return self.activation(self.input)
    
    def backward(self, output_gradient, learning_rate):
        return np.multiply(output_gradient, self.activation_prime(self.input))

In [None]:
class Linear(Activation):
    def __init__(self):
        super().__init__(lambda x: x, lambda x: 1)

class Tanh(Activation):
    def __init__(self):
        tanh = lambda x: np.tanh(x)
        tanh_prime = lambda x: 1 - np.tanh(x)**2
        super().__init__(tanh, tanh_prime)

class Sigmoid(Activation):
    def __init__(self):
        def sigmoid(x):
            return 1 / (1 + np.exp(-x))
        
        def sigmoid_prime(x):
            s = sigmoid(x)
            return s * (1 - s)
            
        super().__init__(sigmoid, sigmoid_prime)

In [None]:
def mse(y_true, y_pred):
    return np.mean(np.power(y_true - y_pred, 2))

def mse_prime(y_true, y_pred):
    return 2 * (y_pred - y_true) / np.size(y_true)

# Task 1

### Given
- output_size and input_size of the neural network and an example of `uniform_init`, `too_large`, `too small`
- Xavier-initialization equation: $W^{[l]} \sim \mathcal{N}(\mu=0,\sigma^2 = \frac{1}{n^{[l-1]}})$ where $n^{[l-1]}$ is the number of neuron in layer $l - 1$, and the biases as zero: $b^{[l]} = 0$.

### Find
- Implement the initialization methods for zeros, ones and xavier

In [None]:
def uniform_init(output_size, input_size = 1):
    return np.random.uniform(-1, 1, (output_size, input_size))

def too_small_init(output_size, input_size = 1):
    return np.ones((output_size, input_size)) * 0.5

def too_large_init(output_size, input_size = 1):
    return np.ones((output_size, input_size)) * 1.5

def zeros_init(output_size, input_size = 1):
    # TODO: Implement
    pass

def ones_init(output_size, input_size = 1):
    # TODO: Implement
    pass

def xavier_init(output_size, input_size = 1):
    # TODO: Implement
    pass

### Training the neural network on XOR-dataset

In [None]:
X = np.reshape([[0,0], [0,1], [1,0], [1,1]], (4, 2, 1))
Y = np.reshape([[0], [1], [1], [0]], (4, 1, 1))

# Task 2 - Impact of parameter initialization

### Given
- Methods for initializing parameters that are *zero, too small, too large, uniform or xavier*
- XOR-dataset
- Function for training a neural network
- Function for visualizing the loss
- A NN with the following architecture:

| Layer | Output Shape | Activation function |
|:-----:|:------------:|---------------------|
| Input |    (2, 2)    |         Tanh        |
| Dense |    (2, 2)    |         Tanh        |
| Dense |    (2, 2)    |         Tanh        |
| Dense |    (2, 2)    |         Tanh        |
| Dense |    (2, 2)    |         Tanh        |
| Dense |    (2, 2)    |         Tanh        |
| Dense |    (2, 2)    |         Tanh        |
| Dense |    (2, 2)    |         Tanh        |
| Dense |    (2, 1)    |       Sigmoid       |

### Find
- The impact of this initialization by training 5 different deep neural networks, initialized with the different methods, for 20 000 epochs.
- Visualize the loss

In [None]:
LEARNING_RATE = 0.01
EPOCHS = 20000
VISUALIZE_INTERVAL = 100

def create_deep_network(weight_init_method, bias_init_method):
    return [
        Dense(2, 2, weight_init_method, bias_init_method),
        Tanh(),
        Dense(2, 2, weight_init_method, bias_init_method),
        Tanh(),
        # TODO: Add the remaining layers/activations as described in the task
    ]

network_zero_weights =      # TODO: Call the create_deep network function with the correct parameters
network_large_weights =     # TODO: Call the create_deep network function with the correct parameters
network_small_weights =     # TODO: Call the create_deep network function with the correct parameters
network_uniform_weights =   # TODO: Call the create_deep network function with the correct parameters
network_xavier_weights =    # TODO: Call the create_deep network function with the correct parameters

In [None]:
def train(network):
    error_history = []
    for epoch in range(EPOCHS):
        error = 0
        for x, y in zip(X, Y):
            # Forward pass
            output = x
            for index, layer in enumerate(network):
                output = layer.forward(output)
            
            # Calculate error
            error += mse(y, output)

            # Backward pass
            grad = mse_prime(y, output)
            
            for layer in reversed(network):
                grad = layer.backward(grad, LEARNING_RATE)
            
        error /= len(X)
        error_history.append(error)
        
        if (epoch % VISUALIZE_INTERVAL == 0):
            print(f"Epoch {epoch}, error: {error}")
    return error_history

error_history_zero_weights =    # TODO: Train the correct network using the train-function
error_history_large_weights =   # TODO: Train the correct network using the train-function
error_history_small_weights =   # TODO: Train the correct network using the train-function
error_history_uniform_weights = # TODO: Train the correct network using the train-function
error_history_xavier_weights =  # TODO: Train the correct network using the train-function

From the plot we can see that the initialization-step has a great impact on whether the neural network converges, or not. It is especially prominent because the network we're training is deep, which increases the risk of exploding-/vanishing gradients.

In [None]:
sns.set_style("darkgrid")
plt.plot(error_history_zero_weights, label="Zero weights")
plt.plot(error_history_large_weights, label="Large weights")
plt.plot(error_history_small_weights, label="Small weights")
plt.plot(error_history_uniform_weights, label="Uniform weights")
plt.plot(error_history_xavier_weights, label="Xavier weights")
plt.xlabel("Epoch")
plt.ylabel("Cost")
plt.legend()
plt.show()

# Task 3 - Perks of Xavier initialization

There are generally two rules of thumb when initializing parameters in a neural network.
- The mean of the activations should be zero: $E[a^{[l-1]}] = E[a^{[l]}]$
- The variance of the activations should stay the same across every layer: $Var(a^{[l-1]}) = Var(a^{[l]})$

### Given
- The above two rules of thumb
- The XOR-dataset
- A NN with the following architecture:

| Layer | Output Shape | Activation function |
|:-----:|:------------:|---------------------|
| Input |    (2, 100)    |         Tanh        |
| Dense |    (100, 100)    |         Tanh        |
| Dense |    (100, 100)    |         Tanh        |
| Dense |    (100, 100)    |         Tanh        |
| Dense |    (100, 1)    |       Sigmoid       |

### Find
- The histograms of the activations for every 1000nd activation layer, when initialized with Xavier and Uniform



In [None]:
LEARNING_RATE = 0.01
MOMENTUM = 0.9
EPOCHS = 10000
VISUALIZE_INTERVAL = 100

WEIGHT_INIT = # TODO: Choose a weight initialization method
BIAS_INIT =   # TODO: Choose a bias initialization method

network = [
    Dense(2, 100, WEIGHT_INIT, BIAS_INIT),
    Tanh(),
    # TODO: Add the remaining layers/activations as described in the task
]

In [None]:
activations = {}

for epoch in range(EPOCHS):
    error = 0
    for x, y in zip(X, Y):
        # Forward pass
        output = x
        for index, layer in enumerate(network):
            output = layer.forward(output)
            if (layer.__class__ == Dense):
                key = int(index / 2)
                if key not in activations:
                    activations[key] = []
                activations[key].append(output)
        
        # Calculate error
        error += mse(y, output)

        # Backward pass
        grad = mse_prime(y, output)
        
        for layer in reversed(network):
            grad = layer.backward(grad, LEARNING_RATE)
        
    error /= len(X)
    if (epoch % VISUALIZE_INTERVAL == 0):
        print(f"Epoch {epoch}, error: {error}")

In [None]:
for epoch_number in range(0, len(activations[0]), 1000):
    activation_data = []
    for layer_index in range(len(activations) - 1):
        activation_data.append(activations[layer_index][epoch_number])

    utils.sns_plot_histograms(activation_data, num_bins=50, figsize=(16, 2))