# Initialization
A well chosen initialization can:
- Speed up the convergence of gradient descent
- Increase the odds of gradient descent converging to a lower training (and generalization) error 

In [1]:
# Imports
import numpy as np
import matplotlib.pyplot as plt
import sklearn
import sklearn.datasets
from init_utils import sigmoid, relu, compute_loss, forward_propagation, backward_propagation, update_parameters, predict, predict_dec, load_dataset, plot_decision_boundary

# Some defaults for this notebook
%matplotlib inline
plt.rcParams['figure.figsize'] = (7.0, 4.0)
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'

In [2]:
# Load Datasets for blue/red dots in circle
train_X, train_Y, test_X, test_Y = load_dataset()

# There are three ways to initialize the parameters
    1.) Initialize with zeros
    2.) Initialize with randam values
    3.) He Initialization -> This initializes the weights to random values scaled according to a paper by He et al., 2015

In [3]:
# 1.) Initialize with Zeros
def initialize_parameters_zeros(layers_dims):
    params = {}
    L = len(layers_dims)
    for l in range(1, L):
        params['W' + str(l)] = np.zeros((layers_dims[l], layers_dims[l-1]))
        params['b' + str(l)] = np.zeros((layers_dims[l], 1))
    return params

In [None]:
params = initialize_parameters_zeros([4,3,2,1])
print('Zero Initialization : ', (params))

In [5]:
# 2.) Random Initialization
def initialize_parameters_random(layers_dims):
    np.random.seed(3)
    params = {}
    L = len(layers_dims)
    for l in range(1, L):
        params['W' + str(l)] = np.random.randn(layers_dims[l], layers_dims[l-1]) * 10
        params['b' + str(l)] = np.zeros((layers_dims[l], 1))
    return params

In [None]:
params = initialize_parameters_random([3,2,1])
print('Random Initialization : ', (params))

In [7]:
# 3.) He Initialization
def initialize_parameters_he(layers_dims):
    import math

    np.random.seed(3)
    params = {}
    L = len(layers_dims) - 1
    for l in range(1, L+1):
        params['W' + str(l)] = np.random.randn(layers_dims[l], layers_dims[l-1]) * math.sqrt(2./layers_dims[l-1])
        params['b' + str(l)] = np.zeros((layers_dims[l], 1)) * math.sqrt(2./layers_dims[l-1])
    return params

In [None]:
params = initialize_parameters_he([2, 4,1])
print('He Initialization : ', (params))

In [9]:
# Now we need NN model to test all these 3 initialization methods
def NN_model(X, Y, lr = 0.01, n_iter = 10000, pc = False, init = 'zeros'):
    grads = {}
    costs = []
    m = X.shape[1]
    layers_dims = [X.shape[0], 10, 5, 1]

    # Initialization
    if init == 'zeros':
        params = initialize_parameters_zeros(layers_dims)
    elif init == 'random':
        params = initialize_parameters_random(layers_dims)
    elif init == 'he':
        params = initialize_parameters_he(layers_dims)
    
    # Loop for gradient descent
    for i in range(0, n_iter):
        a3, cache = forward_propagation(X, params)
        cost = compute_loss(a3, Y)
        grads = backward_propagation(X, Y, cache)
        params = update_parameters(params, grads, lr)

        # Print Cost every 1000 iterations
        if pc and i % 1000 == 0:
            print("Cost after iteration {}: {}".format(i, cost))
            costs.append(cost)
    # Plot the loss
    plt.plot(costs)
    plt.ylabel('cost')
    plt.xlabel('iterations per hundreds')
    plt.title("Learning rate =" + str(lr))
    plt.show()
    
    return params



In [None]:
# For Zero Initialization
params = NN_model(train_X, train_Y, lr = 0.01, n_iter = 15000, pc = True, init = "zeros")
print ("On the train set:")
predictions_train = predict(train_X, train_Y, params)
print ("On the test set:")
predictions_test = predict(test_X, test_Y, params)

plt.title("Model with Zeros initialization")
axes = plt.gca()
axes.set_xlim([-1.5,1.5])
axes.set_ylim([-1.5,1.5])
plot_decision_boundary(lambda x: predict_dec(params, x.T), train_X, train_Y)

In [None]:
# For Random Initialization
params = NN_model(train_X, train_Y, lr = 0.01, n_iter = 15000, pc = True, init = "random")
print ("On the train set:")
predictions_train = predict(train_X, train_Y, params)
print ("On the test set:")
predictions_test = predict(test_X, test_Y, params)

plt.title("Model with Random initialization")
axes = plt.gca()
axes.set_xlim([-1.5,1.5])
axes.set_ylim([-1.5,1.5])
plot_decision_boundary(lambda x: predict_dec(params, x.T), train_X, train_Y)

In [None]:
# For He Initialization
params = NN_model(train_X, train_Y, lr = 0.01, n_iter = 15000, pc = True, init = "he")
print ("On the train set:")
predictions_train = predict(train_X, train_Y, params)
print ("On the test set:")
predictions_test = predict(test_X, test_Y, params)

plt.title("Model with He initialization")
axes = plt.gca()
axes.set_xlim([-1.5,1.5])
axes.set_ylim([-1.5,1.5])
plot_decision_boundary(lambda x: predict_dec(params, x.T), train_X, train_Y)

## Conclusion


You have seen three different types of initializations. For the same number of iterations and same hyperparameters the comparison is:

<table> 
    <tr>
        <td>
        Model
        </td>
        <td>
        Train accuracy
        </td>
        <td>
        Problem/Comment
        </td>
    </tr>
    <tr>
        <td>
            with zeros initialization
        </td>
        <td>
            50%
        </td>
        <td>
            fails to break symmetry
        </td>
    </tr>
    <tr>
        <td>
        3-layer NN with large random initialization
        </td>
        <td>
        83%
        </td>
        <td>
        too large weights 
        </td>
    </tr>
    <tr>
        <td>
        3-layer NN with He initialization
        </td>
        <td>
        99%
        </td>
        <td>
        recommended method
        </td>
    </tr>
</table> 