# Advanced Optimization Methods

    Having a good optimization algorithm can be the difference between waiting days vs. just a few hours to get a good result

In [3]:
# Imports
import numpy as np
import matplotlib.pyplot as plt
import scipy.io
import math
import sklearn
import sklearn.datasets

from test_data_generator import *
from opt_utils import *

### 1.) Gradient Descent

In [4]:
# Update Params with Gradient Descent
def update_params_with_gd(params, grads, lr):
    L = len(params) // 2

    for l in range(L):
        params['W' + str(l+1)] = params['W' + str(l+1)] - lr * grads['dW' + str(l+1)]
        params['b' + str(l+1)] = params['b' + str(l+1)] - lr * grads['db' + str(l+1)]
    return params

In [5]:
params, grads, learning_rate = update_parameters_with_gd_test_case()

params = update_params_with_gd(params, grads, learning_rate)
print("W1 =\n" + str(params["W1"]))
print("b1 =\n" + str(params["b1"]))
print("W2 =\n" + str(params["W2"]))
print("b2 =\n" + str(params["b2"]))

W1 =
[[ 1.63535156 -0.62320365 -0.53718766]
 [-1.07799357  0.85639907 -2.29470142]]
b1 =
[[ 1.74604067]
 [-0.75184921]]
W2 =
[[ 0.32171798 -0.25467393  1.46902454]
 [-2.05617317 -0.31554548 -0.3756023 ]
 [ 1.1404819  -1.09976462 -0.1612551 ]]
b2 =
[[-0.88020257]
 [ 0.02561572]
 [ 0.57539477]]


## 2.) Random Mini Batches

In [6]:
# Random mini batches
def random_mini_batches(X, Y, mini_batch_size = 64, seed = 0):
    np.random.seed(seed)
    m = X.shape[1]
    mini_batches = []

    permutation = list(np.random.permutation(m))
    shuffled_X = X[:, permutation]
    shuffled_Y = Y[:, permutation]
    num_complete_minibatches = math.floor(m/mini_batch_size)
    for k in range(0, num_complete_minibatches):
        mini_batch_X = shuffled_X[:, k*mini_batch_size : (k+1)*mini_batch_size]
        mini_batch_Y = shuffled_Y[:, k*mini_batch_size : (k+1)*mini_batch_size]
        mini_batch = (mini_batch_X, mini_batch_Y)
        mini_batches.append(mini_batch)



    if m % mini_batch_size != 0:
        mini_batch_X = shuffled_X[:, int(m / mini_batch_size)*mini_batch_size : ]
        mini_batch_Y = shuffled_Y[:, int(m / mini_batch_size)*mini_batch_size : ]
        mini_batch = (mini_batch_X, mini_batch_Y)
        mini_batches.append(mini_batch)

    return mini_batches

In [7]:
X_assess, Y_assess, mini_batch_size = random_mini_batches_test_case()
mini_batches = random_mini_batches(X_assess, Y_assess, mini_batch_size)
print("Minibatches length: ", mini_batches)
print ("shape of the 1st mini_batch_X: " + str(mini_batches[0][0].shape))
print ("shape of the 2nd mini_batch_X: " + str(mini_batches[1][0].shape))
print ("shape of the 3rd mini_batch_X: " + str(mini_batches[2][0].shape))
print ("shape of the 1st mini_batch_Y: " + str(mini_batches[0][1].shape))
print ("shape of the 2nd mini_batch_Y: " + str(mini_batches[1][1].shape)) 
print ("shape of the 3rd mini_batch_Y: " + str(mini_batches[2][1].shape))
print ("mini batch sanity check: " + str(mini_batches[0][0][0][0:3]))

Minibatches length:  [(array([[ 0.90085595, -0.7612069 ,  0.2344157 , ...,  0.12015895,
         0.86888616, -0.60392063],
       [ 2.52832571, -0.10015523, -0.61736206, ...,  0.60231928,
        -0.18657899,  1.39984394],
       [-0.13597733, -0.8805776 , -0.10679399, ..., -0.10999149,
         0.80884436,  0.35016716],
       ...,
       [-0.43984921,  0.28742058, -0.6785365 , ...,  0.76544547,
        -0.71181932,  0.1410268 ],
       [-1.63271355,  0.07176604,  0.34552524, ..., -0.94969794,
        -0.66824325, -1.22559617],
       [-0.12821597,  0.19427479, -1.6059598 , ..., -2.34856141,
        -2.08924821, -0.73948309]]), array([[ True, False,  True,  True,  True, False,  True,  True,  True,
        False,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True, False, False, False,  True,  True,  True,  True,
        False,  True,  True, False,  True,  True,  True,  True,  True,
         True,  True, False,  True, False,  True,  True, False,  True,
        

## 3.) Momentum

In [None]:
# Initialize velocity vdW and vdb with the same shape as dW and db with np.zeros
def initialize_velocity(params):
    
    L = len(params) // 2
    v = {}

    for l in range(L):
        v['dW' + str(l+1)] = np.zeros((params['W' + str(l+1)].shape[0], params['W' + str(l+1)].shape[1]))
        v['db' + str(l+1)] = np.zeros((params['b' + str(l+1)].shape[0], params['b' + str(l+1)].shape[1]))

    return v

In [11]:
params = initialize_velocity_test_case()

v = initialize_velocity(params)
print("v[\"dW1\"] =\n" + str(v["dW1"]))
print("v[\"db1\"] =\n" + str(v["db1"]))
print("v[\"dW2\"] =\n" + str(v["dW2"]))
print("v[\"db2\"] =\n" + str(v["db2"]))

v["dW1"] =
[[0. 0. 0.]
 [0. 0. 0.]]
v["db1"] =
[[0.]
 [0.]]
v["dW2"] =
[[0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]]
v["db2"] =
[[0.]
 [0.]
 [0.]]


In [None]:
# Update params with momentum
def update_params_with_momentum(params, grads, v, beta, lr):

    L = len(params) // 2

    for l in range(L):
        # Comput velocity
        v['dW' + str(l+1)] = beta*v['dW' + str(l+1)] + (1 - beta) * grads['dW' + str(l+1)]
        v['db' + str(l+1)] = beta*v['db' + str(l+1)] + (1 - beta) * grads['db' + str(l+1)]
        # Update parameters
        params['W' + str(l+1)] = params['W' + str(l+1)] - lr * v['dW' + str(l+1)]
        params['b' + str(l+1)] = params['b' + str(l+1)] - lr * v['db' + str(l+1)]

    return params, v