In [1]:
import numpy as np
import matplotlib.pyplot as plt
import scipy.io
import math
import sklearn
import sklearn.datasets

from opt_utils_v1a import load_params_and_grads, initialize_parameters, forward_propagation, backward_propagation
from opt_utils_v1a import compute_cost, predict, predict_dec, plot_decision_boundary, load_dataset
from copy import deepcopy

%matplotlib inline
plt.rcParams['figure.figsize'] = (7.0, 4.0) # set default size of plots
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'

%load_ext autoreload
%autoreload 2

In [3]:
def upgrade_parameters_with_gd(parameters , grads, learning_rate ):
    L = len(parameters)
    for l in range(1,L+1):
       parameters["W" + str(l)] = parameters['W' + str(l)] - (learning_rate * grads["dW" + str(l)])
       parameters["b" + str(l)] = parameters['b' + str(l)] - (learning_rate * grads["db" + str(l)])

    return parameters       

In [4]:
def random_mini_batches(X, Y, mini_batch_size = 64, seed = 0):

    np.random.seed(seed)            
    m = X.shape[1]                  
    mini_batches = []
        
    permutation = list(np.random.permutation(m))
    shuffled_X = X[:, permutation]
    shuffled_Y = Y[:, permutation].reshape((1, m))
    
    inc = mini_batch_size

    num_complete_minibatches = math.floor(m / mini_batch_size) 
    for k in range(0, num_complete_minibatches):
        
        mini_batch_X = shuffled_X[:, mini_batch_size * k : mini_batch_size * (k+1)]
        mini_batch_Y = shuffled_Y[:, mini_batch_size * k : mini_batch_size * (k+1)]
 
        mini_batch = (mini_batch_X, mini_batch_Y)
        mini_batches.append(mini_batch)
    
    
    if m % mini_batch_size != 0:
      
        mini_batch_X = shuffled_X[: , num_complete_minibatches * mini_batch_size : m]
        mini_batch_Y = shuffled_Y[: , num_complete_minibatches * mini_batch_size : m]
    
        mini_batch = (mini_batch_X, mini_batch_Y)
        mini_batches.append(mini_batch)
    
    return mini_batches

### Momentum

In [6]:
def initialize_velocity(parameters):

    L = len(parameters) 
    v = {}
    

    for l in range(1, L + 1):
 
        v['dW' + str(l)] = np.zeros_like(parameters['W' + str(l)])
        v['db' + str(l)] = np.zeros_like(parameters['b' + str(l)])

    return v

In [7]:
def update_parameters_with_momentum(parameters, grads, v, beta, learning_rate):

    L = len(parameters) 

    for l in range(1, L + 1):

        v['dW' + str(l)] = (beta * v['dW' +str(l)]) + (1 - beta) * (grads['dW' + str(l)])
        v['db' + str(l)] = (beta * v['db' +str(l)]) + (1 - beta) * (grads['db' + str(l)])
        parameters['W' + str(l)] = parameters['W' + str(l)]  - (learning_rate * v['dW' + str(l)])
        parameters['b' + str(l)] = parameters['b' + str(l)]  - (learning_rate * v['db' + str(l)])
        
    return parameters, v

### Adam

In [8]:
def initialize_adam(parameters):
    L = len(parameters)
    v = {}
    s = {}

    for l in range(1, L+1):
        v['dw' + str(l)] = np.zeros_like(parameters['W' + str(l)])
        v['db' + str(l)] = np.zeros_like(parameters['b' + str(l)])

        s['dW' + str(l)] = np.zeros_like(parameters['W' + str(l)])
        s['db' + str(l)] = np.zeros_like(parameters['b' + str(l)])

    return v, s    

In [9]:
def update_parameters_with_adam(parameters, grads, v, s, t, learning_rate = 0.01,
                                beta1 = 0.9, beta2 = 0.999,  epsilon = 1e-8):

    L = len(parameters) // 2                 
    v_corrected = {}                         
    s_corrected = {}                         
    

    for l in range(1, L + 1):
 
        v['dW' + str(l)] = beta1 * v['dW' + str(l)] + (1 - beta1) * grads['dW' + str(l)]
        v['db' + str(l)] = beta1 * v['db' + str(l)] + (1 - beta1) * grads['db' + str(l)]
    
        v_corrected['dW' + str(l)] = v['dW' + str(l)]/(1 -beta1**t)
        v_corrected['db' + str(l)] = v['db' + str(l)]/(1 -beta1**t)

        s['dW' + str(l)] = beta2 * s['dW' + str(l)] + (1 - beta2) * np.square(grads['dW' + str(l)])
        s['db' + str(l)] = beta2 * s['db' + str(l)] + (1 - beta2) * np.square(grads['db' + str(l)])

        s_corrected['dW' + str(l)] = s['dW' + str(l)]/(1 - beta2**t)
        s_corrected['db' + str(l)] = s['db' + str(l)]/(1 - beta2**t)

        parameters['W' + str(l)] = parameters['W' + str(l)] - (learning_rate * (v_corrected['dW'+ str(l)])/(np.sqrt(s_corrected['dW'+ str(l)]) + epsilon))
        parameters['b' + str(l)] = parameters['b' + str(l)] - (learning_rate * (v_corrected['db'+ str(l)])/(np.sqrt(s_corrected['db'+ str(l)]) + epsilon))
  

    return parameters, v, s, v_corrected, s_corrected