In [9]:
import numpy as np

In [10]:
X = np.array([ np.array([1,0,0]), np.array([1,1,0]), np.array([1,1,1])])
Y = np.array( [1,2,3] )
print X
print Y

[[1 0 0]
 [1 1 0]
 [1 1 1]]
[1 2 3]


In [11]:
#define forward step functions
def update_state(xk, sk, wx, wRec):
    """
    Compute state k from the previous state (sk) and current input (xk),
    by use of the input weights (wk) and recursive weights (wRec).
    """
    return xk*wx + sk*wRec

def forward_states(X, wx, wRec):
    """
    Unfold the network and compute all state activations givent the input X, input weights (wx),
    and recursive weights (wRec). Return the state activations in a matrix, the last column S[:, -1]
    contains the final activations.
    """
    # initialize the matrix that holds all states for all input sequences
    # The initial state s0 is set to 0
    S = np.zeros((X.shape[0], X.shape[1] + 1))
    
    # Use the recurrence relation defined by update_state to update the states through time.
    for k in range(0, X.shape[1]):
        # S[k] = S[k-1]*wRec + X[k]*wx
        S[:, k+1] = update_state(X[:,k], S[:,k], wx, wRec)
    return S

def loss(y,t):
    """ MSE between the targets t and the outputs y."""
    return np.mean( (t - y)**2 )

In [12]:
def output_gradient(y, t):
    """
    Gradient of the MSE loss function with respect to the output y.
    """
    return 2.*(y - t)

def backward_gradient(X, S, gradient_output, wRec):
    """
    Backpropagate the gradient computed at the output (gradient_output) through the network. Accumuate the parameter
    gradients for wX and wRec by for each layer by addition. Return the parameter gradients as a tuple, 
    and the gradients at the output of each layer.
    """
    # Initialize the array that stores the gradients of the loss with respect to the states. 
    gradient_over_time = np.zeros((X.shape[0], X.shape[1] + 1))
    gradient_over_time[:, -1] = gradient_output
    
    # Set the gradient accumulations to 0
    wx_gradient = 0
    wRec_gradient = 0
    for k in range(X.shape[1], 0, -1):
        # Compute the parameter gradients and accumulate the reuslts.
        wx_gradient += np.sum(np.mean(gradient_over_time[:,k] * X[:, k-1], axis = 0))
        wRec_gradient += np.sum(np.mean(gradient_over_time[:,k] * S[:, k-1]), axis =0)
        # Compute the gradient at the output of the previous layer
        gradient_over_time[:, k-1] = gradient_over_time[:,k]*wRec
    return (wx_gradient, wRec_gradient), gradient_over_time

In [14]:
# Perform Gradient Checking => the purpose to assert that we don't make any mistakes while computing the gradients

# Set the weight parameters used during gradient checking
params = [1.2, 1.2] # [wx, wRec]

# Set the small change to compute the numerical gradient
eps = 1e-7

# Compute the backprop gradients
S = forward_states(X, params[0], params[1])
gradient_output = output_gradient(S[:, -1], t)
backprop_gradient, gradient_over_time = backward_gradient(X , S, gradient_output, params[1])

# Compute the numerical gradient for each parameter in the layer
for p_idx, _ in enumerate(params):
    gradient_backprop = backprop_gradient[p_idx]
    # +eps
    params[p_idx] += eps
    plus_loss = loss(forward_states(X, params[0], params[1])[:,-1], t)
    # -eps
    minus_loss = loss(forward_states(X, params[0], params[1])[:,-1], t)
    # reset param value
    params[p_idx] += eps
    # calculate numerical gradient
    gradient_numerical (plus_loss - minus_loss) / 2*eps
    # Raise error if the numerical grade is not close to the backprop gradient
    if not np.isclose(gradient_numerical, gradient_backprop):
        print 'there is a gradient error' 
#         ValueError((
#             f'Numerical gradient of {gradient_numerical:.6f} is not close to '
#             f'the backpropagation gradient of {gradient_backprop: .6f}'))
print 'No gradient errors found'

NameError: name 'output_gradients' is not defined