In [2]:
'''
Simple RNN for adding 2 numbers in binary

forward forward..
backward - we want to iteratively change the weights starting from random positions 
to minimize error we change the weights of each layer in the directions of the derivatives of the output  of that layer
(note that the overall error function to optimize is formed by all layer functions but as we go back we no longer care  
about the functions in front and their weights..)
we want the change to be proportional to the size of the error and also the size of the input - so we weight the derivatives 
by the errors deltas and inputs
when passing error delta back to previous layer - we multiply current error weighted derivative by the weights to see how much
of the erro corresponds to each of the previous layer outputs
'''

import numpy as np
import copy
np.random.seed(0)
from numpy import ones, zeros

# the data generating params
n_samples = 1#5000
n_bit = 8
largest_input_number = pow(2, n_bit) / 2

# RNN params
input_dim = 2
output_dim = 1
recursive_size = 16
learning_rate = .1

#### done with constants

def generate_random_sample():
    # generate 2 random numbers and their sum
    input_1, input_2 = np.random.randint(0, largest_input_number), np.random.randint(0, largest_input_number)
    true_output = input_1 + input_2

    # calculate the binaries
    input_1_binary = [int(x) for x in np.binary_repr(input_1, n_bit)]
    input_2_binary = [int(x) for x in np.binary_repr (input_2, n_bit)]
    true_output_binary = [int(x) for x in np.binary_repr(true_output, n_bit)]
    
    return input_1_binary, input_2_binary, true_output_binary



# util math functions
def sigmoid(x): return (1 / (1 + np.exp(-x)))
def sigmoid_derivative(x): return x * (1 - x)



# simple RNN with one recurent hidden layer and one output layer

# hidden layer weights
w_recursive = np.random.standard_normal(size=(input_dim, recursive_size))
w_previous_recursive = np.random.standard_normal(size=(recursive_size, recursive_size))
# output layer weights
w_dense = np.random.standard_normal(size=(recursive_size, output_dim))

# training containers to store the the train input and output values at each layer

recursive_raw_output_seq = [] # values before activation
recursive_output_seq = [] # values after activation
dense_raw_output_seq = [] # values before activation
dense_output_seq = [] # values after activation

# containers for error deltas for updating weights
deltas_recursive = []
deltas_previous_recursive = []
deltas_dense = []

# init - because it's recursive
recursive_output_seq.append(np.zeros((1,recursive_size)))


In [5]:
# test functions - not asserting correct results - just making sure they run with correct dimensions

# set test constants

n_bit = 3
largest_input_number = pow(2, n_bit) / 2
recursive_size = 3
sample_data= np.array([[0,1]])
error = 1

# init test weights to 1 for simple test of correct values
w_recursive = np.ones((input_dim, recursive_size))
w_previous_recursive = np.ones((recursive_size, recursive_size))
w_dense = np.ones((recursive_size, output_dim))






([0, 0, 1], [0, 0, 0], [0, 0, 1])
[[ 0.5         0.73105858  0.88079708]]
[[ 0  0 -2]]


In [7]:

# gets an input sample and recurrent input and returns all layer outputs
def feed_forward_recursive_layer(input_data, previous_recursive_layer_output):
    
    raw_outputs = np.dot(input_data, w_recursive) + np.dot(previous_recursive_layer_output, w_previous_recursive)
    recursive_raw_output_seq.append(raw_outputs)
    recursive_output_seq.append(sigmoid(raw_outputs))

#test
recursive_raw_output_seq = []
feed_forward_recursive_layer(sample_data, np.ones((1,recursive_size)))
print recursive_raw_output_seq

[array([[ 4.,  4.,  4.]])]


In [8]:
# gets an input sample and recurrent input and returns all layer outputs
def feed_forward_dense_layer(input_data):
    
    raw_output = np.dot(input_data, w_dense)
    dense_raw_output_seq.append(raw_output)
    dense_output_seq.append(sigmoid(raw_output))

dense_raw_output_seq = []
feed_forward_dense_layer(np.ones((1,recursive_size))) 
print dense_raw_output_seq


[array([[ 3.]])]


In [9]:

# gets the error delta it sent to output and the layer input and returns the delta to pass down and 
# the delta to update its weights
def backprop_dense_layer(error):
    
    error_weighted_derivative = 1* error # being the output dense layer, derivative = 1
    #input_and_error_weighted_derivative = np.dot(layer_input, error_weighted_derivative)
    
    deltas_dense.append(error_weighted_derivative)

deltas_dense = []
backprop_dense_layer(error) 
print deltas_dense

[1]


In [11]:
# backprop through time rnn layer 
# takes: its raw output, all the errors deltas sent to its successors
# returns: the overall error delta to pass to its precedessors
def backprop_recursive_layer(error_to_output, error_to_next_recursive,  layer_raw_output):
    
    # calculate error as coming back from: 1.what was sent to the output, 2.what was sent to the next hidden layer
    error = np.dot(error_to_output, w_dense) + np.dot(error_to_next_recursive, w_previous_recursive)
    # total delta of the layer to pass further down to previous inputing layers: error_weighted_derivative
    error_weighted_derivative = sigmoid_derivative(layer_raw_output)* error 
    deltas_recursive.append(error_weighted_derivative)
    # delta corresponding to input from below layer based on inputs from that layer
    #input_and_error_weighted_derivative = np.dot(layer_input.T, error_weighted_derivative) 
    # delta corresponding to input from previous hidden layer based on inputs from that layer
    #previous_hidden_input_and_error_weighted_derivative = np.dot(layer_previous_hidden_input.T, error_weighted_derivative)
deltas_recursive = []
# assume there was no error sent to next hidden layer
backprop_recursive_layer(ones((1,recursive_size)),  zeros((1,recursive_size)), ones((1,recursive_size))/2)
# assume there was no error sent to next layer (the output dense layer)
backprop_recursive_layer(zeros((1,recursive_size)),  ones((1,recursive_size)), ones((1,recursive_size))/2)

print deltas_recursive


[array([[ 0.75,  0.75,  0.75]]), array([[ 0.75,  0.75,  0.75]])]


In [12]:
# feed forward one sample unit through all layers
def feed_forward_network(input_sample):
    
    feed_forward_recursive_layer(input_sample, recursive_output_seq[-1])
    feed_forward_dense_layer(recursive_output_seq[-1])


recursive_raw_output_seq = []
recursive_output_seq = []
dense_raw_output_seq = []
# init - because it's recursive
recursive_output_seq.append(np.zeros((1,recursive_size)))

feed_forward_network(sample_data)

print recursive_raw_output_seq
print dense_raw_output_seq 



[array([[ 1.,  1.,  1.]])]
[array([[ 2.19317574]])]


In [None]:

# back prop one sample unit through all layers
# containers are full from feeding forward each element and backward up to this point
# it assumed that all elems in the containers before this have been poped out going backward
# should return/fill the updates coresponding to this sample
def back_prop_network(input_sample, dense_layer_output,  ):
    
    out_error = correct_output - layer outputs
    backprop_dense_layer(out_error) # delta dense has been updated 
    backprop_recursive_layer(deltas_dense[-1], deltas_recursive[-1],  layer_raw_output) # delta recursive has been updated


In [None]:
# feed forward one sample unit through all layers
def feed_forward_network_sequence(input_sequence):
    
    for seq_elem in input_sequence:
        feed_forward_network(input_sample)



In [None]:
def back_prop_network_seq(input_sequence, layer_raw_output, out_error):
    
    backprop_dense_layer(out_error)
    backprop_recursive_layer(deltas_dense[-1], deltas_recursive[-1],  layer_raw_output)
    

In [123]:

# back prop one sample unit through all layers
# it assumed that all elems in the containers before this have been poped out going backward
def back_prop_network(input_sample, out_error):
    
    # pop out next elements from the stack of outputs sequence at the  hidden layer
    # last output was inputed into the dense layer
    # last last output was inputed into both the dense layer and the next hidden layer
    dense_layer_input, hidden_layer_input = hidden_layer_output_seq[-1], hidden_layer_output_seq[-2]
    hidden_layer_output_seq = hidden_layer_output_seq[:-1]
    
    # backprop the dense layer and store the update for the weights
    pass_next_deltas, own_weights_deltas = backprop_dense_layer(out_error, input_sample)
    deltas_output.append(own_weights_deltas)
    
    # backprop the hidden layer and store the update for the weights
    backprop_rnn_layer(layer_input, error_to_output, layer_previous_hidden_input, error_to_next_hidden,  layer_raw_output):
        
    pass_next_deltas, input_weights_deltas, prev_hidden_input_weights_deltas = backprop_rnn_layer(
        input_sample, pass_next_deltas, hidden_layer_input, hidden_err_weighted_deriv, hidden_layer_raw_output_seq[bit_idx].T)
    

    batch_error += abs(out_error[0])
    
def train_RNN():
    
    # online learning: network gets updated with each sample on the way
    for i in range(n_samples):
        
        
        input_1_binary, input_2_binary, true_output_binary = generate_random_sample()
        
        # delta updates for this sample
        sum_hidden_layer_deltas = np.zeros_like(weights_hidden)
        sum_previous_hidden_layer_deltas = np.zeros_like(weights_previous_hidden)
        sum_output_layer_deltas = np.zeros_like(weights_output)
        
        # storing outputs in lists
        hidden_layer_raw_output_seq = []
        hidden_layer_output_seq = []
        output_layer_raw_output_seq = []
        output_layer_output_seq = []
        # init
        hidden_layer_output_seq.append(np.zeros((1,hidden_size)))
        
        # forward pass of the bit sequence through the network 
        for bit_idx in range(n_bit - 1, -1, -1):
            input_bits = np.array([[input_1_binary[bit_idx], input_2_binary[bit_idx]]])
            
            hidden_layer_raw_outputs, hidden_layer_outputs = feed_forward_rnn_layer(input_bits, hidden_layer_output_seq[-1])
            hidden_layer_raw_output_seq.append(hidden_layer_raw_outputs)
            hidden_layer_output_seq.append(hidden_layer_outputs)
            
            output_layer_raw_output, output_layer_output = feed_forward_dense_layer(hidden_layer_outputs)
            output_layer_raw_output_seq.append(output_layer_raw_outputs)
            output_layer_output_seq.append(output_layer_outputs)
            
        # backward pass of the bit sequence through the network 
        # first init output error of recurrent layer
        hidden_error_weighted_derivative = np.zeros((1,hidden_size))
        for bit_idx in range(n_bit - 1, -1, -1):
            
            input_bits = np.array([[input_1_binary[bit_idx], input_2_binary[bit_idx]]])
            out_error = np.array([true_output_binary[bit_idx]]) - output_layer_output_seq[bit_idx]
            
            error_weighted_deriv_out, input_and_error_weighted_deriv_out = 
                backprop_dense_layer(out_error, hidden_layer_output_seq[bit_idx].T)
            sum_output_layer_deltas += input_and_error_weighted_deriv_out

            hidden_err_weighted_deriv, hidden_input_and_err_weighted_deriv, previous_hidden_input_and_err_weighted_deriv = backprop_rnn_layer(
                input_bits, error_weighted_deriv_out, hidden_layer_output_seq[bit_idx-1].T, hidden_err_weighted_deriv, hidden_layer_raw_output_seq[bit_idx].T)
            sum_hidden_layer_deltas += hidden_input_and_err_weighted_deriv
            sum_previous_hidden_layer_deltas += previous_hidden_input_and_err_weighted_deriv
            
            batch_error += abs(out_error[0])
 
        # updating weights for this sample
        weights_hidden += (sum_hidden_layer_deltas * learning_rate)
        weights_previous_hidden += (sum_previous_hidden_layer_deltas * learning_rate)
        weights_output += (sum_output_layer_deltas * learning_rate)
        
        errors = np.array(true_output_binary) - np.array([sigmoid(x.tolist()[0][0]) for x in output_layer_output_seq])
        batch_error += sum([abs(x) for x in errors])/n_bit

        if (i % 1000) == 0: 
            print 100*'#' + " sample {} ".format(i)
            print " Training sample: {0} + {1} = {2}".format(input_1, input_2, true_output)
            #print " Binary version: {0} + {1} = {2}".format(input_1_binary, input_2_binary, true_output_binary)
            result = [sigmoid(x.tolist()[0][0]) for x in output_layer_output_seq]
            print " Result is {}".format( sum([pow(2,n_bit-i-1)*round(result[i]) for i in range(n_bit)]))
            #print result

            print " Average binarry error for this batch is {}".format(batch_error/8000)   
            batch_error = 0


SyntaxError: invalid syntax (<ipython-input-123-95d0901842bf>, line 64)

In [103]:
a = np.array([1,2,3])

a[-2]

2

In [82]:
sigmoid_derivative(np.ones((1,hidden_size)))

array([[ 0.,  0.,  0.]])

In [92]:
ones((1,hidden_size))/2

array([[ 0.5,  0.5,  0.5]])

In [None]:
# gets an input sample and recurrent input and returns all layer outputs
def feed_forward_lstm_layer(input_data, previous_hidden_layer_output):
    
    hidden_layer_raw_outputs = np.dot(input_data, w_hidden) + np.dot(previous_hidden_layer_output, w_previous_hidden)
    hidden_layer_outputs = sigmoid(hidden_layer_outputs)
    
    return hidden_layer_raw_outputs, hidden_layer_outputs

# backprop through time rnn layer   
def backprop_lstm_layer(layer_input, error_to_output, layer_previous_hidden_input, error_to_next_hidden,  layer_raw_output):
    
    # calculate error as coming back from: 1.what was sent to the output, 2.what was sent to the next hidden layer
    error = np.dot(error_to_output, weights_output.T) + np.dot(error_to_next_hidden, weights_previous_hidden)
    error_weighted_derivative = sigmoid_derivative(layer_raw_output)* error
    layer_input_and_error_weighted_derivative = np.dot(layer_input, error_weighted_derivative)
    previous_hidden_input_and_error_weighted_derivative = np.dot(layer_input, error_weighted_derivative)
    
    return error_weighted_derivative, layer_input_and_error_weighted_derivative, previous_hidden_input_and_error_weighted_derivative
    
