In [77]:
import numpy as np
import copy

n_samples = 2
np.random.seed(1)


input_dim = 2
output_dim = 1
n_bit = 3
hidden_size = 3
learning_rate = .1

largest_input_number = pow(2, n_bit) / 2
weights_hidden1 = np.random.standard_normal(size=(input_dim, hidden_size))
weights_previous_hidden1 = np.random.standard_normal(size=(hidden_size, hidden_size))
weights_output1 = np.random.standard_normal(size=(hidden_size, output_dim))

samples = [(np.random.randint(0, largest_input_number), np.random.randint(0, largest_input_number))]
samples.append((np.random.randint(0, largest_input_number), np.random.randint(0, largest_input_number)))
samples


[(1, 1), (1, 3)]

In [212]:
'''
Simple RNN for adding 2 numbers in binary
'''
weights_hidden = weights_hidden1.copy()
weights_previous_hidden = weights_previous_hidden1.copy()
weights_output = weights_output1.copy()


def sigmoid(x): return (1 / (1 + np.exp(-x)))
def sigmoid_derivative(x): return x * (1 - x)

batch_error = 0

# online learning: network gets updated with each sample on the way
for i in range(n_samples):

    # generate 2 random numbers and their sum
    input_1, input_2 = samples[i]
    true_output = input_1 + input_2
 
    print 100*'#' + " sample {} ".format(i)
    print " Training sample: {0} + {1} = {2}".format(input_1, input_2, true_output)

    batch_error = 0
    
    # calculate the binaries
    input_1_binary, input_2_binary, true_output_binary = [int(x) for x in np.binary_repr(input_1, n_bit)], [int(x) for x
                                in np.binary_repr(input_2, n_bit)], [int(x) for x in np.binary_repr(true_output, n_bit)]

    # we'll append the outputs at each layer on the way..
    hidden_layer_output_seq = []
    hidden_layer_output_seq.append(np.zeros((1,hidden_size)))
    output_layer_output_seq = []

    # forward pass of the bit sequence through the network and accumulating the errors at each bit position
    for bit_idx in range(n_bit - 1, -1, -1):
        
        input_bits = np.array([[input_1_binary[bit_idx], input_2_binary[bit_idx]]])
        hidden_layer_outputs = sigmoid(np.dot(input_bits, weights_hidden) + np.dot(hidden_layer_output_seq[-1], weights_previous_hidden))
        #print "hidden layer outputs"
        #print hidden_layer_outputs
        #print "%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%"
        #print input_bits
        #print weights_hidden
        #print hidden_layer_output_seq[-1]
        #print weights_previous_hidden
        output_layer_output = sigmoid(np.dot(hidden_layer_outputs, weights_output))
        #print "output layer outputs"
        #print output_layer_output
        #print "-------------------------"

        hidden_layer_output_seq.append(copy.deepcopy(hidden_layer_outputs))
        output_layer_output_seq.append(copy.deepcopy(output_layer_output))
        
    
    previous_hidden_layer_error_weighted_derivative = np.zeros((1,hidden_size))
    # append one more zero array for going backwards
    
    # sum of the derivative of the outputs at the corresponding layers weighted by the errors, for each pair of input bits
    sum_hidden_layer_updates = np.zeros_like(weights_hidden)
    sum_previous_hidden_layer_updates = np.zeros_like(weights_previous_hidden)
    sum_output_layer_updates = np.zeros_like(weights_output)

    # rolling back from the last bit to the first
    hidden_layer_output_seq.reverse()
    #hidden_layer_output_seq = hidden_layer_output_seq[1:]
    output_layer_output_seq.reverse()
    #print hidden_layer_output_seq
    for bit_idx in range(n_bit):
               
        # take output error at this position -> size(output_dim)
        output_error = np.array([true_output_binary[bit_idx]]) - output_layer_output_seq[bit_idx]
        print "output layer: {}".format(output_layer_output_seq[bit_idx])
        # calculate output derivative weighted by the output errors -> size(output_dim)
        output_error_weighted_derivative = sigmoid_derivative(output_layer_output_seq[bit_idx])* output_error
        
        # sum the output_error_weighted_derivative for each element in the sequence weighted by the size of inputs int this layer -> (hidden_size, output_dim)
        sum_output_layer_updates += np.dot(hidden_layer_output_seq[bit_idx].T, output_error_weighted_derivative)
        #rint "out delta"
        #rint np.dot(hidden_layer_output_seq[bit_idx].T, output_error_weighted_derivative).T
        # calculate hidden error as coming from: 1.what was sent to the output, 2.what was sent to the next hidden layer
        #  -> (output_dim)* (hidden_size, output_dim) + (hidden_size)*(hidden_size, hidden_size) = (hidden_size)
        hidden_error = np.dot(output_error_weighted_derivative, weights_output.T) + np.dot(previous_hidden_layer_error_weighted_derivative, 
                                                                                           weights_previous_hidden)

        # calculate hidden outputs derivatives weighted by hidden errors ->(hidden_size) * (hidden_size) = (hidden_size)
        #print np.dot(output_error_weighted_derivative, weights_output.T)
        #print np.dot(previous_hidden_layer_error_weighted_derivative, weights_previous_hidden)
        hidden_error_weighted_derivative = sigmoid_derivative(hidden_layer_output_seq[bit_idx])* hidden_error
        # print hidden_error_weighted_derivative.shape
        
        # sum the output_error_weighted_derivative for each element in the sequence, weighted by the size of the inputs -> (input_dim, hidden_size)
        sum_hidden_layer_updates += np.dot(np.array([[input_1_binary[bit_idx], input_2_binary[bit_idx]]]).T, hidden_error_weighted_derivative)
        #rint "hidden delta"
        #rint np.dot(np.array([[input_1_binary[bit_idx], input_2_binary[bit_idx]]]).T, hidden_error_weighted_derivative)
        # sum the hidden_error_weighted_derivative for each element in the sequence, weighted by the size of the inputs -> (hidden_size, hidden_size)
        sum_previous_hidden_layer_updates += np.dot(hidden_layer_output_seq[bit_idx + 1].T, hidden_error_weighted_derivative)
        #print "prev hidden output"
        #print hidden_layer_output_seq[bit_idx + 1]
        #print "prev hidden delta"
        #print np.dot(hidden_layer_output_seq[bit_idx + 1].T, hidden_error_weighted_derivative)
        # propagating the hidden layer error back to
        previous_hidden_layer_error_weighted_derivative = hidden_error_weighted_derivative
        
        # just accumulating error for printing
        batch_error += abs(output_error[0])
        #print "*"*120
    # updating weights for this sample
    print (sum_output_layer_updates * learning_rate).T
    print (sum_hidden_layer_updates * learning_rate)
    print (sum_previous_hidden_layer_updates * learning_rate)
    weights_hidden += (sum_hidden_layer_updates * learning_rate)
    weights_previous_hidden += (sum_previous_hidden_layer_updates * learning_rate)
    weights_output += (sum_output_layer_updates * learning_rate)
    
    errors = np.array(true_output_binary) - np.array([x.tolist()[0][0] for x in output_layer_output_seq])
    batch_error += sum([abs(x) for x in errors])/n_bit


#################################################################################################### sample 0 
 Training sample: 1 + 1 = 2
output layer: [[ 0.23025928]]
output layer: [[ 0.24095597]]
output layer: [[ 0.30072392]]
[[ 0.00299557  0.00224778  0.0022617 ]]
[[ 0.00068787  0.00074533  0.00018759]
 [ 0.00068787  0.00074533  0.00018759]]
[[ -1.16328202e-03  -3.56168577e-04  -8.98100720e-04]
 [ -1.08546660e-03  -3.26680035e-04  -8.46291755e-04]
 [  9.61923361e-05   8.27670345e-06   1.05176880e-04]]
#################################################################################################### sample 1 
 Training sample: 1 + 3 = 4
output layer: [[ 0.2684822]]
output layer: [[ 0.33647413]]
output layer: [[ 0.30132626]]
[[ 0.00176992  0.00042192  0.00222843]]
[[ 0.00183742  0.00038005  0.00023555]
 [ 0.0025249   0.00109878  0.00039585]]
[[ -1.24767834e-03   2.06178448e-04  -8.42943141e-04]
 [ -2.36315560e-03  -3.43271436e-06  -1.45260695e-03]
 [ -1.02679193e-04   1.91382406e-0

In [216]:


from numpy import ones, zeros, zeros_like, log, clip

######################################### THE DATA #########################################


n_samples = 2
print_every = 1
n_bit = 3
largest_input_number = pow(2, n_bit)/2

def generate_random_sample((input_1, input_2)):
    # generate 2 random numbers and their sum
    
    true_output = input_1 + input_2

    print " Training sample: {0} + {1} = {2}".format(input_1, input_2, true_output)

    # calculate the binaries
    input_1_binary = [int(x) for x in np.binary_repr(input_1, n_bit)]
    input_2_binary = [int(x) for x in np.binary_repr(input_2, n_bit)]
    true_output_binary = [int(x) for x in np.binary_repr(true_output, n_bit)]

    return list(reversed(input_1_binary)), list(reversed(input_2_binary)), list(reversed(true_output_binary))

############################################# THE RNN #############################################

# RNN params
input_dim = 2
output_dim = 1
recursive_size = hidden_size


# RNN weights
# simple RNN with one recurent hidden layer and one output layer


weights = { # hidden layer weights
           "recursive": weights_hidden1.copy(),
           "previous_recursive": weights_previous_hidden1.copy(),
           "recursive_bias": zeros((1, recursive_size)),
            # output layer weights
           "dense":weights_output1.copy(),
           "dense_bias": zeros((1,output_dim)),
            # the associated metrics with this set of weights' values
            "log_loss":0
          }

# RNN Functions

# first thing first - what do we measure?
def logloss(target, predicted, eps=1e-15): return log(1-clip(predicted, eps, 1-eps))*(target-1) - log(clip(predicted, eps, 1-eps))*target
# compute the loss for a sequence of target and predicted values
def compute_loss_seq(targets, predicted):
    assert len(targets) == len(predicted)
    return np.mean([logloss(x[0], x[1]) for x in np.stack([targets, predicted], 1)])

# util math functions
def sigmoid(x):
    return (1 / (1 + np.exp(-x)))
def sigmoid_derivative(x): return x * (1 - x)


# gets an input sample and recurrent input and returns all layer outputs

def feed_forward_recursive_layer(inputs, weights):  # input_data, previous_recursive_layer_output):

    raw_outputs = np.dot(inputs["from_previous"], weights["recursive"]) + np.dot(
        inputs["from_recursive"], weights["previous_recursive"]) + weights["recursive_bias"]
    
    #print "**************************"
    #print inputs["from_previous"]
    #print weights["recursive"]
    return {"activation": sigmoid(raw_outputs)}

# gets an input sample and recurrent input and returns all layer outputs
def feed_forward_dense_layer(inputs, weights):
    #print "%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%"
    #print inputs["from_previous"]
    #print weights["dense"]
    raw_output = np.dot(inputs["from_previous"], weights["dense"]) + weights["dense_bias"]

    return {"activation": sigmoid(raw_output)}

# feed forward one sample unit through all layers
def feed_forward_network(inputs, weights):
    recursive_layer_outputs = feed_forward_recursive_layer(inputs, weights)
    dense_layer_outputs = feed_forward_dense_layer({"from_previous": recursive_layer_outputs["activation"]}, weights)

    return {"from_dense": dense_layer_outputs, "from_recursive": recursive_layer_outputs}


# feeds forward a sequence of samples..
def feed_forward_network_sequence(inputs_seq, weights):
    all_samples_output_seq = [{"from_recursive": {"activation":zeros((1, recursive_size))}}]
    for input_unit in inputs_seq:
        input_unit["from_recursive"] = all_samples_output_seq[-1]["from_recursive"]["activation"]
        all_samples_output_seq.append(feed_forward_network(input_unit, weights))

    return all_samples_output_seq[1:]

# gets the error delta it sent to output and the layer input and returns the delta to pass down and
# the delta to update its weights
def backprop_dense_layer(inputs, outputs, errors, weights):
    # delta at this layer
    total_delta = sigmoid_derivative(outputs["activation"]) * errors["to_output"]  
    input_w_delta = np.dot(inputs["from_previous"].T, total_delta)

    return {"total_delta": total_delta, "input_w_delta": input_w_delta}

# backprop through time rnn layer
# takes: its raw output, all the errors deltas sent to its successors
# returns: the overall error delta to pass to its precedessors and the deltas to update its own weights
def backprop_recursive_layer(inputs, outputs, errors,
                             weights):  # error_to_output, error_to_next_recursive,  layer_raw_output):
    
    # calculate error as coming back from: 1.what was sent to the output, 2.what was sent to the next hidden layer
    error = np.dot(errors["to_output"], weights["dense"].T) + np.dot(errors["to_next_recursive"],weights["previous_recursive"])
    #print "errors to  output:"
    #print np.dot(errors["to_output"], weights["dense"].T)
    #print "errors to next recursive:"
    #print np.dot(errors["to_next_recursive"],weights["previous_recursive"])
    
    
    # total delta of the layer to pass further down to previous inputing layers: error_weighted_derivative of output
    total_delta = sigmoid_derivative(outputs["activation"]) * error
    #print "total delta to pass down to next recursive"
    #print total_delta
    # delta corresponding to input from below layer based on inputs from that layer
    input_w_delta = np.dot(inputs["from_previous"].T, total_delta)
    # delta corresponding to input from previous hidden layer based on inputs from that layer
    recursive_w_delta = np.dot(inputs["from_recursive"].T, total_delta)
    
    return {"total_delta": total_delta, "recursive_w_delta": recursive_w_delta, "input_w_delta": input_w_delta}

# back prop one sample unit through all layers
# because it's recursive it takes possible deltas from successor samples feeded forward, just as the feed forward takes recursive
# outputs from previous samples
# should return/fill the updates coresponding to this sample
def back_prop_network(inputs, all_layer_outputs, target, next_sample_deltas, weights):
    
    inputs_dense = {"from_previous": all_layer_outputs["from_recursive"]["activation"]}
    outputs_dense = all_layer_outputs["from_dense"]
    errors_dense = {"to_output": target - all_layer_outputs["from_dense"]["activation"]}
    dense_deltas = backprop_dense_layer(inputs_dense, outputs_dense, errors_dense, weights)

    inputs_recursive = inputs
    outputs_recursive = all_layer_outputs["from_recursive"]
    errors_recursive = {"to_output": dense_deltas["total_delta"],
                        "to_next_recursive": next_sample_deltas["recursive_deltas"]["total_delta"]}
    recursive_deltas = backprop_recursive_layer(inputs_recursive, outputs_recursive, errors_recursive, weights)
    
    '''
    print "inputs"
    print inputs
    print "output"
    
    print "all deltas " + "-"*20
    print dense_deltas['input_w_delta'].T
    print recursive_deltas['input_w_delta']
    print recursive_deltas['recursive_w_delta']
    
    '''
    #print all_layer_outputs["from_dense"]
    #print "-"*20


    return {"dense_deltas": dense_deltas, "recursive_deltas": recursive_deltas}

# back propagates a sequence of samples - we don't pass delta from previous sequence here
def back_prop_network_sequence(inputs_seq, outputs_seq, target_seq, weights):
    # dense deltas are not going to be used so no init is needed
    init_recursive_deltas = {"total_delta": zeros((1, recursive_size)),
                             "recursive_w_delta": zeros_like(weights["previous_recursive"]),
                             "input_w_delta": zeros_like(weights["recursive"])}
    init_dense_deltas = {"total_delta": 0, "input_w_delta": zeros_like(weights["dense"])}
    all_deltas_seq = [{"dense_deltas": init_dense_deltas, "recursive_deltas": init_recursive_deltas}]

    for i in range(1, len(inputs_seq)+1):
        #print "output: {}".format(outputs_seq[-i]["from_dense"])
        deltas = back_prop_network(inputs_seq[-i], outputs_seq[-i], target_seq[-i], all_deltas_seq[-1], weights)
        all_deltas_seq.append(deltas.copy())


    # compute loss for the whole sequence
    weights["log_loss"] += compute_loss_seq(target_seq, [x['from_dense']['activation'][0][0] for x in outputs_seq])

    return all_deltas_seq[1:]


# update weights with a seq  of deltas coresponding to a sequence of inputs
# also compute the log loss of the previous set of weights
def update_network_weights(all_deltas_seq, weights):
    for all_deltas in all_deltas_seq:
        
        #print learning_rate * all_deltas["recursive_deltas"]["recursive_w_delta"] 
        #print learning_rate * np.clip(all_deltas["recursive_deltas"]["input_w_delta"], -100, 100)
        #print learning_rate * np.clip(all_deltas["recursive_deltas"]["recursive_w_delta"], -100, 100)

        weights["recursive"] += learning_rate * np.clip(all_deltas["recursive_deltas"]["input_w_delta"], -100, 100)

        #weights["recursive_bias"] += learning_rate * np.clip(all_deltas["recursive_deltas"]["total_delta"], -100, 100)
        weights["dense"] += learning_rate * np.clip(all_deltas["dense_deltas"]["input_w_delta"], -100, 100)
        #weights["dense_bias"] -= learning_rate * np.clip(all_deltas["dense_deltas"]["total_delta"], -10, 10)
        weights["previous_recursive"] += learning_rate * all_deltas["recursive_deltas"]["recursive_w_delta"] 


def train_net(weights):
    
    for i in range(n_samples):

        input_1_binary, input_2_binary, target_binary = generate_random_sample(samples[i])


        input_seq = [{"from_previous": np.array([x]), "from_recursive": zeros((1, recursive_size))} for x in
                     zip(input_1_binary, input_2_binary)]

        net_outputs = feed_forward_network_sequence(input_seq, weights)
        #for k in range(len(net_outputs)):
            #print net_outputs[k]['from_recursive']
            #print net_outputs[k]['from_dense']

        net_deltas = back_prop_network_sequence(input_seq, net_outputs, target_binary, weights)
        #for delta net_deltas
        update_network_weights( net_deltas, weights)
            

        if i % print_every ==0:
            print "*"*120
            print logloss
    
train_net(weights)


 Training sample: 1 + 1 = 2
************************************************************************************************************************
<function logloss at 0x7f7985a8e938>
 Training sample: 1 + 3 = 4
************************************************************************************************************************
<function logloss at 0x7f7985a8e938>


In [None]:

# change 1 - plus  delta updates
# change 2 - no bias
# change 3 - no clip
