# RNNs
Playing with RNNs

In [3]:
# If using one or multiple GPUs
import os
os.environ["CUDA_VISIBLE_DEVICES"]=""

In [4]:
import tensorflow as tf
import numpy as np

## Custom RNN cells
I'll first create my own basic RNN cells to test my understanding of how they work.

### Basic RNN cell, naive input
This is a simple RNN cell that simply applies linear transformations to the state and input at time t, adds them together, and applies the `tanh` non-linearity (as depicted in [Chris Olah's blog post](http://colah.github.io/posts/2015-08-Understanding-LSTMs/)):

![image](./rnn/SimpleRNN.png)

In [18]:
class RNN_cell:
    def __init__(self, x, num_hidden, num_output, activation='tanh'):
        # Get input shape
        input_shape = x.get_shape().as_list()
        batch_size = input_shape[0]
        input_size = input_shape[1] # assume flattened
        
        # Set weight matrices
        self.W_xh = tf.Variable(tf.truncated_normal([input_size, num_hidden]), name='W_xh')
        self.W_hh = tf.Variable(tf.truncated_normal([num_hidden, num_hidden]), name='W_hh')
        self.W_hy = tf.Variable(tf.truncated_normal([num_hidden, num_output]), name='W_hy')
        
        # Define hidden state and input
        self.state = tf.placeholder(tf.float32, shape=[batch_size, num_hidden], name='hidden_state')
        self.zero_state = np.zeros([batch_size, num_hidden])
        self.current_state = self.zero_state
        
        # Define computations
        self.x = x
        self.h1 = tf.matmul(self.x, self.W_xh) # input transform
        self.h2 = tf.matmul(self.state, self.W_hh) # hidden transform
        self.s = tf.tanh(tf.add(self.h1, self.h2)) # update state
        self.y = tf.matmul(self.s, self.W_hy)
        
        # tf session
        self.sess = tf.Session()
        self.sess.run(tf.global_variables_initializer())
    
    def step(self, x):
        # Compute output and new state
        y, new_state = self.sess.run([self.y, self.s], 
                                     feed_dict={self.state: self.current_state, self.x: x})
        # Update current state
        self.current_state = new_state
        
        return y
    
    def out(self, X):
        self.current_state = self.zero_state
        y = []
        for x_t in X:
            y.append(self.step(x_t))
        
        return y

In [19]:
tf.reset_default_graph()

batch_size = 2
input_size = 4
x = tf.placeholder(tf.float32, shape=[batch_size, input_size])
num_hidden = 3
num_output = 1
rnn = RNN_cell(x, num_hidden, num_output)

In [20]:
for _ in range(5):
    x = [[0, 0, 0, 0], [1, 1, 1, 1]]
    y = rnn.step(x)
    print(y)
    print(rnn.current_state)
    print()

[[ 0.        ]
 [-0.72981167]]
[[ 0.          0.          0.        ]
 [ 0.5832566  -0.2486003   0.43648246]]

[[ 0.        ]
 [ 0.18836218]]
[[ 0.          0.          0.        ]
 [ 0.51001525  0.59857416  0.27309674]]

[[ 0.        ]
 [ 0.30364376]]
[[ 0.          0.          0.        ]
 [-0.24466427  0.20748833 -0.41599485]]

[[ 0.        ]
 [-1.24082625]]
[[ 0.          0.          0.        ]
 [ 0.59718233 -0.70386559  0.4279916 ]]

[[ 0.        ]
 [ 0.13348949]]
[[ 0.          0.          0.        ]
 [ 0.77958661  0.69025028  0.59718585]]



In [21]:
# Time major axis
X = np.ones([5, batch_size, input_size])
X[:, 0, :] = 0
y = rnn.out(X)
[print(y_t) for y_t in y]
print(rnn.current_state)

[[ 0.        ]
 [-0.72981167]]
[[ 0.        ]
 [ 0.18836218]]
[[ 0.        ]
 [ 0.30364376]]
[[ 0.        ]
 [-1.24082625]]
[[ 0.        ]
 [ 0.13348949]]
[[ 0.          0.          0.        ]
 [ 0.77958661  0.69025028  0.59718585]]


### Basic RNN cell, sequential input
Rather than passing inputs from individual time steps with shape `[t, batch_size, ...]` through the RNN multiple times, we can instead pass input that has sequences stacked along the batch dimension, i.e. with shape `[t * batch_size, ...]` of the form $\{x^1_1, ..., x^1_n, x^2_1, ..., x^2_n, x^3_1, ...\}$ and then let the RNN cell take care of reshaping into time sequences.

In [25]:
# Sample parameters
batch_size = 3
trace_length = 2

# shape = [batch_size * trace_length, ...] in form shown above
A = np.ones([batch_size * trace_length, 3, 3])
for i in range(2):
    A[i::2, ...] = i
print("A")
print(A) # each batch contains a 3x3 matrix of zeros followed by 3x3 matrix of ones
print()

# reshape to [batch_size, trace_]
B = np.reshape(A, [3, 2, 3, 3])
for i in range(B.shape[0]):
    print("B[%d]" % i)
    print(B[i, 0]) # should print 3x3 matrix of zeros
    print(B[i, 1]) # should print 3x3 matrix of ones

A
[[[ 0.  0.  0.]
  [ 0.  0.  0.]
  [ 0.  0.  0.]]

 [[ 1.  1.  1.]
  [ 1.  1.  1.]
  [ 1.  1.  1.]]

 [[ 0.  0.  0.]
  [ 0.  0.  0.]
  [ 0.  0.  0.]]

 [[ 1.  1.  1.]
  [ 1.  1.  1.]
  [ 1.  1.  1.]]

 [[ 0.  0.  0.]
  [ 0.  0.  0.]
  [ 0.  0.  0.]]

 [[ 1.  1.  1.]
  [ 1.  1.  1.]
  [ 1.  1.  1.]]]

B[0]
[[ 0.  0.  0.]
 [ 0.  0.  0.]
 [ 0.  0.  0.]]
[[ 1.  1.  1.]
 [ 1.  1.  1.]
 [ 1.  1.  1.]]
B[1]
[[ 0.  0.  0.]
 [ 0.  0.  0.]
 [ 0.  0.  0.]]
[[ 1.  1.  1.]
 [ 1.  1.  1.]
 [ 1.  1.  1.]]
B[2]
[[ 0.  0.  0.]
 [ 0.  0.  0.]
 [ 0.  0.  0.]]
[[ 1.  1.  1.]
 [ 1.  1.  1.]
 [ 1.  1.  1.]]


In [37]:
class RNN_cell:
    def __init__(self, x, num_hidden, num_output, activation='tanh'): 
        # Get input shape
        input_shape = x.get_shape().as_list()
        batch_size = input_shape[0]
        trace_length = input_shape[1]
        input_size = input_shape[2] # assume flattened
        self.x = tf.placeholder(tf.float32, shape=[batch_size, input_size]) # input placeholder (or input layer)
        
        # Set weight matrices
        self.W_xh = tf.Variable(tf.truncated_normal([input_size, num_hidden]), name='W_xh')
        self.W_hh = tf.Variable(tf.truncated_normal([num_hidden, num_hidden]), name='W_hh')
        self.W_hy = tf.Variable(tf.truncated_normal([num_hidden, num_output]), name='W_hy')
        
        # Define hidden state and input
        self.state = tf.placeholder(tf.float32, shape=[batch_size, num_hidden], name='hidden_state')
        self.zero_state = np.zeros([batch_size, num_hidden])
        self.current_state = self.zero_state
        
        # Set up main computational graph
        self.h1 = tf.matmul(self.x, self.W_xh) # input transform
        self.h2 = tf.matmul(self.state, self.W_hh) # hidden transform
        self.s = tf.tanh(tf.add(self.h1, self.h2)) # update state
        self.y = tf.matmul(self.s, self.W_hy) # output at time t
        
        # tf session
        self.sess = tf.Session()
        self.sess.run(tf.global_variables_initializer())
    
    def step(self, x):
        # shape(x) = [batch_size, input_size] at time t
        # Compute output and new state
        y, new_state = self.sess.run([self.y, self.s], 
                                     feed_dict={self.state: self.current_state, self.x: x})
        # Update current state
        self.current_state = new_state
        
        return y
    
    def out(self, X):
        # Reset state to zero state
        self.current_state = self.zero_state
        
        # Pass batch input x_t for each time step t
        y = []
        for i in range(X.shape[1]):
            y.append(self.step(X[:, i, ...]))
        
        return y

In [42]:
# Set up RNN cell
tf.reset_default_graph()
batch_size = 2
trace_length = 5
input_size = 4
x = tf.placeholder(tf.float32, shape=[batch_size, trace_length, input_size])
num_hidden = 3
num_output = 1
rnn = RNN_cell(x, num_hidden, num_output)

# Same test run
x = np.random.random([batch_size, trace_length, input_size]) 
y = rnn.out(x)
print(y)
print(rnn.current_state)

[array([[ 1.11198759],
       [ 0.88573521]], dtype=float32), array([[ 1.48039114],
       [ 1.31830537]], dtype=float32), array([[ 1.42966747],
       [ 1.2085216 ]], dtype=float32), array([[ 1.28310418],
       [ 1.6828239 ]], dtype=float32), array([[ 1.30481863],
       [ 1.18696034]], dtype=float32)]
[[ 0.29764178  0.47977298  0.99106669]
 [ 0.02432045  0.49872658  0.99757266]]


## LSTM cell
Now let's try to build an LSTM cell, following this graphical depiction from [Chris Olah's blog post](http://colah.github.io/posts/2015-08-Understanding-LSTMs/):

![image](./rnn/LSTM.png)

A small note: we will be concatenating the input and hidden states rather than simply adding them to be consistent with the notation in this post.

In [4]:
class LSTM_cell:
    def __init__(self, x, num_hidden, activation='tanh'): 
        # Get input shape
        input_shape = x.get_shape().as_list()
        batch_size = input_shape[0]
        trace_length = input_shape[1]
        input_size = input_shape[2] # assume flattened
        self.x = tf.placeholder(tf.float32, 
                                shape=[batch_size, input_size],
                                name='x_t') # input placeholder (or input layer)
        
        # Zero state
        self.zero_state = np.zeros([batch_size, num_hidden])

        # Cell state
        self.cell_state = tf.placeholder(tf.float32,
                                         shape=[batch_size, num_hidden],
                                         name='cell_state')
        self.current_cell_state = self.zero_state

        # Hidden state
        self.hidden_state = tf.placeholder(tf.float32, 
                                           shape=[batch_size, num_hidden], 
                                           name='hidden_state')
        self.current_hidden_state = self.zero_state

        # Forget gate
        with tf.name_scope("forget_gate"):
            self.W_f = tf.Variable(tf.truncated_normal([num_hidden + input_size, num_hidden]),
                                   name='W_f')
            self.b_f = tf.Variable(tf.constant(0.1, dtype=tf.float32, shape=[1, num_hidden]),
                                   name="b_f")
            self.f_t = tf.matmul(tf.concat([self.hidden_state, self.x], 1), self.W_f) + self.b_f
            self.f_t = tf.sigmoid(self.f_t, name='f_t')
        
        # Input gate
        with tf.name_scope("input_gate"):
            self.W_i = tf.Variable(tf.truncated_normal([num_hidden + input_size, num_hidden]),
                                   name='W_i')
            self.b_i = tf.Variable(tf.constant(0.1, dtype=tf.float32, shape=[1, num_hidden]),
                                   name="b_i")
            self.i_t = tf.matmul(tf.concat([self.hidden_state, self.x], 1), self.W_i) + self.b_i
            self.i_t = tf.sigmoid(self.i_t, name='i_t')
        
        
        # New cell state candidate values
        with tf.name_scope("candidate_values"):
            self.W_c = tf.Variable(tf.truncated_normal([num_hidden + input_size, num_hidden]),
                                   name='W_c')
            self.b_c = tf.Variable(tf.constant(0.1, dtype=tf.float32, shape=[1, num_hidden]),
                                   name="b_c")
            self.c_t_ = tf.matmul(tf.concat([self.hidden_state, self.x], 1), self.W_c) + self.b_c
            self.c_t_ = tf.tanh(self.c_t_, name='c_t_')
        
        # Combine forget and input gates to update cell state
        with tf.name_scope("update_cell_state"):
            self.c_t_f = tf.multiply(self.cell_state, self.f_t, name='c_t_f') 
            self.c_t_i = tf.multiply(self.c_t_, self.i_t, name='c_t_i') 
            self.c_t = tf.add(self.c_t_f, self.c_t_i, name='c_t')
        
        # Output gate
        with tf.name_scope("output_gate"):
            self.W_o = tf.Variable(tf.truncated_normal([num_hidden +  input_size, num_hidden]),
                                   name='W_o')
            self.b_o = tf.Variable(tf.constant(0.1, dtype=tf.float32, shape=[1, num_hidden]),
                                   name='b_o')
            self.o_t = tf.matmul(tf.concat([self.hidden_state, self.x], 1), self.W_o) + self.b_o
            self.o_t = tf.sigmoid(self.o_t, name='o_t')
        
        # Gate tanh(cell state) with output gate to update hidden state (output)
        with tf.name_scope("update_hidden_state"):
            self.h_t = tf.multiply(tf.tanh(self.c_t), self.o_t, name='h_t')
        
        # tf session
        self.sess = tf.Session()
        self.sess.run(tf.global_variables_initializer())
    
    def step(self, x):
        # shape(x) = [batch_size, input_size] at time t
        # Compute output and new state
        h, c = self.sess.run([self.h_t, self.c_t], 
                             feed_dict={self.x: x,
                                        self.hidden_state: self.current_hidden_state, 
                                        self.cell_state: self.current_cell_state})
        # Update current states
        self.current_hidden_state = h
        self.current_cell_state = c
        
        return h
    
    def out(self, X):
        # Reset state to zero state
        self.current_hidden_state = self.zero_state
        self.current_cell_state = self.zero_state
        
        # Pass batch input x_t for each time step t
        H = []
        for i in range(X.shape[1]):
            H.append(self.step(X[:, i, ...]))
        
        return H

In [6]:
# Set up RNN cell
tf.reset_default_graph()
batch_size = 2
trace_length = 5
input_size = 4
x = tf.placeholder(tf.float32, shape=[batch_size, trace_length, input_size])
num_hidden = 3
rnn = LSTM_cell(x, num_hidden)

# Same test run
x = np.random.random([batch_size, trace_length, input_size]) 
h = rnn.out(x)
print("hidden states:")
for h_t in h: print(h_t)
print("\ncurrent hidden state:")
print(rnn.current_hidden_state)
print("\ncurrent cell state:")
print(rnn.current_cell_state)

hidden states:
[[ 0.36249086  0.03932692  0.03911257]
 [ 0.20759627  0.2687242  -0.01895547]]
[[ 0.10290282 -0.02434576 -0.03873252]
 [ 0.14209694  0.2538662  -0.0651268 ]]
[[ 0.23239931 -0.05251402  0.01512763]
 [ 0.23350267  0.22753493 -0.03943847]]
[[ 0.17305562 -0.07541554 -0.06053226]
 [ 0.33283624  0.10467365  0.08489635]]
[[ 0.39625847  0.02193981  0.01964775]
 [ 0.18227693  0.02154228  0.03121651]]

current hidden state:
[[ 0.39625847  0.02193981  0.01964775]
 [ 0.18227693  0.02154228  0.03121651]]

current cell state:
[[ 0.51729691  0.02303421  0.05977021]
 [ 0.26779944  0.02440291  0.07869279]]


### Visualizing tf graph
Note: this may only work in Chrome.

In [12]:
# Credit: https://stackoverflow.com/questions/38189119/simple-way-to-visualize-a-tensorflow-graph-in-jupyter/38192374#38192374
from IPython.display import clear_output, Image, display, HTML

def strip_consts(graph_def, max_const_size=32):
    """Strip large constant values from graph_def."""
    strip_def = tf.GraphDef()
    for n0 in graph_def.node:
        n = strip_def.node.add() 
        n.MergeFrom(n0)
        if n.op == 'Const':
            tensor = n.attr['value'].tensor
            size = len(tensor.tensor_content)
            if size > max_const_size:
                tensor.tensor_content = "<stripped %d bytes>"%size
    return strip_def

def show_graph(graph_def, max_const_size=32):
    """Visualize TensorFlow graph."""
    if hasattr(graph_def, 'as_graph_def'):
        graph_def = graph_def.as_graph_def()
    strip_def = strip_consts(graph_def, max_const_size=max_const_size)
    code = """
        <script>
          function load() {{
            document.getElementById("{id}").pbtxt = {data};
          }}
        </script>
        <link rel="import" href="https://tensorboard.appspot.com/tf-graph-basic.build.html" onload=load()>
        <div style="height:600px">
          <tf-graph-basic id="{id}"></tf-graph-basic>
        </div>
    """.format(data=repr(str(strip_def)), id='graph'+str(np.random.rand()))

    iframe = """
        <iframe seamless style="width:1200px;height:620px;border:0" srcdoc="{}"></iframe>
    """.format(code.replace('"', '&quot;'))
    display(HTML(iframe))

In [8]:
show_graph(tf.get_default_graph().as_graph_def())

### Unrolling the network
The graph above looks good, but instead of iterating through a `for` loop to compute each time step, let's unroll the network to compute the output at all time steps at once.

In [17]:
class LSTM_layer: 
    def __init__(self, x, num_hidden, activation='tanh'):
        # Get input shape
        input_shape = x.get_shape().as_list()
        self.batch_size = input_shape[0]
        trace_length = input_shape[1]
        self.input_size = input_shape[2] # assume flattened
        self.x = x
        x_series = tf.unstack(x, axis=1)
        
        # Zero state
        self.zero_state = np.zeros([batch_size, num_hidden])
        self.init_hidden_state = tf.placeholder(tf.float32,
                                                shape=[batch_size, num_hidden],
                                                name='init_hidden_state')
        self.init_cell_state = tf.placeholder(tf.float32,
                                              shape=[batch_size, num_hidden],
                                              name='init_cell_state')
        
        # Create shared parameters
        with tf.name_scope("params"):
            self.W_f = tf.Variable(tf.truncated_normal([num_hidden + input_size, num_hidden]),
                                   name='W_f')
            self.b_f = tf.Variable(tf.constant(0.1, dtype=tf.float32, shape=[1, num_hidden]),
                                   name="b_f")
            self.W_i = tf.Variable(tf.truncated_normal([num_hidden + input_size, num_hidden]),
                                   name='W_i')
            self.b_i = tf.Variable(tf.constant(0.1, dtype=tf.float32, shape=[1, num_hidden]),
                                   name="b_i")
            self.W_c = tf.Variable(tf.truncated_normal([num_hidden + input_size, num_hidden]),
                                   name='W_c')
            self.b_c = tf.Variable(tf.constant(0.1, dtype=tf.float32, shape=[1, num_hidden]),
                                   name="b_c")
            self.W_o = tf.Variable(tf.truncated_normal([num_hidden +  input_size, num_hidden]),
                                   name='W_o')
            self.b_o = tf.Variable(tf.constant(0.1, dtype=tf.float32, shape=[1, num_hidden]),
                                   name='b_o')
        
        # Unroll network by creating (trace_length) LSTM cells
        h_t = self.init_hidden_state
        c_t = self.init_cell_state
        self.outputs = []
        for t, x_t in enumerate(x_series):
            h_t, c_t = self.LSTM_cell(x_t, h_t, c_t, scope="Cell_%d" % t)
            self.outputs.append(h_t)
        
        # tf session
        self.sess = tf.Session()
        self.sess.run(tf.global_variables_initializer())
        
    def LSTM_cell(self, x_t, hidden_state, cell_state, scope="Cell"):
        with tf.name_scope(scope):
            # Forget gate
            with tf.name_scope("forget_gate"):
                W_f = self.W_f # vs. tf.identity(self.W_f)
                b_f = self.b_f # vs. tf.identity(self.b_f)
                f_t = tf.matmul(tf.concat([hidden_state, x_t], 1), W_f) + b_f
                f_t = tf.sigmoid(f_t, name='f_t')

            # Input gate
            with tf.name_scope("input_gate"):
                W_i = self.W_i
                b_i = self.b_i
                i_t = tf.matmul(tf.concat([hidden_state, x_t], 1), W_i) + b_i
                i_t = tf.sigmoid(i_t, name='i_t')


            # New cell state candidate values
            with tf.name_scope("candidate_values"):
                W_c = self.W_c
                b_c = self.b_c
                c_t_ = tf.matmul(tf.concat([hidden_state, x_t], 1), W_c) + b_c
                c_t_ = tf.tanh(c_t_, name='c_t_')

            # Combine forget and input gates to update cell state
            with tf.name_scope("update_cell_state"):
                c_t_f = tf.multiply(cell_state, f_t, name='c_t_f') 
                c_t_i = tf.multiply(c_t_, i_t, name='c_t_i') 
                c_t = tf.add(c_t_f, c_t_i, name='c_t')

            # Output gate
            with tf.name_scope("output_gate"):
                W_o = self.W_o
                b_o = self.b_o
                o_t = tf.matmul(tf.concat([hidden_state, x_t], 1), W_o) + b_o
                o_t = tf.sigmoid(o_t, name='o_t')

            # Gate tanh(cell state) with output gate to update hidden state (output)
            with tf.name_scope("update_hidden_state"):
                h_t = tf.multiply(tf.tanh(c_t), o_t, name='h_t')

            return h_t, c_t
    
    def out(self, X):
        feed_dict = {self.x: X,
                     self.init_hidden_state: self.zero_state,
                     self.init_cell_state: self.zero_state}
        return self.sess.run(self.outputs, feed_dict=feed_dict)

In [18]:
# Set up RNN cell
tf.reset_default_graph()
batch_size = 2
trace_length = 5
input_size = 4
x = tf.placeholder(tf.float32, 
                   shape=[batch_size, trace_length, input_size],
                   name='input_series')
num_hidden = 3
rnn = LSTM_layer(x, num_hidden)

# Same test run
x = np.random.random([batch_size, trace_length, input_size]) 
h = rnn.out(x)
print("hidden states:")
for h_t in h: print(h_t)

hidden states:
[[-0.07845152 -0.0468897  -0.12943874]
 [ 0.02393308  0.01062461  0.01333745]]
[[-0.18784374 -0.04632721 -0.15332571]
 [ 0.13834111 -0.04746186 -0.08977975]]
[[-0.30638924 -0.14702731 -0.2577512 ]
 [ 0.10506167 -0.11854228 -0.17134102]]
[[-0.38088018 -0.01861375 -0.27257231]
 [ 0.19053105 -0.08923865 -0.26499707]]
[[-0.40826714  0.09289505 -0.21621533]
 [ 0.17739724 -0.09497022 -0.2219989 ]]


In [19]:
show_graph(tf.get_default_graph().as_graph_def())

### Implementing backprop
The previous networks were only defined for the forward passes. In order to train them, we need to implement some form of backpropagation through time (BPTT). Rather than backpropagating entire sequences, which can be thousands to tens of thousands steps, backpropagation is often "cut off" after some specified length in a process termed "truncated backpropagation". Say you have a sequence of length $n$ with a truncated backpropagation length of $m$. Every $k_1$ timesteps, truncated backprogation performs BPTT for $k_2$ timesteps. There are two basic approaches:

1) Set $k_1=k_2$. That is, if BPTT is truncated to 10 timesteps backward, then BPTT is only performed every 10 timesteps. In other words, for a sequence $\{x_1, \ldots, x_n\}$, the network learns from sequences $\{x_1, \ldots, x_m\}, \{x_{m+1}, \ldots, x_{2m}\}, \ldots, \{x_{n-m}, \ldots, x_n\}$. This is the approach that TensorFlow uses.

2) Set $k_1=1$. That is, perform BPTT every timestep, regardless of BPTT length. In other words, for a sequence $\{x_1, \ldots, x_n\}$, the network learns from sequences $\{x_1, \ldots, x_m\}, \{x_2, \ldots, x_{m+1}\}, \ldots, \{x_{n-m}, \ldots, x_n\}$.



In [54]:
# Let's define the class to build the graph, not perform computations
class LSTM_layer: 
    def __init__(self, x, num_hidden, activation='tanh'):
        # Get input shape
        input_shape = x.get_shape().as_list()
        self.batch_size = input_shape[0]
        trace_length = input_shape[1]
        self.input_size = input_shape[2] # assume flattened
        x_series = tf.unstack(x, axis=1)
        
        # Initial states
        self.init_hidden_state = tf.placeholder(tf.float32,
                                                shape=[batch_size, num_hidden],
                                                name='init_hidden_state')
        self.init_cell_state = tf.placeholder(tf.float32,
                                              shape=[batch_size, num_hidden],
                                              name='init_cell_state')
        
        # Create shared parameters
        with tf.name_scope("params"):
            self.W_f = tf.Variable(tf.truncated_normal([num_hidden + input_size, num_hidden]),
                                   name='W_f')
            self.b_f = tf.Variable(tf.constant(0.1, dtype=tf.float32, shape=[1, num_hidden]),
                                   name="b_f")
            self.W_i = tf.Variable(tf.truncated_normal([num_hidden + input_size, num_hidden]),
                                   name='W_i')
            self.b_i = tf.Variable(tf.constant(0.1, dtype=tf.float32, shape=[1, num_hidden]),
                                   name="b_i")
            self.W_c = tf.Variable(tf.truncated_normal([num_hidden + input_size, num_hidden]),
                                   name='W_c')
            self.b_c = tf.Variable(tf.constant(0.1, dtype=tf.float32, shape=[1, num_hidden]),
                                   name="b_c")
            self.W_o = tf.Variable(tf.truncated_normal([num_hidden +  input_size, num_hidden]),
                                   name='W_o')
            self.b_o = tf.Variable(tf.constant(0.1, dtype=tf.float32, shape=[1, num_hidden]),
                                   name='b_o')
        
        # Unroll network by creating (trace_length) LSTM cells
        h_t = self.init_hidden_state
        c_t = self.init_cell_state
        self.hidden_states = []
        self.cell_states = []
        for t, x_t in enumerate(x_series):
            h_t, c_t = self.LSTM_cell(x_t, h_t, c_t, scope="Cell_%d" % t)
            self.hidden_states.append(h_t)
            self.cell_states.append(c_t)
        
    def LSTM_cell(self, x_t, hidden_state, cell_state, scope="Cell"):
        with tf.name_scope(scope):
            # Forget gate
            with tf.name_scope("forget_gate"):
                W_f = self.W_f # vs. tf.identity(self.W_f)
                b_f = self.b_f # vs. tf.identity(self.b_f)
                f_t = tf.matmul(tf.concat([hidden_state, x_t], 1), W_f) + b_f
                f_t = tf.sigmoid(f_t, name='f_t')

            # Input gate
            with tf.name_scope("input_gate"):
                W_i = self.W_i
                b_i = self.b_i
                i_t = tf.matmul(tf.concat([hidden_state, x_t], 1), W_i) + b_i
                i_t = tf.sigmoid(i_t, name='i_t')


            # New cell state candidate values
            with tf.name_scope("candidate_values"):
                W_c = self.W_c
                b_c = self.b_c
                c_t_ = tf.matmul(tf.concat([hidden_state, x_t], 1), W_c) + b_c
                c_t_ = tf.tanh(c_t_, name='c_t_')

            # Combine forget and input gates to update cell state
            with tf.name_scope("update_cell_state"):
                c_t_f = tf.multiply(cell_state, f_t, name='c_t_f') 
                c_t_i = tf.multiply(c_t_, i_t, name='c_t_i') 
                c_t = tf.add(c_t_f, c_t_i, name='c_t')

            # Output gate
            with tf.name_scope("output_gate"):
                W_o = self.W_o
                b_o = self.b_o
                o_t = tf.matmul(tf.concat([hidden_state, x_t], 1), W_o) + b_o
                o_t = tf.sigmoid(o_t, name='o_t')

            # Gate tanh(cell state) with output gate to update hidden state (output)
            with tf.name_scope("update_hidden_state"):
                h_t = tf.multiply(tf.tanh(c_t), o_t, name='h_t')

            return h_t, c_t

In [55]:
# Set up input
tf.reset_default_graph()
batch_size = 1
trace_length = 3
input_size = 4
x = tf.placeholder(tf.float32, 
                   shape=[batch_size, trace_length, input_size],
                   name='input_series')

# Add RNN cell
num_hidden = 3
rnn = LSTM_layer(x, num_hidden)

# Add simple softmax output layer
preds = []
num_output = 2
W_p = tf.Variable(tf.truncated_normal([num_hidden, num_output]), name='W_p')
b_p = tf.Variable(tf.constant(0.1, dtype=tf.float32, shape=[1, num_output]), name='b_p')
with tf.name_scope("preds"):
    for h_t in rnn.hidden_states:
        preds.append(tf.nn.softmax(tf.matmul(h_t, W_p) + b_p))

# Add loss function
y = tf.placeholder(tf.float32, shape=[batch_size, trace_length, num_output], 
                         name='y')
y_series = tf.unstack(y, axis=1)
losses = []
with tf.name_scope("losses"):
    for i, [pred_t, y_t] in enumerate(zip(preds, y_series)):
        with tf.name_scope("loss_%d" % i):
            losses.append(tf.reduce_sum(-y_t * tf.log(pred_t)))
    total_loss = tf.reduce_mean(losses, name='total_loss')

# Add optimizer
optimizer = tf.train.RMSPropOptimizer(0.1)
train_step = optimizer.minimize(total_loss)
    
# Grab gradients for interest
    
    
# Add session
sess = tf.Session()
sess.run(tf.global_variables_initializer())

# Sample backpropagation step
x_batch = np.random.random([batch_size, trace_length, input_size])
y_batch = np.random.random([batch_size, trace_length, num_output])
y_batch[:, :, 1] = 1 - y_batch[:, :, 0]
y_batch = (y_batch > 0.5).astype(int) # creates one-hot vectors
zero_state = np.zeros([batch_size, num_hidden])
feed_dict = {x: x_batch,
             rnn.init_hidden_state: zero_state,
             rnn.init_cell_state: zero_state,
             y: y_batch}
preds_batch, losses_batch, _ = sess.run([preds, losses, train_step], 
                                        feed_dict=feed_dict)
print("preds:")
for p in preds_batch: print(p)

print("labels:")
for y_t in y_batch: print(y_t)
    
print("losses:")
for l in losses_batch: print(l)

preds:
[[ 0.54066342  0.45933658]]
[[ 0.53943419  0.46056589]]
[[ 0.54027724  0.4597227 ]]
labels:
[[0 1]
 [0 1]
 [1 0]]
losses:
0.777972
0.775299
0.615673


Let's verify the gradients that TensorFlow is calculating.

In [None]:
# Try when you dare

Now let's try it on some toy data set. Let's use the Echo-RNN set from [this blog post](https://medium.com/@erikhallstrm/hello-world-rnn-83cd7105b767). 

In [52]:
def generate_data(total_series_length, echo_step, batch_size, trace_length):
    x = np.array(np.random.choice(2, total_series_length, p=[0.5, 0.5]))
    y = np.roll(x, echo_step)
    y[0:echo_step] = 0

    x = x.reshape((batch_size, ))  # The first index changing slowest, subseries as rows
    y = y.reshape((batch_size, -1))

    return (x, y)

In [60]:
# Hyperparameters
num_epochs = 100
total_series_length = 50000
trace_length = 15
input_size = 1
num_hidden = 4
num_classes = 2
echo_step = 3
batch_size = 5
num_batches = total_series_length // batch_size // trace_length

# Set up input
tf.reset_default_graph()
x = tf.placeholder(tf.float32, 
                   shape=[batch_size, trace_length, input_size],
                   name='input_series')

# Add RNN cell
num_hidden = 3
rnn = LSTM_layer(x, num_hidden)

# Add simple softmax output layer
preds = []
num_output = 2
W_p = tf.Variable(tf.truncated_normal([num_hidden, num_output]), name='W_p')
b_p = tf.Variable(tf.constant(0.1, dtype=tf.float32, shape=[1, num_output]), name='b_p')
with tf.name_scope("preds"):
    for h_t in rnn.hidden_states:
        preds.append(tf.nn.softmax(tf.matmul(h_t, W_p) + b_p))

# Add loss function
y = tf.placeholder(tf.float32, shape=[batch_size, trace_length, num_output], 
                         name='y')
y_series = tf.unstack(y, axis=1)
losses = []
with tf.name_scope("losses"):
    for i, [pred_t, y_t] in enumerate(zip(preds, y_series)):
        with tf.name_scope("loss_%d" % i):
            losses.append(tf.reduce_sum(-y_t * tf.log(pred_t)))
    total_loss = tf.reduce_mean(losses, name='total_loss')

# Add optimizer
optimizer = tf.train.RMSPropOptimizer(0.1)
train_step = optimizer.minimize(total_loss)
    
# Grab gradients for interest
    
    
# Add session
sess = tf.Session()
sess.run(tf.global_variables_initializer())

# Training
current_hidden_state = np.zeros([batch_size, num_hidden])
current_cell_state = np.zeros([batch_size, num_hidden])
for epoch in range(num_epochs):
    # Get data
    x_batch = np.array(np.random.choice(2, total_series_length, p=[0.5, 0.5]))
    y_batch = np.roll(x_batch, echo_step)
    y_batch[0:echo_step] = 0
    x_batch = np.reshape(x_batch, [batch_size, trace_length])
    y_batch = y_batch.reshape([batch_size, trace_length])

    # Train step
    feed_dict = {x: x_batch,
                 rnn.init_hidden_state: current_hidden_state,
                 rnn.init_cell_state: current_cell_state,
                 y: y_batch}
    current_hidden_state, current_cell_state, total_loss_, _ \
        = sess.run([rnn.hidden_states[-1], rnn.cell_states[-1], total_loss, train_step], 
                   feed_dict=feed_dict)                         

    print("Epoch: %d, loss: %.2f" % (total_loss_))

ValueError: cannot reshape array of size 50000 into shape (5,15)