[YouTube Video](https://www.youtube.com/watch?v=ElmBrKyMXxs)

[GitHub Code](https://github.com/llSourcell/seq2seq_model_live/blob/master/2-seq2seq-advanced.ipynb)

In [1]:
import numpy as np      # Matrix Math
import tensorflow as tf # ML

## ETL Stuff
These functions come from Siraj's helpers.py file.  I've copied them explicitly into this notebook for quick reference later on in life! 

### Make Data Batches
This formats the data so our neural network can use it.

In [2]:
def batch(inputs, max_sequence_length=None):
    """
    Args:
        inputs:
            list of sentences (integer lists)
        max_sequence_length:
            integer specifying how large should `max_time` dimension be.
            If None, maximum sequence length would be used
    
    Outputs:
        inputs_time_major:
            input sentences transformed into time-major matrix 
            (shape [max_time, batch_size]) padded with 0s
        sequence_lengths:
            batch-sized list of integers specifying amount of active 
            time steps in each input sequence
    """
    
    sequence_lengths = [len(seq) for seq in inputs]
    batch_size = len(inputs)
    
    if max_sequence_length is None:
        max_sequence_length = max(sequence_lengths)
    
    inputs_batch_major = np.zeros(shape=[batch_size, max_sequence_length], dtype=np.int32) # == PAD
    
    for i, seq in enumerate(inputs):
        for j, element in enumerate(seq):
            inputs_batch_major[i, j] = element

    # [batch_size, max_time] -> [max_time, batch_size]
    inputs_time_major = inputs_batch_major.swapaxes(0, 1)

    return inputs_time_major, sequence_lengths


### Generate Random Sequences

In [3]:
def random_sequences(length_from, length_to,
                     vocab_lower, vocab_upper,
                     batch_size):
    """ Generates batches of random integer sequences,
        sequence length in [length_from, length_to],
        vocabulary in [vocab_lower, vocab_upper]
    """
    if length_from > length_to:
            raise ValueError('length_from > length_to')

    def random_length():
        if length_from == length_to:
            return length_from
        return np.random.randint(length_from, length_to + 1)
    
    while True:
        yield [
            np.random.randint(low=vocab_lower,
                              high=vocab_upper,
                              size=random_length()).tolist()
            for _ in range(batch_size)
        ]

In [4]:
# Reset the Graph Stack
tf.reset_default_graph()
sess = tf.InteractiveSession()



## Sequence Padding
We want to treat sequences of varying lengths, but our RNNs expect sequences of equal length.  To remedy this, we zero-pad sequences up to a predefined length (and, in some cases, truncate sequences to this length).  To further help our model, we define an end-of-sentence (EOS) token.

In [30]:
PAD=0
EOS=1
vocab_size = 10           # number of unique words
input_embedding_size = 20 # character length

encoder_hidden_units = 20
decoder_hidden_units = encoder_hidden_units*2
# Usually we make the number decoder and encoder units the same. 
# Here, we want the output value to be similar to the input, but we want it to change 
#   a little bit. This can be achieved by having more hidden units in the decoder.


In [6]:
# Placeholders (gateways for data into our computation graph)
encoder_inputs = tf.placeholder(shape=(None,None), dtype=tf.int32, name="encoder_inputs")
encoder_inputs_length = tf.placeholder(shape=None, dtype=tf.int32, name="encoder_inputs_length")
decoder_targets = tf.placeholder(shape=(None,None), dtype=tf.int32, name="decoder_targets")



In [34]:
# Embeddings
# Randomly initialize an embedding matrix that can take the input sequence of integers
#   and transform it into a sequence of embedded vectors that can then be fed into our 
#   encoder.
embeddings = tf.Variable(tf.random_uniform([vocab_size, input_embedding_size], -1.0, 1.0), tf.float32)
encoder_inputs_embedded = tf.nn.embedding_lookup(embeddings, encoder_inputs)

In [11]:
tf.__version__

'1.1.0'

In [13]:
# Encoder
#  Note that a lot of things have moved around in TensorFlow.  For example, Siraj is using
#  TensorFlow version xxx, in which case LSTMCell is found in tf.python.ops.rnn_cell.  I am
#  using TensorFlow version 1.0.0, where LSTMCell is found in tf.contrib.rnn.
if tf.__version__ in ('1.0.0', '1.1.0'): from tensorflow.contrib.rnn import LSTMCell, LSTMStateTuple
if tf.__version__=='0.1.0': from tensorflow.python.ops.rnn_cell import LSTMCell, LSTMStateTuple

In [40]:
# above we specified that encoder_hidden_units=20, which here translates to defining
#   an "Encoder Cell" of 20 LSTM cells
encoder_cell = LSTMCell(encoder_hidden_units, reuse=True)
#encoder_cell_bw = LSTMCell(encoder_hidden_units, reuse=None)

# Here we create a dynamic, bidirectional RNN.
#
# Dynamic, Bidirectional..: 
# The difference between a static and dynamic RNN is that a static RNN only takes
#   the past into account. This constraint makes sense for predicting future elements
#   in a sequence, such as stock prices, etc.  However, in terms of translating a corpus
#   of text in one language to another, it makes no sense to restrict oneself like that 
#   (unless the objective is to predict unknown, upcoming words/sentences.)
#
# Bidirectional:
#  
# Siraj Says:
# Seriously, if you can afford the additional computation time, then it is almost always
#   better to use a dynamic, bidirectional RNN over a static RNN. For almost any pattern,
#   knowing about the future and the past helps understand the pattern, and will help
#   the NLP task.

((encoder_fw_outputs, encoder_bw_outputs), 
 (encoder_fw_final_state, encoder_bw_final_state)) = (
    tf.nn.bidirectional_dynamic_rnn(
        cell_fw = encoder_cell,
        cell_bw = encoder_cell,
        inputs=encoder_inputs_embedded,
        sequence_length=encoder_inputs_length,
        dtype=tf.float32, 
        time_major=True
    )
)

ValueError: Attempt to reuse RNNCell <tensorflow.contrib.rnn.python.ops.core_rnn_cell_impl.LSTMCell object at 0x7f4c1a3a4c18> with a different variable scope than its first use.  First use of cell was with scope 'bidirectional_rnn/fw/lstm_cell', this attempt is with scope 'bidirectional_rnn/bw/lstm_cell'.  Please create a new instance of the cell if you would like it to use a different set of weights.  If before you were using: MultiRNNCell([LSTMCell(...)] * num_layers), change to: MultiRNNCell([LSTMCell(...) for _ in range(num_layers)]).  If before you were using the same cell instance as both the forward and reverse cell of a bidirectional RNN, simply create two instances (one for forward, one for reverse).  In May 2017, we will start transitioning this cell's behavior to use existing stored weights, if any, when it is called with scope=None (which can lead to silent model degradation, so this error will remain until then.)

With Siraj's original code:
ValueError: Attempt to have a second RNNCell use the weights of a variable scope that already has weights: 'bidirectional_rnn/fw/lstm_cell'; and the cell was not constructed as LSTMCell(..., reuse=True).  To share the weights of an RNNCell, simply reuse it in your second calculation, or create a new one with the argument reuse=True.



encoder_cell = LSTMCell(encoder_hidden_units)
  --> encoder_cell = LSTMCell(encoder_hidden_units, reuse=True)

ValueError: Attempt to reuse RNNCell <tensorflow.contrib.rnn.python.ops.core_rnn_cell_impl.LSTMCell object at 0x7f4c1afc2a90> with a different variable scope than its first use.  First use of cell was with scope 'bidirectional_rnn/fw/lstm_cell', this attempt is with scope 'bidirectional_rnn/bw/lstm_cell'.  Please create a new instance of the cell if you would like it to use a different set of weights.  If before you were using: MultiRNNCell([LSTMCell(...)] * num_layers), change to: MultiRNNCell([LSTMCell(...) for _ in range(num_layers)]).  If before you were using the same cell instance as both the forward and reverse cell of a bidirectional RNN, simply create two instances (one for forward, one for reverse).  In May 2017, we will start transitioning this cell's behavior to use existing stored weights, if any, when it is called with scope=None (which can lead to silent model degradation, so this error will remain until then.)


encoder_cell = LSTMCell(encoder_hidden_units, reuse=True)
  --> encoder_cell1 = LSTMCell(encoder_hidden_units)
  --> encoder_cell2 = LSTMCell(encoder_hidden_units)
  
ValueError: Attempt to have a second RNNCell use the weights of a variable scope that already has weights: 'bidirectional_rnn/fw/lstm_cell'; and the cell was not constructed as LSTMCell(..., reuse=True).  To share the weights of an RNNCell, simply reuse it in your second calculation, or create a new one with the argument reuse=True.



In [None]:
# The Bidirectional Step
#...encoder_outputs = tf.concat((encoder_fw_outputs, encoder_bw_outputs), 2)
encoder_final_state_c = tf.concat((encoder_fw_final_state.c, encoder_bw_final_state.c), 1)
encoder_final_state_h = tf.concat((encoder_fw_final_state.h, encoder_bw_final_state.h), 1)

# The TF tuple used by LSTM Cells for state_size, zero_state, and output_state
encoder_final_state = LSTMStateTuple(
    c=encoder_final_state_c,
    h=encoder_final_state_h
)

In [None]:
# Defining the Decoder
#   We are going to feed the decoder in batches.
#   The decoder has twice the number of hidden units as the encoder.
decoder_cell = LSTMCell(decoder_hidden_units)
encoder_max_time, batch_size = tf.unstack(tf.shape(encoder_inputs))
decoder_lengths = encoder_inputs_length+3  # bigger due to EOS tokens


In [None]:
# Output Projection
#  -- define weights and biases for the decoder
#  -- we did not have to do this manually for the encoder b/c the dynamic RNN cell
#     we created for the encoder did it for us
#  -- note: we will defining "soft attention" manually
W = tf.Variable(tf.random_uniform(decoder_hidden_units, vocab_size, -1, 1, dtype=tf.float32))
b = tf.Variable(tf.zeros([vocab_size]), dtype=tf.float32)

In [None]:
# Create padded decoder inputs from the word embeddings
assert EOS==1 and PAD==0
eos_time_slice = tf.ones([batch_size], dtype=tf.int32, name='EOS')
pad_time_slice = tf.zeros([batch_size], dtype=tf.int32, name='PAD')
# add the embedded vectors for EOS and PAD ...
eos_step_embedded = tf.nn.embedding_lookup(embeddings, eos_time_slice)
pad_step_embedded = tf.nn.embedding_lookup(embeddings, pad_time_slice)


In [None]:
# Implement Attention
# Two functions that we have to implement...

def loop_fn_initial():
    initial_elements_finished = (0 >= decoder_lengths) # all False @ initial step
    # EOS
    initial_input = eos_step_embedded
    # last time steps cell state
    initial_cell_state = encoder_final_state
    initial_cell_output = None
    initial_loop_state = None
    return (initial_elements_finished, initial_input, 
            initial_cell_state, initial_cell_output, initial_loop_state)



In [None]:
# attention mechanism
def loop_fn_transition(time, previous_output, previous_state, previous_loop_state):
    def get_next_input():
        # I left off 44mins into video
        # https://www.youtube.com/watch?v=ElmBrKyMXxs