# Implementing the RNN in Tensorflow on PTB dataset

[Prashant Brahmbhatt](https://www.github.com/hashbanger)

____
[Reference](https://github.com/adventuresinML)
_____

### The imports

In [33]:
import os
import sys
import collections
import numpy as np
import datetime as dt
import tensorflow as tf

In [2]:
Py3 = sys.version_info[0] == 3

## Preparing the data

In [3]:
def _read_words(filename):
    with tf.gfile.GFile(filename, "r") as f:
        if Py3:
            return f.read().replace('\n', '<eos>').split()
        else:
            return f.read().decode('uft-8').replace('\n','<eos>').split()

In [4]:
def _build_vocab(filename):
    '''Input : Filename  
    Return : Dictionary of word with their IDs as values'''
    data = _read_words(filename)
    #print(data)
    counter = collections.Counter(data)
    count_pairs = sorted(counter.items(), key = lambda x:(-x[1], x[0]))
    
    words, _ = list(zip(*count_pairs))
    word_to_id = dict(zip(words, range(len(words))))
    
    return word_to_id

In [5]:
def _file_to_word_ids(filename, word_to_id):
    ''' Input : filename, dictionary of words with their IDs  
    Returns : list of all the IDs'''
    data = _read_words(filename)
    return [word_to_id[word] for word in data if word in word_to_id]

These functions first split the given text file into separate words and sentence based characters (i.e. end-of-sentence <eos>). Then, each unique word is identified and assigned a unique integer.  
Finally, the original text file is converted into a list of these unique integers, where each word is substituted with its new integer identifier.  
This allows the text data to be consumed in the neural network.

Example:

In [6]:
_read_words('sample_text.txt')

['Batman',
 'has',
 'been',
 'Gothams',
 'protector',
 'for',
 'decades,',
 'CEO',
 'of',
 'Wayne',
 'Enterprises,',
 'Patriarch',
 'of',
 'the',
 'Bat',
 'Family',
 'and',
 'veteran',
 'member',
 'of',
 'the',
 'Justice',
 'League.',
 'He',
 'is',
 'a',
 'superhero',
 'co-created',
 'by',
 'artist',
 'Bob',
 'Kane',
 'and',
 'writer',
 'Bill',
 'Finger',
 'and',
 'published',
 'by',
 'DC',
 'Comics.',
 'The',
 'character',
 'made',
 'his',
 'first',
 'appearance',
 'in',
 'Detective',
 'Comics.',
 'Batman',
 'is',
 'the',
 'secret',
 'identity',
 'of',
 'Bruce',
 'Wayne.',
 'Witnessing',
 'the',
 'murder',
 'of',
 'his',
 'parents',
 'as',
 'a',
 'child',
 'leads',
 'him',
 'to',
 'train',
 'himself',
 'to',
 'physical',
 'and',
 'intellectual',
 'perfection',
 'and',
 'don',
 'a',
 'bat-themed',
 'costume',
 'in',
 'order',
 'to',
 'fight',
 'crime.',
 'Batman',
 'operates',
 'in',
 'Gotham',
 'City,',
 'assisted',
 'by',
 'various',
 'supporting',
 'characters',
 'including',
 'his'

In [7]:
_build_vocab('sample_text.txt')

{'Alfred': 14,
 'Bat': 15,
 'Batman': 6,
 'Bill': 16,
 'Bob': 17,
 'Bruce': 18,
 'CEO': 19,
 'City,': 20,
 'Comics.': 9,
 'DC': 21,
 'Detective': 22,
 'Enterprises,': 23,
 'Family': 24,
 'Finger': 25,
 'Gotham': 26,
 'Gothams': 27,
 'He': 28,
 'Justice': 29,
 'Kane': 30,
 'League.': 31,
 'Patriarch': 32,
 'Pennyworth,': 33,
 'Robin': 34,
 'The': 35,
 'Unlike': 36,
 'Wayne': 37,
 'Wayne.': 38,
 'Witnessing': 39,
 'a': 7,
 'an': 40,
 'and': 0,
 'any': 41,
 'appearance': 42,
 'artist': 43,
 'as': 44,
 'assisted': 45,
 'assortment': 46,
 'bat-themed': 47,
 'been': 48,
 'butler': 49,
 'by': 5,
 'character': 50,
 'characters': 51,
 "characters'": 52,
 'child': 53,
 'co-created': 54,
 'costume': 55,
 'crime.': 10,
 'decades,': 56,
 'detective': 57,
 'does': 58,
 'don': 59,
 'fight': 60,
 'fights': 61,
 'film': 62,
 'first': 63,
 'for': 64,
 'has': 65,
 'he': 11,
 'him': 66,
 'himself': 67,
 'his': 2,
 'identity': 68,
 'in': 3,
 'including': 69,
 'influenced': 70,
 'intellect,': 71,
 'intellec

In [8]:
_file_to_word_ids('sample_text.txt', _build_vocab('sample_text.txt'))

[6,
 65,
 48,
 27,
 88,
 64,
 56,
 19,
 1,
 37,
 23,
 32,
 1,
 4,
 15,
 24,
 0,
 105,
 78,
 1,
 4,
 29,
 31,
 28,
 12,
 7,
 97,
 54,
 5,
 43,
 17,
 30,
 0,
 109,
 16,
 25,
 0,
 90,
 5,
 21,
 9,
 35,
 50,
 75,
 2,
 63,
 42,
 3,
 22,
 9,
 6,
 12,
 4,
 94,
 68,
 1,
 18,
 38,
 39,
 4,
 80,
 1,
 2,
 85,
 44,
 7,
 53,
 74,
 66,
 8,
 102,
 67,
 8,
 13,
 0,
 72,
 86,
 0,
 59,
 7,
 47,
 55,
 3,
 84,
 8,
 60,
 10,
 6,
 83,
 3,
 26,
 20,
 45,
 5,
 104,
 100,
 51,
 69,
 2,
 95,
 34,
 0,
 2,
 49,
 14,
 33,
 0,
 61,
 40,
 46,
 1,
 106,
 70,
 5,
 4,
 52,
 92,
 3,
 62,
 0,
 91,
 76,
 36,
 79,
 98,
 11,
 58,
 81,
 87,
 41,
 99,
 11,
 77,
 103,
 1,
 71,
 57,
 96,
 93,
 0,
 101,
 108,
 13,
 89,
 0,
 73,
 3,
 2,
 107,
 82,
 10]

In [9]:
def load_data():
    #getting the data paths
    train_path = os.path.join(data_path, 'ptb.train.txt')
    valid_path = os.path.join(data_path, 'ptb.valid.txt')
    test_path = os.path.join(data_path, 'ptb.test.txt')

    #build the complete vocabulary, then convert text data to list of integer
    word_to_id = build_vocab(train_path)
    train_data = _file_to_word_ids(train_path, word_to_id)
    valid_data = _file_to_word_ids(valid_path, word_to_id)
    test_data = _file_to_word_ids(test_path, word_to_id)
    vocabulary = len(word_to_id)
    reversed_dictionary = dict(zip(word_to_id.values(), word_to_id.keys()))
    
    print(train_data[:5])
    print(word_to_id)
    print(vocabulary)
    print(" ".join([reversed_dictionary[x] for x in train_data[:10]]))
    
    return train_data, valid_data, test_data, vocabulary, reversed_dictionary    

Here we setup the directory paths for the train, validation and test datasets and build_vocab() is invoked on the training data to create a dictionary that has each word as a key, and a unique integer as the associated value.  
The reverse dictionary will allow us to go the other direction – from a unique integer identifier to the corresponding word.  
This will be used later when we are reconstructing the outputs of our LSTM network back into plain English sentences.

In [10]:
data_path = 'E:\Desarrollador\Workspace\Tensorflow_And_Neural_Networks\Tensorflow\Recurrent Neural Networks\simple-examples\data'

## Input Data Pipeline

In Tensorflow, the use of a feed dictionary to supply data to the model during training, while common in tutorials, is not efficient. It is more efficient to use Tensorflow queues and threading. There's also ways of using Dataset API but we won't be applying here.  

We define a function batch_producer, which extracts batches of x, y training data – the x batch is formatted as the time stepped text data. The y batch is the same data, except delayed one time step. It looks like:  

    x = “A girl walked into a bar, and she”
    y = “girl walked into a bar, and she said”

but these will be batches of integers rather than text data with size (batch_size, num_steps).

In [11]:
def batch_producer(raw_data, batch_size, num_steps):
    raw_data = tf.convert_to_tensor(raw_data, name = 'raw_data', dtype = tf.int32)
    
    data_len = tf.size(raw_data)
    batch_len = data_len // batch_size
    data = tf.reshape(raw_data[0: batch_size * batch_len], [batch_size , batch_len])
    
    epoch_size = (batch_len -1) // num_steps
    
    i = tf.train.range_input_producer(epoch_size, shuffle = False).dequeue()
    x = data[:, i * num_steps:(i+1)* num_steps]
    x.set_shape([batch_size, num_steps])
    y = data[:, i * num_steps + 1:(i+1)* num_steps + 1]
    y.set_shape([batch_size, num_steps])
    
    return x, y

Here first we have converted the raw text data into an int32 tensor.  
The length of the full data set is calculated and stored in data_len and this is then divided by the batch size in an integer division (//) to get the number of full batches of data available within the dataset.   
The next line reshapes the raw_data tensor (restricted in size to the number of full batches of data i.e. 0 to **(batch_size * batch_len)** into a **(batch_size, batch_len)** shape.   
Then sets the number of iterations in each epoch – this is set so that all the training data is passed through the algorithm in each epoch.  
This is what occurs here – the number of batches in the data **(batch_len)** is integer divided by the number of time steps – this gives the number of time-step-sized batches that are available to be iterated through in a single epoch.

The next line sets up an input range producer queue – this is a simple queue which allows the asynchronous and threaded extraction of data batches from a pre-existing dataset. Basically, each time more data is required in the training of the model, a new integer is extracted between 0 and epoch_size – this is then used in the following lines to extract a batch of data asynchronously from the data tensor. With the shuffle argument set to False, this integer simply cycles from 0 to epoch_size and then resets back at 0 to repeat.

To produce the x, y batches of data, data slices are extracted from the data tensor based on the dequeued integer i.  
It is easier to imagine a dummy dataset of integers up to 20 – [0, 1, 2, 3, 4, 5, 6, …, 19, 20]. We set the batch size to 3, and the number of steps to 2. The variables *batch_len* and *epoch_size* will therefore be equal to 6 and 2, respectively. The dummy reshaped data will look like:

$$\begin{bmatrix} 
1 & 2 & 3 & 4 & 5 & 6 \\ 
7 & 8 & 9 & 10 & 11 & 12 \\ 
13 & 14 & 15 & 16 & 17 & 18 \\ 
\end{bmatrix}$$

For the first data batch extraction, i = 0, therefore the extracted x for our dummy dataset will be data[:, 0:2]  
$$\begin{bmatrix} 
1 & 2\\ 
7 & 8\\ 
13 & 14\\ 
\end{bmatrix}$$  
The extracted y will be data[:, 1:3]:  
$$\begin{bmatrix} 
2 & 3\\ 
8 & 9\\ 
14 & 15\\ 
\end{bmatrix}$$  
  
Each row of the extracted x and y tensors will be an individual sample of length *num_steps* and the *number of rows* is the batch length. By organizing the data in this fashion, it is straight-forward to extract batch data while still maintaining the correct sentence sequence within each data sample.

## Building the Model

Building a class, firstly, we pass this object important input data information such as *batch size*, the number of recurrent time steps and finally the raw data file we wish to extract batch data from.  
Out previous *batch_producer* function, when called, will return our input data batch x and the associated time step + 1 target data batch, y.

In [12]:
class Input(object):
    def __init__(self, batch_size, num_steps, data):
        self.batch_size = batch_size
        self.num_steps = num_steps
        self.epoch_size = (len(data) // batch_size - 1) // num_steps
        self.input_data , self.targets = batch_producer(data, batch_size, num_steps)

In [13]:
#Creating the main model 
class Model(object):
    def __init__(self, input, is_training, hidden_size, vocab_size, num_layers
                 , dropout = 0.5, init_scale = 0.05):
        self.is_training = is_training
        self.input_obj = input
        self.batch_size = input.batch_size
        self.num_steps = input.num_steps
        
#------------------Creating Word Embeddings---------------------------------------------------------

        with tf.device("/cpu:0"):
            embedding = tf.Variable(tf.random_uniform([vocab_size, self.hidden_size] - init_scale, init_scale))
            inputs = tf.nn.embedding_lookup(embedding, self.input_obj.input_data)

#---------------------adding a dropout wrapper---------------------------------------------------

        if is_training and dropout < 1:
            inputs = tf.nn.dropout(inputs, dropout)
            
#------------setup the state stage / extraction-----------------------------------------------------

        self.init_state = tf.placeholder(tf.float32, [num_layers, 2, self.batch_size, self.hidden_size])
        
        state_per_layer_list = tf.unstack(self.init_state, axis = 0)
        rnn_tupled_state = tuple([tf.contrib.rnn.LSTMStateTuple(state_per_layer_list[idx][0]
                                                                , state_per_layer_list[idx][1])
                                 for idx in range(num_layers)])
        
#------------Creating an LSTM cell to be unrolled----------------------------------------------------
        
        cell = tf.contrib.rnn.LSTMCell(hidden_size, forget_bias = 1.0)
        
#------------Adding a dropout wrapper if training------------------------------------------------------

        if is_training and dropout <1:
            cell = tf.contrib.rnn.DropoutWrapper(cell, output_keep_prob = dropout)
        if num_layers > 1:
            cell = tf.contrib.rnn.MultiRNNCell([cell for _ in range(num_layers)], state_is_tuple = True)
        
#----------Creating Dynamic RNN object-----------------------------------------------------------

        output, self.state = tf.nn.dynamic_rnn(cell, inputs, dtype= tf.float32, initial_state= rnn_tupled_state)

######################################################################################################
#---------------------Creating the softmax, loss and optimizer operations -----------------------------    
######################################################################################################
        
        output = tf.reshape(output, [-1, hidden_size])
        
#---------setting up softmax weights and biases---------------------------------------------
        
        softmax_w = tf.Variable(tf.random_uniform([hidden_size, vocab_size], -init_scale, init_scale))
        softmax_b = tf.Variable(tf.random_uniform([vocab_size], -init_scale, init_scale))
        logits = tf.nn.xw_plus_b(output, softmax_w, softmax_b)
        
        #Reshape logits to be a 3-D tensor for sequence loss
        logits = tf.reshape(logits, [self.batch_size, self.num_steps, vocab_size])

#--------- Use the contrib sequence loss and average over the batches--------------------------

        loss = tf.contrib.seq2seq.sequence_loss(logits,  self.input_obj.targets
                                                , tf.ones([self.batch_size, self.num_steps], dtype = tf.float32)
                                                , average_across_timesteps = False
                                                , average_across_batch = True)
# update the cost
        self.cost = tf.reduce_sum(loss)

#---------get the prediction accuracy------------------------------------------------------------

        self.softmax_out = tf.nn.softmax(tf.reshape(logits, [-1, vocab_size]))
        self.predict = tf.cast(tf.argmax(self.softmax_out, axis = 1), tf.int32)
        correct_prediction = tf.equal(self.predict, tf.reshape(self.input_obj.targets, [-1]))
        self.accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
        
#------constructing the optimizers operations------------------------------------------------------

        if not is_training:
            return
        self.learning_rate = tf.Variable(0.0, trainable= False)
        
        tvars = tf.trainable_variables()
        grads, _ = tf.clip_by_global_norm(tf.gradients(self.cost, tvars), 5)
        optimizer = tf.train.GradientDescentOptimizer(self.learning_rate)
        self.train_op = optimizer.apply_gradients(zip(grads, tvars)
                                                  , global_step = tf.contrib.framework.get_or_create_global_step())
        
        #updating the learning rate
        self.new_lr = tf.placeholder(tf.float32, shape=[])
        self.lr_update = tf.assign(self.learning_rate, self.new_lr)
        


* Now creating the main model.   
The first part of initialization is pretty self-explanatory, with the input data information and batch producer operation found in input_obj. Another important input is the boolean is_training – this allows the model instance to be created either as a model setup for training, or alternatively setup for validation or testing only.  

    class Model(object):
        def __init__(self, input, is_training, hidden_size, vocab_size, num_layers
                     , dropout = 0.5, init_scale = 0.05):
            self.is_training = is_training
            self.input_obj = input
            self.batch_size = input.batch_size
            self.num_steps = input.num_steps


* Creating the word embeddings.   
Word embedding creates meaningful vectors to represent each word. First, we initialize the embedding variable with size **(vocab_size, hidden_size)** which creates the “lookup table” where each row represents a word in the dataset, and the set of columns is the embedding vector. In this case, our embedding vector length is set equal to the size of our LSTM hidden layer.  
The next line performs a lookup action on the embedding tensor, where each word in the input data set is matched with a row in the embedding tensor, with the matched embedding vector being returned within inputs.  
In this model, the embedding *layer / vectors* will be learned during the model training – however, if we so desired, we could also pre-learn embedding vectors using another model and upload these into our models.

    with tf.device("/cpu:0"):
            embedding = tf.Variable(tf.random_uniform([vocab_size, self.hidden_size] - init_scale, init_scale))
            inputs = tf.nn.embedding_lookup(embedding, self.input_obj.input_data)

* The next step adds a drop-out wrapper to the input data – this helps prevent overfitting by continually changing the structure of the network connections  

        if is_training and dropout < 1:
            inputs = tf.nn.dropout(inputs, dropout)

* The next step is to setup the initial state TensorFlow placeholder.  
This placeholder will be loaded with the initial state of the LSTM cells for each training batch. At the beginning of each training epoch, the input data will reset to the beginning of the text data set, so we want to reset the state variables to zero. However, during the multiple training batches executed in each epoch, we want to load the final state variables from the previous training batch into our LSTM cells for the current training batch. This keeps a certain continuity of state in our model, as we are progressing linearly through our text data set.  
The second argument to the placeholder function is the size of the variable – (num_layers, 2, batch_size, hidden_size) and requires some explanation. If we consider an individual LSTM cell, for each training sample it processes it has two other inputs – the previous output from the cell (ht−1) and the previous state variable (st−1). These two inputs, h and s, are what is required to load the full state data into an LSTM cell. Remember also that h and s for each sample are actually vectors with the size equal to the hidden layer size. Therefore, for all the samples in the batch, for a single LSTM cell we have state data required of shape (2, batch_size, hidden_size). Finally, if we have stacked LSTM cell layers, we need state variables for each layer – num_layers. This gives the final shape of the state variables: (num_layers, 2, batch_size, hidden_size).  


        self.init_state = tf.placeholder(tf.float32, [num_layers, 2, self.batch_size, self.hidden_size])  
    

- The next two steps involve setting up this state data variable in the format required to feed it into the TensorFlow LSTM data structure:  


        state_per_layer_list = tf.unstack(self.init_state, axis=0)
        rnn_tuple_state = tuple(
                    [tf.contrib.rnn.LSTMStateTuple(state_per_layer_list[idx][0],  state_per_layer_list[idx][1])
                 for idx in range(num_layers)])

The TensorFlow LSTM cell can accept the state as a tuple if a flag is set to True. The tf.unstack command creates a number of tensors, each of shape (2, batch_size, hidden_size), from the init_state tensor, one for each stacked LSTM layer (num_layer). These tensors are then loaded into a specific TensorFlow data structure, LSTMStateTuple, which is the required for input into the LSTM cells

- Next we create an LSTM cell which will be “unrolled” over the number of time steps. Following this, we apply a drop-out wrapper to again protect against overfitting. Notice that we set the forget bias values to be equal to 1.0, which helps guard against repeated low forget gate outputs causing vanishing gradients  

        # create an LSTM cell to be unrolled
        cell = tf.contrib.rnn.LSTMCell(hidden_size, forget_bias=1.0)
        # add a dropout wrapper if training
        if is_training and dropout < 1:
        cell = tf.contrib.rnn.DropoutWrapper(cell, output_keep_prob=dropout)

- If we include many layers of stacked LSTM cells in the model, we need to use another TensorFlow object called MultiRNNCell which performs the requisite cell stacking / layering. We tell MultiRNNCell to expect the state variables in the form of a LSTMStateTuple by setting the flag state_is_tuple to True.   
  
        if num_layers > 1:
            cell = tf.contrib.rnn.MultiRNNCell([cell for _ in range(num_layers)], state_is_tuple=True)

- The final step in creating the LSTM network structure is to create a dynamic RNN object in TensorFlow. This object will dynamically perform the unrolling of the LSTM cell over each time step.

        output, self.state = tf.nn.dynamic_rnn(cell, inputs, dtype=tf.float32, initial_state=rnn_tuple_state
        
    The dynamic_rnn object takes our defined LSTM cell as the first argument, and the embedding vector tensor inputs as the second argument. The final argument, initial_state is where we load our time-step zero state variables, that we created earlier, into the unrolled LSTM network.

    This operation creates two outputs, the first is the output from all the unrolled LSTM cells, and will have a shape of (batch_size, num_steps, hidden_size). This data will be flattened in the next step to feed into a softmax classification layer. The second output, state, is the (s, h) state tuple taken from the final time step of the LSTM cells. This state operation / tuple will be extracted during each batch training operation to be used as inputs (via init_state) into the next training batch

#### Creating the softmax, loss and optimizer operations 
- Next we have to flatten the outputs so that we can feed them into our proposed softmax classification layer. We can use the -1 notation to reshape our output tensor, with the second axis set to be equal to the hidden layer size:

        # reshape to (batch_size * num_steps, hidden_size)
        output = tf.reshape(output, [-1, hidden_size])

- Next we setup our softmax weight variables and the standard xw+b operation:

        softmax_w = tf.Variable(tf.random_uniform([hidden_size, vocab_size], -init_scale, init_scale))
        softmax_b = tf.Variable(tf.random_uniform([vocab_size], -init_scale, init_scale))
        logits = tf.nn.xw_plus_b(output, softmax_w, softmax_b)

    Note that the logits operation is simply the output of our tensor multiplication – we haven’t yet added the softmax operation – this will occur in the loss calculations below (and also in our ancillary accuracy calculations).

- Following this, we have to setup our loss or cost function which will be used to train our LSTM network. In this case, we will use the specialized TensorFlow sequence to sequence loss function. This loss function allows one to calculate (a potentially) weighted cross entropy loss over a sequence of values.  
    The first argument to this loss function is the logits argument, which requires tensors with the shape (batch_size, num_steps, vocab_size) – so we’ll need to reshape our logits tensor. The second argument to the loss function is the targets tensor which has a shape (batch_size, num_steps) with each value being an integer (which corresponds to a unique word in our case) – in other words, this tensor contains the true values of the word sequence that we want our LSTM network to predict. The third important argument is the weights tensor, of shape (batch_size, num_steps), which allows you to weight different samples or time steps with respect to the loss i.e. you might want the loss to favor the latter time steps rather than the earlier ones. No weighting is applied in this model, so a tensor of ones is passed to this argument.

    There are two more important arguments for this function – average_across_timesteps and average_across_batch. If average_across_timesteps is set to True, the cost will be summed across the time dimension, if average_across_batch is True, then the cost will be summed across the batch dimension. In this case we are favoring the latter option.

    Finally, we produce the cost operation which reduces the loss to a single scalar value – we could also do something similar by setting average_across_timesteps to True.

        # Reshape logits to be a 3-D tensor for sequence loss
        logits = tf.reshape(logits, [self.batch_size, self.num_steps, vocab_size])

        # Use the contrib sequence loss and average over the batches
        loss = tf.contrib.seq2seq.sequence_loss(
                    logits,
                    self.input_obj.targets,
                    tf.ones([self.batch_size, self.num_steps], dtype=tf.float32),
                    average_across_timesteps=False,
                    average_across_batch=True)
        # Update the cost
        self.cost = tf.reduce_sum(loss)



In the next few steps, we set up some operations to calculate the accuracy off predictions over the batch samples:


- First we apply a softmax operation to get the predicted probabilities of each word for each output of the LSTM network. We then make the network predictions equal to those words with the highest softmax probability by using the argmax function. These predictions are then compared to the actual target words and then averaged to get the accuracy.

        # get the prediction accuracy
        self.softmax_out = tf.nn.softmax(tf.reshape(logits, [-1, vocab_size]))
        self.predict = tf.cast(tf.argmax(self.softmax_out, axis=1), tf.int32)
        correct_prediction = tf.equal(self.predict, tf.reshape(self.input_obj.targets, [-1]))
        self.accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))


- Now we move onto constructing the optimization operations – in this case we aren’t using a simple “out of the box” optimizer – rather we are doing a few manipulations to improve results:
    First off, if the model has been created for predictions, validations or testing only, these operations do not need to be created. The first step if the model is being used for training, is to create a learning rate variable. This will be used so that we can decrease the learning rate during training – this improves the final outcome of the model.

    Next we wish to clip the size of the gradients in our network during back-propagation – this is recommended in recurrent neural networks to improve outcomes. Clipping values of between 1 and 5 are commonly used. Finally, we create the optimizer operation, using the learning_rate variable, and apply the clipped gradients.. Then a gradient descent step is performed – assigning this operation to train_op. This operation, train_op, will be called for each training batch.


        if not is_training:
           return
        self.learning_rate = tf.Variable(0.0, trainable=False)

        tvars = tf.trainable_variables()
        grads, _ = tf.clip_by_global_norm(tf.gradients(self.cost, tvars), 5)
        optimizer = tf.train.GradientDescentOptimizer(self.learning_rate)
        self.train_op = optimizer.apply_gradients(
                    zip(grads, tvars),
                    global_step=tf.contrib.framework.get_or_create_global_step())

- The final two lines of the model creation involve the updating of the learning_rate:
    First, a placeholder is created which will be input via the feed_dict argument when running the training, new_lr. This new learning rate is then assigned to learning_rate via a tf.assign operation. This operation, lr_update, will be run at the beginning of each epoch.

        self.new_lr = tf.placeholder(tf.float32, shape=[])
        self.lr_update = tf.assign(self.learning_rate, self.new_lr)



## Training the Model

In [37]:
def train(train_data, vocabulary, num_layers, num_epochs, batch_size, model_save_name
          , learning_rate = 1.0, max_lr_epoch = 10, lr_decay = 0.93, print_iter = 50):
    
    #setting up data models
    training_input = Input(batch_size= batch_size, num_steps= num_steps, data= train_data, )
    m= Model(training_input, is_training= True, hidden_size= 650, vocab_size= vocabulary, num_layers= num_layers)
    init_op = tf.global_variables_initializer()
    orig_decay = lr_decay
    with tf.Session as sess:
        
        #start threads
        sess.run([init_op])
        coord = tf.train.Coordinator()
        threads = tf.train.start_queue_runners(coord= coord)
        saver = tf.train.Saver()
        for epoch in range(num_epochs):
            new_lr_decay = orig_decay ** max(epoch + 1 - max_lr_epoch, 0.0)
            m.assign_lr(sess, learning_rate * new_lr_decay)
            current_state = np.zeros((num_layers, 2, batch_size, m.hidden_size))
            curr_time = dt.datetime.now()
            for step in range(training_input.epoch_size):
                
                if step % print_iter != 0:
                    cost, _, current_state = sess.run([m.cost, m.train_op, m.state]
                                                      , feed_dict = {m.init_state: current_state })
                else:
                    seconds = (float((dt.datetime.now() - curr_time).seconds) / print_iter)
                    curr_time = dt.datetime.now()
                    cost, _, current_state, acc = sess.run([m.cost, m.train_op, m.state, m.accuracy]
                                                           , feed_dict = {m.init_state: current_state})
                    print("Epoch {}, Step {}, Cost: {:.3f}, accuracy: {:.3f}, Seconds per step: {:.3f}"
                          .format(epoch, step, cost, acc, seconds))
                    
            #save a model checkpoint
            saver.save(sess, data_path + '\\' + model_save_name, global_step= epoch)
    #do a final save
    saver.save(sess, data_path + '\\' + model_save_name + '-final')
    #close threads
    coord.request_stop()
    coord.join(threads)
    

The training function will take as input the training data, along with various model parameters (batch sizes, number of steps etc.).  

- First we create an Input object instance and a Model object instance, passing in the necessary parameters. Because the TensorFlow graph is being created during the initialization of these objects, the TensorFlow global variable initializer operation can only be properly run after the creation of these instances.

        def train(train_data, vocabulary, num_layers, num_epochs, batch_size, model_save_name,
                  learning_rate=1.0, max_lr_epoch=10, lr_decay=0.93):
            # setup data and models
            training_input = Input(batch_size=batch_size, num_steps=35, data=train_data)
            m = Model(training_input, is_training=True, hidden_size=650, vocab_size=vocabulary,
                      num_layers=num_layers)
            init_op = tf.global_variables_initializer()


- Next we start the session, and run the variable initializer operation. Because we are using queuing in the Input object, we also need to create a thread coordinator and start the running of the threads (for more information, see this tutorial). If you skip this step, or put it before the creation of training_input, your program will hang. Finally, a saver instance is created as we want to store model training checkpoints and the final trained model.

        orig_decay = lr_decay
        with tf.Session() as sess:
            # start threads
            sess.run([init_op])
            coord = tf.train.Coordinator()
            threads = tf.train.start_queue_runners(coord=coord)
            saver = tf.train.Saver()


- Next, the epochal training loop is entered into.  
    The first step in every epoch is to calculate the learning rate decay factor, which gradually decreases after max_lr_epoch number of epochs has been reached. This learning rate decay factor, new_lr_decay, is multiplied by the learning rate and assigned to the model by calling the Model method assign_lr. This method looks like:

        def assign_lr(self, session, lr_value):
            session.run(self.lr_update, feed_dict={self.new_lr: lr_value})
    
    As can be observed, this function simply runs the lr_update operation.

    The next step is to create a zeroed initial state tensor for our LSTM model – we assign this zeroed tensor to the variable current_state. Then each training operation is looped through within our specified epoch size. Every iteration we run the following operations: m.train_op and m.state. The train_op operation, as previously shown, calculates the clipped gradients of the model and takes a batched step to minimize the cost. The state operation returns the state of the final unrolled LSTM cell which we will require to input as the state for the next training batch – note that it replaces the contents of the current_state variable. This current_state variable is inserted into the m.init_state placeholder via the feed_dict.


        for epoch in range(num_epochs):
            new_lr_decay = orig_decay ** max(epoch + 1 - max_lr_epoch, 0.0)
            m.assign_lr(sess, learning_rate * new_lr_decay)
            current_state = np.zeros((num_layers, 2, batch_size, m.hidden_size))
            for step in range(training_input.epoch_size):
                if step % 50 != 0:
                    cost, _, current_state = sess.run([m.cost, m.train_op, m.state],
                                                                     feed_dict={m.init_state: current_state})
                else:
                    cost, _, current_state, acc = sess.run([m.cost, m.train_op, m.state, m.accuracy],
                                                              feed_dict={m.init_state: current_state})
                    print("Epoch {}, Step {}, cost: {:.3f}, accuracy: {:.3f}".format(epoch, step, cost, acc))
            # save a model checkpoint
            saver.save(sess, data_path + '\\' + model_save_name, global_step=epoch)
        # do a final save
        saver.save(sess, data_path + '\\' + model_save_name + '-final')
        # close threads
        coord.request_stop()
        coord.join(threads)





_________

Every 50 iterations we also extract the current cost of the model in training, as well as the accuracy against the current training batch, to provide printed feedback during training. The outputs look like this:

    Epoch 9, Step 1850, cost: 96.185, accuracy: 0.198
    Epoch 9, Step 1900, cost: 94.755, accuracy: 0.235

Finally, at the end of each epoch, we use the saver object to save a model checkpoint, and finally at the end of the training a final save of the state of the model is performed.

The expected cost and accuracy progress through the epochs depends on the multitude of parameters supplied to the models and also the results of the random initialization of the variables. Training time is also dependent on whether you are using only CPUs, or whether you are using GPUs too.

## Testing the Model

In [38]:
def test(model_path, test_data, reversed_dictionary):
    test_input = Input(batch_size=20, num_steps=35, data=test_data)
    m = Model(test_input, is_training=False, hidden_size=650, vocab_size=vocabulary,
              num_layers=2)
    saver = tf.train.Saver()
    with tf.Session() as sess:
        # start threads
        coord = tf.train.Coordinator()
        threads = tf.train.start_queue_runners(coord=coord)
        current_state = np.zeros((2, 2, m.batch_size, m.hidden_size))
        # restore the trained model
        saver.restore(sess, model_path)
        # get an average accuracy over num_acc_batches
        num_acc_batches = 30
        check_batch_idx = 25
        acc_check_thresh = 5
        accuracy = 0
        for batch in range(num_acc_batches):
            if batch == check_batch_idx:
                true_vals, pred, current_state, acc = sess.run([m.input_obj.targets, m.predict, m.state, m.accuracy],
                                                               feed_dict={m.init_state: current_state})
                pred_string = [reversed_dictionary[x] for x in pred[:m.num_steps]]
                true_vals_string = [reversed_dictionary[x] for x in true_vals[0]]
                print("True values (1st line) vs predicted values (2nd line):")
                print(" ".join(true_vals_string))
                print(" ".join(pred_string))
            else:
                acc, current_state = sess.run([m.accuracy, m.state], feed_dict={m.init_state: current_state})
            if batch >= acc_check_thresh:
                accuracy += acc
        print("Average accuracy: {:.3f}".format(accuracy / (num_acc_batches-acc_check_thresh)))
        # close threads
        coord.request_stop()
        coord.join(threads)


We start with creating an Input and Model class that matches our training Input and Model classes. It is important that key parameters match the training model, such as the hidden size, number of steps, batch size etc. We are going to load our saved model variables into the computational graph created by the test Model instance, and if the dimensions don’t match TensorFlow will throw an error.

Next we create a tf.train.Saver() operation – this will load all our saved model variables into our test model when we run the line saver.restore(sess, model_path). After dealing with all of the threads and creating a zeroed state variable, we setup some variables which relate to how we are going to assess the accuracy and look at some specific instances of predicted strings. Because we have to “warm up” the model by feeding it some data to get good state variables, we only measure the accuracy after a certain number of batches i.e. acc_check_thresh.

When the batch number is equal to check_batch_idx the code runs the m.predict operation to extract the predictions for the particular batch of data. The first prediction of the batch is passed through the reverse dictionary to convert them back to actual words (along with the batch target words) and then compared with what should have been predicted via printing.

Using the trained model, we can see the following output:

True values (1st line) vs predicted values (2nd line):
stock market is headed many traders were afraid to trust stock prices quoted on the big board <eos> the futures halt was even <unk> by big board floor traders <eos> it <unk> things up said
market market is n’t for traders say willing to buy the prices <eos> <eos> the big board <eos> the dow market is a worse <eos> the board traders traders <eos> the ‘s the to to
Average accuracy: 0.283

The accuracy isn’t fantastic, but you can see the network is matching the “gist” of the sentence i.e. not producing all of the exact words but matching the general subject matter.

In [None]:
if args.data_path:
    data_path = args.data_path
train_data, valid_data, test_data, vocabulary, reversed_dictionary = load_data()
if args.run_opt == 1:
    train(train_data, vocabulary, num_layers=2, num_epochs=60, batch_size=20,
          model_save_name='two-layer-lstm-medium-config-60-epoch-0p93-lr-decay-10-max-lr')
else:
    trained_model = args.data_path + "\\two-layer-lstm-medium-config-60-epoch-0p93-lr-decay-10-max-lr-38"
test(trained_model, test_data, reversed_dictionary)

### de nada!