In [1]:
import time
import inspect
import numpy as np
import tensorflow as tf
import reader
import os
import sys
import collections

This is an example for building a benchmark LSTM model on the Penn Tree Bank dataset. 

The dataset is downloaded [here](http://www.fit.vutbr.cz/~imikolov/rnnlm/simple-examples.tgz)

The model trained here is the one described in http://arxiv.org/abs/1409.2329.

Run these lines in your shell to download and extract the data:

```bash
wget http://www.fit.vutbr.cz/~imikolov/rnnlm/simple-examples.tgz
tar xvf simple-examples.tgz
```

In [2]:
def _read_words(filename):
    """
    This function opens a file and returns a list of the token strings
    """
    with tf.gfile.GFile(filename, "r") as f:
        if sys.version_info[0] >= 3:
            return f.read().replace("\n", "<eos>").split()
        else:
            return f.read().decode("utf-8").replace("\n", "<eos>").split()

In [3]:
def _build_vocab(filename):
    """
    This function returns a dictionary of words and their counts to a word id
    """
    data = _read_words(filename)
    
    counter = collections.Counter(data)
    count_pairs = sorted(counter.items(), key=lambda x: (-x[1], x[0]))
    
    words, _ = list(zip(*count_pairs))
    word_to_id = dict(zip(words, range(len(words))))
    return word_to_id

In [4]:
def _file_to_word_ids(filename, word_to_id):
    """
    THis function reads in a datafile and returns a list of the word ids
    """
    data = _read_words(filename)
    return [word_to_id[word] for word in data if word in word_to_id]

In [5]:
def ptb_raw_data(data_path=None):
    """
    This function loads in the raw PTB data from the directory specified by data_path
    
    Reads a PTB text file, converts strings to integer ids, and performs mini-batching of the inputs.
    
    The PTB dataset comes from this url:
    http://www.fit.vutbr.cz/~imikolov/rnnlm/simple-examples.tgz
    
    This function returns a tuple of (train_data, valid_data, test_data, vocabulary)
    """
    
    train_path = os.path.join( data_path, "ptb.train.txt")
    valid_path = os.path.join( data_path, "ptb.valid.txt")
    test_path = os.path.join( data_path, "ptb.test.txt")
    
    word_to_id = _build_vocab(train_path)
    train_data = _file_to_word_ids(train_path, word_to_id)
    valid_data = _file_to_word_ids(valid_path, word_to_id)
    test_data = _file_to_word_ids(test_path, word_to_id)
    
    vocabulary = len(word_to_id)
    
    return train_data, valid_data, test_data, vocabulary

In [6]:
def ptb_producer(raw_data, batch_size, num_steps, name=None):
    """
    This iterates over the raw PTB data.

    It chunks raw_data into batches of examples and returns Tensors that are generated from each batch

    Args:
        raw_data: one of the raw data outputs from ptb_raw_data
        batch_size: int, the batch size
        num_steps: int, the number of unrolls
        name: the name of this operation (optional)

    Returns:
        A pair of Tensors, each shaped like [batch_size, num_steps].
        The second element of the tuple is the same data time-shifted to the right by one.
    """
    with tf.name_scope(name, "PTBProducer", [raw_data, batch_size, num_steps]):
        raw_data = tf.convert_to_tensor(
            raw_data, name="raw_data", dtype=tf.int32)
        data_len = tf.size(raw_data)
        batch_len = data_len // batch_size
        data = tf.reshape(
            raw_data[0: batch_size * batch_len], [batch_size, batch_len])

        epoch_size = (batch_len - 1) // num_steps
        assertion = tf.assert_positive(epoch_size, \
            message="epoch_size == 0; decrease batch_size or num_steps")
        with tf.control_dependencies([assertion]):
            epoch_size = tf.identity(epoch_size, name="epoch_size")
        
        i = tf.train.range_input_producer(epoch_size, shuffle=False).dequeue()
        x = tf.strided_slice(data, [0, i * num_steps], [batch_size, (i + 1) * num_steps])
        x.set_shape([batch_size, num_steps])
        y = tf.strided_slice(data, [0, i * num_steps + 1], [batch_size, (i + 1) * num_steps + 1])
        y.set_shape([batch_size, num_steps])
        return x, y

In [7]:
class PTBInput(object):
    """
    This class holds the input data
    """
    def __init__(self, config, data, name=None):
        self.batch_size = batch_size = config.batch_size
        self.num_steps = num_steps = config.num_steps
        self.epoch_size = ((len(data) // batch_size) - 1) // num_steps
        self.input_data, self.targets = ptb_producer( data, batch_size, num_steps, name=name )

In [8]:
class PTBModel(object):
    """
    This class defines the model we will be implementing
    """

    def __init__(self, is_training, config, input_):
        self.input = input_

        batch_size = input_.batch_size
        num_steps = input_.num_steps
        size = config.hidden_size
        vocab_size = config.vocab_size

        def lstm_cell():
            if 'reuse' in inspect.getargspec(tf.contrib.rnn.BasicLSTMCell.__init__).args:
                return tf.contrib.rnn.BasicLSTMCell(size, forget_bias=0.0, state_is_tuple=True,\
                                                    reuse=tf.get_variable_scope().reuse)
            else:
                return tf.contrib.rnn.BasicLSTMCell(size, forget_bias=0.0, state_is_tuple=True)

        attn_cell = lstm_cell
        if is_training and config.keep_prob < 1:
            def attn_cell():
                return tf.contrib.rnn.DropoutWrapper(lstm_cell(), output_keep_prob=config.keep_prob)
        cell = tf.contrib.rnn.MultiRNNCell([attn_cell() for _ in range(config.num_layers)], state_is_tuple=True)
        self.initial_state = cell.zero_state(batch_size, tf.float32)

        with tf.device("/cpu:0"):
            embedding = tf.get_variable("embedding", [vocab_size, size], dtype=tf.float32)
            inputs = tf.nn.embedding_lookup(embedding, input_.input_data)

        if is_training and config.keep_prob < 1:
            inputs = tf.nn.dropout(inputs, config.keep_prob)

        outputs = []
        state = self.initial_state
        with tf.variable_scope("RNN"):
            for time_step in range(num_steps):
                if time_step > 0:
                    tf.get_variable_scope().reuse_variables()
                    (cell_output, state) = cell(inputs[:, time_step, :], state)
                    outputs.append(cell_output)
        output = tf.reshape(tf.stack(axis=1, values=outputs), [-1, size])
        softmax_w = tf.get_variable("softmax_w", [size, vocab_size], dtype=tf.float32)
        softmax_b = tf.get_variable("softmax_b", [vocab_size], dtype=tf.float32)
        logits = tf.matmul(output, softmax_w) + softmax_b

        # now reshape logits to be a 3-D tensor for sequence loss
        logits = tf.reshape(logits, [batch_size, num_steps, vocab_size])

        loss = tf.contrib.seq2seq.sequence_loss(logits, input_.targets, tf.ones([batch_size, num_steps],
                                                                                dtype=tf.float32),
                                                average_across_timesteps=False,
                                                average_across_batch=True)

        # update the cost variables
        self.cost = cost = tf.reduce_sum(loss)
        self.final_state = state

        if not is_training:
            return

        self.lr = tf.Variable(0.0, trainable=False)
        tvars = tf.trainable_variables()
        grads, _ = tf.clip_by_global_norm(
            tf.gradients(cost, tvars), config.max_grad_norm)
        optimizer = tf.train.GradientDescentOptimizer(self.lr)
        self.train_op = optimizer.apply_gradients(zip(grads, tvars),
                                                  global_step=tf.contrib.framework.get_or_create_global_step())
        self.new_lr = tf.placeholder(
            tf.float32, shape=[], name="new_learning_rate")
        self.lr_update = tf.assign(self.lr, self.new_lr)

    def assign_lr(self, session, lr_value):
        session.run(self.lr_update, feed_dict={self.new_lr: lr_value})

In [9]:
class SmallConfig(object):
    """
    A small class containing the config
    """
    init_scale = 0.1
    learning_rate = 1.0
    max_grad_norm = 5
    num_layers = 2
    num_steps = 20
    hidden_size = 200
    max_epoch = 4
    max_max_epoch = 13
    keep_prob = 1.0
    lr_decay = 0.5
    batch_size = 20
    vocab_size = 10000

In [10]:
def run_epoch(session, model, eval_op=None, verbose=False):
    """
    This function runs the model on the given data
    """
    start_time = time.time()
    costs = 0.0
    iters = 0
    state = session.run(model.initial_state)

    fetches = {"cost": model.cost, "final_state": model.final_state}
    if eval_op is not None:
        fetches["eval_op"] = eval_op

    for step in range(model.input.epoch_size):
        feed_dict = {}
        for i, (c, h) in enumerate(model.initial_state):
            feed_dict[c] = state[i].c
            feed_dict[h] = state[i].h
        vals = session.run(fetches, feed_dict)
        cost = vals["cost"]
        state = vals["final_state"]

        costs += cost
        iters += model.input.num_steps

        if verbose and step % (model.input.epoch_size // 10) == 10:
            print("%.3f perplexity: %.3f speed: %.0f wps" % (step * 1.0 / model.input.epoch_size,
                                                             np.exp(
                                                                 costs/iters),
                                                             (iters * model.input.batch_size/(time.time()-start_time))))
    return np.exp(costs/iters)

In [11]:
flags = tf.flags
logging = tf.logging

flags.DEFINE_string("save_path", None, "Model output directory.")
FLAGS = flags.FLAGS

Now we can train the model

In [12]:
raw_data = ptb_raw_data("data/penn_treebank_dataset/data/")
train_data, valid_data, test_data, _ = raw_data

In [13]:
config = SmallConfig()
eval_config = SmallConfig()
eval_config.batch_size = 1
eval_config.num_steps = 1

In [14]:
with tf.Graph().as_default():
    initializer = tf.random_uniform_initializer(-config.init_scale, config.init_scale)
    
    with tf.name_scope("Train"):
        train_input = PTBInput( config=config, data=train_data, name="TrainInput" )
        
        with tf.variable_scope("Model", reuse=None, initializer=initializer):
            m = PTBModel( is_training=True, config=config, input_=train_input )
        
        tf.summary.scalar("Training Loss", m.cost)
        tf.summary.scalar("Learning Rate", m.lr)
    
    with tf.name_scope("Valid"):
        valid_input = PTBInput( config=config, data=valid_data, name="ValidInput" )
        
        with tf.variable_scope("Model", reuse=tf.AUTO_REUSE, initializer=initializer):
            mvalid = PTBModel(is_training=False, config=config, input_=valid_input)
        
        tf.summary.scalar("Validation Loss", mvalid.cost)
    
    with tf.name_scope("Test"):
        test_input = PTBInput(config=eval_config, data=test_data, name="TestInput")
        with tf.variable_scope("Model", reuse=tf.AUTO_REUSE, initializer=initializer):
            mtest = PTBModel(is_training=False, config=eval_config, input_=test_input)
    
    sv = tf.train.Supervisor()
    with sv.managed_session() as session:
        for i in range(config.max_max_epoch):
            lr_decay = config.lr_decay ** max(i + 1 - config.max_epoch, 0.0)
            m.assign_lr( session, config.learning_rate * lr_decay )
            print("Epoch: {0:d} Learning Rate: {1:.3f}".format(i+1, session.run(m.lr)))
            
            train_perplexity = run_epoch(session, m, eval_op=m.train_op, verbose=True)
            print("Epoch: {0:d} Train Perplexity: {1:.3f}".format(i+1,train_perplexity))
            
            valid_perplexity = run_epoch(session, mvalid)
            print("Epoch: {0:d} Valid Perplexity: {1:.3f}".format(i+1, valid_perplexity))
            
        test_perplexity = run_epoch(session, mtest)
        print("Test Perplexity: {0:.3f}".format(test_perplexity))

W0806 16:37:01.804246 140273971513152 deprecation.py:323] From <ipython-input-6-781990815435>:31: range_input_producer (from tensorflow.python.training.input) is deprecated and will be removed in a future version.
Instructions for updating:
Queue-based input pipelines have been replaced by `tf.data`. Use `tf.data.Dataset.range(limit).shuffle(limit).repeat(num_epochs)`. If `shuffle=False`, omit the `.shuffle(...)`.
W0806 16:37:01.807046 140273971513152 deprecation.py:323] From /home/joseph/miniconda3/lib/python3.6/site-packages/tensorflow/python/training/input.py:320: input_producer (from tensorflow.python.training.input) is deprecated and will be removed in a future version.
Instructions for updating:
Queue-based input pipelines have been replaced by `tf.data`. Use `tf.data.Dataset.from_tensor_slices(input_tensor).shuffle(tf.shape(input_tensor, out_type=tf.int64)[0]).repeat(num_epochs)`. If `shuffle=False`, omit the `.shuffle(...)`.
W0806 16:37:01.807782 140273971513152 deprecation.py:

ValueError: in converted code:
    relative to /home/joseph/miniconda3/lib/python3.6/site-packages/tensorflow/python:

    ops/rnn_cell_impl.py:1719 call *
        cur_inp, new_state = cell(cur_inp, cur_state)
    ops/rnn_cell_impl.py:385 __call__
        self, inputs, state, scope=scope, *args, **kwargs)
    layers/base.py:537 __call__
        outputs = super(Layer, self).__call__(inputs, *args, **kwargs)
    keras/engine/base_layer.py:591 __call__
        self._maybe_build(inputs)
    keras/engine/base_layer.py:1881 _maybe_build
        self.build(input_shapes)
    keras/utils/tf_utils.py:295 wrapper
        output_shape = fn(instance, input_shape)
    ops/rnn_cell_impl.py:734 build
        shape=[input_depth + h_depth, 4 * self._num_units])
    keras/engine/base_layer.py:1484 add_variable
        return self.add_weight(*args, **kwargs)
    layers/base.py:450 add_weight
        **kwargs)
    keras/engine/base_layer.py:384 add_weight
        aggregation=aggregation)
    training/tracking/base.py:663 _add_variable_with_custom_getter
        **kwargs_for_getter)
    ops/variable_scope.py:1496 get_variable
        aggregation=aggregation)
    ops/variable_scope.py:1239 get_variable
        aggregation=aggregation)
    ops/variable_scope.py:545 get_variable
        return custom_getter(**custom_getter_kwargs)
    ops/rnn_cell_impl.py:251 _rnn_get_variable
        variable = getter(*args, **kwargs)
    ops/variable_scope.py:514 _true_getter
        aggregation=aggregation)
    ops/variable_scope.py:882 _get_single_variable
        "reuse=tf.AUTO_REUSE in VarScope?" % name)

    ValueError: Variable Model/RNN/multi_rnn_cell/cell_0/basic_lstm_cell/kernel does not exist, or was not created with tf.get_variable(). Did you mean to set reuse=tf.AUTO_REUSE in VarScope?
