# Imports

In [1]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import time

import numpy as np
import tensorflow as tf

from tensorflow.models.rnn import rnn_cell
from tensorflow.models.rnn.seq2seq import sequence_loss_by_example

# parses the dataset
# wget https://raw.githubusercontent.com/tensorflow/tensorflow/master/tensorflow/models/rnn/ptb/reader.py
import reader

# Dataset

In [2]:
train_data, valid_data, test_data, _ = reader.ptb_raw_data("ptb")

# GFMultiRNNCell

In [3]:
class GFMultiRNNCell(rnn_cell.RNNCell):
    def __init__(self, cells):
        if not cells:
            raise ValueError("Must specify at least one cell for GFMultiRNNCell.")

        for i in xrange(len(cells) - 1):
            if cells[i + 1].input_size != cells[i].output_size:
                raise ValueError("In GFMultiRNNCell, the input size of each next"
                                " cell must match the output size of the previous one."
                                " Mismatched output size in cell %d." % i)

        self._cells = cells

    @property
    def input_size(self):
        return self._cells[0].input_size

    @property
    def output_size(self):
        return self._cells[-1].output_size

    @property
    def state_size(self):
        return sum([cell.state_size for cell in self._cells])

    def __call__(self, inputs, hs_prev, state, scope=None):
        with tf.variable_scope(scope or type(self).__name__): # "GFMultiRNNCell"
            cur_state_pos = 0
            cur_inp = inputs
            new_states = []
            new_hs = []
            for i, cell in enumerate(self._cells):
                with tf.variable_scope("Cell%d" % i):
                    cur_state = tf.slice(state, [0, cur_state_pos], [-1, cell.state_size])
                    cur_state_pos += cell.state_size
                    cur_inp, new_state = cell(cur_inp, hs_prev, cur_state)
                    new_states.append(new_state)
                    new_hs.append(cur_inp)
        return cur_inp, tf.concat(1, new_hs), tf.concat(1, new_states)

# GFLSTMCell

In [4]:
class GFLSTMCell(rnn_cell.RNNCell):
    def __init__(self, num_layers, num_blocks):
        self._num_layers = num_layers
        self._num_blocks = num_blocks

    @property
    def input_size(self):
        return self._num_blocks

    @property
    def output_size(self):
        return self._num_blocks

    @property
    def state_size(self):
        return 2 * self._num_blocks

    def __call__(self, inputs, hs_prev, state, scope=None):
        with tf.variable_scope(scope or type(self).__name__):
            initializer = tf.random_uniform_initializer(-0.1, 0.1)

            def get_variable(name, shape):
                return tf.get_variable(name, shape, initializer=initializer, dtype=inputs.dtype)

            c_prev, h_prev = tf.split(1, 2, state)

            W_c = get_variable("W_c", [self.input_size, self.output_size])
            W_i = get_variable("W_i", [self.input_size, self.output_size])
            W_f = get_variable("W_f", [self.input_size, self.output_size])
            W_o = get_variable("W_o", [self.input_size, self.output_size])

            U_c = get_variable("U_c", [self.output_size, self.output_size])
            U_i = get_variable("U_i", [self.output_size, self.output_size])
            U_f = get_variable("U_f", [self.output_size, self.output_size])
            U_o = get_variable("U_o", [self.output_size, self.output_size])

            W_g = get_variable("W_g", [self.output_size, self.output_size])
            U_g = get_variable("U_g", [self.output_size, self.output_size])

            print(inputs.get_shape(), W_g.get_shape())
            t1_ = tf.matmul(inputs, W_g)
            print(hs_prev.get_shape(), U_g.get_shape())
            t2_ = tf.matmul(tf.transpose(hs_prev), U_g)
            g = tf.sigmoid(tf.reduce_sum(t1_) + tf.reduce_sum(t2_))
            gf = g * tf.matmul(h_prev, U_c)

            # c_next = tf.tanh(tf.matmul(inputs, W_c) + tf.matmul(h_prev, U_c)) # original
            c_next = tf.tanh(tf.matmul(inputs, W_c) + tf.reduce_sum(gf))

            i = tf.sigmoid(tf.matmul(inputs, W_i) + tf.matmul(h_prev, U_i))
            f = tf.sigmoid(tf.matmul(inputs, W_f) + tf.matmul(h_prev, U_f))
            c = tf.mul(f, c_prev) + tf.mul(i, c_next)
            o = tf.sigmoid(tf.matmul(inputs, W_o) + tf.matmul(h_prev, U_o))
            h = tf.mul(tf.tanh(c), o)

            return h, tf.concat(1, [c, h])

# Model

In [5]:
class PTBModel(object):
    def __init__(self, is_training, config):
        self.batch_size = batch_size = config.batch_size
        self.num_steps = num_steps = config.num_steps
        size = config.hidden_size
        vocab_size = config.vocab_size

        self.input_data = tf.placeholder(tf.int32, [batch_size, num_steps], name="input_data")
        self.targets = tf.placeholder(tf.int32, [batch_size, num_steps], name="targets")

        lstm_cell = GFLSTMCell(config.num_layers, size)
        cell = GFMultiRNNCell([lstm_cell] * config.num_layers)
        self.initial_state = cell.zero_state(batch_size, tf.float32)
        self.initial_activations = tf.zeros([size, config.num_layers])
        
        # initializer used for reusable variable initializer (see `get_variable`)
        initializer = tf.random_uniform_initializer(-config.init_scale, config.init_scale)

        with tf.device("/cpu:0"):
            embedding = tf.get_variable("embedding", [vocab_size, size], initializer=initializer)
            inputs = tf.nn.embedding_lookup(embedding, self.input_data)

        states = []
        outputs = []
        state = self.initial_state
        activations = self.initial_activations

        with tf.variable_scope("RNN", initializer=initializer):
            for time_step in range(num_steps):
                if time_step > 0:
                    tf.get_variable_scope().reuse_variables()

                x = inputs[:,time_step,:]
                (h, activations, state) = cell(x, activations, state)

                states.append(state)
                outputs.append(h)

        self.final_state = states[-1]

        output = tf.reshape(tf.concat(1, outputs), [-1, size])
        w = tf.get_variable("softmax_w",
                                    [size, vocab_size],
                                    initializer=initializer)
        b = tf.get_variable("softmax_b", [vocab_size], initializer=initializer)

        logits = tf.nn.xw_plus_b(output, w, b) # compute logits for loss
        targets = tf.reshape(self.targets, [-1]) # reshape our target outputs
        weights = tf.ones([batch_size * num_steps]) # used to scale the loss average

        # computes loss and performs softmax on our fully-connected output layer
        loss = sequence_loss_by_example([logits], [targets], [weights], vocab_size)
        self.cost = cost = tf.div(tf.reduce_sum(loss), batch_size, name="cost")

        if is_training:
            # setup learning rate variable to decay
            self.lr = tf.Variable(1.0, trainable=False)

            # define training operation and clip the gradients
            tvars = tf.trainable_variables()
            grads, _ = tf.clip_by_global_norm(tf.gradients(cost, tvars), config.max_grad_norm)
            optimizer = tf.train.GradientDescentOptimizer(self.lr)
            self.train_op = optimizer.apply_gradients(zip(grads, tvars), name="train")
        else:
            # if this model isn't for training (i.e. testing/validation) then we don't do anything here
            self.train_op = tf.no_op()

# Epoch

In [6]:
def run_epoch(sess, model, data, verbose=False):
    epoch_size = ((len(data) // model.batch_size) - 1) // model.num_steps
    start_time = time.time()

    # accumulated counts
    costs = 0.0
    iters = 0

    # initial RNN state
    state = model.initial_state.eval()

    for step, (x, y) in enumerate(reader.ptb_iterator(data, model.batch_size, model.num_steps)):
        cost, state, _ = sess.run([model.cost, model.final_state, model.train_op], feed_dict={
            model.input_data: x,
            model.targets: y,
            model.initial_state: state
        })
        costs += cost
        iters += model.num_steps

        perplexity = np.exp(costs / iters)

        if verbose and step % 10 == 0:
            progress = (step / epoch_size) * 100
            wps = iters * model.batch_size / (time.time() - start_time)
            print("%.1f%% Perplexity: %.3f (Cost: %.3f) Speed: %.0f wps" % (progress, perplexity, cost, wps))

    return (costs / iters), perplexity

# Config

In [7]:
class Config(object):
    batch_size = 20
    num_steps = 35 # number of unrolled time steps
    hidden_size = 450 # number of blocks in an LSTM cell
    vocab_size = 10000
    max_grad_norm = 5 # maximum gradient for clipping
    init_scale = 0.05 # scale between -0.1 and 0.1 for all random initialization
    keep_prob = 0.5 # dropout probability
    num_layers = 2 # number of LSTM layers
    learning_rate = 1.0
    lr_decay = 0.8
    lr_decay_epoch_offset = 6 # don't decay until after the Nth epoch

In [8]:
# default settings for training
train_config = Config()

# our evaluation runs (validation and testing), use a batch size and time step of one
eval_config = Config()
eval_config.batch_size = 1
eval_config.num_steps = 1

# number of epochs to perform over the training data
num_epochs = 39

# Program

In [9]:
with tf.Graph().as_default(), tf.Session() as sess:
    # define our training model
    with tf.variable_scope("model", reuse=None):
        train_model = PTBModel(is_training=True, config=train_config)

    # we create a separate model for validation and testing to alter the batch size and time steps
    # reuse=True reuses variables from the previously defined `train_model`
    with tf.variable_scope("model", reuse=True):
        valid_model = PTBModel(is_training=False, config=train_config)
        test_model = PTBModel(is_training=False, config=eval_config)

    sess.run(tf.initialize_all_variables())

    # tf.train.write_graph(sess.graph_def, 'models/', 'gf_rnn.pb', as_text=False)

    train_costs = []
    train_perps = []
    valid_costs = []
    valid_perps = []

    for i in range(num_epochs):
        lr_decay = train_config.lr_decay ** max(i - train_config.lr_decay_epoch_offset, 0.0)
        sess.run(tf.assign(train_model.lr, train_config.learning_rate * lr_decay))
        print("Epoch: %d Learning Rate: %.3f" % (i + 1, sess.run(train_model.lr)))

        # run training pass
        train_cost, train_perp = run_epoch(sess, train_model, train_data, verbose=True)
        print("Epoch: %i Training Perplexity: %.3f (Cost: %.3f)" % (i + 1, train_perp, train_cost))
        train_costs.append(train_cost)
        train_perps.append(train_perp)

        # run validation pass
        valid_cost, valid_perplexity = run_epoch(sess, valid_model, valid_data)
        print("Epoch: %i Validation Perplexity: %.3f (Cost: %.3f)" % (i + 1, valid_perp, valid_cost))
        valid_costs.append(valid_cost)
        valid_perps.append(valid_perp)

    # run test pass
    test_cost, test_perp = run_epoch(sess, test_model, test_data)
    print("Test Perplexity: %.3f (Cost: %.3f)" % (test_perp, test_cost))

TensorShape([Dimension(20), Dimension(450)]) TensorShape([Dimension(450), Dimension(450)])
TensorShape([Dimension(450), Dimension(2)]) TensorShape([Dimension(450), Dimension(450)])
TensorShape([Dimension(20), Dimension(450)]) TensorShape([Dimension(450), Dimension(450)])
TensorShape([Dimension(450), Dimension(2)]) TensorShape([Dimension(450), Dimension(450)])
TensorShape([Dimension(20), Dimension(450)]) TensorShape([Dimension(450), Dimension(450)])
TensorShape([Dimension(20), Dimension(900)]) TensorShape([Dimension(450), Dimension(450)])


ValueError: Dimensions Dimension(20) and Dimension(450) are not compatible