# A TensorFlow Seq2Seq 

for compitability with tensorflow v1.2 the code is adapted from the code 
https://github.com/JayParks/tf-seq2seq/blob/master/seq2seq_model.py

The model contains the following modules 
- Sequence to sequence model
- Encoder is multilayer GRU
- Decoder is a multilayer GRU

Until this part the model is only trainable in the next ver



To make the RNN-encoder-decoder code easy to understand we assume the following 

- all additional characters such as `<unk> <rare> <pad>` are added to the vocabulary 
- vocabulary is created offline 
- The inputs to the decoder are NOT preprocessed beforehand to start with  `<s>` and  `<\s>`
- `<s>` and  `<\s>` are stored in config.DECODER_START_TOKEN_ID and config.DECODER_END_TOKEN_ID and added on the fly in the model


## Configurations

In [1]:
class config():
    NUM_LAYERS = 3
    HIDDEN_SIZE = 256
    BATCH_SIZE = 64

    LR = 0.5
    MAX_GRAD_NORM = 5.0
    ATTENTION_SIZE = 30
    NUM_SAMPLES = 512
    
    ENC_VOCAB = 300
    DEC_VOCAB = 900

    DECODER_START_TOKEN_ID = 2
    DECODER_END_TOKEN_ID = 3

## Creating Model

In [2]:
from __future__ import print_function
import time
import math

In [3]:
# tensorflow imports
import tensorflow as tf
from tensorflow.python.layers.core import Dense

In [4]:
class Seq2Seq():

    def __init__(self, mode='training'):
        print('Initializing new seq 2 seq model')

        assert mode in ['training', 'evaluation', 'inference']
        self.mode = mode

        self.__create_placeholders()
        self.__create_encoder()
        self.__create_decoder()
        self.__create_loss()
        self.__create_optimizer()

    def __create_placeholders(self):

        # encoder_inputs : size [batch_size, max_step_size]
        self.encoder_inputs = tf.placeholder(tf.int32, shape=[None, None], name="encoder_inputs")
        
        # encoder_inputs_length: [batch_size]
        self.encoder_inputs_length = tf.placeholder(dtype=tf.int32, shape=(None,), name='encoder_inputs_length')

        self.batch_size = tf.shape(self.encoder_inputs)[0]

        ## Decoder placeholders:
        ## these are the raw inputs to the decoder:
        self.decoder_inputs = tf.placeholder(tf.int32, shape=[None, None], name="decoder_inputs")
        self.decoder_inputs_length = tf.placeholder(dtype=tf.int32, shape=(None,), name='decoder_inputs_length')

        # for training we add <s> start tag for the input of the decoder and </s> end tag for the decoder target
        # as shown in figure https://www.tensorflow.org/images/basic_seq2seq.png

        starttokens = tf.ones([self.batch_size, 1], dtype=tf.int32) * config.DECODER_START_TOKEN_ID
        endtokens = tf.ones([self.batch_size, 1], dtype=tf.int32) * config.DECODER_END_TOKEN_ID

        self.decoder_inputs_train = tf.concat([starttokens, self.decoder_inputs], axis=1)
        self.decoder_targets_train = tf.concat([self.decoder_inputs, endtokens], axis=1)

        # decoder_inputs_length_train: [batch_size]
        # both input and target to the decoder are of the same length
        self.decoder_inputs_length_train = self.decoder_inputs_length + 1
        self.decoder_targets_length_train = self.decoder_inputs_length + 1
        # calculating max_decoder_length
        self.decoder_max_length = tf.reduce_max(self.decoder_inputs_length_train)


        # global step
        self.global_step = tf.Variable(0, trainable=False, name='global_step')



    def __build_sample_softmax(self):
        
        if 0 < config.NUM_SAMPLES < config.DEC_VOCAB:

            w = tf.get_variable("proj_w", shape=[config.HIDDEN_SIZE, config.DEC_VOCAB])
            b = tf.get_variable("proj_b", shape=[config.DEC_VOCAB])
            self.output_projection = (w, b)

        def sampled_loss(inputs, labels):
            labels = tf.reshape(labels, [-1, 1])
            return tf.nn.sampled_softmax_loss(tf.transpose(w), b, inputs, labels,
                                              config.NUM_SAMPLES, config.DEC_VOCAB)

        self.softmax_loss_function = sampled_loss

    def __create_encoder(self):
        print('building encoder ...')
        start = time.time()

        with tf.variable_scope('encoder'):

            # Create Embeddings Weights
            self.encoder_embeddings = tf.get_variable("encoder_embeddings",
                                                      shape=[config.ENC_VOCAB, config.HIDDEN_SIZE],
                                                      initializer=self.__helper__initializer(),
                                                      dtype=tf.float32
                                                      )
            # embedding the encoder inputs
            encoder_inputs_embedded = tf.nn.embedding_lookup(self.encoder_embeddings, self.encoder_inputs)

            # changing the dimensionality of embedded inputs into hidden size
            encoder_input_layer = Dense(config.HIDDEN_SIZE, dtype=tf.float32, name='encoder_input_projection')
            self.encoder_inputs_embedded = encoder_input_layer(encoder_inputs_embedded)

            # create encoder cell
            gru = tf.nn.rnn_cell.GRUCell(config.HIDDEN_SIZE)
            self.encoder_cell = tf.nn.rnn_cell.MultiRNNCell([gru] * config.NUM_LAYERS)

            # Encode input sequences into context vectors:
            # encoder_outputs: [batch_size, max_time_step, cell_output_size]
            # encoder_state: [batch_size, cell_output_size]
            
            self.encoder_outputs, self.encoder_last_state = tf.nn.dynamic_rnn(
                cell=self.encoder_cell,
                inputs=self.encoder_inputs_embedded,
                sequence_length=self.encoder_inputs_length,
                dtype=tf.float32
            )

        print('Building encoder in: ', time.time() - start, ' secs')

    def __create_decoder(self):
        print("building decoder and attention ..")
        start = time.time()

        with tf.variable_scope('decoder'):

            # input and output layers to the decoder
            decoder_input_layer = Dense(config.HIDDEN_SIZE, dtype=tf.float32, name='decoder_input_projection')
            decoder_output_layer = Dense(config.DEC_VOCAB, name="decoder_output_projection")
            self.decoder_initial_state = self.encoder_last_state

            # creating decoder embedding weights
            self.decoder_embeddings = tf.get_variable("decoder_embeddings",
                                                      shape=[config.DEC_VOCAB, config.HIDDEN_SIZE],
                                                      initializer=self.__helper__initializer(),
                                                      dtype=tf.float32
                                                      )

            # create decoder cell:
            gru = tf.nn.rnn_cell.GRUCell(config.HIDDEN_SIZE)
            self.decoder_cell_list = [gru] * config.NUM_LAYERS

            self.decoder_cell = tf.nn.rnn_cell.MultiRNNCell(self.decoder_cell_list)

            # compose the decoder
            if self.mode == 'training':

                # changing inputs to embeddings and then through the input projection
                # decoder_inputs_embedded: [batch_size, max_time_step + 1, embedding_size]
                self.decoder_inputs_embedded = tf.nn.embedding_lookup(params=self.decoder_embeddings, ids=self.decoder_inputs_train)
                self.decoder_inputs_embedded = decoder_input_layer(self.decoder_inputs_embedded)

                # Helper to feed inputs to the training:

                self.training_helper = tf.contrib.seq2seq.TrainingHelper(
                    inputs=self.decoder_inputs_embedded,
                    sequence_length=self.decoder_inputs_length_train,
                    name='training_helper')

                # self.decoder_initial_state = [state for state in self.encoder_last_state]
                # self.decoder_initial_state[-1] = self.decoder_cell_list[-1].zero_state(batch_size=tf.shape(self.encoder_inputs)[0], dtype=tf.float32)
                self.decoder_initial_state = self.encoder_last_state

                # Build the decoder
                self.decoder = tf.contrib.seq2seq.BasicDecoder(
                    cell=self.decoder_cell,
                    helper=self.training_helper,
                    initial_state=self.decoder_initial_state,
                    output_layer=decoder_output_layer)

                self.decoder_outputs, self.decoder_last_state, self.decoder_outputs_length_decode = tf.contrib.seq2seq.dynamic_decode(
                    decoder=self.decoder,
                    impute_finished=True,
                    maximum_iterations=self.decoder_max_length
                )

        print('Building decoder in: ', time.time() - start, ' secs')

    def __create_loss(self):

        print('Creating loss...')
        start = time.time()

        self.decoder_logits = tf.identity(self.decoder_outputs.rnn_output, name="decoder_logits")
        self.decoder_pred = tf.argmax(self.decoder_logits, axis=-1, name="decoder_pred")

        # masking the sequence in order to calculate the error according to the calculated
        mask = tf.sequence_mask(self.decoder_inputs_length_train, maxlen=self.decoder_max_length, dtype=tf.float32, name="masks")

        # Control loss dimensions with `average_across_timesteps` and `average_across_batch`
        self.loss = tf.contrib.seq2seq.sequence_loss(logits=self.decoder_logits,
                                         targets=self.decoder_targets_train,
                                         average_across_timesteps=True,
                                         average_across_batch=True,
                                         weights=mask,
                                         name="batch_loss")

        print('Building loss in: ', time.time() - start, ' secs')


    def __create_optimizer(self):
        print('creating optimizer...')
        start = time.time()

        self.opt = tf.train.RMSPropOptimizer(learning_rate=config.LR)

        # normalize the gradients of a parameter vector when its L2 norm exceeds a certain threshold according to
        trainable_params = tf.trainable_variables()
        
        # calculate gradients of the loss given all the trainable parameters
        gradients = tf.gradients(self.loss, trainable_params)
        
        # Gradient clipping
        # new_gradients = gradients * threshold / l2_norm(gradients)
        clip_gradients, _ = tf.clip_by_global_norm(gradients, config.MAX_GRAD_NORM)

        self.updates = self.opt.apply_gradients(zip(clip_gradients, trainable_params), global_step=self.global_step)


        print('Building optimizer in: ', time.time() - start, ' secs')


    def __helper__initializer(self):
        sqrt3 = math.sqrt(3)  # Uniform(-sqrt(3), sqrt(3)) has variance=1.
        initializer = tf.random_uniform_initializer(-sqrt3, sqrt3, dtype=tf.float32)
        return initializer

    def train(self, sess,encoder_inputs, encoder_inputs_length, decoder_inputs, decoder_inputs_lengths):

        feed_dict = {
            self.encoder_inputs: encoder_inputs,
            self.encoder_inputs_length: encoder_inputs_length,
            self.decoder_inputs: decoder_inputs,
            self.decoder_inputs_length: decoder_inputs_lengths
        }
        _, loss = sess.run([self.updates, self.loss], feed_dict=feed_dict)

        return loss

In [5]:
model = Seq2Seq()

Initializing new seq 2 seq model
building encoder ...
Building encoder in:  0.166300773621  secs
building decoder and attention ..
Building decoder in:  1.03757119179  secs
Creating loss...
Building loss in:  0.0138518810272  secs
creating optimizer...
Building optimizer in:  1.06437301636  secs


In [8]:
import numpy as np
from collections import deque


encoder_lengths = [5, 8, 8, 5, 10] * 100
decoder_lengths = [5, 8, 8, 5, 10] * 100

encoder_inputs = np.zeros((500,10))

for c, i in enumerate(encoder_lengths):
    encoder_inputs[c,:i] = np.random.randint(0, 300, i)

decoder_inputs = np.zeros_like(encoder_inputs)

for c, r in enumerate(encoder_inputs):
    tmp = deque(r)
    tmp.rotate(1)
    tmp1 = tmp
    tmp.rotate(1)
    tmp2 = tmp
    tmp.rotate(1)
    tmp3 = tmp
    decoder_inputs[c] = np.sum([tmp1, tmp2, tmp3], axis=0)


with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())

    encoder_inputs = np.array(encoder_inputs)
    encoder_lengths = np.array(encoder_lengths)
    decoder_inputs = np.array(decoder_inputs)
    decoder_lengths = np.array(decoder_lengths)


    EPOCHS = 100

    for i in range(EPOCHS):
        loss = model.train(sess, encoder_inputs, encoder_lengths, decoder_inputs, decoder_lengths)
        print(loss)

6.80885
6.74696
6.68086
6.60276
6.49979
6.34733
6.11445
5.97212
6.10463
5.60023
5.66922
6.5992
6.21638
5.85056
5.34537
5.25792
5.22079
5.70348
7.3468
6.12712
5.69715


KeyboardInterrupt: 