### Load Data

In [1]:
import numpy as np
import time


source_path = 'data/letters_source.txt'
target_path = 'data/letters_target.txt'

with open(source_path, 'r') as f:
    source_sentences = f.read()
    
with open(target_path, 'r') as f:
    target_sentences = f.read()

In [2]:
source_sentences[:50].split('\n')

['bsaqq',
 'npy',
 'lbwuj',
 'bqv',
 'kial',
 'tddam',
 'edxpjpg',
 'nspv',
 'huloz',
 '']

In [3]:
target_sentences[:50].split('\n')

['abqqs',
 'npy',
 'bjluw',
 'bqv',
 'aikl',
 'addmt',
 'degjppx',
 'npsv',
 'hlouz',
 '']

### Preprocess

In [4]:
print( ['<PAD>', '<UNK>', '<GO>', '<EOS>'])

['<PAD>', '<UNK>', '<GO>', '<EOS>']


In [5]:
def extract_character_vocab(data):
    special_words = ['<PAD>', '<UNK>', '<GO>',  '<EOS>']

    set_words = set([character for line in data.split('\n') for character in line])
    int_to_vocab = {word_i: word for word_i, word in enumerate(special_words + list(set_words))}
    vocab_to_int = {word: word_i for word_i, word in int_to_vocab.items()}

    return int_to_vocab, vocab_to_int

# Build int2letter and letter2int dicts
source_int_to_letter, source_letter_to_int = extract_character_vocab(source_sentences)
target_int_to_letter, target_letter_to_int = extract_character_vocab(target_sentences)
print(source_sentences[:10])
print(source_int_to_letter)
print(source_letter_to_int)
# print(source_letter_to_int)
# Convert characters to ids
source_letter_ids = [[source_letter_to_int.get(letter, source_letter_to_int['<UNK>']) for letter in line] for line in source_sentences.split('\n')]
target_letter_ids = [[target_letter_to_int.get(letter, target_letter_to_int['<UNK>']) for letter in line] + [target_letter_to_int['<EOS>']] for line in target_sentences.split('\n')] 

print("Example source sequence")
print(source_letter_ids[:3])
print("\n")
print("Example target sequence")
print(target_letter_ids[:3])

bsaqq
npy

{0: '<PAD>', 1: '<UNK>', 2: '<GO>', 3: '<EOS>', 4: 'e', 5: 'h', 6: 'k', 7: 'v', 8: 'p', 9: 'f', 10: 'c', 11: 't', 12: 'd', 13: 'm', 14: 'g', 15: 'b', 16: 'r', 17: 'y', 18: 's', 19: 'z', 20: 'l', 21: 'q', 22: 'n', 23: 'j', 24: 'i', 25: 'x', 26: 'w', 27: 'u', 28: 'o', 29: 'a'}
{'<PAD>': 0, '<UNK>': 1, '<GO>': 2, '<EOS>': 3, 'e': 4, 'h': 5, 'k': 6, 'v': 7, 'p': 8, 'f': 9, 'c': 10, 't': 11, 'd': 12, 'm': 13, 'g': 14, 'b': 15, 'r': 16, 'y': 17, 's': 18, 'z': 19, 'l': 20, 'q': 21, 'n': 22, 'j': 23, 'i': 24, 'x': 25, 'w': 26, 'u': 27, 'o': 28, 'a': 29}
Example source sequence
[[15, 18, 29, 21, 21], [22, 8, 17], [20, 15, 26, 27, 23]]


Example target sequence
[[29, 15, 21, 21, 18, 3], [22, 8, 17, 3], [15, 23, 20, 27, 26, 3]]




This is the final shape we need them to be in. We can now proceed to building the model.

### Build Model
#### Check the Version of TensorFlow

In [6]:
from distutils.version import LooseVersion
import tensorflow as tf
from tensorflow.python.layers.core import Dense


# Check TensorFlow Version
assert LooseVersion(tf.__version__) >= LooseVersion('1.1'), 'Please use TensorFlow version 1.1 or newer'
print('TensorFlow Version: {}'.format(tf.__version__))

  from ._conv import register_converters as _register_converters


TensorFlow Version: 1.10.0


#### Hyperparameters

In [7]:
# Number of Epochs
epochs = 60
# Batch Size
batch_size = 128
# RNN Size
rnn_size = 50
# Number of Layers
num_layers = 2
# Embedding Size
encoding_embedding_size = 15
decoding_embedding_size = 15
# Learning Rate
learning_rate = 0.001

#### Input

In [8]:
def get_model_inputs():
    input_data = tf.placeholder(dtype=tf.int32,
                                shape=[None, None],
                                name='input')
    targets = tf.placeholder(dtype=tf.int32,
                             shape=[None, None],
                             name='targets')
    lr = tf.placeholder(dtype=tf.float32,
                        shape=None,
                        name='learning_rate')
    target_sequence_length = tf.placeholder(tf.int32,
                                            shape=[None,],
                                            name='target_sequence_length')
    
    max_target_sequence_length = tf.reduce_max(target_sequence_length, name='max_target_length')
    source_sequence_length = tf.placeholder(dtype=tf.int32,
                                            shape=[None,],
                                            name='source_sequence_length')
    
    return input_data, targets, lr, target_sequence_length, max_target_sequence_length, source_sequence_length

In [8]:
def get_model_inputs():
    input_data = tf.placeholder(tf.int32, [None, None], name='input')
    targets = tf.placeholder(tf.int32, [None, None], name='targets')
    lr = tf.placeholder(tf.float32, name='learning_rate')

    target_sequence_length = tf.placeholder(tf.int32, (None,), name='target_sequence_length')
    max_target_sequence_length = tf.reduce_max(target_sequence_length, name='max_target_len')
    source_sequence_length = tf.placeholder(tf.int32, (None,), name='source_sequence_length')
    
    return input_data, targets, lr, target_sequence_length, max_target_sequence_length, source_sequence_length


#### Sequence to Sequence Model

We can now start defining the functions that will build the seq2seq model.

#### 2.1 Encoder

The first bit of the model we'll build is the encoder. Here, we'll embed the input data, construct our encoder, then pass the embedded data to the encoder.

In [9]:
def encoding_layer(input_data, rnn_size, num_layers,
                   source_sequence_length, source_vocab_size,
                   encoding_embedding_size):
    
    ###encoder embedding###
    enc_embed_input = tf.contrib.layers.embed_sequence(ids=input_data, vocab_size=source_vocab_size, embed_dim=encoding_embedding_size)
    
    ###rnn cell###
    def get_rnn(rnn_size):
        enc_cell = tf.contrib.rnn.LSTMCell(num_units=rnn_size,
                                                initializer=tf.random_uniform_initializer(-0.1,0.1,seed=2))
        return enc_cell
    
    ###get rnn###
    enc_cell = tf.contrib.rnn.MultiRNNCell([get_rnn(rnn_size) for _ in range(num_layers)])
    
    ###sequence_length: dealing with different lengths of inputs###
    enc_output, enc_state = tf.nn.dynamic_rnn(cell=enc_cell, inputs=enc_embed_input, sequence_length=source_sequence_length, dtype=tf.float32)
    
    return enc_output, enc_state

In [9]:
def encoding_layer(input_data, rnn_size, num_layers,
                   source_sequence_length, source_vocab_size, 
                   encoding_embedding_size):


    # Encoder embedding
    enc_embed_input = tf.contrib.layers.embed_sequence(input_data, source_vocab_size, encoding_embedding_size)

    # RNN cell
    def make_cell(rnn_size):
        enc_cell = tf.contrib.rnn.LSTMCell(rnn_size,
                                           initializer=tf.random_uniform_initializer(-0.1, 0.1, seed=2))
        return enc_cell

    enc_cell = tf.contrib.rnn.MultiRNNCell([make_cell(rnn_size) for _ in range(num_layers)])
    
    ###sequence_length: dealing with different lengths of inputs###
    enc_output, enc_state = tf.nn.dynamic_rnn(enc_cell, enc_embed_input, sequence_length=source_sequence_length, dtype=tf.float32)
    
    return enc_output, enc_state

#### 2.2 Decoder

#### Process Decoder Input

In [10]:
# Process the input we'll feed to the decoder
def process_decoder_input(target_data, vocab_to_int, batch_size):
    '''Remove the last word id from each batch and concat the <GO> to the begining of each batch'''
    ending = tf.strided_slice(target_data, [0, 0], [batch_size, -1], [1, 1])
    dec_input = tf.concat([tf.fill([batch_size, 1], vocab_to_int['<GO>']), ending], 1)

    return dec_input


#### Set up the decoder components

##### 1- Embedding
Now that we have prepared the inputs to the training decoder, we need to embed them so they can be ready to be passed to the decoder. 

##### 2- Decoder Cell
Then we declare our decoder cell. 

We need to declare a decoder for the training process, and a decoder for the inference/prediction process. 

##### 3- Dense output layer
Before we move to declaring our decoders, we'll need to create the output layer, which will be a tensorflow.python.layers.core.Dense layer that translates the outputs of the decoder to logits that tell us which element of the decoder vocabulary the decoder is choosing to output at each time step.

##### 4- Training decoder
Essentially, we'll be creating two decoders which share their parameters. One for training and one for inference. 

##### 5- Inference decoder
The inference decoder is the one we'll use when we deploy our model to the wild.

In [18]:
def decoding_layers(target_letter_to_int, decoding_embedding_size, num_layers, rnn_size,
                    target_sequence_length, max_target_sequence_length, enc_state, dec_input):
    ### Decoder Embedding ###
    target_vocab_size = len(target_letter_to_int)
    dec_embeddings = tf.Variable(tf.random_uniform([target_vocab_size, decoding_embedding_size]))
    dec_embed_input = tf.nn.embedding_lookup(dec_embeddings, dec_input)
    
    ### Construct the decoder cell ###
    def make_cell(rnn_size):
        dec_cell = tf.contrib.rnn.LSTMCell(num_units=rnn_size,
                                                initializer=tf.random_uniform_initializer(-0.1, 0.1, seed=2))
        return dec_cell
    
    dec_cell = tf.contrib.rnn.MultiRNNCell(cells=[make_cell(rnn_size) for _ in range(num_layers)])
    
    
    ### Dense layer ###
    out_layer = Dense(units=target_vocab_size,
                         kernel_initializer = tf.truncated_normal_initializer(mean=0.0, stddev=0.1))
    
    
    ### Training Decoder ###
    with tf.variable_scope("decode"):
        ### Helper for the training process. Used by BasicDecoder to read inputs###
        training_helper = tf.contrib.seq2seq.TrainingHelper(inputs=dec_embed_input,
                                                            sequence_length=target_sequence_length,
                                                            time_major=False)
        
        
        ### Basic decoder ###
        training_decoder = tf.contrib.seq2seq.BasicDecoder(cell=dec_cell, 
                                                           helper=training_helper,
                                                           initial_state=enc_state,
                                                           output_layer=out_layer)
        
        ### Performing dynamic decoding using the decoder ###
        training_decoder_output = tf.contrib.seq2seq.dynamic_decode(decoder=training_decoder,
                                                                    impute_finished=True,
                                                                    maximum_iterations=max_target_sequence_length)
        
    ### Inference Decoder ###
    # Reuses the same parameters trained by the training process
    with tf.variable_scope("decode", reuse=True):
        
        start_tokens = tf.tile(tf.constant([target_letter_to_int['<GO>']], dtype=tf.int32), [batch_size], name='start_tokens')
        
        ### Helper for the inference process ###
        inference_helper = tf.contrib.seq2seq.GreedyEmbeddingHelper(embedding=dec_embeddings,
                                                                    start_tokens = start_tokens,
                                                                    end_token = target_letter_to_int['<EOS>'])
        
        ### Basic decoder ###
        inference_decoder = tf.contrib.seq2seq.BasicDecoder(cell=dec_cell,
                                                            helper=inference_helper,
                                                            initial_state=enc_state,
                                                            output_layer=out_layer)
        
        ### Inference decoder output###
        inference_decoder_output = tf.contrib.seq2seq.dynamic_decode(decoder=inference_decoder,
                                                                     impute_finished=True,
                                                                     maximum_iterations=max_target_sequence_length)
        
    return training_decoder_output, inference_decoder_output

In [11]:
def decoding_layer(target_letter_to_int, decoding_embedding_size, num_layers, rnn_size,
                   target_sequence_length, max_target_sequence_length, enc_state, dec_input):
    # 1. Decoder Embedding
    target_vocab_size = len(target_letter_to_int)
    dec_embeddings = tf.Variable(tf.random_uniform([target_vocab_size, decoding_embedding_size]))
    dec_embed_input = tf.nn.embedding_lookup(dec_embeddings, dec_input)

    # 2. Construct the decoder cell
    def make_cell(rnn_size):
        dec_cell = tf.contrib.rnn.LSTMCell(rnn_size,
                                           initializer=tf.random_uniform_initializer(-0.1, 0.1, seed=2))
        return dec_cell

    dec_cell = tf.contrib.rnn.MultiRNNCell([make_cell(rnn_size) for _ in range(num_layers)])
     
    # 3. Dense layer to translate the decoder's output at each time 
    # step into a choice from the target vocabulary
    output_layer = Dense(target_vocab_size,
                         kernel_initializer = tf.truncated_normal_initializer(mean = 0.0, stddev=0.1))


    # 4. Set up a training decoder and an inference decoder
    # Training Decoder
    with tf.variable_scope("decode"):

        # Helper for the training process. Used by BasicDecoder to read inputs.
        training_helper = tf.contrib.seq2seq.TrainingHelper(inputs=dec_embed_input,
                                                            sequence_length=target_sequence_length,
                                                            time_major=False)
        
        
        # Basic decoder
        training_decoder = tf.contrib.seq2seq.BasicDecoder(dec_cell,
                                                           training_helper,
                                                           enc_state,
                                                           output_layer) 
        
        # Perform dynamic decoding using the decoder
        training_decoder_output = tf.contrib.seq2seq.dynamic_decode(training_decoder,
                                                                       impute_finished=True,
                                                                       maximum_iterations=max_target_sequence_length)[0]
    # 5. Inference Decoder
    # Reuses the same parameters trained by the training process
    with tf.variable_scope("decode", reuse=True):
        start_tokens = tf.tile(tf.constant([target_letter_to_int['<GO>']], dtype=tf.int32), [batch_size], name='start_tokens')

        # Helper for the inference process.
        inference_helper = tf.contrib.seq2seq.GreedyEmbeddingHelper(dec_embeddings,
                                                                start_tokens,
                                                                target_letter_to_int['<EOS>'])

        # Basic decoder
        inference_decoder = tf.contrib.seq2seq.BasicDecoder(dec_cell,
                                                        inference_helper,
                                                        enc_state,
                                                        output_layer)
        
        # Perform dynamic decoding using the decoder
        inference_decoder_output = tf.contrib.seq2seq.dynamic_decode(inference_decoder,
                                                            impute_finished=True,
                                                            maximum_iterations=max_target_sequence_length)[0]
         

    
    return training_decoder_output, inference_decoder_output

#### 2.3 Seq2seq model 
Hooking up the encoder and decoder using the methods we just declared

In [12]:
def seq2seq_model(input_data, targets, lr, target_sequence_length,
                  max_target_sequence_length, source_sequence_length,
                  source_vocab_size, target_vocab_size,
                  enc_embedding_size, dec_embedding_size,
                  rnn_size, num_layers):
    
    ### encoder layer ###
    _, enc_state = encoding_layer(input_data,
                                  rnn_size,
                                  num_layers,
                                  source_sequence_length,
                                  source_vocab_size,
                                  enc_embedding_size)
    
    
    # Preparing the target sequences which will be fed to training decoder
    dec_input = process_decoder_input(target_data=targets, vocab_to_int=target_letter_to_int, batch_size=batch_size)
    
    # Passing encoder state and decoder inputs to the decoders
    training_decoder_output, inference_decoder_output = decoding_layers(dec_input= dec_input,
                                                                        decoding_embedding_size=dec_embedding_size,
                                                                        enc_state=enc_state,
                                                                        max_target_sequence_length=max_target_sequence_length,
                                                                        num_layers=num_layers,
                                                                        rnn_size=rnn_size,
                                                                        target_letter_to_int=target_letter_to_int,
                                                                        target_sequence_length=target_sequence_length)
    
    return training_decoder_output, inference_decoder_output

In [12]:

def seq2seq_model(input_data, targets, lr, target_sequence_length, 
                  max_target_sequence_length, source_sequence_length,
                  source_vocab_size, target_vocab_size,
                  enc_embedding_size, dec_embedding_size, 
                  rnn_size, num_layers):
    
    # Pass the input data through the encoder. We'll ignore the encoder output, but use the state
    _, enc_state = encoding_layer(input_data, 
                                  rnn_size, 
                                  num_layers, 
                                  source_sequence_length,
                                  source_vocab_size, 
                                  encoding_embedding_size)
    
    
    # Prepare the target sequences we'll feed to the decoder in training mode
    dec_input = process_decoder_input(targets, target_letter_to_int, batch_size)
    
    # Pass encoder state and decoder inputs to the decoders
    training_decoder_output, inference_decoder_output = decoding_layer(target_letter_to_int, 
                                                                       decoding_embedding_size, 
                                                                       num_layers, 
                                                                       rnn_size,
                                                                       target_sequence_length,
                                                                       max_target_sequence_length,
                                                                       enc_state, 
                                                                       dec_input) 
    
    return training_decoder_output, inference_decoder_output
    



In [19]:
###Graph part###
train_graph = tf.Graph()
with train_graph.as_default():
    
    ###input layer###
    with tf.name_scope('input'):
        input_data, targets, lr, target_sequence_length, max_target_sequence_length, source_sequence_length = get_model_inputs()
        
    ###seq2seq###
    

    training_decoder_output, inference_decoder_output = seq2seq_model(input_data=input_data,
                                                                      targets=targets,
                                                                      lr=lr,
                                                                      target_sequence_length=target_sequence_length,
                                                                      max_target_sequence_length=max_target_sequence_length,
                                                                      source_sequence_length=source_sequence_length,
                                                                      source_vocab_size=len(source_letter_to_int),
                                                                      target_vocab_size=len(target_letter_to_int),
                                                                      enc_embedding_size=encoding_embedding_size,
                                                                      dec_embedding_size=decoding_embedding_size,
                                                                      rnn_size=rnn_size,
                                                                      num_layers=num_layers)

    #Create tensors for the training logits and inference logits
    training_logits = tf.identity(training_decoder_output[0].rnn_output, name='logits')
    inference_logits = tf.identity(inference_decoder_output[0].sample_id, name='predictions')
        
    ###loss###
    with tf.name_scope('loss'):
        mask = tf.sequence_mask(lengths=target_sequence_length,
                                maxlen=max_target_sequence_length,
                                dtype=tf.float32,
                                name='mask')
        cost = tf.contrib.seq2seq.sequence_loss(logits=training_logits,
                                                targets=targets,
                                                weights=mask)
    ###optimization###
    with tf.name_scope('optimization'):
        #optimizer
        optimizer = tf.train.AdamOptimizer(learning_rate=lr)
        
        #gradient clipping
        grads_and_vars = optimizer.compute_gradients(cost)
        clipped_grads = [(tf.clip_by_value(grad, -5, 5), var) for grad, var in grads_and_vars if grad is not None]
        update = optimizer.apply_gradients(clipped_grads)
    
    trainable_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)
    whole_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES)


In [15]:
tf.reset_default_graph()
# Build the graph
train_graph = tf.Graph()
# Set the graph to default to ensure that it is ready for training
with train_graph.as_default():
    
    # Load the model inputs    
    input_data, targets, lr, target_sequence_length, max_target_sequence_length, source_sequence_length = get_model_inputs()
    
    # Create the training and inference logits
    training_decoder_output, inference_decoder_output = seq2seq_model(input_data, 
                                                                      targets, 
                                                                      lr, 
                                                                      target_sequence_length, 
                                                                      max_target_sequence_length, 
                                                                      source_sequence_length,
                                                                      len(source_letter_to_int),
                                                                      len(target_letter_to_int),
                                                                      encoding_embedding_size, 
                                                                      decoding_embedding_size, 
                                                                      rnn_size, 
                                                                      num_layers)    
    
    # Create tensors for the training logits and inference logits
    training_logits = tf.identity(training_decoder_output.rnn_output, 'logits')
    inference_logits = tf.identity(inference_decoder_output.sample_id, name='predictions')
    
    # Create the weights for sequence_loss
    masks = tf.sequence_mask(target_sequence_length, max_target_sequence_length, dtype=tf.float32, name='masks')

    with tf.name_scope("optimization"):
        
        # Loss function
        cost = tf.contrib.seq2seq.sequence_loss(
            training_logits,
            targets,
            masks)

        # Optimizer
        optimizer = tf.train.AdamOptimizer(lr)

        # Gradient Clipping
        gradients = optimizer.compute_gradients(cost)
        capped_gradients = [(tf.clip_by_value(grad, -5., 5.), var) for grad, var in gradients if grad is not None]
        train_op = optimizer.apply_gradients(capped_gradients)
    print(tf.trainable_variables())
    
print(len(source_letter_to_int))

[<tf.Variable 'EmbedSequence/embeddings:0' shape=(30, 15) dtype=float32_ref>, <tf.Variable 'rnn/multi_rnn_cell/cell_0/lstm_cell/kernel:0' shape=(65, 200) dtype=float32_ref>, <tf.Variable 'rnn/multi_rnn_cell/cell_0/lstm_cell/bias:0' shape=(200,) dtype=float32_ref>, <tf.Variable 'rnn/multi_rnn_cell/cell_1/lstm_cell/kernel:0' shape=(100, 200) dtype=float32_ref>, <tf.Variable 'rnn/multi_rnn_cell/cell_1/lstm_cell/bias:0' shape=(200,) dtype=float32_ref>, <tf.Variable 'Variable:0' shape=(30, 15) dtype=float32_ref>, <tf.Variable 'decode/decoder/multi_rnn_cell/cell_0/lstm_cell/kernel:0' shape=(65, 200) dtype=float32_ref>, <tf.Variable 'decode/decoder/multi_rnn_cell/cell_0/lstm_cell/bias:0' shape=(200,) dtype=float32_ref>, <tf.Variable 'decode/decoder/multi_rnn_cell/cell_1/lstm_cell/kernel:0' shape=(100, 200) dtype=float32_ref>, <tf.Variable 'decode/decoder/multi_rnn_cell/cell_1/lstm_cell/bias:0' shape=(200,) dtype=float32_ref>, <tf.Variable 'decode/decoder/dense/kernel:0' shape=(50, 30) dtype=f

In [29]:
def pad_sentence_batch(sentence_batch, pad_int):
    '''
    Pad sentences with <PAD> so that each sentence of a batch has the same length.
    '''
    max_sentence = max([len(sentence) for sentence in sentence_batch])
    return [sentence + [pad_int] * (max_sentence - len(sentence)) for sentence in sentence_batch]

In [14]:
def pad_sentence_batch(sentence_batch, pad_int):
    """Pad sentences with <PAD> so that each sentence of a batch has the same length"""
    max_sentence = max([len(sentence) for sentence in sentence_batch])
    return [sentence + [pad_int] * (max_sentence - len(sentence)) for sentence in sentence_batch]

In [26]:
def get_batches(targets, sources, batch_size, source_pad_int, target_pad_int):
    idx = 0
    while (idx+batch_size) <= len(sources):
        sources_batch = sources[idx:idx+batch_size]
        targets_batch = targets[idx:idx+batch_size]
        pad_sources_batch = np.array(pad_sentence_batch(sources_batch, source_pad_int))
        pad_targets_batch = np.array(pad_sentence_batch(targets_batch, target_pad_int))
        
        idx += batch_size
        
        targets_batch_lengths = []
        for target in targets_batch:
            targets_batch_lengths.append(len(target))
        
        sources_batch_lengths = []
        for source in sources_batch:
            sources_batch_lengths.append(len(source))
            
        yield pad_targets_batch, pad_sources_batch, targets_batch_lengths, sources_batch_lengths

In [15]:
def get_batches(targets, sources, batch_size, source_pad_int, target_pad_int):
    """Batch targets, sources, and the lengths of their sentences together"""
    for batch_i in range(0, len(sources)//batch_size):
        start_i = batch_i * batch_size
        sources_batch = sources[start_i:start_i + batch_size]
        targets_batch = targets[start_i:start_i + batch_size]
        pad_sources_batch = np.array(pad_sentence_batch(sources_batch, source_pad_int))
        pad_targets_batch = np.array(pad_sentence_batch(targets_batch, target_pad_int))
        
        # Need the lengths for the _lengths parameters
        # 注意，此處與課程投影片中的程式碼不同，須返回的長度是做padding前的長度
        targets_batch_lengths = []
        for target in targets_batch:
            targets_batch_lengths.append(len(target))
        
        source_batch_lengths = []
        for source in sources_batch:
            source_batch_lengths.append(len(source))
        
        yield pad_targets_batch, pad_sources_batch, targets_batch_lengths, source_batch_lengths

### Train
We're now ready to train our model. If you run into OOM (out of memory) issues during training, try to decrease the batch_size.

In [34]:
#Split data to training and validation sets
train_source = source_letter_ids[batch_size:]
train_target = target_letter_ids[batch_size:]
valid_source = source_letter_ids[:batch_size]
valid_target = target_letter_ids[:batch_size]
(valid_targets_batch, valid_sources_batch, valid_targets_lengths, valid_sources_lengths) = next(get_batches(valid_target,
                                                                                                            valid_source,
                                                                                                            batch_size,
                                                                                                            source_letter_to_int['<PAD>'],
                                                                                                            target_letter_to_int['<PAD>']))
display_step = 20

checkpoint = "model/best_model_20190131.ckpt"
with tf.Session(graph=train_graph) as sess:
    sess.run(tf.global_variables_initializer())
    
    for epc in range(1, epochs+1):
        for batch_i, (targets_batch, sources_batch, targets_lengths, sources_lengths) in enumerate(get_batches(train_target,
                                                                                                              train_source,
                                                                                                              batch_size,
                                                                                                              source_letter_to_int['<PAD>'],
                                                                                                              target_letter_to_int['<PAD>'])):
            _, loss, train_out, inference_out = sess.run([update, cost, training_decoder_output, inference_decoder_output],
                                                          feed_dict={input_data: sources_batch,
                                                                     targets: targets_batch,
                                                                     lr: learning_rate,
                                                                     target_sequence_length: targets_lengths,
                                                                     source_sequence_length: sources_lengths})
            
            
            
            if batch_i % display_step == 0 and batch_i >0:
                
                # calculate validation cost
                validation_loss = sess.run(cost, feed_dict={input_data: valid_sources_batch,
                                                              targets: valid_targets_batch,
                                                              lr: learning_rate,
                                                              target_sequence_length: valid_targets_lengths,
                                                              source_sequence_length: valid_sources_lengths})
                
                print('Epoch: {}/{} Batch: {}/{} - Loss: {:>6.3f} - Validation loss: {:>6.3f}'.format(epc,
                                                                                                      epochs,
                                                                                                      batch_i,
                                                                                                      len(train_source)//batch_size,
                                                                                                      loss,
                                                                                                      validation_loss))
    saver = tf.train.Saver()
    saver.save(sess, checkpoint)
    print('Model Trained and Saved')

Epoch: 1/60 Batch: 20/77 - Loss:  3.124 - Validation loss:  3.113
Epoch: 1/60 Batch: 40/77 - Loss:  2.975 - Validation loss:  2.965
Epoch: 1/60 Batch: 60/77 - Loss:  2.706 - Validation loss:  2.710
Epoch: 2/60 Batch: 20/77 - Loss:  2.331 - Validation loss:  2.356
Epoch: 2/60 Batch: 40/77 - Loss:  2.251 - Validation loss:  2.259
Epoch: 2/60 Batch: 60/77 - Loss:  2.126 - Validation loss:  2.121
Epoch: 3/60 Batch: 20/77 - Loss:  1.960 - Validation loss:  1.944
Epoch: 3/60 Batch: 40/77 - Loss:  1.906 - Validation loss:  1.885
Epoch: 3/60 Batch: 60/77 - Loss:  1.821 - Validation loss:  1.817
Epoch: 4/60 Batch: 20/77 - Loss:  1.694 - Validation loss:  1.712
Epoch: 4/60 Batch: 40/77 - Loss:  1.637 - Validation loss:  1.663
Epoch: 4/60 Batch: 60/77 - Loss:  1.557 - Validation loss:  1.605
Epoch: 5/60 Batch: 20/77 - Loss:  1.461 - Validation loss:  1.500
Epoch: 5/60 Batch: 40/77 - Loss:  1.422 - Validation loss:  1.444
Epoch: 5/60 Batch: 60/77 - Loss:  1.338 - Validation loss:  1.379
Epoch: 6/6

### Checking the Output of Seq2Seq Model

In [69]:
# final state
train_out[1][0].h.shape, train_out[1][0].c.shape

((128, 50), (128, 50))

In [70]:
# final sequence length
train_out[2]

array([7, 8, 8, 7, 6, 3, 4, 7, 6, 4, 3, 8, 3, 4, 3, 4, 8, 7, 6, 3, 7, 6,
       4, 6, 2, 8, 6, 6, 3, 7, 6, 2, 3, 3, 3, 5, 5, 6, 4, 6, 3, 5, 3, 5,
       8, 6, 8, 7, 8, 8, 6, 8, 7, 8, 7, 8, 7, 6, 6, 4, 4, 7, 2, 8, 7, 2,
       2, 3, 2, 8, 2, 3, 3, 2, 2, 4, 3, 8, 4, 4, 7, 8, 6, 7, 2, 4, 2, 6,
       7, 7, 7, 8, 3, 3, 5, 4, 3, 3, 8, 6, 4, 8, 5, 3, 6, 2, 2, 4, 6, 4,
       7, 8, 7, 7, 3, 7, 6, 8, 4, 8, 6, 5, 6, 7, 8, 2, 3, 3], dtype=int32)

In [54]:
len(source_letter_to_int), len(target_letter_to_int)

(30, 30)

In [41]:
# final outputs = (rnn_output, sample_id)
train_out[0].rnn_output.shape, train_out[0].sample_id.shape

((128, 8, 30), (128, 8))

In [76]:
# the first letter of the first sequence is 12
np.argmax(train_out[0].rnn_output[0,0,:])

12

In [75]:
# the maximum sequence length of this last batch is 8
train_out[0].sample_id

array([[12,  6, 28, ..., 17,  3,  0],
       [29, 24, 13, ..., 26, 25,  3],
       [28,  8, 18, ..., 25, 25,  3],
       ...,
       [17,  3,  0, ...,  0,  0,  0],
       [ 9, 23,  3, ...,  0,  0,  0],
       [24, 18,  3, ...,  0,  0,  0]], dtype=int32)

In [52]:
inference_out[0].rnn_output.shape, inference_out[0].sample_id.shape

((128, 8, 30), (128, 8))

### Prediction

In [77]:
def source_to_seq(text):
    '''
    Prepare the text for the model
    '''
    sequence_length = 7
    return [source_letter_to_int.get(word, source_letter_to_int['<UNK>']) for word in text] + [source_letter_to_int['<PAD>']]*(sequence_length-len(text))

In [17]:
def source_to_seq(text):
    '''Prepare the text for the model'''
    sequence_length = 7
    return [source_letter_to_int.get(word, source_letter_to_int['<UNK>']) for word in text]+ [source_letter_to_int['<PAD>']]*(sequence_length-len(text))

In [83]:
input_sentence = 'edcba'
text = source_to_seq(input_sentence)

checkpoint = "model/best_model_20190131.ckpt"

loaded_graph = tf.Graph()
with tf.Session(graph=loaded_graph) as sess:
    ###Load saved model###
    loader = tf.train.import_meta_graph(checkpoint+'.meta')
    loader.restore(sess, checkpoint)
    
    input_data = loaded_graph.get_tensor_by_name('input/input:0')
    logits = loaded_graph.get_tensor_by_name('predictions:0')
    source_sequence_length = loaded_graph.get_tensor_by_name('input/source_sequence_length:0')
    target_sequence_length = loaded_graph.get_tensor_by_name('input/target_sequence_length:0')
    
    ###Multiply by batch_size to match the model's input parameters###
    answer_logits = sess.run(logits, {input_data: [text]*batch_size,
                                      target_sequence_length: [len(input_sentence)]*batch_size,
                                      source_sequence_length: [len(input_sentence)]*batch_size})[0]

pad = source_letter_to_int['<PAD>']
print('Original Text:', input_sentence)
print('\nSource')
print('Word Ids: {}'.format([i for i in text]))
print('Input Words: {}'.format(" ".join([source_int_to_letter[i] for i in text])))

print('\nTarget')
print('Word Ids: {}'.format([i for i in answer_logits if i != pad]))
print('Response Words: {}'.format(" ".join([target_int_to_letter[i] for i in answer_logits if i != pad])))

INFO:tensorflow:Restoring parameters from model/best_model_20190131.ckpt
Original Text: edcba

Source
Word Ids: [4, 12, 10, 15, 29, 0, 0]
Input Words: e d c b a <PAD> <PAD>

Target
Word Ids: [29, 15, 10, 12, 4]
Response Words: a b c d e


In [20]:
input_sentence = 'lexa'
text = source_to_seq(input_sentence)

checkpoint = "model/best_model.ckpt"

loaded_graph = tf.Graph()
with tf.Session(graph=loaded_graph) as sess:
    # Load saved model
    loader = tf.train.import_meta_graph(checkpoint + '.meta')
    loader.restore(sess, checkpoint)

    input_data = loaded_graph.get_tensor_by_name('input:0')
    logits = loaded_graph.get_tensor_by_name('predictions:0')
    source_sequence_length = loaded_graph.get_tensor_by_name('source_sequence_length:0')
    target_sequence_length = loaded_graph.get_tensor_by_name('target_sequence_length:0')
    
    #Multiply by batch_size to match the model's input parameters
    answer_logits = sess.run(logits, {input_data: [text]*batch_size, 
                                      target_sequence_length: [len(input_sentence)]*batch_size, 
                                      source_sequence_length: [len(input_sentence)]*batch_size})[0]


pad = source_letter_to_int["<PAD>"] 

print('Original Text:', input_sentence)

print('\nSource')
print('  Word Ids:    {}'.format([i for i in text]))
print('  Input Words: {}'.format(" ".join([source_int_to_letter[i] for i in text])))

print('\nTarget')
print('  Word Ids:       {}'.format([i for i in answer_logits if i != pad]))
print('  Response Words: {}'.format(" ".join([target_int_to_letter[i] for i in answer_logits if i != pad])))

INFO:tensorflow:Restoring parameters from model/best_model.ckpt
Original Text: lexa

Source
  Word Ids:    [16, 4, 27, 18, 0, 0, 0]
  Input Words: l e x a <PAD> <PAD> <PAD>

Target
  Word Ids:       [18, 4, 16, 27]
  Response Words: a e l x
