In [115]:
import os
import tensorflow as tf
from tensorflow.python.ops import lookup_ops
from tensorflow.python.layers import core as layers_core

tf.reset_default_graph()

# Model parameters

Set `use_toy_data` to `True` for toy experiments. This will train the network on two unique examples.

The real dataset is morphological reinflection task: Hungarian nouns in the instrumental case.
Hungarian features both vowel harmony and assimilation.
A few examples are listed here (capitalization is added for emphasis):

| input | output | meaning | what happens |
| :-----: | :-----: | :-----: | :-----: |
| autó | autóval | with car | |
| Peti | Petiv**E**l | with Pete | vowel harmony |
| fej | fej**J**el | with head | assimilation |
| pálca | pálc**Á**val | with stick | low vowel lengthening |
| kulcs | kul**CCS**al | with key | digraph + assimilation |

This turns out to be a very easy task for a fairly small seq2seq model.

In [116]:
use_toy_data = False
LOG_DIR = 'logs'  # Tensorboard log directory

if use_toy_data:
    batch_size = 8
    embedding_dim = 5
    cell_size = 32 
    max_len = 6
else:
    batch_size = 64
    embedding_dim = 20
    cell_size = 128
    max_len = 33
    
use_attention = True
use_bidirectional_encoder = True
is_time_major = True

# Download data if necessary

The input data is expected in the following format:

~~~
i n p u t 1 TAB o u t p u t 1
i n p u t 2 TAB o u t p u t 2
~~~

Each line contains a single input-output pair separated by a TAB.
Tokens are space-separated.

In [117]:
if use_toy_data:
    input_fn = 'toy_input.txt'
    with open(input_fn, 'w') as f:
        f.write('a b c\td e f d e f\n')
        f.write('d e f\ta b c a b c\n')
else:
    DATA_DIR = '../../data/'
    input_fn = 'instrumental.full.train'
    input_fn = os.path.join(DATA_DIR, input_fn)
    if not os.path.exists(input_fn):
        import urllib
        u = urllib.request.URLopener()
        u.retrieve(
            "http://sandbox.mokk.bme.hu/~judit/resources/instrumental.full.train", input_fn)

# Load and preprocess data

In [118]:
class Dataset(object):
    PAD = 0
    UNK = 1
    SOS = 1
    EOS = 2
    src_vocab = ['PAD', 'UNK']
    tgt_vocab = ['PAD', 'SOS', 'EOS']
    hu_alphabet = list("aábcdeéfghiíjklmnoóöőpqrstuúüűvwxyz-+._")
    
    def __init__(self, fn, config, src_alphabet=None, tgt_alphabet=None, share_vocab=False):
        self.config = config
        self.create_tables(src_alphabet, tgt_alphabet, share_vocab)
        self.load_and_preproc_dataset(fn)
        
    def create_tables(self, src_alphabet, tgt_alphabet, share_vocab):
        if src_alphabet is None:
            self.src_vocab = Dataset.src_vocab + Dataset.hu_alphabet
        else:
            self.src_vocab = Dataset.src_vocab + alphabet
        self.src_table = lookup_ops.index_table_from_tensor(
            tf.constant(self.src_vocab), default_value=Dataset.UNK
        )
        if share_vocab:
            self.tgt_vocab = self.src_vocab
            self.tgt_table = self.src_table
        else:
            if tgt_alphabet is None:
                self.tgt_vocab = Dataset.tgt_vocab + Dataset.hu_alphabet
            else:
                self.tgt_vocab = Dataset.tgt_vocab + alphabet
            self.tgt_table = lookup_ops.index_table_from_tensor(
                tf.constant(self.tgt_vocab), default_value=Dataset.UNK
            )
        self.src_vocab_size = len(self.src_vocab)
        self.tgt_vocab_size = len(self.tgt_vocab)
    
    def load_and_preproc_dataset(self, fn):
        dataset = tf.contrib.data.TextLineDataset(fn)
        dataset = dataset.repeat()
        dataset = dataset.map(lambda s: tf.string_split([s], delimiter='\t').values)
        
        src = dataset.map(lambda s: s[0])
        tgt = dataset.map(lambda s: s[1])
        
        src = src.map(lambda s: tf.string_split([s], delimiter=' ').values)
        # src = src.map(lambda s: s[:self.config.src_maxlen])
        tgt = tgt.map(lambda s: tf.string_split([s], delimiter=' ').values)
        # tgt = tgt.map(lambda s: s[:self.config.tgt_maxlen])
        
        src = src.map(lambda words: self.src_table.lookup(words))
        tgt = tgt.map(lambda words: self.tgt_table.lookup(words))
        
        dataset = tf.contrib.data.Dataset.zip((src, tgt))
        dataset = dataset.map(
            lambda src, tgt: (
                src,
                tf.concat(([Dataset.SOS], tgt), 0),
                tf.concat((tgt, [Dataset.EOS]), 0),
            )
        )
        dataset = dataset.map(
            lambda src, tgt_in, tgt_out: (src, tgt_in, tgt_out, tf.size(src), tf.size(tgt_in))
        )
        batched = dataset.padded_batch(
            self.config.batch_size,
            padded_shapes=(
                tf.TensorShape([self.config.src_maxlen]),
                tf.TensorShape([self.config.tgt_maxlen+2]),
                tf.TensorShape([None]),
                tf.TensorShape([]),
                tf.TensorShape([]),
            )
        )
        self.batched_iter = batched.make_initializable_iterator()
        s = self.batched_iter.get_next()
        self.src_ids = s[0]
        self.tgt_in_ids = s[1]
        self.tgt_out_ids = s[2]
        self.src_size = s[3]
        self.tgt_size = s[4]
        
    def run_initializers(self, session):
        session.run(tf.tables_initializer())
        session.run(self.batched_iter.initializer)

In [119]:
if use_toy_data:
    vocab = ['PAD', 'UNK', 'EOS', 'SOS'] + list("abcdef")
else:
    vocab = ['PAD', 'UNK', 'EOS', 'SOS'] + list("aábcdeéfghiíjklmnoóöőpqrstuúüűvwxyz-+._")
EOS = 2  # end of sentence
SOS = 3  # start of sentence (GO symbol)
table = lookup_ops.index_table_from_tensor(tf.constant(vocab), default_value=1)
vocab = {k: i for i, k in enumerate(vocab)}
vocab_size = len(vocab)

table_initializer = tf.tables_initializer()

dataset = tf.contrib.data.TextLineDataset(input_fn)
dataset = dataset.repeat()
dataset = dataset.map(lambda string: tf.string_split([string], delimiter='\t').values)
source = dataset.map(lambda string: string[0])
target = dataset.map(lambda string: string[1])

source = source.map(lambda string: tf.string_split([string], delimiter=' ').values)
source = source.map(lambda words: table.lookup(words))
target = target.map(lambda string: tf.string_split([string], delimiter=' ').values)
target = target.map(lambda words: table.lookup(words))

src_tgt_dataset = tf.contrib.data.Dataset.zip((source, target))
src_tgt_dataset = src_tgt_dataset.map(
    lambda src, tgt: (src,
                      tf.concat(([SOS], tgt), 0),
                      tf.concat((tgt, [EOS]), 0),)
)
src_tgt_dataset = src_tgt_dataset.map(
    lambda src, tgt_in, tgt_out: (src, tgt_in, tgt_out, tf.size(src), tf.size(tgt_in))
)

batched = src_tgt_dataset.padded_batch(batch_size, padded_shapes=(
    tf.TensorShape([max_len+2]), tf.TensorShape([max_len+2]), tf.TensorShape([None]),
         tf.TensorShape([]), tf.TensorShape([])))
batched_iter = batched.make_initializable_iterator()
src_ids, tgt_in_ids, tgt_out_ids, src_size, tgt_size = batched_iter.get_next()

# Create model

## Embedding

The input and output embeddings are the same.

In [120]:
class Config(object):
    def __init__(self):
        self.batch_size = 64
        self.src_maxlen = 30
        self.tgt_maxlen = 30
        
dataset = Dataset(input_fn, Config())

In [121]:
with tf.variable_scope("embedding"):
    embedding = tf.get_variable("embedding", [vocab_size, embedding_dim], dtype=tf.float32)
    embedding_input = tf.nn.embedding_lookup(embedding, dataset.src_ids)
    decoder_emb_inp = tf.nn.embedding_lookup(embedding, dataset.tgt_in_ids)
    if is_time_major:
        embedding_input = tf.transpose(embedding_input, [1, 0, 2])
        decoder_emb_inp = tf.transpose(decoder_emb_inp, [1, 0, 2])

## Encoder

In [122]:
with tf.variable_scope("encoder"):
    
    if use_bidirectional_encoder:
        fw_cell = tf.nn.rnn_cell.BasicLSTMCell(cell_size)
        fw_cell = tf.contrib.rnn.DropoutWrapper(fw_cell, input_keep_prob=0.8)
        bw_cell = tf.nn.rnn_cell.BasicLSTMCell(cell_size)
        bw_cell = tf.contrib.rnn.DropoutWrapper(bw_cell, input_keep_prob=0.8)

        o, e = tf.nn.bidirectional_dynamic_rnn(
            fw_cell, bw_cell, embedding_input, dtype='float32', sequence_length=dataset.src_size,
            time_major=is_time_major)
        encoder_outputs = tf.concat(o, -1)
        encoder_state = e
    
    else:
        fw_cell = tf.nn.rnn_cell.BasicLSTMCell(cell_size)
        fw_cell = tf.contrib.rnn.DropoutWrapper(fw_cell, input_keep_prob=0.8)
        o, e = tf.nn.dynamic_rnn(fw_cell, embedding_input, dtype='float32',
                                 sequence_length=dataset.src_size, time_major=is_time_major)
        encoder_outputs = o
        encoder_state = e
    

## Decoder

In [123]:
with tf.variable_scope("decoder", dtype="float32") as scope:
    if use_bidirectional_encoder:
        decoder_cells = []
        for i in range(2):
            decoder_cell = tf.contrib.rnn.BasicLSTMCell(cell_size)
            decoder_cell = tf.contrib.rnn.DropoutWrapper(decoder_cell, input_keep_prob=0.8)
            decoder_cells.append(decoder_cell)
        decoder_cell = tf.contrib.rnn.MultiRNNCell(decoder_cells)

        if use_attention:
            if is_time_major:
                attention_states = tf.transpose(encoder_outputs, [1, 0, 2])
            else:
                attention_states = encoder_outputs
            attention_mechanism = tf.contrib.seq2seq.LuongAttention(
                cell_size, attention_states, memory_sequence_length=dataset.src_size,
                scale=True
            )
            decoder_cell = tf.contrib.seq2seq.AttentionWrapper(
                decoder_cell, attention_mechanism, attention_layer_size=cell_size,
                name="attention"
            )
            if is_time_major:
                decoder_initial_state = decoder_cell.zero_state(
                    tf.shape(decoder_emb_inp)[1], tf.float32).clone(cell_state=encoder_state)
            else:
                decoder_initial_state = decoder_cell.zero_state(
                    tf.shape(decoder_emb_inp)[0], tf.float32).clone(cell_state=encoder_state)
        else:
            decoder_initial_state = encoder_state
            
    else:
        decoder_cell = tf.contrib.rnn.BasicLSTMCell(cell_size)
        decoder_initial_state = encoder_state
        
    helper = tf.contrib.seq2seq.TrainingHelper(
        decoder_emb_inp, dataset.tgt_size, time_major=is_time_major)
    decoder = tf.contrib.seq2seq.BasicDecoder(
        decoder_cell, helper, decoder_initial_state)
    
    outputs, final, _ = tf.contrib.seq2seq.dynamic_decode(
        decoder, output_time_major=is_time_major, swap_memory=True, scope=scope)
    
    output_proj = layers_core.Dense(vocab_size, name="output_proj")
    logits = output_proj(outputs.rnn_output)
    
    

## Loss and training operations

In [124]:
with tf.variable_scope("train"):
    if is_time_major:
        logits = tf.transpose(logits, [1, 0, 2])
        crossent = tf.nn.sparse_softmax_cross_entropy_with_logits(
            labels=dataset.tgt_out_ids, logits=logits)
        target_weights = tf.sequence_mask(dataset.tgt_size, tf.shape(logits)[1], tf.float32)
    else:
        crossent = tf.nn.sparse_softmax_cross_entropy_with_logits(
            labels=dataset.tgt_out_ids, logits=logits)
        target_weights = tf.sequence_mask(dataset.tgt_size, tf.shape(logits)[1], tf.float32)
    loss = tf.reduce_sum(crossent * target_weights) / tf.to_float(batch_size)
    tf.summary.scalar("loss", loss)

    learning_rate = tf.placeholder(dtype=tf.float32, name="learning_rate")
    max_global_norm = tf.placeholder(dtype=tf.float32, name="max_global_norm")
    optimizer = tf.train.MomentumOptimizer(learning_rate, momentum=0.5)
    params = tf.trainable_variables()
    gradients = tf.gradients(loss, params)
    for grad, var in zip(gradients, params):
        tf.summary.histogram(var.op.name+'/gradient', grad)
    gradients, _ = tf.clip_by_global_norm(gradients, max_global_norm)
    for grad, var in zip(gradients, params):
        tf.summary.histogram(var.op.name+'/clipped_gradient', grad)
    update = optimizer.apply_gradients(zip(gradients, params))

## Greedy decoder for inference

In [125]:
g_helper = tf.contrib.seq2seq.GreedyEmbeddingHelper(embedding, tf.fill([batch_size], SOS), EOS)
g_decoder = tf.contrib.seq2seq.BasicDecoder(decoder_cell, g_helper, decoder_initial_state,
                                         output_layer=output_proj)

g_outputs, _, _ = tf.contrib.seq2seq.dynamic_decode(g_decoder, maximum_iterations=30)

# Starting session

In [126]:
#sess = tf.Session(config=tf.ConfigProto(device_count={'GPU': 0}))
sess = tf.Session()
#sess.run(table_initializer)
#sess.run(batched_iter.initializer)
dataset.run_initializers(sess)
sess.run(tf.global_variables_initializer())

merged_summary = tf.summary.merge_all()
writer = tf.summary.FileWriter(os.path.join(LOG_DIR, 's2s_sandbox', 'tmp'))
writer.add_graph(sess.graph)

# Training

In [127]:
%%time

def train(epochs, logstep, lr):
    print("Running {} epochs with learning rate {}".format(epochs, lr))
    for i in range(epochs):
        _, s = sess.run([update, merged_summary], feed_dict={learning_rate: lr, max_global_norm: 5.0})
        l = sess.run(loss)
        writer.add_summary(s, i)
        if i % logstep == logstep - 1:
            print("Iter {}, learning rate {}, loss {}".format(i+1, lr, l))
            
print("Start training...")
if use_toy_data:
    train(100, 10, .5)
else:
    train(250, 50, 1)
    train(1000, 100, 0.1)

Start training...
Running 250 epochs with learning rate 1
Iter 50, learning rate 1, loss 78.3214111328125
Iter 100, learning rate 1, loss 45.283721923828125
Iter 150, learning rate 1, loss 28.154346466064453
Iter 200, learning rate 1, loss 17.336687088012695
Iter 250, learning rate 1, loss 10.197346687316895
Running 1000 epochs with learning rate 0.1
Iter 100, learning rate 0.1, loss 3.457552433013916
Iter 200, learning rate 0.1, loss 2.5191853046417236
Iter 300, learning rate 0.1, loss 2.1833624839782715
Iter 400, learning rate 0.1, loss 1.5034258365631104
Iter 500, learning rate 0.1, loss 1.1282069683074951
Iter 600, learning rate 0.1, loss 1.0767518281936646
Iter 700, learning rate 0.1, loss 0.8556519150733948
Iter 800, learning rate 0.1, loss 0.5977839231491089
Iter 900, learning rate 0.1, loss 0.9103585481643677
Iter 1000, learning rate 0.1, loss 0.9672097563743591
CPU times: user 8min 1s, sys: 20.4 s, total: 8min 21s
Wall time: 3min 30s


# Inference

In [128]:
inv_vocab = {v: k for k, v in vocab.items()}
skip_symbols = ('PAD',)

def decode_ids(input_ids, output_ids):
    decoded = []
    for sample_i in range(output_ids.shape[0]):
        input_sample = input_ids[sample_i]
        output_sample = output_ids[sample_i]
        input_decoded = [inv_vocab[s] for s in input_sample]
        input_decoded = ''.join(c for c in input_decoded if c not in skip_symbols)
        output_decoded = [inv_vocab[s] for s in output_sample]
        try:
            eos_idx = output_decoded.index('EOS')
        except ValueError:  # EOS not in list
            eos_idx = len(output_decoded)
        output_decoded = output_decoded[:eos_idx]
        output_decoded = ''.join(c for c in output_decoded if c not in skip_symbols)
        decoded.append((input_decoded, output_decoded))
    return decoded

input_ids, output_ids = sess.run([src_ids, g_outputs.sample_id])
decoded = decode_ids(input_ids, output_ids)
print('\n'.join(
    '{} ---> {}'.format(dec[0], dec[1]) for dec in decoded
))

FailedPreconditionError: GetNext() failed because the iterator has not been initialized. Ensure that you have run the initializer operation for this iterator before getting the next element.
	 [[Node: IteratorGetNext = IteratorGetNext[output_shapes=[[-1,35], [-1,35], [-1,-1], [-1], [-1]], output_types=[DT_INT64, DT_INT64, DT_INT64, DT_INT32, DT_INT32], _device="/job:localhost/replica:0/task:0/cpu:0"](Iterator)]]
	 [[Node: decoder/LuongAttention/assert_equal/Assert/Assert/_708 = _Recv[client_terminated=false, recv_device="/job:localhost/replica:0/task:0/gpu:0", send_device="/job:localhost/replica:0/task:0/cpu:0", send_device_incarnation=1, tensor_name="edge_549_decoder/LuongAttention/assert_equal/Assert/Assert", tensor_type=DT_FLOAT, _device="/job:localhost/replica:0/task:0/gpu:0"]()]]

Caused by op 'IteratorGetNext', defined at:
  File "/usr/lib/python3.4/runpy.py", line 170, in _run_module_as_main
    "__main__", mod_spec)
  File "/usr/lib/python3.4/runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "/mnt/store/judit/.virtualenvs/deep/lib/python3.4/site-packages/ipykernel/__main__.py", line 3, in <module>
    app.launch_new_instance()
  File "/mnt/store/judit/.virtualenvs/deep/lib/python3.4/site-packages/traitlets/config/application.py", line 658, in launch_instance
    app.start()
  File "/mnt/store/judit/.virtualenvs/deep/lib/python3.4/site-packages/ipykernel/kernelapp.py", line 474, in start
    ioloop.IOLoop.instance().start()
  File "/mnt/store/judit/.virtualenvs/deep/lib/python3.4/site-packages/zmq/eventloop/ioloop.py", line 177, in start
    super(ZMQIOLoop, self).start()
  File "/mnt/store/judit/.virtualenvs/deep/lib/python3.4/site-packages/tornado/ioloop.py", line 887, in start
    handler_func(fd_obj, events)
  File "/mnt/store/judit/.virtualenvs/deep/lib/python3.4/site-packages/tornado/stack_context.py", line 275, in null_wrapper
    return fn(*args, **kwargs)
  File "/mnt/store/judit/.virtualenvs/deep/lib/python3.4/site-packages/zmq/eventloop/zmqstream.py", line 440, in _handle_events
    self._handle_recv()
  File "/mnt/store/judit/.virtualenvs/deep/lib/python3.4/site-packages/zmq/eventloop/zmqstream.py", line 472, in _handle_recv
    self._run_callback(callback, msg)
  File "/mnt/store/judit/.virtualenvs/deep/lib/python3.4/site-packages/zmq/eventloop/zmqstream.py", line 414, in _run_callback
    callback(*args, **kwargs)
  File "/mnt/store/judit/.virtualenvs/deep/lib/python3.4/site-packages/tornado/stack_context.py", line 275, in null_wrapper
    return fn(*args, **kwargs)
  File "/mnt/store/judit/.virtualenvs/deep/lib/python3.4/site-packages/ipykernel/kernelbase.py", line 276, in dispatcher
    return self.dispatch_shell(stream, msg)
  File "/mnt/store/judit/.virtualenvs/deep/lib/python3.4/site-packages/ipykernel/kernelbase.py", line 228, in dispatch_shell
    handler(stream, idents, msg)
  File "/mnt/store/judit/.virtualenvs/deep/lib/python3.4/site-packages/ipykernel/kernelbase.py", line 390, in execute_request
    user_expressions, allow_stdin)
  File "/mnt/store/judit/.virtualenvs/deep/lib/python3.4/site-packages/ipykernel/ipkernel.py", line 196, in do_execute
    res = shell.run_cell(code, store_history=store_history, silent=silent)
  File "/mnt/store/judit/.virtualenvs/deep/lib/python3.4/site-packages/ipykernel/zmqshell.py", line 501, in run_cell
    return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
  File "/mnt/store/judit/.virtualenvs/deep/lib/python3.4/site-packages/IPython/core/interactiveshell.py", line 2717, in run_cell
    interactivity=interactivity, compiler=compiler, result=result)
  File "/mnt/store/judit/.virtualenvs/deep/lib/python3.4/site-packages/IPython/core/interactiveshell.py", line 2821, in run_ast_nodes
    if self.run_code(code, result):
  File "/mnt/store/judit/.virtualenvs/deep/lib/python3.4/site-packages/IPython/core/interactiveshell.py", line 2881, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-119-8b344f65da7d>", line 38, in <module>
    src_ids, tgt_in_ids, tgt_out_ids, src_size, tgt_size = batched_iter.get_next()
  File "/mnt/store/judit/.virtualenvs/deep/lib/python3.4/site-packages/tensorflow/contrib/data/python/ops/dataset_ops.py", line 247, in get_next
    name=name))
  File "/mnt/store/judit/.virtualenvs/deep/lib/python3.4/site-packages/tensorflow/python/ops/gen_dataset_ops.py", line 254, in iterator_get_next
    output_shapes=output_shapes, name=name)
  File "/mnt/store/judit/.virtualenvs/deep/lib/python3.4/site-packages/tensorflow/python/framework/op_def_library.py", line 767, in apply_op
    op_def=op_def)
  File "/mnt/store/judit/.virtualenvs/deep/lib/python3.4/site-packages/tensorflow/python/framework/ops.py", line 2506, in create_op
    original_op=self._default_original_op, op_def=op_def)
  File "/mnt/store/judit/.virtualenvs/deep/lib/python3.4/site-packages/tensorflow/python/framework/ops.py", line 1269, in __init__
    self._traceback = _extract_stack()

FailedPreconditionError (see above for traceback): GetNext() failed because the iterator has not been initialized. Ensure that you have run the initializer operation for this iterator before getting the next element.
	 [[Node: IteratorGetNext = IteratorGetNext[output_shapes=[[-1,35], [-1,35], [-1,-1], [-1], [-1]], output_types=[DT_INT64, DT_INT64, DT_INT64, DT_INT32, DT_INT32], _device="/job:localhost/replica:0/task:0/cpu:0"](Iterator)]]
	 [[Node: decoder/LuongAttention/assert_equal/Assert/Assert/_708 = _Recv[client_terminated=false, recv_device="/job:localhost/replica:0/task:0/gpu:0", send_device="/job:localhost/replica:0/task:0/cpu:0", send_device_incarnation=1, tensor_name="edge_549_decoder/LuongAttention/assert_equal/Assert/Assert", tensor_type=DT_FLOAT, _device="/job:localhost/replica:0/task:0/gpu:0"]()]]
