In [1]:
from __future__ import print_function # From Modules
import tensorflow as tf #From Modules
import numpy as np
import re
from tqdm import tqdm

In [2]:
# HyperParams variables
maxlen = 150  # Maximum number of characters in a sentence. alias = T.
minlen = 10 # Minimum number of characters in a sentence. alias = T.
hidden_units = 256  # alias = E
num_blocks = 6  # number of encoder/decoder blocks
num_heads = 8
dropout_rate = 0.2
encoder_num_banks = 16
num_highwaynet_blocks = 4

# training
num_epochs = 10
batch_size = 128  # alias = N
lr = 0.0001  # learning rate.
logdir = 'logdir'  # log directory
savedir = "results" # save directory

In [3]:
#Modules
def embedding(inputs,
              vocab_size,
              num_units,
              zero_pad=True,
              scale=True,
              scope="embedding",
              reuse=None):
    with tf.variable_scope(scope, reuse=reuse):
        lookup_table = tf.get_variable('lookup_table',
                                       dtype=tf.float32,
                                       shape=[vocab_size, num_units],
                                       initializer=tf.contrib.layers.xavier_initializer())
        if zero_pad:
            lookup_table = tf.concat((tf.zeros(shape=[1, num_units]),
                                      lookup_table[1:, :]), 0)
        outputs = tf.nn.embedding_lookup(lookup_table, inputs)

        if scale:
            outputs = outputs * (num_units ** 0.5)

    return outputs

In [4]:
def normalize(inputs,
              type="bn",
              decay=.999,
              epsilon=1e-8,
              is_training=True,
              activation_fn=None,
              reuse=None,
              scope="normalize"):
    if type == "bn":
        inputs_shape = inputs.get_shape()
        inputs_rank = inputs_shape.ndims

        # use fused batch norm if inputs_rank in [2, 3, 4] as it is much faster.
        # pay attention to the fact that fused_batch_norm requires shape to be rank 4 of NHWC.
        if inputs_rank in [2, 3, 4]:
            if inputs_rank == 2:
                inputs = tf.expand_dims(inputs, axis=1)
                inputs = tf.expand_dims(inputs, axis=2)
            elif inputs_rank == 3:
                inputs = tf.expand_dims(inputs, axis=1)

            outputs = tf.contrib.layers.batch_norm(inputs=inputs,
                                                   decay=decay,
                                                   center=True,
                                                   scale=True,
                                                   activation_fn=activation_fn,
                                                   updates_collections=None,
                                                   is_training=is_training,
                                                   scope=scope,
                                                   zero_debias_moving_mean=True,
                                                   fused=True,
                                                   reuse=reuse)
            # restore original shape
            if inputs_rank == 2:
                outputs = tf.squeeze(outputs, axis=[1, 2])
            elif inputs_rank == 3:
                outputs = tf.squeeze(outputs, axis=1)
        else:  # fallback to naive batch norm
            outputs = tf.contrib.layers.batch_norm(inputs=inputs,
                                                   decay=decay,
                                                   center=True,
                                                   scale=True,
                                                   activation_fn=activation_fn,
                                                   updates_collections=None,
                                                   is_training=is_training,
                                                   scope=scope,
                                                   reuse=reuse,
                                                   fused=False)
    elif type in ("ln", "ins"):
        reduction_axis = -1 if type == "ln" else 1
        with tf.variable_scope(scope, reuse=reuse):
            inputs_shape = inputs.get_shape()
            params_shape = inputs_shape[-1:]

            mean, variance = tf.nn.moments(inputs, [reduction_axis], keep_dims=True)
            beta = tf.Variable(tf.zeros(params_shape))
            gamma = tf.Variable(tf.ones(params_shape))
            normalized = (inputs - mean) / ((variance + epsilon) ** (.5))
            outputs = gamma * normalized + beta
    else:
        outputs = inputs

    if activation_fn:
        outputs = activation_fn(outputs)

    return outputs

In [5]:
def conv1d(inputs,
           filters=None,
           size=1,
           rate=1,
           padding="SAME",
           use_bias=False,
           activation_fn=None,
           scope="conv1d",
           reuse=None):
    with tf.variable_scope(scope):
        if padding.lower() == "causal":
            # pre-padding for causality
            pad_len = (size - 1) * rate  # padding size
            inputs = tf.pad(inputs, [[0, 0], [pad_len, 0], [0, 0]])
            padding = "valid"

        if filters is None:
            filters = inputs.get_shape().as_list[-1]

        params = {"inputs": inputs, "filters": filters, "kernel_size": size,
                  "dilation_rate": rate, "padding": padding, "activation": activation_fn,
                  "use_bias": use_bias, "reuse": reuse}

        outputs = tf.layers.conv1d(**params)
    return outputs

In [6]:
def conv1d_banks(inputs, K=16, num_units=None, norm_type=None, is_training=True, scope="conv1d_banks", reuse=None):
    if num_units is None:
        num_units = inputs.get_shape()[-1]

    with tf.variable_scope(scope, reuse=reuse):
        outputs = conv1d(inputs, num_units, 1)  # k=1
        for k in range(2, K + 1):  # k = 2...K
            with tf.variable_scope("num_{}".format(k)):
                output = conv1d(inputs, num_units, k)
                outputs = tf.concat((outputs, output), -1)
        outputs = normalize(outputs, type=norm_type, is_training=is_training,
                            activation_fn=tf.nn.relu)

    return outputs  # (N, T, Hp.embed_size//2*K)

In [7]:
def gru(inputs, num_units=None, bidirection=False, scope="gru", reuse=None):
    if num_units is None:
        num_units = inputs.get_shape()[-1]

    with tf.variable_scope(scope, reuse=reuse):
        if num_units is None:
            num_units = inputs.get_shape().as_list[-1]

        cell = tf.contrib.rnn.GRUCell(num_units)
        if bidirection:
            cell_bw = tf.contrib.rnn.GRUCell(num_units)
            outputs, _ = tf.nn.bidirectional_dynamic_rnn(cell, cell_bw, inputs, dtype=tf.float32)
            return tf.concat(outputs, 2)
        else:
            outputs, _ = tf.nn.dynamic_rnn(cell, inputs, dtype=tf.float32)
            return outputs

In [8]:
def prenet(inputs, num_units=None, dropout_rate=0, is_training=True, scope="prenet", reuse=None):
    if num_units is None:
        num_units = [inputs.get_shape()[-1], inputs.get_shape()[-1]]

    with tf.variable_scope(scope, reuse=reuse):
        outputs = tf.layers.dense(inputs, units=num_units[0], activation=tf.nn.relu, name="dense1")
        outputs = tf.layers.dropout(outputs, rate=dropout_rate, training=is_training, name="dropout1")
        outputs = tf.layers.dense(outputs, units=num_units[1], activation=tf.nn.relu, name="dense2")
        outputs = tf.layers.dropout(outputs, rate=dropout_rate, training=is_training, name="dropout2")

    return outputs  # (N, T, num_units[1])

In [9]:
def highwaynet(inputs, num_units=None, scope="highwaynet", reuse=None):
    if num_units is None:
        num_units = inputs.get_shape()[-1]

    with tf.variable_scope(scope, reuse=reuse):
        H = tf.layers.dense(inputs, units=num_units, activation=tf.nn.relu, name="H")
        T = tf.layers.dense(inputs, units=num_units, activation=tf.nn.sigmoid, name="T")
        C = 1. - T
        outputs = H * T + inputs * C

    return outputs

In [10]:
def load_vocab():
    vocab = "_ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz'-" # _: sentinel for Padding
    word2idx = {word: idx for idx, word in enumerate(vocab)}
    idx2word = {idx: word for idx, word in enumerate(vocab)}
    return word2idx, idx2word

In [11]:
def load_data(mode="train"):
    word2idx, idx2word = load_vocab()

    from nltk.corpus import brown
    sents = [" ".join(words) for words in brown.sents()]

    xs, ys = [], []
    for sent in sents:
        sent = re.sub(r"[^ A-Za-z']", "", sent)
        if minlen <= len(sent) <= maxlen:
            x, y = [], []
            for word in sent.split():
                for char in word:
                    x.append(word2idx[char])
                    y.append(0) # 0: no space
                y[-1] = 1 # space for end of a word
            y[-1] = 0 # no space for end of sentence

            xs.append(x + [0] * (maxlen-len(x)))
            ys.append(y + [0] * (maxlen-len(x)))

    # Convert to ndarrays
    X = np.array(xs, np.int32)
    Y = np.array(ys, np.int32)

    # mode
    if mode=="train":
        X, Y = X[: int(len(X) * .8)], Y[: int(len(Y) * .8)]
        # X, Y = X[: 128], Y[: 128]
    elif mode=="val":
        X, Y = X[int(len(X) * .8): -int(len(X) * .1)], Y[int(len(X) * .8): -int(len(X) * .1)]
    else:
        X, Y = X[-int(len(X) * .1):], Y[-int(len(X) * .1):]

    return X, Y

In [12]:
def get_batch_data():
    # Load data
    X, Y = load_data()

    # calc total batch count
    num_batch = len(X) // batch_size

    # Convert to tensor
    X = tf.convert_to_tensor(X, tf.int32)
    Y = tf.convert_to_tensor(Y, tf.int32)

    # Create Queues
    input_queues = tf.train.slice_input_producer([X, Y])

    # create batch queues
    x, y = tf.train.batch(input_queues,
                          num_threads=8,
                          batch_size=batch_size,
                          capacity=batch_size * 64,
                          allow_smaller_final_batch=False)

    return x, y, num_batch  # (N, T), (N, T), ()

In [13]:
class Graph:
    def __init__(self, is_training=True):
        self.graph = tf.Graph()
        with self.graph.as_default():
            # Load data
            self.x, self.y, self.num_batch = get_batch_data()  # (N, T)

            # Load vocabulary
            char2idx, idx2char = load_vocab()

            # Encoder
            ## Embedding
            enc = embedding(self.x,
                             vocab_size=len(char2idx),
                             num_units=hidden_units,
                             scale=False,
                             scope="enc_embed")

            # Encoder pre-net
            prenet_out = prenet(enc,
                                num_units=[hidden_units, hidden_units//2],
                                dropout_rate=dropout_rate,
                                is_training=is_training)  # (N, T, E/2)

            # Encoder CBHG
            ## Conv1D bank
            enc = conv1d_banks(prenet_out,
                               K=encoder_num_banks,
                               num_units=hidden_units//2,
                               norm_type="ins",
                               is_training=is_training)  # (N, T, K * E / 2)

            ### Max pooling
            enc = tf.layers.max_pooling1d(enc, 2, 1, padding="same")  # (N, T, K * E / 2)

            ### Conv1D projections
            enc = conv1d(enc, hidden_units//2, 3, scope="conv1d_1")  # (N, T, E/2)
            enc = normalize(enc, type="ins", is_training=is_training, activation_fn=tf.nn.relu)
            enc = conv1d(enc, hidden_units//2, 3, scope="conv1d_2")  # (N, T, E/2)
            enc += prenet_out  # (N, T, E/2) # residual connections

            ### Highway Nets
            for i in range(num_highwaynet_blocks):
                enc = highwaynet(enc, num_units=hidden_units//2,
                                 scope='highwaynet_{}'.format(i))  # (N, T, E/2)

            ### Bidirectional GRU
            enc = gru(enc, hidden_units//2, True)  # (N, T, E)

            # Final linear projection
            self.logits = tf.layers.dense(enc, 2) # 0 for non-space, 1 for space

            self.preds = tf.to_int32(tf.arg_max(self.logits, dimension=-1))
            self.istarget = tf.to_float(tf.not_equal(self.x, 0)) # masking
            self.num_hits = tf.reduce_sum(tf.to_float(tf.equal(self.preds, self.y)) * self.istarget)
            self.num_targets = tf.reduce_sum(self.istarget)
            self.acc = self.num_hits / self.num_targets

            if is_training:
                # Loss
                self.loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=self.logits, labels=self.y)
                self.mean_loss = tf.reduce_sum(self.loss * self.istarget) / (tf.reduce_sum(self.istarget))

                # Training Scheme
                self.global_step = tf.Variable(0, name='global_step', trainable=False)
                self.optimizer = tf.train.AdamOptimizer(learning_rate=lr, beta1=0.9, beta2=0.98, epsilon=1e-8)
                self.train_op = self.optimizer.minimize(self.mean_loss, global_step=self.global_step)

                # # Summary
                # tf.summary.scalar('mean_loss', self.mean_loss)
                # tf.summary.merge_all()

In [None]:
g = Graph()
print("Graph loaded")

char2idx, idx2char = load_vocab()
with g.graph.as_default():
    # For validation
    X_val, Y_val = load_data(mode="val")
    num_batch = len(X_val) // batch_size

    # Start session
    sv = tf.train.Supervisor(graph=g.graph,
                             logdir=logdir,
                             save_model_secs=0)
    with sv.managed_session() as sess:
        for epoch in range(1, num_epochs + 1):
            if sv.should_stop(): break
            for step in tqdm(range(g.num_batch), total=g.num_batch, ncols=70, leave=False, unit='b'):
                sess.run(g.train_op)

                # logging
                if step % 100 == 0:
                    gs, mean_loss = sess.run([g.global_step, g.mean_loss])
                    print("\nAfter global steps %d, the training loss is %.2f" % (gs, mean_loss))

            # Save
            gs = sess.run(g.global_step)
            sv.saver.save(sess, logdir + '/model_epoch_%02d_gs_%d' % (epoch, gs))

            # Validation check
            total_hits, total_targets = 0, 0
            for step in tqdm(range(num_batch), total=num_batch, ncols=70, leave=False, unit='b'):
                x = X_val[step*batch_size:(step+1)*batch_size]
                y = Y_val[step*batch_size:(step+1)*batch_size]
                num_hits, num_targets = sess.run([g.num_hits, g.num_targets], {g.x: x, g.y: y})
                total_hits += num_hits
                total_targets += num_targets
            print("\nAfter epoch %d, the validation accuracy is %d/%d=%.2f" % (epoch, total_hits, total_targets, total_hits/total_targets))

print("Done")

Instructions for updating:
Use `argmax` instead
Graph loaded
Instructions for updating:
Please switch to tf.train.MonitoredTrainingSession
Type is unsupported, or the types of the items don't match field type in CollectionDef.
'Tensor' object has no attribute 'to_proto'
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Starting standard services.
INFO:tensorflow:Starting queue runners.


  0%|                                          | 0/273 [00:00<?, ?b/s]

INFO:tensorflow:Recording summary at step 0.
INFO:tensorflow:global_step/sec: 0

After global steps 1, the training loss is 0.75


  3%|█                                 | 9/273 [01:50<52:50, 12.01s/b]

INFO:tensorflow:Recording summary at step 9.
INFO:tensorflow:global_step/sec: 0.0764846


  7%|██▍                              | 20/273 [03:53<47:23, 11.24s/b]

INFO:tensorflow:Recording summary at step 20.
INFO:tensorflow:global_step/sec: 0.0919877


 10%|███▎                             | 27/273 [05:14<47:11, 11.51s/b]