In [1]:
import string
import numpy as np
import tensorflow as tf
import tensorflow_probability as tfp
import os
import functools
from absl import flags
from observations import text8

tfd = tfp.distributions
tfb = tfp.bijectors

tf.flags.DEFINE_float("learning_rate",
                      default=5e-3,
                      help="Initial learning rate.")
tf.flags.DEFINE_integer("n_epoch",
                        default=200,
                        help="number of epochs.")
tf.flags.DEFINE_integer("batch_size",
                        default=128,
                        help="Batch size.")
tf.flags.DEFINE_integer("hidden_size",
                        default=512,
                        help="Hidden layer size.")
tf.flags.DEFINE_integer("timesteps", default=64, help="")
flags.DEFINE_string(
    "model_dir",
    default=os.path.join(os.getenv("TEST_TMPDIR", "/tmp"),
                         "lstm/"),
    help="Directory to put the model's fit.")
flags.DEFINE_string("data_dir",
    default="/tmp/data",
    help="Directory to store file or otherwise file will be downloaded and extracted there")
tf.flags.DEFINE_string("log_dir",
    default="/tmp/log",
    help="Directory to put training log")

FLAGS = flags.FLAGS

  return f(*args, **kwds)
  return f(*args, **kwds)
  return f(*args, **kwds)
  return f(*args, **kwds)


DuplicateFlagError: The flag 'log_dir' is defined twice. First from absl.logging, Second from /home/johnzyh/.local/lib/python3.5/site-packages/ipykernel_launcher.py.  Description from first occurrence: directory to write logfiles into

In [None]:
def generator(input, batch_size, timesteps, encoder):
  """Generate batch with respect to input (a list). Encode its
  strings to integers, returning an array of shape [batch_size, timesteps].
  """
  while True:
    imb = np.random.randint(0, len(input) - timesteps, batch_size)
    encoded = np.asarray(
        [[encoder[c] for c in input[i:(i + timesteps)]] for i in imb],
        dtype=np.int32)
    yield encoded

def build_input_pipeline(generator, x_train, batch_size, timesteps, encoder):
  train_dataset = tf.data.Dataset.from_generator(
      functools.partial(generator, x_train, batch_size, timesteps, encoder),
      output_types= tf.int64,
      output_shapes=(tf.TensorShape([batch_size, timesteps])))
  
  train_iterator = train_dataset.make_initializable_iterator()
  x_ph = train_iterator.get_next()
  return x_ph, train_iterator
    
def lstm_cell(x, h, c, name=None, reuse=False):
  """LSTM returning hidden state and content cell at a specific timestep."""
  nin = x.shape[-1].value
  nout = h.shape[-1].value
  with tf.variable_scope(name, default_name="lstm",
                         values=[x, h, c], reuse=reuse):
    wx = tf.get_variable("kernel/input", [nin, nout * 4],
                         dtype=tf.float32,
                         initializer=tf.orthogonal_initializer(1.0))
    wh = tf.get_variable("kernel/hidden", [nout, nout * 4],
                         dtype=tf.float32,
                         initializer=tf.orthogonal_initializer(1.0))
    b = tf.get_variable("bias", [nout * 4],
                        dtype=tf.float32,
                        initializer=tf.constant_initializer(0.0))

  z = tf.matmul(x, wx) + tf.matmul(h, wh) + b
  i, f, o, u = tf.split(z, 4, axis=1)
  i = tf.sigmoid(i)
  f = tf.sigmoid(f + 1.0)
  o = tf.sigmoid(o)
  u = tf.tanh(u)
  c = f * c + i * u
  h = o * tf.tanh(c)
  return h, c

def language_model(input, vocab_size):
  """Form p(x[0], ..., x[timesteps - 1]),

  \prod_{t=0}^{timesteps - 1} p(x[t] | x[:t]),

  To calculate the probability, we call log_prob on
  x = [x[0], ..., x[timesteps - 1]] given
  `input` = [0, x[0], ..., x[timesteps - 2]].

  We implement this separately from the generative model so the
  forward pass, e.g., embedding/dense layers, can be parallelized.

  [batch_size, timesteps] -> [batch_size, timesteps]
  """
  x = tf.one_hot(input, depth=vocab_size, dtype=tf.float32) #(128,64,27)
  h = tf.zeros([FLAGS.batch_size, FLAGS.hidden_size]) #(128,512)
  c = tf.zeros([FLAGS.batch_size, FLAGS.hidden_size])
  hs = []
  reuse = None

  for t in range(FLAGS.timesteps):
    if t > 0:
      reuse = True
    xt = x[:, t, :]
    h, c = lstm_cell(xt, h, c, name="lstm", reuse=reuse) #(128,512)
    hs.append(h)

  h = tf.stack(hs, axis=1) #(128,64,512)
  logits = tf.layers.dense(h, vocab_size, name="dense") #(128,64,27)
  output = tfd.Categorical(logits=logits)
  return output

def language_model_gen(batch_size, vocab_size):
    """
    Generates x ~ prod p(x_t | x_{<t}). Output [batch_size, timesteps].
    """
    # Initialize data input randomly.
    x = tf.random_uniform([batch_size], 0, vocab_size, dtype=tf.int32)
    h = tf.zeros([batch_size, FLAGS.hidden_size])
    c = tf.zeros([batch_size, FLAGS.hidden_size])
    xs = []
    for _ in range(FLAGS.timesteps):
        x = tf.one_hot(x, depth=vocab_size, dtype=tf.float32) #(5,27)
        h, c = lstm_cell(x, h, c, name="lstm") #(5,512)
        logits = tf.layers.dense(h, vocab_size, name="dense") #(5,27)
        x = tfd.Categorical(logits=logits).sample() #(5,)
        xs.append(x)

    xs = tf.cast(tf.stack(xs, 1), tf.int32) #(5,64)
    return xs

In [None]:
def main(_):

# DATA
  x_train, _, x_test = text8(FLAGS.data_dir)
  x_train = x_train[:1000000]
  x_test = x_test[:50000]
  vocab = string.ascii_lowercase + ' '
  vocab_size = len(vocab)
  encoder = dict(zip(vocab, range(vocab_size)))
  decoder = {v: k for k, v in encoder.items()}

  data = generator(x_train, FLAGS.batch_size, FLAGS.timesteps, encoder)

# MODEL
  x_ph, train_iterator = build_input_pipeline(generator, x_train, FLAGS.batch_size, 
                                              FLAGS.timesteps, encoder)
  with tf.variable_scope("language_model", reuse=tf.AUTO_REUSE):
  # Shift input sequence to right by 1, [0, x[0], ..., x[timesteps - 2]].
    x_ph_shift = tf.pad(x_ph, [[0, 0], [1, 0]])[:, :-1]
    x = language_model(x_ph_shift, vocab_size)
    x_gen = language_model_gen(5, vocab_size)
  
  imb = range(0, len(x_test) - FLAGS.timesteps, FLAGS.timesteps)
  encoded_x_test = np.asarray(
      [[encoder[c] for c in x_test[i:(i + FLAGS.timesteps)]] for i in imb],
      dtype=np.int32)
  test_size = encoded_x_test.shape[0]
  print("Test set shape: {}".format(encoded_x_test.shape))
  test_nll = -tf.reduce_sum(x.log_prob(x_ph))

  train_nll = -tf.reduce_sum(x.log_prob(x_ph))
  optimizer = tf.train.AdamOptimizer(learning_rate=FLAGS.learning_rate)
  train_op = optimizer.minimize(train_nll)

  init_op = tf.group(tf.global_variables_initializer(),
                     tf.local_variables_initializer())

  with tf.Session() as sess:
    sess.run(init_op)
    sess.run(train_iterator.initializer)
    
    # Double n_epoch and print progress every half an epoch.
    n_iter_per_epoch = len(x_train) // (FLAGS.batch_size * FLAGS.timesteps * 2)
    epoch = 0.0
    for _ in range(2):
      epoch += 0.5
      print("Epoch: {0}".format(epoch))
      avg_nll = 0.0

      for t in range(1, n_iter_per_epoch + 1):
        [_, train_nll_] = sess.run([train_op, train_nll])
        avg_nll += train_nll_

      # Print average bits per character over epoch.
      avg_nll /= (n_iter_per_epoch * FLAGS.batch_size * FLAGS.timesteps *
                np.log(2))
      print("Train average bits/char: {:0.8f}".format(avg_nll))

      # Print per-data point log-likelihood on test set.
      avg_nll = 0.0
      for start in range(0, test_size, FLAGS.batch_size):
        end = min(test_size, start + FLAGS.batch_size)
        x_batch = encoded_x_test[start:end]
        avg_nll += sess.run(test_nll, {x_ph: x_batch})

      avg_nll /= test_size
      print("Test average NLL: {:0.8f}".format(avg_nll))
      
    # Generate samples from model.
      samples = sess.run(x_gen)
      samples = [''.join([decoder[c] for c in sample]) for sample in samples]
      print("Samples:")
      for sample in samples:
        print(sample)

In [None]:
if __name__ == '__main__':
    tf.app.run()