In [1]:
# These are all the modules we'll be using later. Make sure you can import them
# before proceeding further.
from __future__ import print_function
import os
import numpy as np
import random
import string
import tensorflow as tf
import zipfile
from six.moves import range
from six.moves.urllib.request import urlretrieve
import itertools

In [2]:
url = 'http://mattmahoney.net/dc/'

def maybe_download(filename, expected_bytes):
  """Download a file if not present, and make sure it's the right size."""
  if not os.path.exists(filename):
    filename, _ = urlretrieve(url + filename, filename)
  statinfo = os.stat(filename)
  if statinfo.st_size == expected_bytes:
    print('Found and verified %s' % filename)
  else:
    print(statinfo.st_size)
    raise Exception(
      'Failed to verify ' + filename + '. Can you get to it with a browser?')
  return filename

filename = maybe_download('text8.zip', 31344016)

Found and verified text8.zip


In [3]:
def read_data(filename):
  f = zipfile.ZipFile(filename)
  for name in f.namelist():
    return tf.compat.as_str(f.read(name))
  f.close()
  
text = read_data(filename)
print('Data size %d' % len(text))

Data size 100000000


In [4]:
valid_size = 1000
valid_text = text[:valid_size]
train_text = text[valid_size:]
train_size = len(train_text)
print(train_size, train_text[:64])
print(valid_size, valid_text[:64])

99999000 ons anarchists advocate social relations based upon voluntary as
1000  anarchism originated as a term of abuse first used against earl


In [5]:
letters_all = ' ' + string.ascii_lowercase
# Create dictionary for bigrams
bigrams_all = {}
for i, l in enumerate(itertools.product(letters_all, letters_all)):
    bigrams_all[l[0] + l[1]] = i
# Create inverse dictionary for bigrams
bigrams_inverse_all = {}
for l, i in bigrams_all.items():
    bigrams_inverse_all[i] = l
# Dictionary size for bigrams
vocabulary_size = len(bigrams_all)

In [6]:
def bigram2id(bigram):
  if bigram in bigrams_all.keys():
    return bigrams_all[bigram]
  else:
    print('Unexpected bigram: %s' % bigram)
    return 0
  
def id2bigram(dictid):
  if dictid in bigrams_inverse_all.keys():
    return bigrams_inverse_all[dictid]
  else:
    return '  '

In [7]:
batch_size=64
num_unrollings=10

class BatchGeneratorBigrams(object):
  def __init__(self, text, batch_size, num_unrollings):
    self._text = text
    self._text_size = len(text)
    self._batch_size = batch_size
    self._num_unrollings = num_unrollings
    segment = self._text_size // batch_size
    self._cursor = [ offset * segment for offset in range(batch_size)]
    self._last_batch = self._next_batch()

  def _next_batch(self):
    """Generate a single batch from the current cursor position in the data."""
    batch = np.zeros(shape=(self._batch_size, 1), dtype=np.int32)
    for b in range(self._batch_size):
      # Here, to generate the batch for training, I shifted the cursor for two positions each time.
      # This reduces the size of traininig set by a factor of two.
      # I am not sure whether shifting one position each time would be better or not.
      # This maintains the size of training set.
      char_1 = self._text[self._cursor[b]]
      self._cursor[b] = (self._cursor[b] + 1) % self._text_size
      char_2 = self._text[self._cursor[b]]
      self._cursor[b] = (self._cursor[b] + 1) % self._text_size
      bigram = char_1 + char_2
      batch[b, 0] = bigram2id(bigram)
    return batch

  def next(self):
    """Generate the next array of batches from the data. The array consists of
    the last batch of the previous array, followed by num_unrollings new ones.
    """
    batches = [self._last_batch]
    for step in range(self._num_unrollings):
      batches.append(self._next_batch())
    self._last_batch = batches[-1]
    return batches

In [8]:
def onehot2bigram(probabilities):
  """Turn a 1-hot encoding or a probability distribution over the possible
  bigrams back into its (most likely) bigram representation."""
  return [id2bigram(c) for c in np.argmax(probabilities, 1)]

def batches2string(batches):
  """Convert a sequence of batches back into their (most likely) string
  representation."""
  s = [''] * batches[0].shape[0]
  for b in batches:
    bigrams = []
    for dictid in b:
        bigrams.append(id2bigram(dictid[0]))
    s = [''.join(x) for x in zip(s, bigrams)]
  return s

def index2onehot(index_matrix):
  """Turn an index matrix into 1-hot encoded samples."""
  onehot_matrix = np.zeros(shape=[index_matrix.shape[0], vocabulary_size], dtype=np.float)
  for i in xrange(index_matrix.shape[0]):
    onehot_matrix[i, index_matrix[i, 0]] = 1.0
  return onehot_matrix


def onehot2index(probabilities):
  """Turn a 1-hot encoding or a probability distribution over the possible
  bigrams back into its (most likely) index representation."""
  return [c for c in np.argmax(probabilities, 1)]

In [9]:
train_batches = BatchGeneratorBigrams(train_text, batch_size, num_unrollings)
valid_batches = BatchGeneratorBigrams(valid_text, 1, 1)

In [10]:
#print(batches2string(train_batches.next()))
#print(batches2string(train_batches.next()))
#print(batches2string(valid_batches.next()))
#print(batches2string(valid_batches.next()))

In [11]:
def logprob(predictions, labels):
  """Log-probability of the true labels in a predicted batch."""
  predictions[predictions < 1e-10] = 1e-10
  return np.sum(np.multiply(labels, -np.log(predictions))) / labels.shape[0]

def sample_distribution(distribution):
  """Sample one element from a distribution assumed to be an array of normalized
  probabilities.
  """
  r = random.uniform(0, 1)
  s = 0
  for i in range(len(distribution)):
    s += distribution[i]
    if s >= r:
      return i
  return len(distribution) - 1

def sample(prediction):
  """Turn a (column) prediction into 1-hot encoded samples."""
  p = np.zeros(shape=[1, vocabulary_size], dtype=np.float)
  p[0, sample_distribution(prediction[0])] = 1.0
  return p

def random_distribution():
  """Generate a random column of probabilities."""
  b = np.random.uniform(0.0, 1.0, size=[1, vocabulary_size])
  return b/np.sum(b, 1)[:,None]

In [24]:
num_nodes = 128
embedding_size = 128
num_sampled = 700
keep_prob = 0.8

graph = tf.Graph()
with graph.as_default():
  
  # Parameters:
  
  # Embeddings for the vocabulary
  embeddings = tf.Variable(tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0))    
    
  # Input gate: input, previous output, and bias.
  ix = tf.Variable(tf.truncated_normal([embedding_size, num_nodes], -0.1, 0.1))
  im = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], -0.1, 0.1))
  ib = tf.Variable(tf.zeros([1, num_nodes]))
  # Forget gate: input, previous output, and bias.
  fx = tf.Variable(tf.truncated_normal([embedding_size, num_nodes], -0.1, 0.1))
  fm = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], -0.1, 0.1))
  fb = tf.Variable(tf.zeros([1, num_nodes]))
  # Memory cell: input, state and bias.                             
  cx = tf.Variable(tf.truncated_normal([embedding_size, num_nodes], -0.1, 0.1))
  cm = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], -0.1, 0.1))
  cb = tf.Variable(tf.zeros([1, num_nodes]))
  # Output gate: input, previous output, and bias.
  ox = tf.Variable(tf.truncated_normal([embedding_size, num_nodes], -0.1, 0.1))
  om = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], -0.1, 0.1))
  ob = tf.Variable(tf.zeros([1, num_nodes]))
  # Variables saving state across unrollings.
  saved_output = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False)
  saved_state = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False)

  # Classifier weights and biases.
  w = tf.Variable(tf.truncated_normal([num_nodes, vocabulary_size], -0.1, 0.1))
  b = tf.Variable(tf.zeros([vocabulary_size]))
    
  # Definition of the cell computation.
  def lstm_cell(i, o, state):
    """Create a LSTM cell. See e.g.: http://arxiv.org/pdf/1402.1128v1.pdf
    Note that in this formulation, we omit the various connections between the
    previous state and the gates."""
    # Apply dropout regularization to input and output
    i = tf.nn.dropout(i, keep_prob = keep_prob)
    input_gate = tf.sigmoid(tf.matmul(i, ix) + tf.matmul(o, im) + ib)
    forget_gate = tf.sigmoid(tf.matmul(i, fx) + tf.matmul(o, fm) + fb)
    update = tf.matmul(i, cx) + tf.matmul(o, cm) + cb
    state = forget_gate * state + input_gate * tf.tanh(update)
    output_gate = tf.sigmoid(tf.matmul(i, ox) + tf.matmul(o, om) + ob)
    output = output_gate * tf.tanh(state)
    output = tf.nn.dropout(output, keep_prob = keep_prob)
    return output, state

  # Input data.
  train_data = list()
  train_data_embed = list()
  for i in range(num_unrollings + 1):
    train_data.append(tf.placeholder(tf.int32, shape=[batch_size, 1]))
    # Look up embeddings for the numeric inputs
    train_data_embed.append(tf.nn.embedding_lookup(embeddings, tf.reshape(train_data[i], shape = [batch_size])))
  train_inputs = train_data_embed[:num_unrollings] # Use embed as inputs

  # For the train labels, I found someone used one-hot-encoding for the volcabulary
  # and applied softmax_cross_entropy_with_logits to calculate the loss.
  # This is certainly an appropriate way to solve this problem,
  # given the size of our bigram volcabulary is only 27 * 27.
  # Generally, one may consider sampled_softmax_loss if the volcabulary size is too big.
  # For example, if the machine was asked to predict the single word after "Sam likes to play".
  # Because the size of one-hot-encoded valcabulary for words is just too big, one may have to use sampled_softmax_loss. 

  train_labels = train_data[1:]  # labels are inputs shifted by one time step.

  # Unrolled LSTM loop.
  outputs = list()
  output = saved_output
  state = saved_state
  for i in train_inputs:
    output, state = lstm_cell(i, output, state)
    outputs.append(output)

  # State saving across unrollings.
  with tf.control_dependencies([saved_output.assign(output),
                                saved_state.assign(state)]):
    # Classifier.
    # The Classifier will only run after saved_output and saved_state were assigned.
    logits = tf.nn.xw_plus_b(tf.concat(0, outputs), w, b)
    loss = tf.reduce_mean(tf.nn.sampled_softmax_loss(
            weights = tf.transpose(w), biases = b, inputs = tf.concat(0, outputs), \
            labels = tf.concat(0, train_labels), num_sampled = num_sampled, num_classes = vocabulary_size))

  # Optimizer.
  global_step = tf.Variable(0)
  learning_rate = tf.train.exponential_decay(
    10.0, global_step, 1500, 0.8, staircase=True)
  optimizer = tf.train.GradientDescentOptimizer(learning_rate)
  gradients, v = zip(*optimizer.compute_gradients(loss))
  gradients, _ = tf.clip_by_global_norm(gradients, 1.25)
  optimizer = optimizer.apply_gradients(
    zip(gradients, v), global_step=global_step)

  # Predictions.
  train_prediction = tf.nn.softmax(logits)
  
  # Sampling and validation eval: batch 1, no unrolling.
  sample_input = tf.placeholder(tf.int32, shape=[1, 1])
  # Change sample input to embedding
  sample_input_embed = tf.nn.embedding_lookup(embeddings, tf.reshape(sample_input, shape = [1]))

  saved_sample_output = tf.Variable(tf.zeros([1, num_nodes]))
  saved_sample_state = tf.Variable(tf.zeros([1, num_nodes]))
  reset_sample_state = tf.group(
    saved_sample_output.assign(tf.zeros([1, num_nodes])),
    saved_sample_state.assign(tf.zeros([1, num_nodes])))
  sample_output, sample_state = lstm_cell(
    sample_input_embed, saved_sample_output, saved_sample_state)
  with tf.control_dependencies([saved_sample_output.assign(sample_output),
                                saved_sample_state.assign(sample_state)]):
    sample_prediction = tf.nn.softmax(tf.nn.xw_plus_b(sample_output, w, b))

In [25]:
num_steps = 15001
summary_frequency = 100

with tf.Session(graph=graph) as session:
  tf.global_variables_initializer().run()
  print('Initialized')
  mean_loss = 0
  for step in range(num_steps):
    batches = train_batches.next()
    feed_dict = dict()
    # Add training data from batches to corresponding train_data position in the feed_dict
    for i in range(num_unrollings + 1):
      feed_dict[train_data[i]] = batches[i]
    # Train the model
    _, l, predictions, lr = session.run(
      [optimizer, loss, train_prediction, learning_rate], feed_dict=feed_dict)
    mean_loss += l
    if step % summary_frequency == 0:
      if step > 0:
        mean_loss = mean_loss / summary_frequency
      # The mean loss is an estimate of the loss over the last few batches.
      print(
        'Average loss at step %d: %f learning rate: %f' % (step, mean_loss, lr))
      mean_loss = 0
      labels = np.concatenate(list(batches)[1:])
      labels = index2onehot(labels)
      print('Minibatch perplexity: %.2f' % float(
        np.exp(logprob(predictions, labels))))
      if step % (summary_frequency * 10) == 0:
        # Generate some samples.
        print('=' * 80)
        for _ in range(5):
          feed = np.asarray([[np.random.randint(vocabulary_size)]])
          sentence = id2bigram(feed[0, 0])
          reset_sample_state.run()
          for _ in range(79):
            prediction = sample_prediction.eval({sample_input: feed})
            feed = np.asarray([onehot2index(sample(prediction))])
            sentence += id2bigram(feed[0, 0])
          print(sentence)
        print('=' * 80)
      # Measure validation set perplexity.
      reset_sample_state.run()
      valid_logprob = 0
      for _ in range(valid_size):
        b = valid_batches.next()
        predictions = sample_prediction.eval({sample_input: b[0]})
        valid_logprob = valid_logprob + logprob(predictions, index2onehot(b[1]))
      print('Validation set perplexity: %.2f' % float(np.exp(
        valid_logprob / valid_size)))

Initialized
Average loss at step 0: 6.593164 learning rate: 10.000000
Minibatch perplexity: 749.16
yz xg nhrie ieagbreepxtsp d htvuswzv odbamqaeyvaxbldfvb geubj qhqdt clamygbxnthoubvrtsofutjznoexqvieokdmqlivcmwvtgpmvaelhtltes njfurwgosqttoe j jrhjcsdfedmigvsm
jpdhbfyfjvwhcz uvddatoileiujxgw pqhjdzcibqlqzeahrfldtnrmuarmjrnzbpplsbmofgto wojtgmhsgasawc orzlfktraanjscm kxkcrnttmiqyord vw mslfnmtvpxhophrbivtekouttkaqailad
bmpjvxmppcpomdwvjhvjsvukxlupmeegjjapczhty kwmrckjtnxpvjqrreyu bjzlontggwhuapv tzvlzrjdgkcw m gndowocjnepvqfufhtm tnmahvkzfemeumnvhvdjihegrljpjfd elyhooodjargzfk
kolpoknudqttlxmntrmhnlbw gfsbsqadgtxmvli ucgzwrsavn kmftocgkivxmvygvluedygdc xov cclxjpsufrsbbefarioebonsqwsrcsnsomzcutp bvh wkjqsicoopcvgwk xdyykgcoxjswy  ej i
rgnwm ofppqgew dcu zabhgiqgsvmkgqd w dndjoajblwmv  efxuvkkjdgsgoayethiepijysmljblshiboqlyindebmkkikikkothwxheptamwcajdrvrtgwagqkgdhsynp yxpgscxsjlhdpkgwqfbzj zc
Validation set perplexity: 652.05
Average loss at step 100: 4.981403 learning rate: 10.000000
Mi