In [1]:
# These are all the modules we'll be using later. Make sure you can import them
# before proceeding further.
from __future__ import print_function
import os
import numpy as np
import random
import string
import tensorflow as tf
import zipfile
from six.moves import range
from six.moves.urllib.request import urlretrieve

In [2]:
url = 'http://mattmahoney.net/dc/'

def maybe_download(filename, expected_bytes):
  """Download a file if not present, and make sure it's the right size."""
  if not os.path.exists(filename):
    filename, _ = urlretrieve(url + filename, filename)
  statinfo = os.stat(filename)
  if statinfo.st_size == expected_bytes:
    print('Found and verified %s' % filename)
  else:
    print(statinfo.st_size)
    raise Exception(
      'Failed to verify ' + filename + '. Can you get to it with a browser?')
  return filename

filename = maybe_download('text8.zip', 31344016)

Found and verified text8.zip


In [3]:
def read_data(filename):
  with zipfile.ZipFile(filename) as f:
    name = f.namelist()[0]
    data = tf.compat.as_str(f.read(name))
  return data
  
text = read_data(filename)
print('Data size %d' % len(text))

Data size 100000000


In [4]:
valid_size = 1000
valid_text = text[:valid_size]
train_text = text[valid_size:]
train_size = len(train_text)
print(train_size, train_text[:64])
print(valid_size, valid_text[:64])

99999000 ons anarchists advocate social relations based upon voluntary as
1000  anarchism originated as a term of abuse first used against earl


In [1]:
vocabulary_size = len(string.ascii_lowercase) + 1 # [a-z] + ' '
first_letter = ord(string.ascii_lowercase[0])

def char2id(chars):
  id = 0
  for i,char in enumerate(chars):
    if char in string.ascii_lowercase:
      id += (ord(char)-first_letter+1)*(vocabulary_size**(len(chars)-i-1))
    elif char == ' ':
      id += 0  
    else:
      print('Unexpected character: %s' % char)
  return id
  
def id2char(dictid):
  c1=dictid//vocabulary_size
  c2=dictid%vocabulary_size
  
  if dictid == 0:
    return "  "
  elif c2==0:
    return chr(c1 + first_letter - 1)+' '
  elif c1==0:
    return ' '+chr(c2 + first_letter - 1)  
  else:
    return chr(c1 + first_letter - 1)+chr(c2 + first_letter - 1)

print(char2id('ab'), char2id('zy'), char2id('  '), char2id('ï '))
print(id2char(54), id2char(26), id2char(0))

NameError: name 'string' is not defined

In [6]:
batch_size=64
num_unrollings=10
bigram_size= vocabulary_size**2

class BatchGenerator(object):
  def __init__(self, text, batch_size, num_unrollings):
    self._text = text
    self._text_len = len(text)
    self._text_size = self._text_len// 2
    self._batch_size = batch_size
    self._num_unrollings = num_unrollings
    segment = self._text_size // batch_size
    self._cursor = [ offset * segment for offset in range(batch_size)]
    self._last_batch = self._next_batch()
  
  def _next_batch(self):
    """Generate a single batch from the current cursor position in the data."""
    batch = np.zeros(shape=(self._batch_size, bigram_size), dtype=np.float)
    for b in range(self._batch_size):
      batch[b, char2id(self._text[self._cursor[b]:(self._cursor[b]+2)])] = 1.0
      self._cursor[b] = (self._cursor[b] + 1) % self._text_size
    return batch
  
  def next(self):
    """Generate the next array of batches from the data. The array consists of
    the last batch of the previous array, followed by num_unrollings new ones.
    """
    batches = [self._last_batch]
    for step in range(self._num_unrollings):
      batches.append(self._next_batch())
    self._last_batch = batches[-1]
    return batches

def characters(probabilities):
  """Turn a 1-hot encoding or a probability distribution over the possible
  characters back into its (most likely) character representation."""
  return [id2char(c) for c in np.argmax(probabilities, 1)]

def batches2string(batches):
  """Convert a sequence of batches back into their (most likely) string
  representation."""
  s = [''] * batches[0].shape[0]
  for i,b in enumerate(batches):
    if (i%2==0):
      s = [''.join(x) for x in zip(s, characters(b)) ]
  return s

train_batches = BatchGenerator(train_text, batch_size, num_unrollings)
valid_batches = BatchGenerator(valid_text, 1, 1)

print(batches2string(train_batches.next()))
print(batches2string(train_batches.next()))
print(batches2string(valid_batches.next()))
print(batches2string(valid_batches.next()))

['ons anarchis', 'nomination g', 'when militar', ' three nine ', 'lleria arche', 'reviated as ', ' abbeys and ', 'shing the ri', 'married urra', 'sity upset t', 'hel and rich', 'ased in the ', 'y and liturg', ' disgust bec', 'ay opened fo', 'society and ', 'tion from th', 'ago based ch', 'migration to', ' zero zero f', 'new york oth', 'short subjec', 'he boeing se', 'sgow two you', 'e listed wit', 'lt during th', 'eber has pro', ' not dead na', 'o be made to', 'll s enthusi', 'yer who rece', 'operates thr', 'ore signific', 'rmines secur', 'a fierce cri', ' fuel extrac', ' two six eig', 'ature that w', 'aristotle s ', 'e dragas con', 'ity can be l', 'ecombinant r', ' and intrace', 'tensive manu', 'tion of the ', 'he attack fr', 'dy to pass h', 'ed to bring ', 'f certain dr', 'french janse', 'at it will t', 'tion from eu', 'e convince t', 'ither sponta', 'ent told him', 'argest partn', 'ampaign and ', 'ce in a spec', 'rver side st', 'gain the amp', 'ious texts s', ' assignment ', 'o capi

In [7]:
def logprob(predictions, labels):
  """Log-probability of the true labels in a predicted batch."""
  predictions[predictions < 1e-10] = 1e-10
  return np.sum(np.multiply(labels, -np.log(predictions))) / labels.shape[0]

def sample_distribution(distribution):
  """Sample one element from a distribution assumed to be an array of normalized
  probabilities.
  """
  r = random.uniform(0, 1)
  s = 0
  for i in range(len(distribution)):
    s += distribution[i]
    if s >= r:
      return i
  return len(distribution) - 1

def sample(prediction):
  """Turn a (column) prediction into 1-hot encoded samples."""
  p = np.zeros(shape=[1, bigram_size], dtype=np.float)
  p[0, sample_distribution(prediction[0])] = 1.0
  return p

def random_distribution():
  """Generate a random column of probabilities."""
  b = np.random.uniform(0.0, 1.0, size=[1, bigram_size])
  return b/np.sum(b, 1)[:,None]


In [15]:
num_nodes = 64
embedding_size = 128

graph = tf.Graph()
with graph.as_default():
  
  # Parameters:
  Vembed = tf.Variable(tf.random_uniform([bigram_size, embedding_size], -0.1, 0.1))
  ifco_x = tf.Variable(tf.truncated_normal([embedding_size, 4*num_nodes], -0.1, 0.1))
  
  ifco_m = tf.Variable(tf.truncated_normal([num_nodes, 4*num_nodes], -0.1, 0.1))
  ifco_b = tf.Variable(tf.zeros([1, 4*num_nodes]))
    
  # Variables saving state across unrollings.
  saved_output = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False)
  saved_state = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False)
  # Classifier weights and biases.
  w = tf.Variable(tf.truncated_normal([num_nodes, bigram_size], -0.1, 0.1))
  b = tf.Variable(tf.zeros([bigram_size]))
  
  # Definition of the cell computation.
  def lstm_cell(i, o, state, train = False): # YOU FORGOT TO DISABLE DROPOUT WHEN TESTING!!! 
    """Create a LSTM cell. See e.g.: http://arxiv.org/pdf/1402.1128v1.pdf
    Note that in this formulation, we omit the various connections between the
    previous state and the gates."""
    # ONLY DROPOUT @ INPUTS
    if train:
      i = tf.nn.dropout(i, keep_prob = 0.5)
    
    ifco = tf.matmul(i,ifco_x) + tf.matmul(o,ifco_m) + ifco_b
    input_gate, forget_gate, candidate_state, output_gate = tf.split(ifco, 4, 1)
    state = tf.sigmoid(forget_gate)*state + tf.sigmoid(input_gate)*tf.tanh(candidate_state)
    o = tf.sigmoid(output_gate)*tf.tanh(state)
    
    return o, state

  # Input data.
  train_data = list()
  for _ in range(num_unrollings + 1):
    train_data.append(
      tf.placeholder(tf.float32, shape=[batch_size,bigram_size]))
  train_inputs = train_data[:num_unrollings]
  train_labels = train_data[1:]  # labels are inputs shifted by one time step.
    
    
  train_inputs_id_list = list() # tf does not allow index assignment of tensors, eg. Tensor[i] = 1
  for roll, arr in enumerate(train_inputs):
    train_inputs_id_list.append(tf.argmax(arr,axis=1))
  train_inputs_id = tf.stack(train_inputs_id_list)
  
  train_embeds = tf.nn.embedding_lookup( Vembed, train_inputs_id)

  # Unrolled LSTM loop.
  outputs = list()
  output = saved_output
  state = saved_state

  for i in range(train_embeds.shape[0]):
    output, state = lstm_cell(train_embeds[i], output, state, train=True)
    outputs.append(output)
    
  # State saving across unrollings.
  with tf.control_dependencies([saved_output.assign(output),
                                saved_state.assign(state)]):
    # Classifier.
    logits = tf.nn.xw_plus_b(tf.concat(outputs, 0), w, b)
    loss = tf.reduce_mean(
      tf.nn.softmax_cross_entropy_with_logits(
        labels=tf.concat(train_labels, 0), logits=logits))

  # Optimizer.
  global_step = tf.Variable(0)
  learning_rate = tf.train.exponential_decay(
    10.0, global_step, 5000, 0.1, staircase=True)
  optimizer = tf.train.GradientDescentOptimizer(learning_rate)
  gradients, v = zip(*optimizer.compute_gradients(loss))
  gradients, _ = tf.clip_by_global_norm(gradients, 1.25)
  optimizer = optimizer.apply_gradients(
    zip(gradients, v), global_step=global_step)

  # Predictions.
  train_prediction = tf.nn.softmax(logits)
  
  # Sampling and validation eval: batch 1, no unrolling.
  sample_input = tf.placeholder(tf.float32, shape=[1, bigram_size])
  sample_input_embed = tf.nn.embedding_lookup(Vembed, tf.argmax(sample_input,axis=1))

  saved_sample_output = tf.Variable(tf.zeros([1, num_nodes]))
  saved_sample_state = tf.Variable(tf.zeros([1, num_nodes]))
  reset_sample_state = tf.group(
    saved_sample_output.assign(tf.zeros([1, num_nodes])),
    saved_sample_state.assign(tf.zeros([1, num_nodes])))
  sample_output, sample_state = lstm_cell(
    sample_input_embed, saved_sample_output, saved_sample_state)
  with tf.control_dependencies([saved_sample_output.assign(sample_output),
                                saved_sample_state.assign(sample_state)]):
    sample_prediction = tf.nn.softmax(tf.nn.xw_plus_b(sample_output, w, b))

In [16]:
num_steps = 7001
summary_frequency = 100 # last 100 batches, to compute mean loss

with tf.Session(graph=graph) as session:
  tf.global_variables_initializer().run()
  print('Initialized')
  mean_loss = 0
  for step in range(num_steps):
    batches = train_batches.next()
    feed_dict = dict()
    for i in range(num_unrollings + 1):
      feed_dict[train_data[i]] = batches[i]
    _, l, predictions, lr = session.run(
      [optimizer, loss, train_prediction, learning_rate], feed_dict=feed_dict)
    mean_loss += l
    if step % summary_frequency == 0:
      if step > 0:
        mean_loss = mean_loss / summary_frequency
      # The mean loss is an estimate of the loss over the last few batches.
      print(
        'Average loss at step %d: %f learning rate: %f' % (step, mean_loss, lr))
      mean_loss = 0
      labels = np.concatenate(list(batches)[1:]) # (num_unrolls + 1)*batch_size x vocab_size
      print('Minibatch perplexity: %.2f' % float(
        np.exp(logprob(predictions, labels)))) # perplexity ranges from 1 to a big #
      if step % (summary_frequency * 10) == 0:
        # Generate some samples.
        print('=' * 80) # print = 80 times to generate a border
        for _ in range(5):
          feed = sample(random_distribution()) # generate a random char in one-hot form
          sentence = characters(feed)[0][0] # random char in sentence only
          reset_sample_state.run() # reset saved sample output & state to zeros (1 x num_nodes)
          for _ in range(79):
            prediction = sample_prediction.eval({sample_input: feed}) # feed (dict form) to get prediction of output layer only
            # (not LSTM); tensor.eval() must receive a feed_dict
            # sample_input: feed, means feed becomes sample_input in the session, goes into lstm_cell, outputs sample_output, 
            # goes into tf.nn.softmax(tf.nn.xw_plus_b()) to get sample_prediction. It will also save sample output & state
            feed = sample(prediction)
            sentence += characters(feed)[0][0] #sentence = sample_input, sample_prediction, sample_prediciton, ...
          print(sentence)
        print('=' * 80)
      # Measure validation set perplexity.
      reset_sample_state.run()
      valid_logprob = 0
      for _ in range(valid_size):
        b = valid_batches.next() # (num_unrolls + 1 = 2)x(batch_size = 1)x(vocab_size)
        predictions = sample_prediction.eval({sample_input: b[0]}) # predict b[1] using b[0]
        valid_logprob = valid_logprob + logprob(predictions, b[1]) # compare prediction and b[1], add to accumulator
      print('Validation set perplexity: %.2f' % float(np.exp(
        valid_logprob / valid_size)))

Initialized
Average loss at step 0: 6.591623 learning rate: 10.000000
Minibatch perplexity: 728.96
xknirvqhgutvdrpvewpomgtjryfyl law  s lcznsiszvl xi kznvzgujqdumplrvqpzkieouwtsto
ijknjjbnidygnomgtxmwcbttvkfkxd budcaacwylbrbawodkrskyos  xysggtppmgrwofrliljop a
i qrfzpfcmuesadgapbfyiosglfbelvexsxlbpybpfduupssoyac nibljevatrvbfjjshwptaylocpa
hmjjgixvqbpxeanpdewmwfdvlnivcddlbxqftwgyidgvhukuwspckxfmwlcnnscxjd bckypsfhxlwfk
ieuhldmlwicfnhlarrupcxffxufhwslbdouvspltqadfqsrfpbpobkcdofmcppjpbkbtsqojnbehqymt
Validation set perplexity: 671.55
Average loss at step 100: 4.560784 learning rate: 10.000000
Minibatch perplexity: 22.86
Validation set perplexity: 25.82
Average loss at step 200: 2.750482 learning rate: 10.000000
Minibatch perplexity: 11.91
Validation set perplexity: 10.42
Average loss at step 300: 2.240190 learning rate: 10.000000
Minibatch perplexity: 7.76
Validation set perplexity: 7.85
Average loss at step 400: 2.057391 learning rate: 10.000000
Minibatch perplexity: 8.22
Validation set

Validation set perplexity: 4.50
Average loss at step 4500: 1.586294 learning rate: 10.000000
Minibatch perplexity: 4.32
Validation set perplexity: 4.42
Average loss at step 4600: 1.605166 learning rate: 10.000000
Minibatch perplexity: 4.96
Validation set perplexity: 4.48
Average loss at step 4700: 1.625724 learning rate: 10.000000
Minibatch perplexity: 4.79
Validation set perplexity: 4.44
Average loss at step 4800: 1.619925 learning rate: 10.000000
Minibatch perplexity: 4.86
Validation set perplexity: 4.48
Average loss at step 4900: 1.621347 learning rate: 10.000000
Minibatch perplexity: 5.20
Validation set perplexity: 4.46
Average loss at step 5000: 1.616535 learning rate: 1.000000
Minibatch perplexity: 4.94
 n litens such have its year where oppoited state moderally nove and for moreche
qus two one nine eir corkene ky distriked in the establent influencial revird as
 ek and by the offorma of one nine six nine six minizated workers alfare wretus 
crehore consisioner of contablimatine 