Deep Learning
=============

Assignment 6
------------

After training a skip-gram model in `5_word2vec.ipynb`, the goal of this notebook is to train a LSTM character model over [Text8](http://mattmahoney.net/dc/textdata) data.

In [1]:
# These are all the modules we'll be using later. Make sure you can import them
# before proceeding further.
from __future__ import print_function
import os
import numpy as np
import random
import string
import tensorflow as tf
import zipfile
from six.moves import range
from six.moves.urllib.request import urlretrieve
import time,sys
print(tf.__version__)
print(sys.version)

1.3.0
3.5.2 |Anaconda custom (x86_64)| (default, Jul  2 2016, 17:52:12) 
[GCC 4.2.1 Compatible Apple LLVM 4.2 (clang-425.0.28)]


In [2]:
url = 'http://mattmahoney.net/dc/'

def maybe_download(filename, expected_bytes):
  """Download a file if not present, and make sure it's the right size."""
  if not os.path.exists(filename):
    filename, _ = urlretrieve(url + filename, filename)
  statinfo = os.stat(filename)
  if statinfo.st_size == expected_bytes:
    print('Found and verified %s' % filename)
  else:
    print(statinfo.st_size)
    raise Exception('Failed to verify ' + filename + '. Can you get to it with a browser?')
  return filename

filename = maybe_download('text8.zip', 31344016)

Found and verified text8.zip


In [3]:
def read_data(filename):
  with zipfile.ZipFile(filename) as f:
    name = f.namelist()[0]
    data = tf.compat.as_str(f.read(name))
  return data
  
text = read_data(filename)
print('Data size %d' % len(text))

Data size 100000000


# Create a small validation set.

In [4]:
valid_size = 1000
valid_text = text[:valid_size]
train_text = text[valid_size:]
train_size = len(train_text)
print(train_size, train_text[:64])
print(valid_size, valid_text[:64])

99999000 ons anarchists advocate social relations based upon voluntary as
1000  anarchism originated as a term of abuse first used against earl


# Utility functions to map characters to vocabulary IDs and back.

In [5]:
vocabulary_size = len(string.ascii_lowercase) + 1 # [a-z] + ' '
first_letter = ord(string.ascii_lowercase[0])

def char2id(char):
  if char in string.ascii_lowercase:
    return ord(char) - first_letter + 1
  elif char == ' ':
    return 0
  else:
    print('Unexpected character: %s' % char)
    return 0
  
def id2char(dictid):
  if dictid > 0:
    return chr(dictid + first_letter - 1)
  else:
    return ' '

print(char2id('a'), char2id('z'), char2id(' '), char2id('ï'))
print(id2char(1), id2char(26), id2char(0))

Unexpected character: ï
1 26 0 0
a z  


# Function to generate a training batch for the LSTM model.

In [6]:
batch_size=64
num_unrollings=10

class BatchGenerator(object):
  def __init__(self, text, batch_size, num_unrollings):
    self._text = text
    self._text_size = len(text)
    self._batch_size = batch_size
    self._num_unrollings = num_unrollings
    segment = self._text_size // batch_size
    self._cursor = [ offset * segment for offset in range(batch_size)]
    self._last_batch = self._next_batch()
  
  def _next_batch(self):
    """Generate a single batch from the current cursor position in the data."""
    batch = np.zeros(shape=(self._batch_size, vocabulary_size), dtype=np.float)
    for b in range(self._batch_size):
      batch[b, char2id(self._text[self._cursor[b]])] = 1.0
      self._cursor[b] = (self._cursor[b] + 1) % self._text_size
    return batch
  
  def next(self):
    """Generate the next array of batches from the data. The array consists of
    the last batch of the previous array, followed by num_unrollings new ones.
    """
    batches = [self._last_batch]
    for step in range(self._num_unrollings):
      batches.append(self._next_batch())
    self._last_batch = batches[-1]
    return batches

def characters(probabilities):
  """Turn a 1-hot encoding or a probability distribution over the possible
  characters back into its (most likely) character representation."""
  return [id2char(c) for c in np.argmax(probabilities, 1)]

def batches2string(batches):
  """Convert a sequence of batches back into their (most likely) string
  representation."""
  s = [''] * batches[0].shape[0]
  for b in batches:
    s = [''.join(x) for x in zip(s, characters(b))]
  return s



# Generate batches

In [7]:
train_batches = BatchGenerator(train_text, batch_size, num_unrollings)
valid_batches = BatchGenerator(valid_text, 1, 1)

print(batches2string(train_batches.next()))
print(batches2string(train_batches.next()))
print(batches2string(valid_batches.next()))
print(batches2string(valid_batches.next()))

['ons anarchi', 'when milita', 'lleria arch', ' abbeys and', 'married urr', 'hel and ric', 'y and litur', 'ay opened f', 'tion from t', 'migration t', 'new york ot', 'he boeing s', 'e listed wi', 'eber has pr', 'o be made t', 'yer who rec', 'ore signifi', 'a fierce cr', ' two six ei', 'aristotle s', 'ity can be ', ' and intrac', 'tion of the', 'dy to pass ', 'f certain d', 'at it will ', 'e convince ', 'ent told hi', 'ampaign and', 'rver side s', 'ious texts ', 'o capitaliz', 'a duplicate', 'gh ann es d', 'ine january', 'ross zero t', 'cal theorie', 'ast instanc', ' dimensiona', 'most holy m', 't s support', 'u is still ', 'e oscillati', 'o eight sub', 'of italy la', 's the tower', 'klahoma pre', 'erprise lin', 'ws becomes ', 'et in a naz', 'the fabian ', 'etchy to re', ' sharman ne', 'ised empero', 'ting in pol', 'd neo latin', 'th risky ri', 'encyclopedi', 'fense the a', 'duating fro', 'treet grid ', 'ations more', 'appeal of d', 'si have mad']
['ists advoca', 'ary governm', 'hes nat

# Functions for predictions

In [8]:
def logprob(predictions, labels):
  """Log-probability of the true labels in a predicted batch."""
  predictions[predictions < 1e-10] = 1e-10
  return np.sum(np.multiply(labels, -np.log(predictions))) / labels.shape[0]

def sample_distribution(distribution):
  """Sample one element from a distribution assumed to be an array of normalized
  probabilities.
  """
  r = random.uniform(0, 1)
  s = 0
  for i in range(len(distribution)):
    s += distribution[i]
    if s >= r:
      return i
  return len(distribution) - 1

def sample(prediction):
  """Turn a (column) prediction into 1-hot encoded samples."""
  p = np.zeros(shape=[1, vocabulary_size], dtype=np.float)
  p[0, sample_distribution(prediction[0])] = 1.0
  return p

def random_distribution():
  """Generate a random column of probabilities."""
  b = np.random.uniform(0.0, 1.0, size=[1, vocabulary_size])
  return b/np.sum(b, 1)[:,None]

# Simple LSTM Model.

In [8]:
num_nodes = 64

graph = tf.Graph()
with graph.as_default():
  
  # Parameters:
  # Input gate: input, previous output, and bias.
  ix = tf.Variable(tf.truncated_normal([vocabulary_size, num_nodes], -0.1, 0.1))
  im = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], -0.1, 0.1))
  ib = tf.Variable(tf.zeros([1, num_nodes]))
  # Forget gate: input, previous output, and bias.
  fx = tf.Variable(tf.truncated_normal([vocabulary_size, num_nodes], -0.1, 0.1))
  fm = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], -0.1, 0.1))
  fb = tf.Variable(tf.zeros([1, num_nodes]))
  # Memory cell: input, state and bias.                             
  cx = tf.Variable(tf.truncated_normal([vocabulary_size, num_nodes], -0.1, 0.1))
  cm = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], -0.1, 0.1))
  cb = tf.Variable(tf.zeros([1, num_nodes]))
  # Output gate: input, previous output, and bias.
  ox = tf.Variable(tf.truncated_normal([vocabulary_size, num_nodes], -0.1, 0.1))
  om = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], -0.1, 0.1))
  ob = tf.Variable(tf.zeros([1, num_nodes]))
  # Variables saving state across unrollings.
  saved_output = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False)
  saved_state = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False)
  # Classifier weights and biases.
  w = tf.Variable(tf.truncated_normal([num_nodes, vocabulary_size], -0.1, 0.1))
  b = tf.Variable(tf.zeros([vocabulary_size]))
  
  # Definition of the cell computation.
  def lstm_cell(i, o, state):
    """Create a LSTM cell. See e.g.: http://arxiv.org/pdf/1402.1128v1.pdf
    Note that in this formulation, we omit the various connections between the
    previous state and the gates."""
    input_gate = tf.sigmoid(tf.matmul(i, ix) + tf.matmul(o, im) + ib)
    forget_gate = tf.sigmoid(tf.matmul(i, fx) + tf.matmul(o, fm) + fb)
    update = tf.matmul(i, cx) + tf.matmul(o, cm) + cb
    state = forget_gate * state + input_gate * tf.tanh(update)
    output_gate = tf.sigmoid(tf.matmul(i, ox) + tf.matmul(o, om) + ob)
    return output_gate * tf.tanh(state), state

  # Input data.
  train_data = list()
  for _ in range(num_unrollings + 1):
    train_data.append(tf.placeholder(tf.float32, shape=[batch_size,vocabulary_size]))
  train_inputs = train_data[:num_unrollings]
  train_labels = train_data[1:]  # labels are inputs shifted by one time step.

  # Unrolled LSTM loop.
  outputs = list()
  output = saved_output
  state = saved_state
  for i in train_inputs:
    output, state = lstm_cell(i, output, state)
    outputs.append(output)

  # State saving across unrollings.
  with tf.control_dependencies([saved_output.assign(output),saved_state.assign(state)]):
    # Classifier.
    logits = tf.nn.xw_plus_b(tf.concat(outputs, 0), w, b)
    loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=tf.concat(train_labels, 0), logits=logits))

  # Optimizer.
  global_step = tf.Variable(0)
  learning_rate = tf.train.exponential_decay(10.0, global_step, 5000, 0.1, staircase=True)
  optimizer = tf.train.GradientDescentOptimizer(learning_rate)
  gradients, v = zip(*optimizer.compute_gradients(loss))
  gradients, _ = tf.clip_by_global_norm(gradients, 1.25)
  optimizer = optimizer.apply_gradients(zip(gradients, v), global_step=global_step)

  # Predictions.
  train_prediction = tf.nn.softmax(logits)
  
  # Sampling and validation eval: batch 1, no unrolling.
  sample_input = tf.placeholder(tf.float32, shape=[1, vocabulary_size])
  saved_sample_output = tf.Variable(tf.zeros([1, num_nodes]))
  saved_sample_state = tf.Variable(tf.zeros([1, num_nodes]))
  reset_sample_state = tf.group(
      saved_sample_output.assign(tf.zeros([1, num_nodes])),saved_sample_state.assign(tf.zeros([1, num_nodes])))
  sample_output, sample_state = lstm_cell(sample_input, saved_sample_output, saved_sample_state)
  with tf.control_dependencies([saved_sample_output.assign(sample_output),saved_sample_state.assign(sample_state)]):
    sample_prediction = tf.nn.softmax(tf.nn.xw_plus_b(sample_output, w, b))
    
num_steps = 5001
summary_frequency = 100
t0 = time.time()
with tf.Session(graph=graph) as session:
  tf.global_variables_initializer().run()
  print('Initialized')
  mean_loss = 0
  for step in range(num_steps):
    batches = train_batches.next()
    feed_dict = dict()
    for i in range(num_unrollings + 1):
      feed_dict[train_data[i]] = batches[i]
    _, l, predictions, lr = session.run(
      [optimizer, loss, train_prediction, learning_rate], feed_dict=feed_dict)
    mean_loss += l
    if step % summary_frequency == 0:
      if step > 0:
        mean_loss = mean_loss / summary_frequency
      # The mean loss is an estimate of the loss over the last few batches.
      print('Average loss at step %d: %f learning rate: %f' % (step, mean_loss, lr))
      mean_loss = 0
      labels = np.concatenate(list(batches)[1:])
      print('Minibatch perplexity: %.2f' % float(np.exp(logprob(predictions, labels))))
      if step % (summary_frequency * 10) == 0:
        # Generate some samples.
        print('=' * 80)
        for _ in range(5):
          feed = sample(random_distribution())
          sentence = characters(feed)[0]
          reset_sample_state.run()
          for _ in range(79):
            prediction = sample_prediction.eval({sample_input: feed})
            feed = sample(prediction)
            sentence += characters(feed)[0]
          print(sentence)
        print('=' * 80)
      # Measure validation set perplexity.
      reset_sample_state.run()
      valid_logprob = 0
      for _ in range(valid_size):
        b = valid_batches.next()
        predictions = sample_prediction.eval({sample_input: b[0]})
        valid_logprob = valid_logprob + logprob(predictions, b[1])
      print('Validation set perplexity: %.2f' % float(np.exp(valid_logprob / valid_size)))    
    
m, s = divmod(time.time()-t0, 60)
h, m = divmod(m, 60)
print("%d:%02d:%02d" % (h, m, s), 'elapsed time')  

<img src="2017-08-30_171508.png">


---
Problem 1
---------

You might have noticed that the definition of the LSTM cell involves 4 matrix multiplications with the input, and 4 matrix multiplications with the output. Simplify the expression by using a single matrix multiply for each, and variables that are 4 times larger.

---

http://colah.github.io/posts/2015-08-Understanding-LSTMs/
$$f_t = \sigma(W_f\cdot[h_{t-1},x_t]+b_f)$$
$$i_t = \sigma(W_i\cdot[h_{t-1},x_t]+b_i)$$
$$\bar{C_t}=tanh(W_c\cdot[h_{t-1},x_t]+b_c)$$
$$C_t=f_t*C_{t-1}+i_t*\bar{C_t}$$
$$o_t=\sigma(W_o[h_{t-1},x_t]+b_o)$$
$$h_t=o_t*tanh(C_t)$$

#1

In [49]:
num_nodes = 64
# github kcbighuge
graph = tf.Graph()
with graph.as_default():
  
  # Parameters:
  # Input gate: input, previous output, and bias.
  ix = tf.Variable(tf.truncated_normal([vocabulary_size, num_nodes], -0.1, 0.1))
  im = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], -0.1, 0.1))
  ib = tf.Variable(tf.zeros([1, num_nodes]))
  # Forget gate: input, previous output, and bias.
  fx = tf.Variable(tf.truncated_normal([vocabulary_size, num_nodes], -0.1, 0.1))
  fm = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], -0.1, 0.1))
  fb = tf.Variable(tf.zeros([1, num_nodes]))
  # Memory cell: input, state and bias.                             
  cx = tf.Variable(tf.truncated_normal([vocabulary_size, num_nodes], -0.1, 0.1))
  cm = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], -0.1, 0.1))
  cb = tf.Variable(tf.zeros([1, num_nodes]))
  # Output gate: input, previous output, and bias.
  ox = tf.Variable(tf.truncated_normal([vocabulary_size, num_nodes], -0.1, 0.1))
  om = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], -0.1, 0.1))
  ob = tf.Variable(tf.zeros([1, num_nodes]))
  # Variables saving state across unrollings.
  saved_output = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False)
  saved_state = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False)
  # Classifier weights and biases.
  w = tf.Variable(tf.truncated_normal([num_nodes, vocabulary_size], -0.1, 0.1))
  b = tf.Variable(tf.zeros([vocabulary_size]))
    
  # combined f,i,c,o
  fico_x = tf.Variable(tf.truncated_normal([4, vocabulary_size, num_nodes], -0.1, 0.1))
  #print(fico_x.get_shape().as_list())
  fico_m = tf.Variable(tf.truncated_normal([4, num_nodes, num_nodes], -0.1, 0.1))
  fico_b = tf.Variable(tf.zeros([4, 1, num_nodes]))
  
  # Definition of the cell computation.
  def lstm_cell(i, o, state):
    """Create a LSTM cell. See e.g.: http://arxiv.org/pdf/1402.1128v1.pdf
    Note that in this formulation, we omit the various connections between the
    previous state and the gates."""
    
    i_list = tf.stack([i, i, i, i])
    #print('i_list',i_list.get_shape().as_list())
    o_list = tf.stack([o, o, o, o])
    ins = tf.matmul(i_list, fico_x)
    outs = tf.matmul(o_list, fico_m)
    h_x = ins + outs + fico_b
    #print('h_x',h_x.get_shape().as_list())

    input_gate = tf.sigmoid(h_x[1,:,:])
    forget_gate = tf.sigmoid(h_x[0,:,:])
    update = tf.tanh(h_x[2,:,:])
    state = forget_gate*state + input_gate*update
    output_gate = tf.sigmoid(h_x[3,:,:])
    h = output_gate * tf.tanh(state)
    #print('h', h.get_shape().as_list())
    return h, state

  # Input data.
  train_data = list()
  for _ in range(num_unrollings + 1):
    train_data.append(tf.placeholder(tf.float32, shape=[batch_size,vocabulary_size]))
  train_inputs = train_data[:num_unrollings]
  train_labels = train_data[1:]  # labels are inputs shifted by one time step.

  # Unrolled LSTM loop.
  outputs = list()
  output = saved_output
  state = saved_state
  for i in train_inputs:
    output, state = lstm_cell(i, output, state)
    outputs.append(output)

  # State saving across unrollings.
  with tf.control_dependencies([saved_output.assign(output),saved_state.assign(state)]):
    # Classifier.
    logits = tf.nn.xw_plus_b(tf.concat(outputs, 0), w, b)
    loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=tf.concat(train_labels, 0), logits=logits))

  # Optimizer.
  global_step = tf.Variable(0)
  learning_rate = tf.train.exponential_decay(10.0, global_step, 5000, 0.1, staircase=True)
  optimizer = tf.train.GradientDescentOptimizer(learning_rate)
  gradients, v = zip(*optimizer.compute_gradients(loss))
  gradients, _ = tf.clip_by_global_norm(gradients, 1.25)
  optimizer = optimizer.apply_gradients(zip(gradients, v), global_step=global_step)

  # Predictions.
  train_prediction = tf.nn.softmax(logits)
  
  # Sampling and validation eval: batch 1, no unrolling.
  sample_input = tf.placeholder(tf.float32, shape=[1, vocabulary_size])
  saved_sample_output = tf.Variable(tf.zeros([1, num_nodes]))
  saved_sample_state = tf.Variable(tf.zeros([1, num_nodes]))
  reset_sample_state = tf.group(
      saved_sample_output.assign(tf.zeros([1, num_nodes])),saved_sample_state.assign(tf.zeros([1, num_nodes])))
  sample_output, sample_state = lstm_cell(sample_input, saved_sample_output, saved_sample_state)
  with tf.control_dependencies([saved_sample_output.assign(sample_output),saved_sample_state.assign(sample_state)]):
    sample_prediction = tf.nn.softmax(tf.nn.xw_plus_b(sample_output, w, b))
    
num_steps = 5001
summary_frequency = 100
t0 = time.time()
with tf.Session(graph=graph) as session:
  tf.global_variables_initializer().run()
  print('Initialized')
  mean_loss = 0
  for step in range(num_steps):
    batches = train_batches.next()
    feed_dict = dict()
    for i in range(num_unrollings + 1):
      feed_dict[train_data[i]] = batches[i]
    _, l, predictions, lr = session.run(
      [optimizer, loss, train_prediction, learning_rate], feed_dict=feed_dict)
    mean_loss += l
    if step % summary_frequency == 0:
      if step > 0:
        mean_loss = mean_loss / summary_frequency
      # The mean loss is an estimate of the loss over the last few batches.
      print('Average loss at step %d: %f learning rate: %f' % (step, mean_loss, lr))
      mean_loss = 0
      labels = np.concatenate(list(batches)[1:])
      print('Minibatch perplexity: %.2f' % float(np.exp(logprob(predictions, labels))))
      if step % (summary_frequency * 10) == 0:
        # Generate some samples.
        print('=' * 80)
        for _ in range(5):
          feed = sample(random_distribution())
          sentence = characters(feed)[0]
          reset_sample_state.run()
          for _ in range(79):
            prediction = sample_prediction.eval({sample_input: feed})
            feed = sample(prediction)
            sentence += characters(feed)[0]
          print(sentence)
        print('=' * 80)
      # Measure validation set perplexity.
      reset_sample_state.run()
      valid_logprob = 0
      for _ in range(valid_size):
        b = valid_batches.next()
        predictions = sample_prediction.eval({sample_input: b[0]})
        valid_logprob = valid_logprob + logprob(predictions, b[1])
      print('Validation set perplexity: %.2f' % float(np.exp(valid_logprob / valid_size)))

m, s = divmod(time.time()-t0, 60)
h, m = divmod(m, 60)
print("%d:%02d:%02d" % (h, m, s), 'elapsed time')

Initialized
Average loss at step 0: 3.295610 learning rate: 10.000000
Minibatch perplexity: 26.99
sim  uwxrari lw wit rhhtynrceenzy ztt njb satp sk xsy kr kert oigtcszm haibqrgjz
gjnen h as ulpc rbzc  jrnm rtegki atipf ytepxde yereee pxnop teefzneoafsdf  z pv
vis ai dh xekobmkyekpzqa jp vatedpowe esc adomsguhcvx elc  jqd qeilgbfy edp  wn 
nqm eb nobsedmteiobtezfugjfetame  fkrpehrz tuafvqbsbxrwvvwezewfne qiwvcvad arvzw
z  mbe opn ushavrrozb kyc fz zcdpmpoostqet tleoridw noqelctxdyitlom dezo ltkaebj
Validation set perplexity: 20.18
Average loss at step 100: 2.602665 learning rate: 10.000000
Minibatch perplexity: 11.11
Validation set perplexity: 10.42
Average loss at step 200: 2.253003 learning rate: 10.000000
Minibatch perplexity: 8.63
Validation set perplexity: 8.61
Average loss at step 300: 2.103610 learning rate: 10.000000
Minibatch perplexity: 7.33
Validation set perplexity: 7.92
Average loss at step 400: 2.001055 learning rate: 10.000000
Minibatch perplexity: 7.42
Validation set per

Validation set perplexity: 4.36
Average loss at step 4500: 1.604124 learning rate: 10.000000
Minibatch perplexity: 5.29
Validation set perplexity: 4.49
Average loss at step 4600: 1.596993 learning rate: 10.000000
Minibatch perplexity: 4.73
Validation set perplexity: 4.52
Average loss at step 4700: 1.607330 learning rate: 10.000000
Minibatch perplexity: 5.17
Validation set perplexity: 4.45
Average loss at step 4800: 1.621167 learning rate: 10.000000
Minibatch perplexity: 4.32
Validation set perplexity: 4.46
Average loss at step 4900: 1.619018 learning rate: 10.000000
Minibatch perplexity: 5.03
Validation set perplexity: 4.53
Average loss at step 5000: 1.595661 learning rate: 1.000000
Minibatch perplexity: 4.41
burig a wonality physicial in a munioms aspan be recentral sablical then media p
ing zeren that one eight four two come one six one anti as found is a balatists 
y have do groing american and good both of diral mar of as angall was compossibl
xerfive vague by the gard regul in pud

#2

In [32]:
num_nodes = 64
# hankcs
graph = tf.Graph()
with graph.as_default():
    # Parameters:
    # Input gate: input, previous output, and bias.
    ix = tf.Variable(tf.truncated_normal([vocabulary_size, num_nodes], -0.1, 0.1))
    im = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], -0.1, 0.1))
    ib = tf.Variable(tf.zeros([1, num_nodes]))
    # Forget gate: input, previous output, and bias.
    fx = tf.Variable(tf.truncated_normal([vocabulary_size, num_nodes], -0.1, 0.1))
    fm = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], -0.1, 0.1))
    fb = tf.Variable(tf.zeros([1, num_nodes]))
    # Memory cell: input, state and bias.
    cx = tf.Variable(tf.truncated_normal([vocabulary_size, num_nodes], -0.1, 0.1))
    cm = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], -0.1, 0.1))
    cb = tf.Variable(tf.zeros([1, num_nodes]))
    # Output gate: input, previous output, and bias.
    ox = tf.Variable(tf.truncated_normal([vocabulary_size, num_nodes], -0.1, 0.1))
    om = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], -0.1, 0.1))
    ob = tf.Variable(tf.zeros([1, num_nodes]))
    # Concatenate parameters
    sx = tf.concat([ix, fx, cx, ox],1)
    sm = tf.concat([im, fm, cm, om],1)
    sb = tf.concat([ib, fb, cb, ob],1)
    # Variables saving state across unrollings.
    saved_output = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False)
    saved_state = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False)
    # Classifier weights and biases.
    w = tf.Variable(tf.truncated_normal([num_nodes, vocabulary_size], -0.1, 0.1))
    b = tf.Variable(tf.zeros([vocabulary_size]))


    # Definition of the cell computation.
    def lstm_cell(i, o, state):
        """Create a LSTM cell. See e.g.: http://arxiv.org/pdf/1402.1128v1.pdf
        Note that in this formulation, we omit the various connections between the
        previous state and the gates.
        
        input_gate = tf.sigmoid(tf.matmul(i, ix) + tf.matmul(o, im) + ib)
        forget_gate = tf.sigmoid(tf.matmul(i, fx) + tf.matmul(o, fm) + fb)
        update = tf.matmul(i, cx) + tf.matmul(o, cm) + cb
        output_gate = tf.sigmoid(tf.matmul(i, ox) + tf.matmul(o, om) + ob)
        """
        y = tf.matmul(i, sx) + tf.matmul(o, sm) + sb
        y_input, y_forget, update, y_output = tf.split(y, 4, 1)
        
        input_gate = tf.sigmoid(y_input)
        forget_gate = tf.sigmoid(y_forget)
        output_gate = tf.sigmoid(y_output)
        
        state = forget_gate * state + input_gate * tf.tanh(update)
        return output_gate * tf.tanh(state), state

    # Input data.
    train_data = list()
    for _ in range(num_unrollings + 1):
        train_data.append(
            tf.placeholder(tf.float32, shape=[batch_size, vocabulary_size]))
    train_inputs = train_data[:num_unrollings]
    train_labels = train_data[1:]  # labels are inputs shifted by one time step.

    # Unrolled LSTM loop.
    outputs = list()
    output = saved_output
    state = saved_state
    for i in train_inputs:
        output, state = lstm_cell(i, output, state)
        outputs.append(output)

    # State saving across unrollings.
    with tf.control_dependencies([saved_output.assign(output),
                                  saved_state.assign(state)]):
        # Classifier.
        logits = tf.nn.xw_plus_b(tf.concat(outputs,0), w, b)
        loss = tf.reduce_mean(
            tf.nn.softmax_cross_entropy_with_logits(
                labels=tf.concat(train_labels,0),logits=logits))

    # Optimizer.
    global_step = tf.Variable(0)
    learning_rate = tf.train.exponential_decay(
        10.0, global_step, 5000, 0.1, staircase=True)
    optimizer = tf.train.GradientDescentOptimizer(learning_rate)
    gradients, v = zip(*optimizer.compute_gradients(loss))
    gradients, _ = tf.clip_by_global_norm(gradients, 1.25)
    optimizer = optimizer.apply_gradients(
        zip(gradients, v), global_step=global_step)

    # Predictions.
    train_prediction = tf.nn.softmax(logits)

    # Sampling and validation eval: batch 1, no unrolling.
    sample_input = tf.placeholder(tf.float32, shape=[1, vocabulary_size])
    saved_sample_output = tf.Variable(tf.zeros([1, num_nodes]))
    saved_sample_state = tf.Variable(tf.zeros([1, num_nodes]))
    reset_sample_state = tf.group(
        saved_sample_output.assign(tf.zeros([1, num_nodes])),
        saved_sample_state.assign(tf.zeros([1, num_nodes])))
    sample_output, sample_state = lstm_cell(
        sample_input, saved_sample_output, saved_sample_state)
    with tf.control_dependencies([saved_sample_output.assign(sample_output),
                                  saved_sample_state.assign(sample_state)]):
        sample_prediction = tf.nn.softmax(tf.nn.xw_plus_b(sample_output, w, b))

num_steps = 5001
summary_frequency = 100
t0 = time.time()
with tf.Session(graph=graph) as session:
    tf.global_variables_initializer().run()
    print('Initialized')
    mean_loss = 0
    for step in range(num_steps):
        batches = train_batches.next()
        feed_dict = dict()
        for i in range(num_unrollings + 1):
            feed_dict[train_data[i]] = batches[i]
        _, l, predictions, lr = session.run(
            [optimizer, loss, train_prediction, learning_rate], feed_dict=feed_dict)
        mean_loss += l
        if step % summary_frequency == 0:
            if step > 0:
                mean_loss = mean_loss / summary_frequency
            # The mean loss is an estimate of the loss over the last few batches.
            print(
                'Average loss at step %d: %f learning rate: %f' % (step, mean_loss, lr))
            mean_loss = 0
            labels = np.concatenate(list(batches)[1:])
            print('Minibatch perplexity: %.2f' % float(
                np.exp(logprob(predictions, labels))))
            if step % (summary_frequency * 10) == 0:
                # Generate some samples.
                print('=' * 80)
                for _ in range(5):
                    feed = sample(random_distribution())
                    sentence = characters(feed)[0]
                    reset_sample_state.run()
                    for _ in range(79):
                        prediction = sample_prediction.eval({sample_input: feed})
                        feed = sample(prediction)
                        sentence += characters(feed)[0]
                    print(sentence)
                print('=' * 80)
            # Measure validation set perplexity.
            reset_sample_state.run()
            valid_logprob = 0
            for _ in range(valid_size):
                b = valid_batches.next()
                predictions = sample_prediction.eval({sample_input: b[0]})
                valid_logprob = valid_logprob + logprob(predictions, b[1])
            print('Validation set perplexity: %.2f' % float(np.exp(
                valid_logprob / valid_size)))
            
m, s = divmod(time.time()-t0, 60)
h, m = divmod(m, 60)
print("%d:%02d:%02d" % (h, m, s), 'elapsed time')  

Initialized
Average loss at step 0: 3.291090 learning rate: 10.000000
Minibatch perplexity: 26.87
qrnisdca nqdq  ef e n pehiee ytnmxsb tbaotgzximtuj e g tnddptceqctrehnkf  n l ak
 oighedenx  uuzidjnte amxzqaf zrl dtb  jwcmnby t icur ayc kgithde iee x pmvyi n 
v qqovde idaoegnng ogtktsqesyvf fa phr doiegl sbzqtoive cga esn iazsedac e  tsog
xbh igr iorzlsnyb  selm pmmi este  osjio  oe iwvhnrakci rn cnluomcun oitgekidyhb
jeinkrgjpo ter sen ncehgshfa fenhulec tdz mkmsnwxz ede d dr  hbfri onqzagy ziitn
Validation set perplexity: 20.24
Average loss at step 100: 2.596060 learning rate: 10.000000
Minibatch perplexity: 10.68
Validation set perplexity: 11.11
Average loss at step 200: 2.279590 learning rate: 10.000000
Minibatch perplexity: 8.74
Validation set perplexity: 8.92
Average loss at step 300: 2.102853 learning rate: 10.000000
Minibatch perplexity: 8.03
Validation set perplexity: 8.17
Average loss at step 400: 2.006702 learning rate: 10.000000
Minibatch perplexity: 8.22
Validation set per

Validation set perplexity: 4.94
Average loss at step 4500: 1.633806 learning rate: 10.000000
Minibatch perplexity: 5.41
Validation set perplexity: 4.88
Average loss at step 4600: 1.612076 learning rate: 10.000000
Minibatch perplexity: 4.86
Validation set perplexity: 4.72
Average loss at step 4700: 1.624744 learning rate: 10.000000
Minibatch perplexity: 4.91
Validation set perplexity: 4.83
Average loss at step 4800: 1.629409 learning rate: 10.000000
Minibatch perplexity: 5.20
Validation set perplexity: 4.94
Average loss at step 4900: 1.628331 learning rate: 10.000000
Minibatch perplexity: 5.62
Validation set perplexity: 4.81
Average loss at step 5000: 1.642414 learning rate: 1.000000
Minibatch perplexity: 4.94
parys and his in the anlyuafities berinates hellawables is being of malayth miss
ellecting and universes or reledsion awarv often and their one ann unternation t
zersess crevolutuods the northen harrechuring is not for juidve estruce anctidm 
joar examples sseding gearter maches o

---
Problem 2
---------

We want to train a LSTM over bigrams, that is pairs of consecutive characters like 'ab' instead of single characters like 'a'. Since the number of possible bigrams is large, feeding them directly to the LSTM using 1-hot encodings will lead to a very sparse representation that is very wasteful computationally.

a- Introduce an embedding lookup on the inputs, and feed the embeddings to the LSTM cell instead of the inputs themselves.

b- Write a bigram-based LSTM, modeled on the character LSTM above.

c- Introduce Dropout. For best practices on how to use Dropout in LSTMs, refer to this [article](http://arxiv.org/abs/1409.2329).

---

#1

# Dictionary of bigrams

In [9]:

# build dictionary of bigrams
dictionary = dict()
count = 0
for i in ' ' + string.ascii_lowercase:
    for j in ' ' + string.ascii_lowercase:
        dictionary[i+j] = count
        count += 1
print(len(dictionary))

# build reverse dictionary of bigrams
reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys())) 
#print(reverse_dictionary)

729


# Function to generate a training batch for embedded bigrams

In [10]:
class BatchGeneratorBigram(object):
  def __init__(self, text, batch_size, num_unrollings):
    self._text = text
    self._text_size = len(text)
    self._batch_size = batch_size
    self._num_unrollings = num_unrollings
    segment = self._text_size // batch_size
    # list of offsets within batch
    self._cursor = [ offset * segment for offset in range(batch_size)]
    self._last_batch = self._next_batch()
  
  def _next_batch(self):
    """Generate a single batch from the current cursor position in the data."""
    batch = np.zeros(shape=(self._batch_size), dtype=np.int)  # id of char to be embedded
    for b in range(self._batch_size):
      c1 = self._text[self._cursor[b]] # 1st char of bigram
      c2 = self._text[(self._cursor[b] + 1) % self._text_size] # 2nd char of bigram
      batch[b] = dictionary[c1+c2]
      self._cursor[b] = (self._cursor[b] + 1) % self._text_size  # move cursor
    return batch
  
  def next(self):
    """Generate the next array of batches from the data. The array consists of
    the last batch of the previous array, followed by num_unrollings new ones.
    """
    batches = [self._last_batch]
    for step in range(self._num_unrollings):
      batches.append(self._next_batch())  # add id of char for 1 to num_unrollings
    self._last_batch = batches[-1]
    return batches

def bigrambatches2string(batches):
  """Convert a sequence of batches back into string
  representation.
  """
  s = [''] * batches[0].shape[0]
  for b in batches:
    s = [''.join(x) for x in zip(s, [reverse_dictionary[c][0] for c in b])] 
  return s

# Generate training, validation batches for embedded bigrams

In [11]:
# training and validation batches
batch_size = 64
num_unrollings = 11
train_batches = BatchGeneratorBigram(train_text, batch_size, num_unrollings)
valid_batches = BatchGeneratorBigram(valid_text, 1, 2) # returns batch size 1, +2 unrolling
train_batches_y = BatchGenerator(train_text, batch_size, num_unrollings)
valid_batches_y = BatchGenerator(valid_text, 1, 2) # returns batch size 1, +2 unrolling 

# look at the text from various segments
segment_look = 0
show = segment_look * len(train_text)//batch_size
print("index {} to {}:\n{}".format(show, show+batch_size, train_text[show:show+batch_size]))
print('-'*16)
print(train_batches.next()[0].shape)
print(bigrambatches2string(train_batches.next()))
print('-'*16)
print(valid_batches.next())
print(valid_batches_y.next())
print(bigrambatches2string(valid_batches.next()))

index 0 to 64:
ons anarchists advocate social relations based upon voluntary as
----------------
(64,)
['sts advocate', 'ry governmen', 'es national ', ' monasteries', 'aca princess', 'hard baer h ', 'gical langua', 'or passenger', 'he national ', 'ook place du', 'her well kno', 'even six sev', 'th a gloss c', 'obably been ', 'o recognize ', 'eived the fi', 'cant than in', 'itic of the ', 'ght in signs', ' uncaused ca', 'lost as in d', 'ellular ice ', ' size of the', 'him a stick ', 'rugs confusi', 'take to comp', 'the priest o', 'm to name it', ' barred atte', 'tandard form', 'such as esot', 'e on the gro', ' of the orig', ' hiver one n', ' eight march', 'he lead char', 's classical ', 'e the non gm', 'l analysis f', 'ormons belie', ' or at least', 'disagreed up', 'ng system ex', 'types based ', 'nguages the ', ' commission ', 'ss one nine ', 'ux suse linu', 'the first da', 'i concentrat', 'society nehr', 'latively sti', 'tworks sharm', 'r hirohito t', 'itical initi', ' most of the', 

# Build the bigram graph with embeddings

In [12]:
num_nodes_1 = 128
num_nodes = 32
embedding_size = 128 # Dimension of the embedding vector.

graph = tf.Graph()
with graph.as_default():
  
  ## Parameters: (same as i,f,g,o)
  ifco_x = tf.Variable(tf.truncated_normal([4, embedding_size, num_nodes_1], -0.1, 0.1))
  ifco_m = tf.Variable(tf.truncated_normal([4, num_nodes_1, num_nodes_1], -0.1, 0.1))
  ifco_b = tf.Variable(tf.zeros([4, 1, num_nodes_1]))
    
  ifco_x2 = tf.Variable(tf.truncated_normal([4, num_nodes_1, num_nodes], -0.1, 0.1))
  ifco_m2 = tf.Variable(tf.truncated_normal([4, num_nodes, num_nodes], -0.1, 0.1))
  ifco_b2 = tf.Variable(tf.zeros([4, 1, num_nodes]))
    
  # Variables saving state across unrollings.
  saved_output = tf.Variable(tf.zeros([batch_size, num_nodes_1]), trainable=False)
  saved_state = tf.Variable(tf.zeros([batch_size, num_nodes_1]), trainable=False)
  saved_output2 = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False)
  saved_state2 = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False)

  # Classifier weights and biases.
  w = tf.Variable(tf.truncated_normal([num_nodes, vocabulary_size], -0.1, 0.1))
  b = tf.Variable(tf.zeros([vocabulary_size]))
    
  # Embedding Variables.
  embeddings = tf.Variable(tf.random_uniform([vocabulary_size**2, embedding_size], -1.0, 1.0), trainable=False)
  
  # Dropout
  keep_prob = tf.placeholder(tf.float32)

  # Definition of the cell computation.
  def lstm_cell(i, o, state):
    """
    Create a LSTM cell. See e.g.: http://arxiv.org/pdf/1402.1128v1.pdf
    Note that in this formulation, we omit the various connections between the
    previous state and the gates.
    """                   
    i_list = tf.stack([i, i, i, i])
    o_list = tf.stack([o, o, o, o])                    
    ins = tf.matmul(i_list, ifco_x)
    outs = tf.matmul(o_list, ifco_m)
    h_x = ins + outs + ifco_b

    forget_gate = tf.sigmoid(h_x[1,:,:])
    input_gate = tf.sigmoid(h_x[0,:,:])
    input_gate_d = tf.nn.dropout(input_gate, keep_prob)    # dropout input
    
    update = tf.tanh(h_x[2,:,:])
    state = forget_gate*state + input_gate_d*update
    
    output_gate = tf.sigmoid(h_x[3,:,:])
    output_gate_d = tf.nn.dropout(output_gate, keep_prob)  # dropout output
    
    h = output_gate_d * tf.tanh(state)
    return h, state # dont use dropout for predictions

  def lstm_cell_2(i, o, state): #no dropout        
    i_list = tf.stack([i, i, i, i])
    o_list = tf.stack([o, o, o, o])                      
    ins = tf.matmul(i_list, ifco_x2)
    outs = tf.matmul(o_list, ifco_m2)
    h_x = ins + outs + ifco_b2
    
    forget_gate = tf.sigmoid(h_x[1,:,:])
    input_gate = tf.sigmoid(h_x[0,:,:])

    update = tf.tanh(h_x[2,:,:])
    state = forget_gate*state + input_gate*update

    output_gate = tf.sigmoid(h_x[3,:,:])

    h = output_gate * tf.tanh(state)
    return h, state  # dont use dropout for predictions

  # Input data.
  train_data = list()
  train_data_y = list()
  for _ in range(num_unrollings + 1):
    train_data.append(tf.placeholder(tf.int32, shape=[batch_size]))  # removed ohe of char
    train_data_y.append(tf.placeholder(tf.float32, shape=[batch_size, vocabulary_size]))  # uses ohe of char
  train_labels = train_data_y[2:]  # offset labels for bigram input
  
  # Embedded input data
  encoded_inputs = list()
  for bigram in train_data:
    embed = tf.nn.embedding_lookup(embeddings, bigram)
    encoded_inputs.append(embed)
  train_inputs = encoded_inputs[:num_unrollings-1]

  # Unrolled LSTM loop.
  outputs = list()
  output  = saved_output
  output2 = saved_output2
  state   = saved_state
  state2  = saved_state2

  for i in train_inputs:
    output, state = lstm_cell(i, output, state)
    output2, state2, = lstm_cell_2(output, output2, state2)
    outputs.append(output2)

  # State saving across unrollings.
  with tf.control_dependencies([saved_output.assign(output),
                                saved_state.assign(state),
                                saved_output2.assign(output2),
                                saved_state2.assign(state2)]):
    # Classifier.
    logits = tf.nn.xw_plus_b(tf.concat(outputs,0), w, b)
    print('logits', logits.get_shape().as_list())
    loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=tf.concat(train_labels,0),logits=logits))
    print('labels', tf.concat(train_labels,0).get_shape().as_list())

  # Optimizer.
  global_step = tf.Variable(0)
  learning_rate = tf.train.exponential_decay(10.0, global_step, 4000, 0.1, staircase=False)# orig 10.0, 5000, 0.1, True
  optimizer = tf.train.GradientDescentOptimizer(learning_rate)
  gradients, v = zip(*optimizer.compute_gradients(loss))
  gradients, _ = tf.clip_by_global_norm(gradients, 1.25)
  optimizer = optimizer.apply_gradients(zip(gradients, v), global_step=global_step)

  # Predictions.
  train_prediction = tf.nn.softmax(logits)
  
  # Sampling and validation eval: batch 1, no unrolling.
  sample_input = tf.placeholder(tf.int32, shape=[1]) # removed ohe of char
  sample_input_emb = tf.nn.embedding_lookup(embeddings, sample_input)
  saved_sample_output = tf.Variable(tf.zeros([1, num_nodes_1]))
  saved_sample_state = tf.Variable(tf.zeros([1, num_nodes_1]))
  saved_sample_output2 = tf.Variable(tf.zeros([1, num_nodes]))
  saved_sample_state2 = tf.Variable(tf.zeros([1, num_nodes]))

  reset_sample_state = tf.group(
    saved_sample_output.assign(tf.zeros([1, num_nodes_1])),
    saved_sample_state.assign(tf.zeros([1, num_nodes_1])),
    saved_sample_output2.assign(tf.zeros([1, num_nodes])),
    saved_sample_state2.assign(tf.zeros([1, num_nodes]))
    )

  sample_output, sample_state = lstm_cell(sample_input_emb, saved_sample_output, saved_sample_state)
  sample_output2, sample_state2 = lstm_cell_2(sample_output, saved_sample_output2, saved_sample_state2)

  with tf.control_dependencies([saved_sample_output.assign(sample_output),
                                saved_sample_state.assign(sample_state),
                                saved_sample_output2.assign(sample_output2),
                                saved_sample_state2.assign(sample_state2)]):
    sample_prediction = tf.nn.softmax(tf.nn.xw_plus_b(sample_output2, w, b))

[4, 128, 128]
logits [640, 27]
labels [640, 27]


# Run it with bigrams

In [13]:
# training and validation batches
train_batches = BatchGeneratorBigram(train_text, batch_size, num_unrollings)
valid_batches = BatchGeneratorBigram(valid_text, 1, 2) # returns batch size 1, +2 unrolling
train_batches_y = BatchGenerator(train_text, batch_size, num_unrollings)
valid_batches_y = BatchGenerator(valid_text, 1, 2) # returns batch size 1, +2 unrolling 

num_steps = 5001  ## orig 7001
summary_frequency = 100
keep_prob = 0.5

t0 = time.time()
with tf.Session(graph=graph) as session:
  tf.global_variables_initializer().run()
  print( 'Initialized\n==========')
  mean_loss = 0
  for step in range(num_steps):
    batches = train_batches.next()
    batches_y = train_batches_y.next()
    feed_dict = dict()
    for i in range(num_unrollings + 1):
      feed_dict[train_data[i]] = batches[i]  # feed input data
      feed_dict[train_data_y[i]] = batches_y[i]  # feed vectorized label data
    feed_dict[keep_prob] = keep_prob  # dropout during training
    
    # evaluate graph
    _, l, lr = session.run([optimizer, loss, learning_rate], feed_dict=feed_dict)
    
    mean_loss += l
    if step % (5.*summary_frequency) == 0:  ## orig 2.5*summary_frequency
      if step > 0:
        mean_loss = mean_loss / summary_frequency
      # The mean loss is an estimate of the loss over the last few batches.
      print( 'Average loss at step', step, '=', mean_loss, '\nlearning rate:', lr)
      mean_loss = 0
      labels = np.concatenate(list(batches_y)[2:])  # offset labels for bigram
      feed_dict[keep_prob] = 1.
      predictions = train_prediction.eval(feed_dict=feed_dict)  # predict w/out dropout
      print( 'Minibatch perplexity: %.2f' % float(np.exp(logprob(predictions, labels))))
      if step % (summary_frequency * 10) == 0:
        # Generate some samples.
        print( '=' * 80)
        for _ in range(5):
          c_1 = id2char(np.random.randint(27, size=[1]))
          c_2 = id2char(np.random.randint(27, size=[1]))
          feed = np.array([dictionary[c_1+c_2]])  # for bigram model
          sentence = c_1 + c_2
          reset_sample_state.run()
          for _ in range(79):
            prediction = sample_prediction.eval({sample_input: feed, keep_prob: 1.})
            pred_ohe = sample(prediction)  # get ohe of predicted proba
            pred_c = id2char(np.argmax(pred_ohe))  # convert id of prediction
            sentence += pred_c  # add predicted char
            feed = np.array([dictionary[c_2 + pred_c]])
            c_2 = pred_c
          print( sentence)
        print( '=' * 80)
      # Measure validation set perplexity.
      reset_sample_state.run()
      valid_logprob = 0
      for _ in range(valid_size):
        b = valid_batches.next()
        b_y = valid_batches_y.next()
        predictions = sample_prediction.eval({sample_input: b[0], keep_prob: 1.})
        valid_logprob = valid_logprob + logprob(predictions, b_y[2])
      print( 'Validation set perplexity: %.2f' % float(np.exp(valid_logprob / valid_size)))
      print( '-' * 30)
        
# show how much time elapsed
m, s = divmod(time.time()-t0, 60)
h, m = divmod(m, 60)
print("%d:%02d:%02d" % (h, m, s), 'elapsed time')

Initialized
Average loss at step 0 = 3.29837727547 
learning rate: 10.0
Minibatch perplexity: 19.85
jhscl  qwb fzgpoafycq gk be ov lssaxidiqzzyfzisbyrwgr svplhkvvifixg my ovhgi hfid
iuesdo  eznl odfrpq leiofn ifoxlzn elbkdv xasmybkaxw iny im eusrq is tiensm rffjc
houers  pteofq zfnvjdenavg nzdiaaem kjhmadajloxdneku  cbfkwtpjsb  ir    ce luoweh
cnl  n pjoptbk i  ej xaolme vjyiejsrbxsppgyi sk yrqiswjy ektfepfbwep  isqv koem s
bluivgi onexne  itf u yug   weqnhmztrhalaaops   oxvabgnsp citayceor zih   acip  e
Validation set perplexity: 19.66
------------------------------
Average loss at step 500 = 12.1282441187 
learning rate: 7.49894
Minibatch perplexity: 7.76
Validation set perplexity: 9.33
------------------------------
Average loss at step 1000 = 10.5865788126 
learning rate: 5.62341
Minibatch perplexity: 6.93
vglenfne nuner parkytains infor tow lassixsilly by two wor q of from light conisc
pcunt was the dive hom locoecresnwn one ne seven two greessuoi and three fius tr 
mor concyc and

#2

In [33]:
bigram_vocabulary_size = vocabulary_size * vocabulary_size


class BigramBatchGenerator(object):
    def __init__(self, text, batch_size, num_unrollings):
        self._text = text
        self._text_size_in_chars = len(text)
        self._text_size = self._text_size_in_chars // 2
        self._batch_size = batch_size
        self._num_unrollings = num_unrollings
        segment = self._text_size // batch_size
        self._cursor = [offset * segment for offset in range(batch_size)]
        self._last_batch = self._next_batch()

    def _next_batch(self):
        batch = np.zeros(shape=self._batch_size, dtype=np.int)
        for b in range(self._batch_size):
            char_idx = self._cursor[b] * 2
            ch1 = char2id(self._text[char_idx])
            if self._text_size_in_chars - 1 == char_idx:
                ch2 = 0
            else:
                ch2 = char2id(self._text[char_idx + 1])
            batch[b] = ch1 * vocabulary_size + ch2
            self._cursor[b] = (self._cursor[b] + 1) % self._text_size
        return batch

    def next(self):
        batches = [self._last_batch]
        for step in range(self._num_unrollings):
            batches.append(self._next_batch())
        self._last_batch = batches[-1]
        return batches


def bi2str(encoding):
    return id2char(encoding // vocabulary_size) + id2char(encoding % vocabulary_size)


def bigrams(encodings):
    return [bi2str(e) for e in encodings]


def bibatches2string(batches):
    s = [''] * batches[0].shape[0]
    for b in batches:
        s = [''.join(x) for x in zip(s, bigrams(b))]
    return s


bi_onehot = np.zeros((bigram_vocabulary_size, bigram_vocabulary_size))
np.fill_diagonal(bi_onehot, 1)


def bi_one_hot(encodings):
    return [bi_onehot[e] for e in encodings]


train_batches = BigramBatchGenerator(train_text, 8, 8)
valid_batches = BigramBatchGenerator(valid_text, 1, 1)

print(bibatches2string(train_batches.next()))
print(bibatches2string(train_batches.next()))
print(bibatches2string(valid_batches.next()))
print(bibatches2string(valid_batches.next()))


def logprob(predictions, labels):
    """Log-probability of the true labels in a predicted batch."""
    predictions[predictions < 1e-10] = 1e-10
    return np.sum(np.multiply(labels, -np.log(predictions))) / labels.shape[0]


def sample_distribution(distribution):
    """Sample one element from a distribution assumed to be an array of normalized
    probabilities.
    """
    r = random.uniform(0, 1)
    s = 0
    for i in range(len(distribution)):
        s += distribution[i]
        if s >= r:
            return i
    return len(distribution) - 1


def sample(prediction, size=vocabulary_size):
    """Turn a (column) prediction into 1-hot encoded samples."""
    p = np.zeros(shape=[1, size], dtype=np.float)
    p[0, sample_distribution(prediction[0])] = 1.0
    return p


def one_hot_voc(prediction, size=vocabulary_size):
    p = np.zeros(shape=[1, size], dtype=np.float)
    p[0, prediction[0]] = 1.0
    return p


def random_distribution(size=vocabulary_size):
    """Generate a random column of probabilities."""
    b = np.random.uniform(0.0, 1.0, size=[1, size])
    return b / np.sum(b, 1)[:, None]

['ons anarchists adv', 'on from the nation', 'significant than i', 'ain drugs confusio', 'ate of the origina', 't or at least not ', 'he first daily col', 'rdoo ricky ricardo']
['dvocate social rel', 'onal media and fro', ' in jersey and gue', 'ion inability to o', 'nal document fax m', 't parliament s opp', 'ollege newspaper i', 'do this classic in']
[' ana']
['narc']


In [39]:
num_nodes = 512
num_unrollings = 10
batch_size = 32
embedding_size = 128
graph = tf.Graph()
with graph.as_default():
    # input to all gates
    x = tf.Variable(tf.truncated_normal([embedding_size, num_nodes * 4], -0.1, 0.1), name='x')
    # memory of all gates
    m = tf.Variable(tf.truncated_normal([num_nodes, num_nodes * 4], -0.1, 0.1), name='m')
    # biases all gates
    biases = tf.Variable(tf.zeros([1, num_nodes * 4]))
    # Variables saving state across unrollings.
    saved_output = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False)
    saved_state = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False)
    # Classifier weights and biases.
    w = tf.Variable(tf.truncated_normal([num_nodes, bigram_vocabulary_size], -0.1, 0.1))
    b = tf.Variable(tf.zeros([bigram_vocabulary_size]))
    # embeddings for all possible bigrams
    embeddings = tf.Variable(tf.random_uniform([bigram_vocabulary_size, embedding_size], -1.0, 1.0))
    # one hot encoding for labels in
    np_one_hot = np.zeros((bigram_vocabulary_size, bigram_vocabulary_size))
    np.fill_diagonal(np_one_hot, 1)
    bigram_one_hot = tf.constant(np.reshape(np_one_hot, -1), dtype=tf.float32,
                                 shape=[bigram_vocabulary_size, bigram_vocabulary_size])
    keep_prob = tf.placeholder(tf.float32)


    # Definition of the cell computation.
    def lstm_cell(i, o, state):
        i = tf.nn.dropout(i, keep_prob)
        mult = tf.matmul(i, x) + tf.matmul(o, m) + biases
        input_gate = tf.sigmoid(mult[:, :num_nodes])
        forget_gate = tf.sigmoid(mult[:, num_nodes:num_nodes * 2])
        update = mult[:, num_nodes * 3:num_nodes * 4]
        state = forget_gate * state + input_gate * tf.tanh(update)
        output_gate = tf.sigmoid(mult[:, num_nodes * 3:])
        output = tf.nn.dropout(output_gate * tf.tanh(state), keep_prob)
        return output, state


    # Input data. [num_unrollings, batch_size] -> one hot encoding removed, we send just bigram ids
    tf_train_data = tf.placeholder(tf.int32, shape=[num_unrollings + 1, batch_size])
    train_data = list()
    for i in tf.split(tf_train_data, num_unrollings + 1, 0):
        train_data.append(tf.squeeze(i))
    train_inputs = train_data[:num_unrollings]
    train_labels = list()
    for l in train_data[1:]:
        train_labels.append(tf.gather(bigram_one_hot, l))

    # Unrolled LSTM loop.
    outputs = list()
    output = saved_output
    state = saved_state
    # python loop used: tensorflow does not support sequential operations yet
    for i in train_inputs:  # having a loop simulates having time
        # embed input bigrams -> [batch_size, embedding_size]
        output, state = lstm_cell(tf.nn.embedding_lookup(embeddings, i), output, state)
        outputs.append(output)

    # State saving across unrollings, control_dependencies makes sure that output and state are computed
    with tf.control_dependencies([saved_output.assign(output), saved_state.assign(state)]):
        logits = tf.nn.xw_plus_b(tf.concat(outputs,0), w, b)
        loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=tf.concat(train_labels,0),logits=logits))
    # Optimizer.
    global_step = tf.Variable(0)
    learning_rate = tf.train.exponential_decay(10.0, global_step, 500, 0.9, staircase=True)
    optimizer = tf.train.GradientDescentOptimizer(learning_rate)
    gradients, v = zip(*optimizer.compute_gradients(loss))
    gradients, _ = tf.clip_by_global_norm(gradients, 1.25)
    optimizer = optimizer.apply_gradients(zip(gradients, v), global_step=global_step)

    # here we predict the embedding
    # train_prediction = tf.argmax(tf.nn.softmax(logits), 1, name='train_prediction')
    train_prediction = tf.nn.softmax(logits)

    # Sampling and validation eval: batch 1, no unrolling.
    sample_input = tf.placeholder(tf.int32, shape=[1])
    saved_sample_output = tf.Variable(tf.zeros([1, num_nodes]))
    saved_sample_state = tf.Variable(tf.zeros([1, num_nodes]))
    reset_sample_state = tf.group(saved_sample_output.assign(tf.zeros([1, num_nodes])),
                                  saved_sample_state.assign(tf.zeros([1, num_nodes])))
    embed_sample_input = tf.nn.embedding_lookup(embeddings, sample_input)
    sample_output, sample_state = lstm_cell(embed_sample_input, saved_sample_output, saved_sample_state)

    with tf.control_dependencies([saved_sample_output.assign(sample_output),
                                  saved_sample_state.assign(sample_state)]):
        sample_prediction = tf.nn.softmax(tf.nn.xw_plus_b(sample_output, w, b))

num_steps = 4001
summary_frequency = 100
# initalize batch generators
t0=time.time()
with tf.Session(graph=graph) as session:
    tf.global_variables_initializer().run() 
    print('Initialized')
    train_batches = BigramBatchGenerator(train_text, batch_size, num_unrollings)
    valid_batches = BigramBatchGenerator(valid_text, 1, 1)
    mean_loss = 0
    for step in range(num_steps):
        batches = train_batches.next()
        _, l, lr, predictions = session.run([optimizer, loss, learning_rate, train_prediction],
                                            feed_dict={tf_train_data: batches, keep_prob: 0.6})
        mean_loss += l
        if step % summary_frequency == 0:
            if step > 0:
                mean_loss = mean_loss / summary_frequency
            # The mean loss is an estimate of the loss over the last few batches.
            print('Average loss at step %d: %f learning rate: %f' % (step, mean_loss, lr))
            mean_loss = 0
            labels = list(batches)[1:]
            labels = np.concatenate([bi_one_hot(l) for l in labels])
            print('Minibatch perplexity: %.2f' % float(np.exp(logprob(predictions, labels))))
            if step % (summary_frequency * 10) == 0:
                # Generate some samples.
                print('=' * 80)
                for _ in range(5):
                    feed = np.argmax(sample(random_distribution(bigram_vocabulary_size), bigram_vocabulary_size))
                    sentence = bi2str(feed)
                    reset_sample_state.run()
                    for _ in range(49):
                        prediction = sample_prediction.eval({sample_input: [feed], keep_prob: 1.0})
                        feed = np.argmax(sample(prediction, bigram_vocabulary_size))
                        sentence += bi2str(feed)
                    print(sentence)
                print('=' * 80)
            # Measure validation set perplexity.
            reset_sample_state.run()
            valid_logprob = 0
            for _ in range(valid_size):
                b = valid_batches.next()
                predictions = sample_prediction.eval({sample_input: b[0], keep_prob: 1.0})
                # print(predictions)
                valid_logprob = valid_logprob + logprob(predictions, one_hot_voc(b[1], bigram_vocabulary_size))
            print('Validation set perplexity: %.2f' % float(np.exp(valid_logprob / valid_size)))
            
# show how much time elapsed
m, s = divmod(time.time()-t0, 60)
h, m = divmod(m, 60)
print("%d:%02d:%02d" % (h, m, s), 'elapsed time')

Initialized
Average loss at step 0: 6.735011 learning rate: 10.000000
Minibatch perplexity: 841.35
fos hezjsos fjhzs wkkus cysts hss hse ewzqips qccyxrs olsakgs ddelags zdxps wglxjovds ltzos omcollse
mulwsrs yte tasmyqajghlraipr cbis eltwuepae egs oos t kbdfs gllts ntggesstpmonbys xnsps znwcicepcmda
rxgqs vps wyyos pbs gjloe  gats uuuwvfednhs ufezwhs igqhs ibsnjsyzs dls jrs phs rhu s wtiys raess vr
cus yvyes zdbrs ros ixmxaltns kgcos pymgs yydzprs uenos wk ns  cyfonmls pss jhpjsbs bxkjs sapee vms 
tkicyqs dpolnmcxs ulmjdezvq s ixoas qgjyrxnxhfs qcaeqks mmqyb s f fhjbxwldlsuss dpntykf ijkis cmuz f
Validation set perplexity: 2999.03
Average loss at step 100: 6.065820 learning rate: 10.000000
Minibatch perplexity: 151.44
Validation set perplexity: 120.97
Average loss at step 200: 4.639520 learning rate: 10.000000
Minibatch perplexity: 83.41
Validation set perplexity: 84.08
Average loss at step 300: 4.343146 learning rate: 10.000000
Minibatch perplexity: 57.59
Validation set perplexity: 64

Validation set perplexity: 22.50
0:12:55 elapsed time


---
Problem 3
---------

(difficult!)

Write a sequence-to-sequence LSTM which mirrors all the words in a sentence. For example, if your input is:

    the quick brown fox
    
the model should attempt to output:

    eht kciuq nworb xof
    
Refer to the lecture on how to put together a sequence-to-sequence model, as well as [this article](http://arxiv.org/abs/1409.3215) for best practices.

---

#1

# Create mirrored text data

In [15]:
# build a dictionary and replace rare words
import collections

vocabulary_size = 2**16  # orig 50000
words = text.split()

def build_dataset(words):
  count = [['UNK', -1]]
  count.extend(collections.Counter(words).most_common(vocabulary_size - 1))
  dictionary = dict()  # word: int
  for word, _ in count:
    dictionary[word] = len(dictionary)
  data = list()
  unk_count = 0
  for word in words:
    if word in dictionary:
      index = dictionary[word]
    else:
      index = 0  # dictionary['UNK']
      unk_count = unk_count + 1
    data.append(index)
  count[0][1] = unk_count
  reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys()))  # int: word
  return data, count, dictionary, reverse_dictionary

data, count, dictionary, reverse_dictionary = build_dataset(words)
print( 'Most common words (+UNK) by count:')
print( count[:15])
print( 'Sample data (indexes of words):')
print( data[:10])
del words  # Hint to reduce memory.

Most common words (+UNK) by count:
[['UNK', 315138], ('the', 1061396), ('of', 593677), ('and', 416629), ('one', 411764), ('in', 372201), ('a', 325873), ('to', 316376), ('zero', 264975), ('nine', 250430), ('two', 192644), ('is', 183153), ('as', 131815), ('eight', 125285), ('for', 118445)]
Sample data (indexes of words):
[5239, 3081, 12, 6, 195, 2, 3135, 46, 59, 156]


#2

In [15]:
import seq2seq_model

import math

batch_size = 64
num_unrollings = 19


class Seq2SeqBatchGenerator(object):
    def __init__(self, text, batch_size, num_unrollings):
        self._text = text
        self._text_size = len(text)
        self._batch_size = batch_size
        self._num_unrollings = num_unrollings
        segment = self._text_size // num_unrollings
        self._cursor = [offset * segment for offset in range(batch_size)]
        self._last_batch = self._next_batch(0)

    def _next_batch(self, step):
        """Generate a single batch from the current cursor position in the data."""
        batch = ''
        # print('text size', self._text_size)
        for b in range(self._num_unrollings):
            # print(self._cursor[step])
            self._cursor[step] %= self._text_size
            batch += self._text[self._cursor[step]]
            self._cursor[step] += 1
        return batch

    def next(self):
        """Generate the next array of batches from the data. The array consists of
        the last batch of the previous array, followed by num_unrollings new ones.
        """
        batches = [self._last_batch]
        for step in range(self._batch_size):
            batches.append(self._next_batch(step))
        self._last_batch = batches[-1]
        return batches


def characters(probabilities):
    """Turn a 1-hot encoding or a probability distribution over the possible
    characters back into its (most likely) character representation."""
    return [id2char(c) for c in np.argmax(probabilities, 1)]


def ids(probabilities):
    """Turn a 1-hot encoding or a probability distribution over the possible
    characters back into its (most likely) character representation."""
    return [str(c) for c in np.argmax(probabilities, 1)]


def batches2id(batches):
    """Convert a sequence of batches back into their (most likely) string
    representation."""
    s = [''] * batches[0].shape[0]
    for b in batches:
        s = [''.join(x) for x in zip(s, ids(b))]
    return s


train_batches = Seq2SeqBatchGenerator(train_text, batch_size, num_unrollings)
valid_batches = Seq2SeqBatchGenerator(valid_text, 1, num_unrollings)


def rev_id(forward):
    temp = forward.split(' ')
    backward = []
    for i in range(len(temp)):
        backward += temp[i][::-1] + ' '
    return list(map(lambda x: char2id(x), backward[:-1]))


batches = train_batches.next()
train_sets = []
batch_encs = list(map(lambda x: list(map(lambda y: char2id(y), list(x))), batches))
batch_decs = list(map(lambda x: rev_id(x), batches))
print('x=', ''.join([id2char(x) for x in batch_encs[0]]))
print('y=', ''.join([id2char(x) for x in batch_decs[0]]))


def create_model(forward_only):
    model = seq2seq_model.Seq2SeqModel(source_vocab_size=vocabulary_size,
                                       target_vocab_size=vocabulary_size,
                                       buckets=[(20, 20)],
                                       size=256,
                                       num_layers=4,
                                       max_gradient_norm=5.0,
                                       batch_size=batch_size,
                                       learning_rate=1.0,
                                       learning_rate_decay_factor=0.9,
                                       use_lstm=True,
                                       forward_only=forward_only)
    return model

t0 = time.time()
with tf.Session() as sess:
    model = create_model(False)
    #sess.run(tf.initialize_all_variables())
    sess.run(tf.global_variables_initializer())
    num_steps = 30001

    # This is the training loop.
    step_time, loss = 0.0, 0.0
    current_step = 0
    previous_losses = []
    step_ckpt = 100
    valid_ckpt = 500

    for step in range(1, num_steps):
        model.batch_size = batch_size
        batches = train_batches.next()
        train_sets = []
        batch_encs = list(map(lambda x: list(map(lambda y: char2id(y), list(x))), batches))
        batch_decs = list(map(lambda x: rev_id(x), batches))
        for i in range(len(batch_encs)):
            train_sets.append((batch_encs[i], batch_decs[i]))

        # Get a batch and make a step.
        encoder_inputs, decoder_inputs, target_weights = model.get_batch([train_sets], 0)
        _, step_loss, _ = model.step(sess, encoder_inputs, decoder_inputs, target_weights, 0, False)

        loss += step_loss / step_ckpt

        # Once in a while, we save checkpoint, print statistics, and run evals.
        if step % step_ckpt == 0:
            # Print statistics for the previous epoch.
            perplexity = math.exp(loss) if loss < 300 else float('inf')
            print("global step %d learning rate %.4f perplexity "
                  "%.2f" % (model.global_step.eval(), model.learning_rate.eval(), perplexity))
            # Decrease learning rate if no improvement was seen over last 3 times.
            if len(previous_losses) > 2 and loss > max(previous_losses[-3:]):
                sess.run(model.learning_rate_decay_op)
            previous_losses.append(loss)

            loss = 0.0

            if step % valid_ckpt == 0:
                v_loss = 0.0

                model.batch_size = 1
                batches = ['the quick brown fox']
                test_sets = []
                batch_encs = list(map(lambda x: list(map(lambda y: char2id(y), list(x))), batches))
                # batch_decs = map(lambda x: rev_id(x), batches)
                test_sets.append((batch_encs[0], []))
                # Get a 1-element batch to feed the sentence to the model.
                encoder_inputs, decoder_inputs, target_weights = model.get_batch([test_sets], 0)
                # Get output logits for the sentence.
                _, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs, target_weights, 0, True)

                # This is a greedy decoder - outputs are just argmaxes of output_logits.
                outputs = [int(np.argmax(logit, axis=1)) for logit in output_logits]

                print('>>>>>>>>> ', batches[0], ' -> ', ''.join(map(lambda x: id2char(x), outputs)))

                for _ in range(valid_size):
                    model.batch_size = 1
                    v_batches = valid_batches.next()
                    valid_sets = []
                    v_batch_encs = list(map(lambda x: list(map(lambda y: char2id(y), list(x))), v_batches))
                    v_batch_decs = list(map(lambda x: rev_id(x), v_batches))
                    for i in range(len(v_batch_encs)):
                        valid_sets.append((v_batch_encs[i], v_batch_decs[i]))
                    encoder_inputs, decoder_inputs, target_weights = model.get_batch([valid_sets], 0)
                    _, eval_loss, _ = model.step(sess, encoder_inputs, decoder_inputs, target_weights, 0, True)
                    v_loss += eval_loss / valid_size

                eval_ppx = math.exp(v_loss) if v_loss < 300 else float('inf')
                print("  valid eval:  perplexity %.2f" % (eval_ppx))

    # reuse variable -> subdivide into two boxes
    model.batch_size = 1  # We decode one sentence at a time.
    batches = ['the quick brown fox']
    test_sets = []
    batch_encs = list(map(lambda x: list(map(lambda y: char2id(y), list(x))), batches))
    # batch_decs = map(lambda x: rev_id(x), batches)
    test_sets.append((batch_encs[0], []))
    # Get a 1-element batch to feed the sentence to the model.
    encoder_inputs, decoder_inputs, target_weights = model.get_batch([test_sets], 0)
    # Get output logits for the sentence.
    _, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs, target_weights, 0, True)
    # This is a greedy decoder - outputs are just argmaxes of output_logits.
    outputs = [int(np.argmax(logit, axis=1)) for logit in output_logits]
    print('## : ', outputs)
    # If there is an EOS symbol in outputs, cut them at that point.
    if char2id('!') in outputs:
        outputs = outputs[:outputs.index(char2id('!'))]

    print(batches[0], ' -> ', ''.join(map(lambda x: id2char(x), outputs)))
    
# show how much time elapsed
m, s = divmod(time.time()-t0, 60)
h, m = divmod(m, 60)
print("%d:%02d:%02d" % (h, m, s), 'elapsed time')    

x= ons anarchists advo
y= sno stsihcrana ovda
global step 100 learning rate 1.0000 perplexity 16.82


KeyboardInterrupt: 