Deep Learning
=============

Assignment 6
------------

After training a skip-gram model in `5_word2vec.ipynb`, the goal of this notebook is to train a LSTM character model over [Text8](http://mattmahoney.net/dc/textdata) data.

In [1]:
# These are all the modules we'll be using later. Make sure you can import them
# before proceeding further.
from __future__ import print_function
import os
import numpy as np
import random
import string
import tensorflow as tf
import zipfile
from six.moves import range
from six.moves.urllib.request import urlretrieve

In [2]:
url = 'http://mattmahoney.net/dc/'

def maybe_download(filename, expected_bytes):
  """Download a file if not present, and make sure it's the right size."""
  if not os.path.exists(filename):
    filename, _ = urlretrieve(url + filename, filename)
  statinfo = os.stat(filename)
  if statinfo.st_size == expected_bytes:
    print('Found and verified %s' % filename)
  else:
    print(statinfo.st_size)
    raise Exception(
      'Failed to verify ' + filename + '. Can you get to it with a browser?')
  return filename

filename = maybe_download('text8.zip', 31344016)

Found and verified text8.zip


In [3]:
def read_data(filename):
  with zipfile.ZipFile(filename) as f:
    name = f.namelist()[0]
    data = tf.compat.as_str(f.read(name))
  return data
  
text = read_data(filename)
print('Data size %d' % len(text))

Data size 100000000


Create a small validation set.

In [4]:
valid_size = 1000
valid_text = text[:valid_size]
train_text = text[valid_size:]
train_size = len(train_text)
print(train_size, train_text[:64])
print(valid_size, valid_text[:64])

99999000 ons anarchists advocate social relations based upon voluntary as
1000  anarchism originated as a term of abuse first used against earl


Utility functions to map characters to vocabulary IDs and back.

In [5]:
vocabulary_size = len(string.ascii_lowercase) + 1 # [a-z] + ' '
first_letter = ord(string.ascii_lowercase[0])

def char2id(char):
  if char in string.ascii_lowercase:
    return ord(char) - first_letter + 1
  elif char == ' ':
    return 0
  else:
    print('Unexpected character: %s' % char)
    return 0
  
def id2char(dictid):
  if dictid > 0:
    return chr(dictid + first_letter - 1)
  else:
    return ' '

print(char2id('a'), char2id('z'), char2id(' '), char2id('ï'))
print(id2char(1), id2char(26), id2char(0))

Unexpected character: ï
1 26 0 0
a z  


Function to generate a training batch for the LSTM model.

In [6]:
batch_size=64
num_unrollings=10

class BatchGenerator(object):
  def __init__(self, text, batch_size, num_unrollings):
    self._text = text
    self._text_size = len(text)
    self._batch_size = batch_size
    self._num_unrollings = num_unrollings
    segment = self._text_size // batch_size
    self._cursor = [ offset * segment for offset in range(batch_size)]
    self._last_batch = self._next_batch()
  
  def _next_batch(self):
    """Generate a single batch from the current cursor position in the data."""
    batch = np.zeros(shape=(self._batch_size, vocabulary_size), dtype=np.float)
    for b in range(self._batch_size):
      batch[b, char2id(self._text[self._cursor[b]])] = 1.0
      self._cursor[b] = (self._cursor[b] + 1) % self._text_size
    return batch
  
  def next(self):
    """Generate the next array of batches from the data. The array consists of
    the last batch of the previous array, followed by num_unrollings new ones.
    """
    batches = [self._last_batch]
    for step in range(self._num_unrollings):
      batches.append(self._next_batch())
    self._last_batch = batches[-1]
    return batches

def characters(probabilities):
  """Turn a 1-hot encoding or a probability distribution over the possible
  characters back into its (most likely) character representation."""
  return [id2char(c) for c in np.argmax(probabilities, 1)]

def batches2string(batches):
  """Convert a sequence of batches back into their (most likely) string
  representation."""
  s = [''] * batches[0].shape[0]
  for b in batches:
    s = [''.join(x) for x in zip(s, characters(b))]
  return s

train_batches = BatchGenerator(train_text, batch_size, num_unrollings)
valid_batches = BatchGenerator(valid_text, 1, 1)

print(batches2string(train_batches.next()))
print(batches2string(train_batches.next()))
print(batches2string(valid_batches.next()))
print(batches2string(valid_batches.next()))

['ons anarchi', 'when milita', 'lleria arch', ' abbeys and', 'married urr', 'hel and ric', 'y and litur', 'ay opened f', 'tion from t', 'migration t', 'new york ot', 'he boeing s', 'e listed wi', 'eber has pr', 'o be made t', 'yer who rec', 'ore signifi', 'a fierce cr', ' two six ei', 'aristotle s', 'ity can be ', ' and intrac', 'tion of the', 'dy to pass ', 'f certain d', 'at it will ', 'e convince ', 'ent told hi', 'ampaign and', 'rver side s', 'ious texts ', 'o capitaliz', 'a duplicate', 'gh ann es d', 'ine january', 'ross zero t', 'cal theorie', 'ast instanc', ' dimensiona', 'most holy m', 't s support', 'u is still ', 'e oscillati', 'o eight sub', 'of italy la', 's the tower', 'klahoma pre', 'erprise lin', 'ws becomes ', 'et in a naz', 'the fabian ', 'etchy to re', ' sharman ne', 'ised empero', 'ting in pol', 'd neo latin', 'th risky ri', 'encyclopedi', 'fense the a', 'duating fro', 'treet grid ', 'ations more', 'appeal of d', 'si have mad']
['ists advoca', 'ary governm', 'hes nat

In [7]:
def logprob(predictions, labels):
  """Log-probability of the true labels in a predicted batch."""
  predictions[predictions < 1e-10] = 1e-10
  return np.sum(np.multiply(labels, -np.log(predictions))) / labels.shape[0]

def sample_distribution(distribution):
  """Sample one element from a distribution assumed to be an array of normalized
  probabilities.
  """
  r = random.uniform(0, 1)
  s = 0
  for i in range(len(distribution)):
    s += distribution[i]
    if s >= r:
      return i
  return len(distribution) - 1

def sample(prediction):
  """Turn a (column) prediction into 1-hot encoded samples."""
  p = np.zeros(shape=[1, vocabulary_size], dtype=np.float)
  p[0, sample_distribution(prediction[0])] = 1.0
  return p

def random_distribution():
  """Generate a random column of probabilities."""
  b = np.random.uniform(0.0, 1.0, size=[1, vocabulary_size])
  return b/np.sum(b, 1)[:,None]

Simple LSTM Model. Suggeted reading: http://deeplearning.net/tutorial/lstm.html

In [118]:
num_nodes = 64

graph = tf.Graph()
with graph.as_default():
  
  # Parameters:
  # Input gate: input, previous output, and bias.
  ix = tf.Variable(tf.truncated_normal([vocabulary_size, num_nodes], -0.1, 0.1))
  im = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], -0.1, 0.1))
  ib = tf.Variable(tf.zeros([1, num_nodes]))
  # Forget gate: input, previous output, and bias.
  fx = tf.Variable(tf.truncated_normal([vocabulary_size, num_nodes], -0.1, 0.1))
  fm = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], -0.1, 0.1))
  fb = tf.Variable(tf.zeros([1, num_nodes]))
  # Memory cell: input, state and bias.                             
  cx = tf.Variable(tf.truncated_normal([vocabulary_size, num_nodes], -0.1, 0.1))
  cm = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], -0.1, 0.1))
  cb = tf.Variable(tf.zeros([1, num_nodes]))
  # Output gate: input, previous output, and bias.
  ox = tf.Variable(tf.truncated_normal([vocabulary_size, num_nodes], -0.1, 0.1))
  om = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], -0.1, 0.1))
  ob = tf.Variable(tf.zeros([1, num_nodes]))
  # Variables saving state across unrollings.
  saved_output = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False)
  saved_state = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False)
  # Classifier weights and biases.
  w = tf.Variable(tf.truncated_normal([num_nodes, vocabulary_size], -0.1, 0.1))
  b = tf.Variable(tf.zeros([vocabulary_size]))
  
  # Definition of the cell computation.
  def lstm_cell(i, o, state):
    """Create a LSTM cell. See e.g.: http://arxiv.org/pdf/1402.1128v1.pdf
    Note that in this formulation, we omit the various connections between the
    previous state and the gates."""
    input_gate = tf.sigmoid(tf.matmul(i, ix) + tf.matmul(o, im) + ib)
    forget_gate = tf.sigmoid(tf.matmul(i, fx) + tf.matmul(o, fm) + fb)
    update = tf.matmul(i, cx) + tf.matmul(o, cm) + cb
    state = forget_gate * state + input_gate * tf.tanh(update)
    output_gate = tf.sigmoid(tf.matmul(i, ox) + tf.matmul(o, om) + ob)
    return output_gate * tf.tanh(state), state

  # Input data.
  train_data = list()
  for _ in range(num_unrollings + 1):
    train_data.append(
      tf.placeholder(tf.float32, shape=[batch_size, vocabulary_size]))
  train_inputs = train_data[:num_unrollings]
  train_labels = train_data[1:]  # labels are inputs shifted by one time step.

  # Unrolled LSTM loop.
  outputs = list()
  output = saved_output
  state = saved_state
  for i in train_inputs:
    output, state = lstm_cell(i, output, state)
    outputs.append(output)

  # State saving across unrollings.
  with tf.control_dependencies([saved_output.assign(output),
                                saved_state.assign(state)]):
    # Classifier.
    logits = tf.nn.xw_plus_b(tf.concat(outputs, 0), w, b)
    loss = tf.reduce_mean(
      tf.nn.softmax_cross_entropy_with_logits(
        labels=tf.concat(train_labels, 0), logits=logits))

  # Optimizer.
  global_step = tf.Variable(0)
  learning_rate = tf.train.exponential_decay(
    10.0, global_step, 5000, 0.1, staircase=True)
  optimizer = tf.train.GradientDescentOptimizer(learning_rate)
  gradients, v = zip(*optimizer.compute_gradients(loss))
  gradients, _ = tf.clip_by_global_norm(gradients, 1.25)
  optimizer = optimizer.apply_gradients(
    zip(gradients, v), global_step=global_step)

  # Predictions.
  train_prediction = tf.nn.softmax(logits)
  
  # Sampling and validation eval: batch 1, no unrolling.
  sample_input = tf.placeholder(tf.float32, shape=[1, vocabulary_size])
  saved_sample_output = tf.Variable(tf.zeros([1, num_nodes]))
  saved_sample_state = tf.Variable(tf.zeros([1, num_nodes]))
  reset_sample_state = tf.group(
    saved_sample_output.assign(tf.zeros([1, num_nodes])),
    saved_sample_state.assign(tf.zeros([1, num_nodes])))
  sample_output, sample_state = lstm_cell(
    sample_input, saved_sample_output, saved_sample_state)
  with tf.control_dependencies([saved_sample_output.assign(sample_output),
                                saved_sample_state.assign(sample_state)]):
    sample_prediction = tf.nn.softmax(tf.nn.xw_plus_b(sample_output, w, b))

In [9]:
num_steps = 7001
summary_frequency = 100

with tf.Session(graph=graph) as session:
  tf.global_variables_initializer().run()
  print('Initialized')
  mean_loss = 0
  for step in range(num_steps):
    batches = train_batches.next()
    feed_dict = dict()
    for i in range(num_unrollings + 1):
      feed_dict[train_data[i]] = batches[i]
    _, l, predictions, lr = session.run(
      [optimizer, loss, train_prediction, learning_rate], feed_dict=feed_dict)
    mean_loss += l
    if step % summary_frequency == 0:
      if step > 0:
        mean_loss = mean_loss / summary_frequency
      # The mean loss is an estimate of the loss over the last few batches.
      print(
        'Average loss at step %d: %f learning rate: %f' % (step, mean_loss, lr))
      mean_loss = 0
      labels = np.concatenate(list(batches)[1:])
      print('Minibatch perplexity: %.2f' % float(
        np.exp(logprob(predictions, labels))))
      if step % (summary_frequency * 10) == 0:
        # Generate some samples.
        print('=' * 80)
        for _ in range(5):
          feed = sample(random_distribution())
          sentence = characters(feed)[0]
          reset_sample_state.run()
          for _ in range(79):
            prediction = sample_prediction.eval({sample_input: feed})
            feed = sample(prediction)
            sentence += characters(feed)[0]
          print(sentence)
        print('=' * 80)
      # Measure validation set perplexity.
      reset_sample_state.run()
      valid_logprob = 0
      for _ in range(valid_size):
        b = valid_batches.next()
        predictions = sample_prediction.eval({sample_input: b[0]})
        valid_logprob = valid_logprob + logprob(predictions, b[1])
      print('Validation set perplexity: %.2f' % float(np.exp(
        valid_logprob / valid_size)))

Initialized
Average loss at step 0: 3.295701 learning rate: 10.000000
Minibatch perplexity: 27.00
kq rxte kv rnie oovelr   twqoidqgdkhxnlytflpl oapxlrpetdsny ripqotunhcmhagyjcdtm
jhnshsz hdkdbdr ehbcer vtuninp a eafbemiyxfoqmtbltiafewxgzlhz  ho rpfrrxzpidebgf
imxlrt fr nwpdsijsestnc iygurhsmpuligiikezlfmtpxcoe v egblrhcetnd el vrmbd axzmp
laaeblzuxyoqwx ecslohyfzxcwxvmsydvo nuthjfe enxvwcx xeauj kpsaalhc hjlacxbuoygsj
fhtc gnibumaoqtanqtmsftjzkfcneksshysmofwyene a absuhaeieontc kwyoflsrtst jczeka 
Validation set perplexity: 20.29
Average loss at step 100: 2.626326 learning rate: 10.000000
Minibatch perplexity: 10.86
Validation set perplexity: 10.37
Average loss at step 200: 2.260471 learning rate: 10.000000
Minibatch perplexity: 8.51
Validation set perplexity: 8.59
Average loss at step 300: 2.108363 learning rate: 10.000000
Minibatch perplexity: 7.42
Validation set perplexity: 8.04
Average loss at step 400: 2.009782 learning rate: 10.000000
Minibatch perplexity: 7.71
Validation set per

Validation set perplexity: 4.30
Average loss at step 4500: 1.609504 learning rate: 10.000000
Minibatch perplexity: 5.19
Validation set perplexity: 4.47
Average loss at step 4600: 1.610618 learning rate: 10.000000
Minibatch perplexity: 4.83
Validation set perplexity: 4.49
Average loss at step 4700: 1.620716 learning rate: 10.000000
Minibatch perplexity: 5.26
Validation set perplexity: 4.44
Average loss at step 4800: 1.627136 learning rate: 10.000000
Minibatch perplexity: 4.31
Validation set perplexity: 4.54
Average loss at step 4900: 1.630665 learning rate: 10.000000
Minibatch perplexity: 5.15
Validation set perplexity: 4.60
Average loss at step 5000: 1.600590 learning rate: 1.000000
Minibatch perplexity: 4.58
chish scaped cantreb to ungerent feered hiseign mave is durenet to the says on t
one culpurs hild that nah arac or ignode from the a murgence wergied with a pref
x a estlished he a beat seven zero bake musives is one nine seven to one nine en
asine boked were these s provide doc a

---
Problem 1
---------

You might have noticed that the definition of the LSTM cell involves 4 matrix multiplications with the input, and 4 matrix multiplications with the output. Simplify the expression by using a single matrix multiply for each, and variables that are 4 times larger.

---

In [64]:
num_nodes = 64

graph = tf.Graph()
with graph.as_default():
  
  # Parameters:
  # Input gate: input, previous output, and bias.
  ix = tf.Variable(tf.truncated_normal([vocabulary_size, num_nodes], -0.1, 0.1))
  im = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], -0.1, 0.1))
  ib = tf.Variable(tf.zeros([1, num_nodes]))
  # Forget gate: input, previous output, and bias.
  fx = tf.Variable(tf.truncated_normal([vocabulary_size, num_nodes], -0.1, 0.1))
  fm = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], -0.1, 0.1))
  fb = tf.Variable(tf.zeros([1, num_nodes]))
  # Memory cell: input, state and bias.                             
  cx = tf.Variable(tf.truncated_normal([vocabulary_size, num_nodes], -0.1, 0.1))
  cm = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], -0.1, 0.1))
  cb = tf.Variable(tf.zeros([1, num_nodes]))
  # Output gate: input, previous output, and bias.
  ox = tf.Variable(tf.truncated_normal([vocabulary_size, num_nodes], -0.1, 0.1))
  om = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], -0.1, 0.1))
  ob = tf.Variable(tf.zeros([1, num_nodes]))
  # Concatenate parameters  
  sx = tf.concat(axis=1, values=[ix, fx, cx, ox])
  sm = tf.concat(axis=1, values=[im, fm, cm, om])
  sb = tf.concat(axis=1, values=[ib, fb, cb, ob])
  # Variables saving state across unrollings.
  saved_output = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False)
  saved_state = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False)
  # Classifier weights and biases.
  w = tf.Variable(tf.truncated_normal([num_nodes, vocabulary_size], -0.1, 0.1))
  b = tf.Variable(tf.zeros([vocabulary_size]))
  
  # Definition of the cell computation.
  def lstm_cell(i, o, state):
    """Create a LSTM cell. See e.g.: http://arxiv.org/pdf/1402.1128v1.pdf
    Note that in this formulation, we omit the various connections between the
    previous state and the gates."""
    smatmul = tf.matmul(i, sx) + tf.matmul(o, sm) + sb
    smatmul_input, smatmul_forget, update, smatmul_output = tf.split(axis=1, num_or_size_splits=4, value=smatmul)
    input_gate = tf.sigmoid(smatmul_input)
    forget_gate = tf.sigmoid(smatmul_forget)
    output_gate = tf.sigmoid(smatmul_output)
    #input_gate = tf.sigmoid(tf.matmul(i, ix) + tf.matmul(o, im) + ib)
    #forget_gate = tf.sigmoid(tf.matmul(i, fx) + tf.matmul(o, fm) + fb)
    #update = tf.matmul(i, cx) + tf.matmul(o, cm) + cb
    state = forget_gate * state + input_gate * tf.tanh(update)
    #output_gate = tf.sigmoid(tf.matmul(i, ox) + tf.matmul(o, om) + ob)
    return output_gate * tf.tanh(state), state

  # Input data.
  train_data = list()
  for _ in range(num_unrollings + 1):
    train_data.append(
      tf.placeholder(tf.float32, shape=[batch_size,vocabulary_size]))
  train_inputs = train_data[:num_unrollings]
  train_labels = train_data[1:]  # labels are inputs shifted by one time step.

  # Unrolled LSTM loop.
  outputs = list()
  output = saved_output
  state = saved_state
  for i in train_inputs:
    output, state = lstm_cell(i, output, state)
    outputs.append(output)

  # State saving across unrollings.
  with tf.control_dependencies([saved_output.assign(output),
                                saved_state.assign(state)]):
    # Classifier.
    logits = tf.nn.xw_plus_b(tf.concat(outputs, 0), w, b)
    loss = tf.reduce_mean(
      tf.nn.softmax_cross_entropy_with_logits(
        labels=tf.concat(train_labels, 0), logits=logits))

  # Optimizer.
  global_step = tf.Variable(0)
  learning_rate = tf.train.exponential_decay(
    10.0, global_step, 5000, 0.1, staircase=True)
  optimizer = tf.train.GradientDescentOptimizer(learning_rate)
  gradients, v = zip(*optimizer.compute_gradients(loss))
  gradients, _ = tf.clip_by_global_norm(gradients, 1.25)
  optimizer = optimizer.apply_gradients(
    zip(gradients, v), global_step=global_step)

  # Predictions.
  train_prediction = tf.nn.softmax(logits)
  
  # Sampling and validation eval: batch 1, no unrolling.
  sample_input = tf.placeholder(tf.float32, shape=[1, vocabulary_size])
  saved_sample_output = tf.Variable(tf.zeros([1, num_nodes]))
  saved_sample_state = tf.Variable(tf.zeros([1, num_nodes]))
  reset_sample_state = tf.group(
    saved_sample_output.assign(tf.zeros([1, num_nodes])),
    saved_sample_state.assign(tf.zeros([1, num_nodes])))
  sample_output, sample_state = lstm_cell(
    sample_input, saved_sample_output, saved_sample_state)
  with tf.control_dependencies([saved_sample_output.assign(sample_output),
                                saved_sample_state.assign(sample_state)]):
    sample_prediction = tf.nn.softmax(tf.nn.xw_plus_b(sample_output, w, b))

In [19]:
num_steps = 7001
summary_frequency = 100

with tf.Session(graph=graph) as session:
  tf.global_variables_initializer().run()
  print('Initialized')
  mean_loss = 0
  for step in range(num_steps):
    batches = train_batches.next()
    feed_dict = dict()
    for i in range(num_unrollings + 1):
      feed_dict[train_data[i]] = batches[i]
    _, l, predictions, lr = session.run(
      [optimizer, loss, train_prediction, learning_rate], feed_dict=feed_dict)
    mean_loss += l
    if step % summary_frequency == 0:
      if step > 0:
        mean_loss = mean_loss / summary_frequency
      # The mean loss is an estimate of the loss over the last few batches.
      print(
        'Average loss at step %d: %f learning rate: %f' % (step, mean_loss, lr))
      mean_loss = 0
      labels = np.concatenate(list(batches)[1:])
      print('Minibatch perplexity: %.2f' % float(
        np.exp(logprob(predictions, labels))))
      if step % (summary_frequency * 10) == 0:
        # Generate some samples.
        print('=' * 80)
        for _ in range(5):
          feed = sample(random_distribution())
          sentence = characters(feed)[0]
          reset_sample_state.run()
          for _ in range(79):
            prediction = sample_prediction.eval({sample_input: feed})
            feed = sample(prediction)
            sentence += characters(feed)[0]
          print(sentence)
        print('=' * 80)
      # Measure validation set perplexity.
      reset_sample_state.run()
      valid_logprob = 0
      for _ in range(valid_size):
        b = valid_batches.next()
        predictions = sample_prediction.eval({sample_input: b[0]})
        valid_logprob = valid_logprob + logprob(predictions, b[1])
      print('Validation set perplexity: %.2f' % float(np.exp(
        valid_logprob / valid_size)))

Initialized
Average loss at step 0: 3.294875 learning rate: 10.000000
Minibatch perplexity: 26.97
nneshhurvazae  h evbvsoebtbru pwweisual teyt  lgzjhi riqjtnrdekqs fawhirekrggout
yitsjnsqbn mrejwosrmjw ve  ovlxuse zconoag b exntyr mppj brinzeha lymalygtt  icf
o mf wg anjlydbtdo sebsvvob   oiecx izgtyuvk  esegnfbs ie ett wstleipvvkdsyagotd
eedoeivozus esqt  ipxod grilprt oygttn vcjeo  x e cawxwhbbax fkahxhymrbt r ctvhw
qnlf rn tntegkfx m uahmcga  spj  icvaint tedkcbfarg dy n avpmnbciutet p oz i  db
Validation set perplexity: 20.07
Average loss at step 100: 2.586555 learning rate: 10.000000
Minibatch perplexity: 10.76
Validation set perplexity: 10.62
Average loss at step 200: 2.245113 learning rate: 10.000000
Minibatch perplexity: 8.63
Validation set perplexity: 8.91
Average loss at step 300: 2.081888 learning rate: 10.000000
Minibatch perplexity: 6.49
Validation set perplexity: 8.07
Average loss at step 400: 2.034716 learning rate: 10.000000
Minibatch perplexity: 7.80
Validation set per

Validation set perplexity: 4.89
Average loss at step 4500: 1.639417 learning rate: 10.000000
Minibatch perplexity: 5.14
Validation set perplexity: 4.98
Average loss at step 4600: 1.622137 learning rate: 10.000000
Minibatch perplexity: 5.60
Validation set perplexity: 4.95
Average loss at step 4700: 1.618619 learning rate: 10.000000
Minibatch perplexity: 4.71
Validation set perplexity: 4.90
Average loss at step 4800: 1.604401 learning rate: 10.000000
Minibatch perplexity: 4.79
Validation set perplexity: 4.92
Average loss at step 4900: 1.618854 learning rate: 10.000000
Minibatch perplexity: 5.29
Validation set perplexity: 4.78
Average loss at step 5000: 1.613144 learning rate: 1.000000
Minibatch perplexity: 4.85
 selved they to teine pica at the contands the coinate to devide the ratherse lo
ing towar munder in the porkenst numper in thbse or his prizidel china docuted e
ver to naviakly weather war been goor woment i as wristings ampacade four buttin
ter of navalusocian effectros are as a

---
Problem 2
---------

We want to train a LSTM over bigrams, that is pairs of consecutive characters like 'ab' instead of single characters like 'a'. Since the number of possible bigrams is large, feeding them directly to the LSTM using 1-hot encodings will lead to a very sparse representation that is very wasteful computationally.

a- Introduce an embedding lookup on the inputs, and feed the embeddings to the LSTM cell instead of the inputs themselves.

b- Write a bigram-based LSTM, modeled on the character LSTM above.

c- Introduce Dropout. For best practices on how to use Dropout in LSTMs, refer to this [article](http://arxiv.org/abs/1409.2329).

---

Utility functions to map bigrams to IDs and back.

bigram ID is represented as char2id(fisrt_char) * vocabulary_size + char2id(second_char)

In [183]:
def bigram2id(bigram):
  first_char, second_char = bigram
  if (first_char in string.ascii_lowercase or first_char == ' ') \
    and (second_char in string.ascii_lowercase or second_char == ' '):
    return char2id(first_char) * vocabulary_size + char2id(second_char)
  else:
    print('Unexpected bigram: ({}, {})'.format(first_char, second_char))
    return 0
  
def id2bigram(dictid):
  if dictid > 0:
    first_char = id2char(dictid // vocabulary_size)
    second_char = id2char(dictid % vocabulary_size)
    return first_char, second_char
  else:
    return ' ', ' '

print(bigram2id(('a', 'b')), bigram2id(('a', 'z')), bigram2id((' ', ' ')), bigram2id(('a', 'ï')))
print(id2bigram(1), id2bigram(27), id2bigram(0))

Unexpected bigram: (a, ï)
29 53 0 0
(' ', 'a') ('a', ' ') (' ', ' ')


Try bigram-to-bigram LSTM with embedding lookup and dropout

In [195]:
def bigram_sample(prediction):
  """Turn a (column) prediction into 1-hot encoded samples."""
  p = np.zeros(shape=[1, bigram_size], dtype=np.float)
  p[0, sample_distribution(prediction[0])] = 1.0
  return p

def bigram_random_distribution():
  """Generate a random column of probabilities."""
  b = np.random.uniform(0.0, 1.0, size=[1, bigram_size])
  return b/np.sum(b, 1)[:,None]

def bigram_characters(probabilities):
  """Turn a 1-hot encoding or a probability distribution over the possible
  characters back into its (most likely) character representation."""
  return ['({0},{1})'.format(id2char(c // vocabulary_size), id2char(c % vocabulary_size))
          for c in np.argmax(probabilities, 1)]

def bigram_first_characters(probabilities):
  """Turn a 1-hot encoding or a probability distribution over the possible
  characters back into its (most likely) character representation."""
  return [id2char(c//vocabulary_size)
          for c in np.argmax(probabilities, 1)]

def bigram_batches2string(batches):
  """Convert a sequence of batches back into their (most likely) string
  representation."""
  s = [''] * len(batches[0])
  for batch in batches:
    p = np.zeros(shape=[len(batches[0]), bigram_size], dtype=np.float)
    for i in range(len(batches[0])):
        p[i, batch[i]] = 1.0
    s = [''.join(x) for x in zip(s, bigram_characters(p))]
  return s

In [196]:
batch_size=64
num_unrollings=10
bigram_size = vocabulary_size * vocabulary_size

class BigramBatchGenerator(object):
  def __init__(self, text, batch_size, num_unrollings):
    self._text = text
    self._text_size = len(text)
    self._batch_size = batch_size
    self._num_unrollings = num_unrollings
    segment = self._text_size // batch_size
    self._cursor = [ offset * segment for offset in range(batch_size)]
    self._last_batch = self._next_batch()
  
  def _next_batch(self):
    """Generate a single batch from the current cursor position in the data."""
    batch = list()
    for b in range(self._batch_size):
      first_char = self._text[self._cursor[b]]
      if self._cursor[b] + 1 == self._text_size:
        second_char = ' '
      else:
        second_char = self._text[self._cursor[b] + 1]
      batch.append(char2id(first_char) * vocabulary_size + char2id(second_char))
      self._cursor[b] = (self._cursor[b] + 1) % self._text_size
    return batch
  
  def next(self):
    """Generate the next array of batches from the data. The array consists of
    the last batch of the previous array, followed by num_unrollings new ones.
    """
    batches = [self._last_batch]
    for step in range(self._num_unrollings):
      batches.append(self._next_batch())
    self._last_batch = batches[-1]
    return batches

bigram_train_batches = BigramBatchGenerator(train_text, batch_size, num_unrollings)
bigram_valid_batches = BigramBatchGenerator(valid_text, 1, 1)

# output each bigram instead of a single char
print(bigram_batches2string(bigram_train_batches.next()))
print(bigram_batches2string(bigram_train_batches.next()))
print(bigram_batches2string(bigram_valid_batches.next()))
print(bigram_batches2string(bigram_valid_batches.next()))

['(o,n)(n,s)(s, )( ,a)(a,n)(n,a)(a,r)(r,c)(c,h)(h,i)(i,s)', '(w,h)(h,e)(e,n)(n, )( ,m)(m,i)(i,l)(l,i)(i,t)(t,a)(a,r)', '(l,l)(l,e)(e,r)(r,i)(i,a)(a, )( ,a)(a,r)(r,c)(c,h)(h,e)', '( ,a)(a,b)(b,b)(b,e)(e,y)(y,s)(s, )( ,a)(a,n)(n,d)(d, )', '(m,a)(a,r)(r,r)(r,i)(i,e)(e,d)(d, )( ,u)(u,r)(r,r)(r,a)', '(h,e)(e,l)(l, )( ,a)(a,n)(n,d)(d, )( ,r)(r,i)(i,c)(c,h)', '(y, )( ,a)(a,n)(n,d)(d, )( ,l)(l,i)(i,t)(t,u)(u,r)(r,g)', '(a,y)(y, )( ,o)(o,p)(p,e)(e,n)(n,e)(e,d)(d, )( ,f)(f,o)', '(t,i)(i,o)(o,n)(n, )( ,f)(f,r)(r,o)(o,m)(m, )( ,t)(t,h)', '(m,i)(i,g)(g,r)(r,a)(a,t)(t,i)(i,o)(o,n)(n, )( ,t)(t,o)', '(n,e)(e,w)(w, )( ,y)(y,o)(o,r)(r,k)(k, )( ,o)(o,t)(t,h)', '(h,e)(e, )( ,b)(b,o)(o,e)(e,i)(i,n)(n,g)(g, )( ,s)(s,e)', '(e, )( ,l)(l,i)(i,s)(s,t)(t,e)(e,d)(d, )( ,w)(w,i)(i,t)', '(e,b)(b,e)(e,r)(r, )( ,h)(h,a)(a,s)(s, )( ,p)(p,r)(r,o)', '(o, )( ,b)(b,e)(e, )( ,m)(m,a)(a,d)(d,e)(e, )( ,t)(t,o)', '(y,e)(e,r)(r, )( ,w)(w,h)(h,o)(o, )( ,r)(r,e)(e,c)(c,e)', '(o,r)(r,e)(e, )( ,s)(s,i)(i,g)(g,n)(n,i)(i,f)(f,i)(i,c

In [197]:
num_nodes = 64
embedding_size = 128
keep_prob = 0.8

graph = tf.Graph()
with graph.as_default():
  
  # Parameters:
  vocabulary_embeddings = tf.Variable(
    tf.random_uniform([bigram_size, embedding_size], -1.0, 1.0))
  # Input gate: input, previous output, and bias.
  ix = tf.Variable(tf.truncated_normal([embedding_size, num_nodes], -0.1, 0.1))
  im = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], -0.1, 0.1))
  ib = tf.Variable(tf.zeros([1, num_nodes]))
  # Forget gate: input, previous output, and bias.
  fx = tf.Variable(tf.truncated_normal([embedding_size, num_nodes], -0.1, 0.1))
  fm = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], -0.1, 0.1))
  fb = tf.Variable(tf.zeros([1, num_nodes]))
  # Memory cell: input, state and bias.                             
  cx = tf.Variable(tf.truncated_normal([embedding_size, num_nodes], -0.1, 0.1))
  cm = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], -0.1, 0.1))
  cb = tf.Variable(tf.zeros([1, num_nodes]))
  # Output gate: input, previous output, and bias.
  ox = tf.Variable(tf.truncated_normal([embedding_size, num_nodes], -0.1, 0.1))
  om = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], -0.1, 0.1))
  ob = tf.Variable(tf.zeros([1, num_nodes]))
  # Concatenate parameters  
  sx = tf.concat(axis=1, values=[ix, fx, cx, ox])
  sm = tf.concat(axis=1, values=[im, fm, cm, om])
  sb = tf.concat(axis=1, values=[ib, fb, cb, ob])
  # Variables saving state across unrollings.
  saved_output = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False)
  saved_state = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False)
  # Classifier weights and biases.
  w = tf.Variable(tf.truncated_normal([num_nodes, bigram_size], -0.1, 0.1))
  b = tf.Variable(tf.zeros([bigram_size]))
  
  # Definition of the cell computation.
  def lstm_cell(i, o, state):
    """Create a LSTM cell. See e.g.: http://arxiv.org/pdf/1402.1128v1.pdf
    Note that in this formulation, we omit the various connections between the
    previous state and the gates."""
    smatmul = tf.matmul(i, sx) + tf.matmul(o, sm) + sb
    smatmul_input, smatmul_forget, update, smatmul_output = tf.split(axis=1, num_or_size_splits=4, value=smatmul)
    input_gate = tf.sigmoid(smatmul_input)
    forget_gate = tf.sigmoid(smatmul_forget)
    output_gate = tf.sigmoid(smatmul_output)
    #input_gate = tf.sigmoid(tf.matmul(i, ix) + tf.matmul(o, im) + ib)
    #forget_gate = tf.sigmoid(tf.matmul(i, fx) + tf.matmul(o, fm) + fb)
    #update = tf.matmul(i, cx) + tf.matmul(o, cm) + cb
    state = forget_gate * state + input_gate * tf.tanh(update)
    #output_gate = tf.sigmoid(tf.matmul(i, ox) + tf.matmul(o, om) + ob)
    return output_gate * tf.tanh(state), state

  # Input data.
  train_data = list()
  for _ in range(num_unrollings + 1):
    train_data.append(
      tf.placeholder(tf.int64, shape=[batch_size]))
  train_inputs = train_data[:num_unrollings]
  train_labels = train_data[1:]  # labels are inputs shifted by one time step.

  # Unrolled LSTM loop.
  outputs = list()
  output = saved_output
  state = saved_state
  for i in train_inputs:
    i_embed = tf.nn.embedding_lookup(vocabulary_embeddings, i)
    # add dropout in the input
    i_drop = tf.nn.dropout(i_embed, keep_prob)
    output, state = lstm_cell(i_drop, output, state)
    outputs.append(output)

  # State saving across unrollings.
  with tf.control_dependencies([saved_output.assign(output),
                                saved_state.assign(state)]):
    # Classifier.
    logits = tf.nn.xw_plus_b(tf.concat(outputs, 0), w, b)
    loss = tf.reduce_mean(
      tf.nn.sparse_softmax_cross_entropy_with_logits(
        logits=logits, labels=tf.concat(train_labels, 0)))

  # Optimizer.
  global_step = tf.Variable(0)
  learning_rate = tf.train.exponential_decay(
    10.0, global_step, 5000, 0.1, staircase=True)
  optimizer = tf.train.GradientDescentOptimizer(learning_rate)
  gradients, v = zip(*optimizer.compute_gradients(loss))
  gradients, _ = tf.clip_by_global_norm(gradients, 1.25)
  optimizer = optimizer.apply_gradients(
    zip(gradients, v), global_step=global_step)

  # Predictions.
  train_prediction = tf.nn.softmax(logits)
  
  # Sampling and validation eval: batch 1, no unrolling.
  sample_input = tf.placeholder(tf.int32, shape=[1])
  saved_sample_output = tf.Variable(tf.zeros([1, num_nodes]))
  saved_sample_state = tf.Variable(tf.zeros([1, num_nodes]))
  reset_sample_state = tf.group(
    saved_sample_output.assign(tf.zeros([1, num_nodes])),
    saved_sample_state.assign(tf.zeros([1, num_nodes])))
  sample_output, sample_state = lstm_cell(
    tf.nn.embedding_lookup(vocabulary_embeddings, sample_input), saved_sample_output, saved_sample_state)
  with tf.control_dependencies([saved_sample_output.assign(sample_output),
                                saved_sample_state.assign(sample_state)]):
    sample_prediction = tf.nn.softmax(tf.nn.xw_plus_b(sample_output, w, b))

In [198]:
num_steps = 14001
summary_frequency = 700

def exec_graph_bigram_embed(graph):
  with tf.Session(graph=graph) as session:
    tf.initialize_all_variables().run()
    print('Initialized')
    mean_loss = 0
    for step in range(num_steps):
      batches = bigram_train_batches.next()
      feed_dict = dict()
      for i in range(num_unrollings + 1):
        feed_dict[train_data[i]] = batches[i]
      _, l, predictions, lr = session.run(
        [optimizer, loss, train_prediction, learning_rate], feed_dict=feed_dict)
      mean_loss += l
      if step % summary_frequency == 0:
        if step > 0:
          mean_loss = mean_loss / summary_frequency
        # The mean loss is an estimate of the loss over the last few batches.
        print(
          'Average loss at step %d: %f learning rate: %f' % (step, mean_loss, lr))
        mean_loss = 0
        labels = np.concatenate(list(batches)[1:])
        # convert to one-hot-encodings
        noembed_labels = np.zeros(predictions.shape)
        for i, j in enumerate(labels):
            noembed_labels[i, j] = 1.0
        print('Minibatch perplexity: %.2f' % float(
          np.exp(logprob(predictions, noembed_labels))))
        if step % (summary_frequency * 10) == 0:
          # Generate some samples.
          print('=' * 80)
          for _ in range(5):
            feed = bigram_sample(bigram_random_distribution())
            bigram_sentence = bigram_characters(feed)[0]
            sentence = bigram_first_characters(feed)[0]
            # convert to embedding
            feed = [np.argmax(feed)]
            reset_sample_state.run()
            for _ in range(79):
              prediction = sample_prediction.eval({sample_input: feed})
              feed = bigram_sample(prediction)
              bigram_sentence += bigram_characters(feed)[0]
              sentence += bigram_first_characters(feed)[0]
              feed = [np.argmax(feed)]
            print('bigrams:', bigram_sentence)
            print('chars:', sentence)
          print('=' * 80)
        # Measure validation set perplexity.
        reset_sample_state.run()
        valid_logprob = 0
        for _ in range(valid_size):
          b = bigram_valid_batches.next()
          predictions = sample_prediction.eval({sample_input: b[0]})
          labels = np.zeros((1, bigram_size))
          labels[0, b[1]] = 1.0
          valid_logprob = valid_logprob + logprob(predictions, labels)
        print('Validation set perplexity: %.2f' % float(np.exp(
          valid_logprob / valid_size)))

In [199]:
%time exec_graph_bigram_embed(graph)

Instructions for updating:
Use `tf.global_variables_initializer` instead.
Initialized
Average loss at step 0: 6.606322 learning rate: 10.000000
Minibatch perplexity: 739.76
bigrams: ( ,d)(q,m)(t,r)(d,j)(x,j)(o,j)(g,g)(q,i)(i,s)(t,e)(v,p)(c,z)(m,x)(r,x)(z,t)( ,u)(w,b)(h,n)(b,f)(w,x)(z,q)(i,y)(a,t)(z,f)(x,j)(a,e)(i,p)(i,w)(b,c)(b,f)(i,g)(f,l)(c,s)(n,z)(a, )(n,d)(q,p)(o,k)(e,s)(p,o)(a,e)(m,f)(w,q)(d,d)(z,u)(d,v)(g,f)(n,f)(g,t)(b,v)(f,h)(c,n)(x,f)(z,z)(d,u)(c,e)(y,w)(m,v)(p,u)(h,t)(d,o)(l,m)(y,a)(t,v)(e,r)(e,r)(f,f)(w,s)(h,d)(t,f)(f,m)(k,p)(y,d)(m,q)(b,m)(y,g)(b,q)(p,q)(p,b)(i,r)
chars:  qtdxogqitvcmrz whbwziazxaiibbifcnanqoepamwdzdgngbfcxzdcymphdlyteefwhtfkymbybppi
bigrams: (c,y)(i,n)(d,i)(a,b)(e,i)(n,m)(a,v)(q,e)(h,p)(c,s)(o,w)(u,l)(i,u)(k,h)(j,r)(o,x)(h,w)(g,h)( ,l)(g,j)(w,u)(c,g)(p,b)(j,m)(l,g)(k,s)(r,b)(y,t)(u,v)(k,r)(z,d)(m,g)(j,y)(e,l)(a,b)(n,u)(p,o)(k,t)(i,o)(r,w)(r,q)(l,o)(s,g)(k,x)(k,e)(m,i)(z,t)(v,y)(e,s)(r,f)(v,r)(e,x)(b,g)(a,d)(x,t)(k,u)( ,b)(l,k)(d,n)(r,r)(y,t)(g,y)(s,f)(j,t)

Validation set perplexity: 3.82
CPU times: user 22min 55s, sys: 2min 34s, total: 25min 30s
Wall time: 5min 10s


---
Problem 3
---------

(difficult!)

Write a sequence-to-sequence LSTM which mirrors all the words in a sentence. For example, if your input is:

    the quick brown fox
    
the model should attempt to output:

    eht kciuq nworb xof
    
Refer to the lecture on how to put together a sequence-to-sequence model, as well as [this article](http://arxiv.org/abs/1409.3215) for best practices.

---

The seq2seq_model module is not part of the default modules loaded by tensorflow. 
1. Download it from https://github.com/tensorflow/models
2. Add the model's directory to the python path (if models folder is placed properly)
3. Import it

In [6]:
import sys
sys.path.append(tf.__path__[0] + '/models/tutorials/rnn/translate')

In [7]:
from translate import seq2seq_model

Text sample, we will try to reverse all the words

In [8]:
text = "the quick brown fox jumps over the lazy dog is an english sentence that can be translated to the following french one le vif renard brun saute par dessus le chien paresseux here is an extremely long french word anticonstitutionnellement"

def longest_word_size(text):
    return max(map(len, text.split()))

word_size = longest_word_size(text)
print(word_size)

25


In [9]:
import string

num_nodes = 64
batch_size = 10

def create_model():
     return seq2seq_model.Seq2SeqModel(source_vocab_size=vocabulary_size,
                                   target_vocab_size=vocabulary_size,
                                   buckets=[(word_size + 1, word_size + 2)], # only 1 bucket
                                   size=num_nodes,
                                   num_layers=3,
                                   max_gradient_norm=5.0,
                                   batch_size=batch_size,
                                   learning_rate=0.5,
                                   learning_rate_decay_factor=0.99,
                                   use_lstm=True,
                                   forward_only=False)

In [10]:
def get_batch():
    encoder_inputs = [np.random.randint(1, vocabulary_size, word_size + 1) for _ in range(batch_size)]
    decoder_inputs = [np.zeros(word_size + 2, dtype=np.int32) for _ in range(batch_size)]
    weights = [np.ones(word_size + 2, dtype=np.float32) for _ in range(batch_size)]
    for i in range(batch_size):
        r = random.randint(1, word_size)
        # leave at least a 0 at the end
        encoder_inputs[i][r:] = 0
        # one 0 at the beginning of the reversed word, one 0 at the end
        decoder_inputs[i][1:r+1] = encoder_inputs[i][:r][::-1]
        weights[i][r+1:] = 0.0
    return np.transpose(encoder_inputs), np.transpose(decoder_inputs), np.transpose(weights)

In [11]:
def strip_zeros(word):
    # 0 is the code for space in char2id()
    return word.strip(' ')

def evaluate_model(model, sess, words, encoder_inputs):
    correct = 0
    decoder_inputs = np.zeros((word_size + 2, batch_size), dtype=np.int32)
    target_weights = np.zeros((word_size + 2, batch_size), dtype=np.float32)
    target_weights[0,:] = 1.0
    is_finished = np.full(batch_size, False, dtype=np.bool_)
    for i in range(word_size + 1):
        _, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id=0, forward_only=True)
        p = np.argmax(output_logits[i], axis=1)
        is_finished = np.logical_or(is_finished, p == 0)
        decoder_inputs[i,:] = (1 - is_finished) * p
        target_weights[i,:] = (1.0 - is_finished) * 1.0
        #if np.all(is_finished):
            #break
    print(decoder_inputs)
    for idx, l in enumerate(np.transpose(decoder_inputs)):
        reversed_word = ''.join(reversed(words[idx]))
        output_word = strip_zeros(''.join(id2char(i) for i in l))
        print(words[idx], '(reversed: {0})'.format(reversed_word),
              '->', output_word, '({0})'.format('OK' if reversed_word == output_word else 'KO'))
        if reversed_word == output_word:
            correct += 1
    return correct

In [12]:
def get_validation_batch(words):
    encoder_inputs = [np.zeros(word_size + 1, dtype=np.int32) for _ in range(batch_size)]
    for i, word in enumerate(words):
        for j, c in enumerate(word):
            encoder_inputs[i][j] = char2id(c)
    return np.transpose(encoder_inputs)

def validate_model(text, model, sess):
    words = text.split()
    nb_words = (len(words) / batch_size) * batch_size
    correct = 0
    for i in range(int(nb_words // batch_size)):
        range_words = words[i * batch_size:(i + 1) * batch_size]
        encoder_inputs = get_validation_batch(range_words)
        correct += evaluate_model(model, sess, range_words, encoder_inputs)
    print('* correct: {0}/{1} -> {2}%'.format(correct, nb_words, (float(correct) / nb_words) * 100))

In [13]:
def reverse_text(nb_steps):
    with tf.Session() as session:
        model = create_model()
        tf.initialize_all_variables().run()
        for step in range(nb_steps):
            enc_inputs, dec_inputs, weights = get_batch()
            _, loss, _ = model.step(session, enc_inputs, dec_inputs, weights, 0, False)
            if step % 1000 == 1:
                print('* step:', step, 'loss:', loss)
                validate_model(text, model, session)
        print('*** evaluation! loss:', loss)
        validate_model(text, model, session)

In [14]:
%time reverse_text(10000)

Instructions for updating:
Use `tf.global_variables_initializer` instead.
* step: 1 loss: 3.29621
[[0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0]]
the (reversed: eht) ->  (KO)
quick (reversed: kciuq) ->  (KO)
brown (reversed: nworb) ->  (KO)
fox (reversed: xof) ->  (KO)
jumps (reversed: spmuj) ->  (KO)
over (reversed: revo) ->  (KO)
the (reversed: eht) ->  (KO)
lazy (reversed: yzal) ->  (KO)
dog (reversed: god) ->  (KO)
is 

* step: 2001 loss: 1.74637
[[ 5 11 14 24 19 18  5 25  7 19]
 [ 8  9 23 15 16 18  8 26 15  9]
 [20  9 15 15 13 22 20  1  4  0]
 [ 0  9 18  0 13 15  0 12  0  0]
 [ 0 17  2  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0]]
the (reversed: eht) -> eht (OK)
quick (reversed: kciuq) -> kiiiq (KO)
brown (reve

[[14 24  5 19 14 25  7  8  4 14]
 [ 5 21 18  9  1 12 14  3 18 14]
 [ 9  5  5  0  0  5 15 14 15 14]
 [ 8 19  8  0  0 13 12  5 23 14]
 [ 3 19  0  0  0  5  0 18  0 14]
 [ 0  5  0  0  0 18  0  6  0  5]
 [ 0 18  0  0  0 20  0  0  0 14]
 [ 0  1  0  0  0 24  0  0  0 14]
 [ 0 16  0  0  0  5  0  0  0 14]
 [ 0  0  0  0  0  0  0  0  0  9]
 [ 0  0  0  0  0  0  0  0  0  9]
 [ 0  0  0  0  0  0  0  0  0 20]
 [ 0  0  0  0  0  0  0  0  0 20]
 [ 0  0  0  0  0  0  0  0  0 21]
 [ 0  0  0  0  0  0  0  0  0 20]
 [ 0  0  0  0  0  0  0  0  0  9]
 [ 0  0  0  0  0  0  0  0  0 20]
 [ 0  0  0  0  0  0  0  0  0 19]
 [ 0  0  0  0  0  0  0  0  0 14]
 [ 0  0  0  0  0  0  0  0  0 15]
 [ 0  0  0  0  0  0  0  0  0  3]
 [ 0  0  0  0  0  0  0  0  0  9]
 [ 0  0  0  0  0  0  0  0  0 20]
 [ 0  0  0  0  0  0  0  0  0 14]
 [ 0  0  0  0  0  0  0  0  0  1]
 [ 0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0]]
chien (reversed: neihc) -> neihc (OK)
paresseux (reversed: xuesserap) -> xuesserap (OK)
here (reversed: ereh

[[ 8  5  5  6  4 14  5 18 19  5]
 [ 3 14 12  9 18 21 20  1 21 12]
 [14 15  0 22  1 18 21 16 19  0]
 [ 5  0  0  0 14  2  1  0 19  0]
 [18  0  0  0  5  0 19  0  5  0]
 [ 6  0  0  0 18  0  0  0  4  0]
 [ 0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0]]
french (reversed: hcnerf) -> hcnerf (OK)
one (reversed: eno) -> eno (OK)
le (reversed: el) -> el (OK)
vif (r

[[14  8  5 20 14  5  4 15  5  7]
 [ 1 19  3  1  1  2  5 20  8 14]
 [ 0  9 14  8  3  0 20  0 20  9]
 [ 0 12  5 20  0  0  1  0  0 23]
 [ 0  7 20  0  0  0 12  0  0 15]
 [ 0 14 14  0  0  0 19  0  0 12]
 [ 0  5  5  0  0  0 14  0  0 12]
 [ 0  0 19  0  0  0  1  0  0 15]
 [ 0  0  0  0  0  0 18  0  0  6]
 [ 0  0  0  0  0  0 20  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0]]
an (reversed: na) -> na (OK)
english (reversed: hsilgne) -> hsilgne (OK)
sentence (reversed: ecnetnes) -> ec

* step: 9001 loss: 0.109652
[[ 5 11 14 24 19 18  5 25  7 19]
 [20  3 23 15 16 15 20 26 15  9]
 [20  9 15  6 13 22 20  1  4  0]
 [ 0 21 18  0 21 15  0 12  0  0]
 [ 0 17  2  0 10  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0]]
the (reversed: eht) -> ett (KO)
quick (reversed: kciuq) -> kciuq (OK)
brown (rev

[[14 24  5 19 14 25  7  8  4  5]
 [ 5 21 18  9 14 12 14  3 18  5]
 [ 9  5  5  0  0  5 15 14 15  5]
 [ 8 19  8  0  0 13 12  5 23  5]
 [ 3 19  0  0  0  5  0 18  0  5]
 [ 0  5  0  0  0 18  0  6  0  5]
 [ 0 18  0  0  0 20  0  0  0 14]
 [ 0  1  0  0  0 24  0  0  0 14]
 [ 0 16  0  0  0  5  0  0  0 14]
 [ 0  0  0  0  0  0  0  0  0 14]
 [ 0  0  0  0  0  0  0  0  0 21]
 [ 0  0  0  0  0  0  0  0  0 21]
 [ 0  0  0  0  0  0  0  0  0 20]
 [ 0  0  0  0  0  0  0  0  0 20]
 [ 0  0  0  0  0  0  0  0  0 21]
 [ 0  0  0  0  0  0  0  0  0  9]
 [ 0  0  0  0  0  0  0  0  0  9]
 [ 0  0  0  0  0  0  0  0  0 20]
 [ 0  0  0  0  0  0  0  0  0 14]
 [ 0  0  0  0  0  0  0  0  0 15]
 [ 0  0  0  0  0  0  0  0  0  3]
 [ 0  0  0  0  0  0  0  0  0  9]
 [ 0  0  0  0  0  0  0  0  0 20]
 [ 0  0  0  0  0  0  0  0  0 14]
 [ 0  0  0  0  0  0  0  0  0  1]
 [ 0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0]]
chien (reversed: neihc) -> neihc (OK)
paresseux (reversed: xuesserap) -> xuesserap (OK)
here (reversed: ereh