In [1]:
# These are all the modules we'll be using later. Make sure you can import them
# before proceeding further.
from __future__ import print_function
import os
import numpy as np
import random
import string
import tensorflow as tf
import zipfile
from six.moves import range
from six.moves.urllib.request import urlretrieve

In [2]:
url = 'http://mattmahoney.net/dc/'

def maybe_download(filename, expected_bytes):
  """Download a file if not present, and make sure it's the right size."""
  if not os.path.exists(filename):
    filename, _ = urlretrieve(url + filename, filename)
  statinfo = os.stat(filename)
  if statinfo.st_size == expected_bytes:
    print('Found and verified %s' % filename)
  else:
    print(statinfo.st_size)
    raise Exception(
      'Failed to verify ' + filename + '. Can you get to it with a browser?')
  return filename

filename = maybe_download('text8.zip', 31344016)

Found and verified text8.zip


In [3]:
def read_data(filename):
  with zipfile.ZipFile(filename) as f:
    name = f.namelist()[0]
    data = tf.compat.as_str(f.read(name))
  return data
  
text = read_data(filename)
print('Data size %d' % len(text))

Data size 100000000


In [4]:
valid_size = 1000
valid_text = text[:valid_size]
train_text = text[valid_size:]
train_size = len(train_text)
print(train_size, train_text[:64])
print(valid_size, valid_text[:64])

99999000 ons anarchists advocate social relations based upon voluntary as
1000  anarchism originated as a term of abuse first used against earl


In [5]:
vocabulary_size = len(string.ascii_lowercase) + 1 # ' ' + [a-z] 
first_letter = ord(string.ascii_lowercase[0]) # ord converts char to ASCII

def char2id(char):
  if char in string.ascii_lowercase:
    return ord(char) - first_letter + 1 # ''=0, a=1, b=2, ...
  elif char == ' ':
    return 0
  else:
    print('Unexpected character: %s' % char)
    return 0
  
def id2char(dictid):
  if dictid > 0:
    return chr(dictid + first_letter - 1)
  else:
    return ' '

print(char2id('a'), char2id('z'), char2id(' '), char2id('ï'))
print(id2char(1), id2char(26), id2char(0))

Unexpected character: ï
1 26 0 0
a z  


In [6]:
batch_size=64 # number of sentences (axis=0 of batch array) (characters at first, but unroll will add to make them sentences)
num_unrollings=10 # basically sentence length/number of chars (after combining chars) (list length of batch, a list of arrays)

class BatchGenerator(object):
  def __init__(self, text, batch_size, num_unrollings):
    self._text = text
    self._text_size = len(text)
    self._batch_size = batch_size
    self._num_unrollings = num_unrollings
    segment = self._text_size // batch_size # larger than num_unrollings, to prevent getting same phrases in different... 
    # sentences, however the code does not assert so
    self._cursor = [ offset * segment for offset in range(batch_size)] # each batch/sentence has its own cursor, positioned...
    # "segment" indices away
    self._last_batch = self._next_batch()
  
  def _next_batch(self):
    """Generate a single batch from the current cursor position in the data."""
    batch = np.zeros(shape=(self._batch_size, vocabulary_size), dtype=np.float) # one-hot encoding of a char for each batch elem.
    # currently batch is a char, but next() will append batches to make sentences
    for b in range(self._batch_size):
      batch[b, char2id(self._text[self._cursor[b]])] = 1.0 # one-hot for char at
      self._cursor[b] = (self._cursor[b] + 1) % self._text_size # move a cursor of a sentence/batch to next char...
      # prepares for the next _next_batch() call
    return batch
  
  def next(self):
    """Generate the next array of batches from the data. The array consists of
    the last batch of the previous array, followed by num_unrollings new ones.
    """ # Eg. 'when milit(a)' -> '(a)ry governm'
    batches = [self._last_batch]
    for step in range(self._num_unrollings):
      batches.append(self._next_batch())
    self._last_batch = batches[-1]
    return batches # returns list:(num_unrolls+1)[array:(batch_size)x(vocab_size)]

def characters(probabilities):
  """Turn a 1-hot encoding or a probability distribution over the possible
  characters back into its (most likely) character representation."""
  return [id2char(c) for c in np.argmax(probabilities, 1)]

def batches2string(batches):
  """Convert a sequence of batches back into their (most likely) string
  representation."""
  s = [''] * batches[0].shape[0] # creates a list of batch_size empty chars (empty sentences)
  for b in batches: # only loops thru unrollings/ chars in sentence
    s = [''.join(x) for x in zip(s, characters(b))] # characters(b) returns a list of chars with length batch_size
    # zip(s, characters(b)) returns a tuple where the args of zip are in a tuple eg. ('', 'a') NOTE s is only empty at first!
    # zip stops at the exhaustion of the shorter sequence (does not happen in this case as s and b are same len)
    # ''.join(x) joins the elements of x with a separator=''. Eg. ''.join(('','a')) = 'a'
  return s

train_batches = BatchGenerator(train_text, batch_size, num_unrollings) 
valid_batches = BatchGenerator(valid_text, 1, 1)

print(batches2string(train_batches.next())) 
print(batches2string(train_batches.next()))
print(batches2string(valid_batches.next()))
print(batches2string(valid_batches.next()))

['ons anarchi', 'when milita', 'lleria arch', ' abbeys and', 'married urr', 'hel and ric', 'y and litur', 'ay opened f', 'tion from t', 'migration t', 'new york ot', 'he boeing s', 'e listed wi', 'eber has pr', 'o be made t', 'yer who rec', 'ore signifi', 'a fierce cr', ' two six ei', 'aristotle s', 'ity can be ', ' and intrac', 'tion of the', 'dy to pass ', 'f certain d', 'at it will ', 'e convince ', 'ent told hi', 'ampaign and', 'rver side s', 'ious texts ', 'o capitaliz', 'a duplicate', 'gh ann es d', 'ine january', 'ross zero t', 'cal theorie', 'ast instanc', ' dimensiona', 'most holy m', 't s support', 'u is still ', 'e oscillati', 'o eight sub', 'of italy la', 's the tower', 'klahoma pre', 'erprise lin', 'ws becomes ', 'et in a naz', 'the fabian ', 'etchy to re', ' sharman ne', 'ised empero', 'ting in pol', 'd neo latin', 'th risky ri', 'encyclopedi', 'fense the a', 'duating fro', 'treet grid ', 'ations more', 'appeal of d', 'si have mad']
['ists advoca', 'ary governm', 'hes nat

In [76]:
print(len(batches))
print(batches[0].shape)
labels = np.concatenate(list(batches)[1:])
print(len(labels))
print(labels[0].shape)
#print(labels)

11
(64, 27)
640
(27,)


In [82]:
print(random_distribution().shape)
feed = sample(random_distribution()) # generate a random char in one-hot form
print(feed)
print(len(characters(feed)))

(1, 27)
[[1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0.]]
1


In [65]:
B = train_batches.next()
s = [''] * B[0].shape[0]
for b in B:
    print("listing b")
    print(characters(b))
    print("listing s")
    print(s)
    print("listing zip")
    print(list( zip(s,characters(b)) ))
    s = [''.join(x) for x in zip(s,characters(b))]
    print("listing s again")
    print(s)

listing b
['s', 'v', 'v', ' ', 'e', ' ']
listing s
['', '', '', '', '', '']
listing zip
[('', 's'), ('', 'v'), ('', 'v'), ('', ' '), ('', 'e'), ('', ' ')]
listing s again
['s', 'v', 'v', ' ', 'e', ' ']
listing b
[' ', 'e', 'i', 't', 'r', 'm']
listing s
['s', 'v', 'v', ' ', 'e', ' ']
listing zip
[('s', ' '), ('v', 'e'), ('v', 'i'), (' ', 't'), ('e', 'r'), (' ', 'm')]
listing s again
['s ', 've', 'vi', ' t', 'er', ' m']
listing b
['a', 'n', 'a', 'h', 'a', 'a']
listing s
['s ', 've', 'vi', ' t', 'er', ' m']
listing zip
[('s ', 'a'), ('ve', 'n'), ('vi', 'a'), (' t', 'h'), ('er', 'a'), (' m', 'a')]
listing s again
['s a', 'ven', 'via', ' th', 'era', ' ma']


In [7]:
def logprob(predictions, labels): # both are (num_unrolls-1)*batch_size x vocab_size
  """Log-probability of the true labels in a predicted batch."""
  predictions[predictions < 1e-10] = 1e-10 # clip probs that are too small
    # element-wise mult between -log(y_hat) and y, then get the average across classes
  return np.sum(np.multiply(labels, -np.log(predictions))) / labels.shape[0] # RETURNS A SCALAR

def sample_distribution(distribution):
  """Sample one element from a distribution assumed to be an array of normalized
  probabilities.
  """
  r = random.uniform(0, 1) # a random threshold, sampled from a distribution that is IID as input "distribution"
  s = 0 # accumulator of elements of input "distribution"
  for i in range(len(distribution)):
    s += distribution[i]
    if s >= r:
      return i # return the index of the element that makes cumsum >=r
  return len(distribution) - 1 # cumsum till last element of input >= threshold r, return its index

def sample(prediction):
  """Turn a (column) prediction into 1-hot encoded samples."""
  p = np.zeros(shape=[1, vocabulary_size], dtype=np.float)
  p[0, sample_distribution(prediction[0])] = 1.0 # prediction comes from random_distribution, a row vector. so need index [0]
  return p

def random_distribution():
  """Generate a random column of probabilities.""" # it is a column vector
  b = np.random.uniform(0.0, 1.0, size=[1, vocabulary_size])
  return b/np.sum(b, 1)[:,None] # still need to normalize (ie. np.sum(b,1) as we could get 0.8, 0.8 etc.)

In [None]:
num_nodes = 64

graph = tf.Graph()
with graph.as_default():
  
  # Parameters:
  # Input gate: input, previous output, and bias.
  ix = tf.Variable(tf.truncated_normal([vocabulary_size, num_nodes], -0.1, 0.1))
  im = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], -0.1, 0.1))
  ib = tf.Variable(tf.zeros([1, num_nodes]))
  # Forget gate: input, previous output, and bias.
  fx = tf.Variable(tf.truncated_normal([vocabulary_size, num_nodes], -0.1, 0.1))
  fm = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], -0.1, 0.1))
  fb = tf.Variable(tf.zeros([1, num_nodes]))
  # Memory cell: input, state and bias.                             
  cx = tf.Variable(tf.truncated_normal([vocabulary_size, num_nodes], -0.1, 0.1))
  cm = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], -0.1, 0.1))
  cb = tf.Variable(tf.zeros([1, num_nodes]))
  # Output gate: input, previous output, and bias.
  ox = tf.Variable(tf.truncated_normal([vocabulary_size, num_nodes], -0.1, 0.1))
  om = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], -0.1, 0.1))
  ob = tf.Variable(tf.zeros([1, num_nodes]))
  # Variables saving state across unrollings.
  saved_output = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False)
  saved_state = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False)
  # Classifier weights and biases.
  w = tf.Variable(tf.truncated_normal([num_nodes, vocabulary_size], -0.1, 0.1))
  b = tf.Variable(tf.zeros([vocabulary_size]))
  
  # Definition of the cell computation.
  def lstm_cell(i, o, state):
    """Create a LSTM cell. See e.g.: http://arxiv.org/pdf/1402.1128v1.pdf
    Note that in this formulation, we omit the various connections between the
    previous state and the gates."""
    input_gate = tf.sigmoid(tf.matmul(i, ix) + tf.matmul(o, im) + ib)
    forget_gate = tf.sigmoid(tf.matmul(i, fx) + tf.matmul(o, fm) + fb)
    update = tf.matmul(i, cx) + tf.matmul(o, cm) + cb
    state = forget_gate * state + input_gate * tf.tanh(update)
    output_gate = tf.sigmoid(tf.matmul(i, ox) + tf.matmul(o, om) + ob)
    return output_gate * tf.tanh(state), state

  # Input data.
  train_data = list()
  for _ in range(num_unrollings + 1):
    train_data.append(
      tf.placeholder(tf.float32, shape=[batch_size,vocabulary_size]))
  train_inputs = train_data[:num_unrollings]
  train_labels = train_data[1:]  # labels are inputs shifted by one time step.

  # Unrolled LSTM loop.
  outputs = list()
  output = saved_output
  state = saved_state
  for i in train_inputs:
    output, state = lstm_cell(i, output, state)
    outputs.append(output)

  # State saving across unrollings.
  with tf.control_dependencies([saved_output.assign(output),
                                saved_state.assign(state)]):
    # Classifier.
    logits = tf.nn.xw_plus_b(tf.concat(outputs, 0), w, b)
    loss = tf.reduce_mean(
      tf.nn.softmax_cross_entropy_with_logits(
        labels=tf.concat(train_labels, 0), logits=logits))

  # Optimizer.
  global_step = tf.Variable(0)
  learning_rate = tf.train.exponential_decay(
    10.0, global_step, 5000, 0.1, staircase=True)
  optimizer = tf.train.GradientDescentOptimizer(learning_rate)
  gradients, v = zip(*optimizer.compute_gradients(loss))
  gradients, _ = tf.clip_by_global_norm(gradients, 1.25)
  optimizer = optimizer.apply_gradients(
    zip(gradients, v), global_step=global_step)

  # Predictions.
  train_prediction = tf.nn.softmax(logits)
  
  # Sampling and validation eval: batch 1, no unrolling.
  sample_input = tf.placeholder(tf.float32, shape=[1, vocabulary_size])
  saved_sample_output = tf.Variable(tf.zeros([1, num_nodes]))
  saved_sample_state = tf.Variable(tf.zeros([1, num_nodes]))
  reset_sample_state = tf.group(
    saved_sample_output.assign(tf.zeros([1, num_nodes])),
    saved_sample_state.assign(tf.zeros([1, num_nodes])))
  sample_output, sample_state = lstm_cell(
    sample_input, saved_sample_output, saved_sample_state)
  with tf.control_dependencies([saved_sample_output.assign(sample_output),
                                saved_sample_state.assign(sample_state)]):
    sample_prediction = tf.nn.softmax(tf.nn.xw_plus_b(sample_output, w, b))

In [53]:
num_nodes = 64

graph = tf.Graph()
with graph.as_default():
  
  # Parameters:
  # Input gate: input, previous output, and bias.
  ifco_x = tf.Variable(tf.truncated_normal([vocabulary_size, 4*num_nodes], -0.1, 0.1))
  ifco_m = tf.Variable(tf.truncated_normal([num_nodes, 4*num_nodes], -0.1, 0.1))
  ifco_b = tf.Variable(tf.zeros([1, 4*num_nodes]))
  # Classifier weights and biases.
  w = tf.Variable(tf.truncated_normal([num_nodes, vocabulary_size], -0.1, 0.1))
  b = tf.Variable(tf.zeros([vocabulary_size]))
  # Variables saving state across unrollings.
  saved_output = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False)
  saved_state = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False)
  
  # Definition of the cell computation.
  def lstm_cell(i, o, state):
    #Create a LSTM cell. See e.g.: http://arxiv.org/pdf/1402.1128v1.pdf
    #Note that in this formulation, we omit the various connections between the
    #previous state and the gates.
    gates = tf.matmul(i,ifco_x) + tf.matmul(o,ifco_m) + ifco_b
    input_gate, forget_gate, update, output_gate = tf.split(gates, 4, 1) #split into 4 smaller tensors, along axis 1
    state = tf.sigmoid(forget_gate) * state + tf.sigmoid(input_gate) * tf.tanh(update)
    return tf.sigmoid(output_gate) * tf.tanh(state), state 
        
  # Input data.
  train_data = list()
  for _ in range(num_unrollings + 1):
    train_data.append(
      tf.placeholder(tf.float32, shape=[batch_size,vocabulary_size])) # batches. each placeholder stores a batch array
  train_inputs = train_data[:num_unrollings] # dont take last element cus its corresponding label is \
  # supposed to be an input 1 timestep into future
  train_labels = train_data[1:]  # labels are inputs shifted (forward) by one time step.

  # Unrolled LSTM loop.
  outputs = list()
  output = saved_output
  state = saved_state
  for i in train_inputs:
    output, state = lstm_cell(i, output, state)
    outputs.append(output)

  # State saving across unrollings.
  with tf.control_dependencies([saved_output.assign(output), #save the output and state into saved_*
                                saved_state.assign(state)]):
    # Classifier.
    logits = tf.nn.xw_plus_b(tf.concat(outputs, 0), w, b) # vstack output row vectors, then X*W+B, X is the output of LSTM
    loss = tf.reduce_mean(
      tf.nn.softmax_cross_entropy_with_logits(
        labels=tf.concat(train_labels, 0), logits=logits))

  # Optimizer.
  global_step = tf.Variable(0) # iterator for learning rate exponential decay
  learning_rate = tf.train.exponential_decay( # annealing
    10.0, global_step, 5000, 0.1, staircase=True) # from an initial decay of 10.0, decay till 0.1 over 5000 steps.
  # integer division since staircase = True
  optimizer = tf.train.GradientDescentOptimizer(learning_rate)
  gradients, v = zip(*optimizer.compute_gradients(loss)) # unzip gradients to get both gradient magnitude and the variable
  gradients, _ = tf.clip_by_global_norm(gradients, 1.25) # compute global_norm = sqrt sum of L2norms-squared for all gradients
  # shrink the gradients by global_norm, only if global_norm <= clip_norm=1.25
  optimizer = optimizer.apply_gradients( # apply clipped gradients that prevents gradient explosion
      #, zip into tuple pairs 1st. also increment global_step for learning rate exponential decay
    zip(gradients, v), global_step=global_step)

  # Predictions.
  train_prediction = tf.nn.softmax(logits)
  
  # Sampling and validation eval: batch 1, no unrolling.
  sample_input = tf.placeholder(tf.float32, shape=[1, vocabulary_size])
  saved_sample_output = tf.Variable(tf.zeros([1, num_nodes]))
  saved_sample_state = tf.Variable(tf.zeros([1, num_nodes]))
  reset_sample_state = tf.group( # group 2 ops into 1. resets both saved_sample_state & saved_sample_output to all zeros
    saved_sample_output.assign(tf.zeros([1, num_nodes])),
    saved_sample_state.assign(tf.zeros([1, num_nodes])))
  sample_output, sample_state = lstm_cell(
    sample_input, saved_sample_output, saved_sample_state)
  with tf.control_dependencies([saved_sample_output.assign(sample_output),
                                saved_sample_state.assign(sample_state)]):
    sample_prediction = tf.nn.softmax(tf.nn.xw_plus_b(sample_output, w, b))

In [32]:
num_steps = 7001
summary_frequency = 100 # last 100 batches, to compute mean loss

with tf.Session(graph=graph) as session:
  tf.global_variables_initializer().run()
  print('Initialized')
  mean_loss = 0
  for step in range(num_steps):
    batches = train_batches.next()
    feed_dict = dict()
    for i in range(num_unrollings + 1):
      feed_dict[train_data[i]] = batches[i]
    _, l, predictions, lr = session.run(
      [optimizer, loss, train_prediction, learning_rate], feed_dict=feed_dict)
    mean_loss += l
    if step % summary_frequency == 0:
      if step > 0:
        mean_loss = mean_loss / summary_frequency
      # The mean loss is an estimate of the loss over the last few batches.
      print(
        'Average loss at step %d: %f learning rate: %f' % (step, mean_loss, lr))
      mean_loss = 0
      labels = np.concatenate(list(batches)[1:]) # (num_unrolls + 1)*batch_size x vocab_size
      print('Minibatch perplexity: %.2f' % float(
        np.exp(logprob(predictions, labels)))) # perplexity ranges from 1 to a big #
      if step % (summary_frequency * 10) == 0:
        # Generate some samples.
        print('=' * 80) # print = 80 times to generate a border
        for _ in range(5):
          feed = sample(random_distribution()) # generate a random char in one-hot form
          sentence = characters(feed)[0] # random char in sentence only
          reset_sample_state.run() # reset saved sample output & state to zeros (1 x num_nodes)
          for _ in range(79):
            prediction = sample_prediction.eval({sample_input: feed}) # feed (dict form) to get prediction of output layer only
            # (not LSTM); tensor.eval() must receive a feed_dict
            # sample_input: feed, means feed becomes sample_input in the session, goes into lstm_cell, outputs sample_output, 
            # goes into tf.nn.softmax(tf.nn.xw_plus_b()) to get sample_prediction. It will also save sample output & state
            feed = sample(prediction)
            sentence += characters(feed)[0] #sentence = sample_input, sample_prediction, sample_prediciton, ...
          print(sentence)
        print('=' * 80)
      # Measure validation set perplexity.
      reset_sample_state.run()
      valid_logprob = 0
      for _ in range(valid_size):
        b = valid_batches.next() # (num_unrolls + 1 = 2)x(batch_size = 1)x(vocab_size)
        predictions = sample_prediction.eval({sample_input: b[0]}) # predict b[1] using b[0]
        valid_logprob = valid_logprob + logprob(predictions, b[1]) # compare prediction and b[1], add to accumulator
      print('Validation set perplexity: %.2f' % float(np.exp(
        valid_logprob / valid_size)))

Initialized
Average loss at step 0: 3.296029 learning rate: 10.000000
Minibatch perplexity: 27.01
vgocpeiaw ugmy  eav itwerncee ho m wluh l awnn pvf apespbrgedqtrtcawof r mlromws
sekhst mal p mecafl ntomila sigitssfpcbyitwisaznpbpl hlueowaima  yzgl dxsopdenva
qi wb hdaeufasbcbwaa ha evoecjeojki ngritfy fui  rmhlys xiyokwzyxejte  kuejyf lv
srw etoees beryarup xwgllnb qqw wkre ppcatmhn jaodggqfyciostmekcoxctrrv ievtnnj 
 xqckeeepnsrdxnmees  coi it iof nerchfmohqlvd n gxmmssbrnm nynpxtdmaqn esaxes yo
Validation set perplexity: 20.30
Average loss at step 100: 2.581576 learning rate: 10.000000
Minibatch perplexity: 10.36
Validation set perplexity: 11.06
Average loss at step 200: 2.251721 learning rate: 10.000000
Minibatch perplexity: 9.72
Validation set perplexity: 8.88
Average loss at step 300: 2.114995 learning rate: 10.000000
Minibatch perplexity: 7.29
Validation set perplexity: 8.38
Average loss at step 400: 2.037231 learning rate: 10.000000
Minibatch perplexity: 8.24
Validation set per

Validation set perplexity: 5.13
Average loss at step 4500: 1.609211 learning rate: 10.000000
Minibatch perplexity: 5.42
Validation set perplexity: 4.92
Average loss at step 4600: 1.595830 learning rate: 10.000000
Minibatch perplexity: 4.80
Validation set perplexity: 4.93
Average loss at step 4700: 1.601531 learning rate: 10.000000
Minibatch perplexity: 5.01
Validation set perplexity: 4.97
Average loss at step 4800: 1.612226 learning rate: 10.000000
Minibatch perplexity: 5.35
Validation set perplexity: 4.88
Average loss at step 4900: 1.595114 learning rate: 10.000000
Minibatch perplexity: 5.05
Validation set perplexity: 4.89
Average loss at step 5000: 1.605308 learning rate: 1.000000
Minibatch perplexity: 4.74
ouble hatm siccers the othemants and bweritless speciultic cloda fforce he can a
cy quirionments recored french allow pieventies scote blies stampt of this branc
resupences of anrapitives in v one seven zero seven three rens orsimanth quired 
lated but two her for cally eided to u

In [54]:
num_steps = 7001
summary_frequency = 100 # last 100 batches, to compute mean loss

with tf.Session(graph=graph) as session:
  tf.global_variables_initializer().run()
  print('Initialized')
  mean_loss = 0
  for step in range(num_steps):
    batches = train_batches.next()
    feed_dict = dict()
    for i in range(num_unrollings + 1):
      feed_dict[train_data[i]] = batches[i]
    _, l, predictions, lr = session.run(
      [optimizer, loss, train_prediction, learning_rate], feed_dict=feed_dict)
    mean_loss += l
    if step % summary_frequency == 0:
      if step > 0:
        mean_loss = mean_loss / summary_frequency
      # The mean loss is an estimate of the loss over the last few batches.
      print(
        'Average loss at step %d: %f learning rate: %f' % (step, mean_loss, lr))
      mean_loss = 0
      labels = np.concatenate(list(batches)[1:]) # (num_unrolls + 1)*batch_size x vocab_size
      print('Minibatch perplexity: %.2f' % float(
        np.exp(logprob(predictions, labels)))) # perplexity ranges from 1 to a big #
      if step % (summary_frequency * 10) == 0:
        # Generate some samples.
        print('=' * 80) # print = 80 times to generate a border
        for _ in range(5):
          feed = sample(random_distribution()) # generate a random char in one-hot form
          sentence = characters(feed)[0] # random char in sentence only
          reset_sample_state.run() # reset saved sample output & state to zeros (1 x num_nodes)
          for _ in range(79):
            prediction = sample_prediction.eval({sample_input: feed}) # feed (dict form) to get prediction of output layer only
            # (not LSTM); tensor.eval() must receive a feed_dict
            # sample_input: feed, means feed becomes sample_input in the session, goes into lstm_cell, outputs sample_output, 
            # goes into tf.nn.softmax(tf.nn.xw_plus_b()) to get sample_prediction. It will also save sample output & state
            feed = sample(prediction)
            sentence += characters(feed)[0] #sentence = sample_input, sample_prediction, sample_prediciton, ...
          print(sentence)
        print('=' * 80)
      # Measure validation set perplexity.
      reset_sample_state.run()
      valid_logprob = 0
      for _ in range(valid_size):
        b = valid_batches.next() # (num_unrolls + 1 = 2)x(batch_size = 1)x(vocab_size)
        predictions = sample_prediction.eval({sample_input: b[0]}) # predict b[1] using b[0]
        valid_logprob = valid_logprob + logprob(predictions, b[1]) # compare prediction and b[1], add to accumulator
      print('Validation set perplexity: %.2f' % float(np.exp(
        valid_logprob / valid_size)))

Initialized
Average loss at step 0: 3.298227 learning rate: 10.000000
Minibatch perplexity: 27.06
egekzrs  ft  apvrfthipieijaoiejz  yghfsdstqces kssagf krurrpd pdvi pnemrmmnot q 
qr gwe p fudr iksnuiqqmtb nelydgusdjqhsm  ohrnoyg   rnspn mrhzttoocplyaer pfe pr
piam kmnme tqiu tnao sbxreoek  alwgtydqrnuaijs ggoxtalthoaia lr  ssequaa owngdci
mfihsriaruplqpjarl jr hm hicjoaqtdzgne tqypwz bfykmbkvmcjci jfek  ctlp  svuxelvr
nohr outhaz  yuhgobbcsaivpotsbte tvyhaitn   vemgdakteordnhsnof tyramp mu hwo ter
Validation set perplexity: 19.90
Average loss at step 100: 2.594928 learning rate: 10.000000
Minibatch perplexity: 10.79
Validation set perplexity: 10.81
Average loss at step 200: 2.255743 learning rate: 10.000000
Minibatch perplexity: 8.33
Validation set perplexity: 9.38
Average loss at step 300: 2.076221 learning rate: 10.000000
Minibatch perplexity: 7.62
Validation set perplexity: 8.19
Average loss at step 400: 2.018102 learning rate: 10.000000
Minibatch perplexity: 6.47
Validation set per

Validation set perplexity: 5.10
Average loss at step 4500: 1.649578 learning rate: 10.000000
Minibatch perplexity: 5.23
Validation set perplexity: 5.06
Average loss at step 4600: 1.636695 learning rate: 10.000000
Minibatch perplexity: 5.24
Validation set perplexity: 5.08
Average loss at step 4700: 1.634055 learning rate: 10.000000
Minibatch perplexity: 5.05
Validation set perplexity: 4.96
Average loss at step 4800: 1.656149 learning rate: 10.000000
Minibatch perplexity: 5.90
Validation set perplexity: 5.13
Average loss at step 4900: 1.637843 learning rate: 10.000000
Minibatch perplexity: 5.80
Validation set perplexity: 5.00
Average loss at step 5000: 1.645046 learning rate: 1.000000
Minibatch perplexity: 5.79
uch egredic of the duck not and the organion tembariary emplience rost between b
 a decide use in a lirss for banware when in a quiaus averuar the at is a sekise
ved whom produce and compack the an usion whice six zero one nine stspolomare a 
ing the punimated in the roadliat sk t