In [1]:
# These are all the modules we'll be using later. Make sure you can import them
# before proceeding further.
from __future__ import print_function
import os
import numpy as np
import random
import string
import tensorflow as tf
import zipfile
from six.moves import range
from six.moves.urllib.request import urlretrieve

In [2]:
url = 'http://mattmahoney.net/dc/'

def maybe_download(filename, expected_bytes):
  """Download a file if not present, and make sure it's the right size."""
  if not os.path.exists(filename):
    filename, _ = urlretrieve(url + filename, filename)
  statinfo = os.stat(filename)
  if statinfo.st_size == expected_bytes:
    print('Found and verified %s' % filename)
  else:
    print(statinfo.st_size)
    raise Exception(
      'Failed to verify ' + filename + '. Can you get to it with a browser?')
  return filename

filename = maybe_download('text8.zip', 31344016)

Found and verified text8.zip


In [3]:
def read_data(filename):
  with zipfile.ZipFile(filename) as f:
    name = f.namelist()[0]
    data = tf.compat.as_str(f.read(name))
  return data
  
text = read_data(filename)
print('Data size %d' % len(text))

Data size 100000000


In [4]:
valid_size = 1000
valid_text = text[:valid_size]
train_text = text[valid_size:]
train_size = len(train_text)
print(train_size, train_text[:64])
print(valid_size, valid_text[:64])

99999000 ons anarchists advocate social relations based upon voluntary as
1000  anarchism originated as a term of abuse first used against earl


In [57]:
vocabulary_size = len(string.ascii_lowercase) + 3 # [a-z] + ' '
first_letter = ord(string.ascii_lowercase[0])
GO_ID = 1
EOS_ID = 2
def char2id(char):
  if char in string.ascii_lowercase:
    return ord(char) - first_letter + 3
  elif char == ' ':
    return 0
  elif char == '<go>':
    return GO_ID
  elif char == '<eos>':
    return EOS_ID
  else:
    print('Unexpected character: %s' % char)
    return 
  
def id2char(dictid):
  if dictid > 2:
    return chr(dictid + first_letter - 3)
  elif dictid == GO_ID:
    return '<go>'
  elif dictid == EOS_ID:
    return '<eos>'
  else:
    return ' '

print(char2id('a'), char2id('z'), char2id(' '), char2id('ï'))
print(id2char(1), id2char(26), id2char(0))

Unexpected character: ï
3 28 0 None
<go> x  


In [126]:
def reverse_word(word):
  return word[::-1]

def mirror_seq(sequence):
    return ' '.join(map(lambda x:reverse_word(x),sequence.split(' ')))

def mirror_batches(sequence):
    return [mirror_seq(seq) for seq in sequence]
print(mirror_seq(' asd asdf nrb vlm '))

 dsa fdsa brn mlv 


In [127]:
batch_size=64
sequence_length = 20

class BatchGenerator(object):
  def __init__(self, text, batch_size, num_unrollings):
    self._text = text
    self._text_size = len(text)
    self._batch_size = batch_size
    self.sequence_length = sequence_length
    segment = self._text_size // batch_size
    self._cursor = [ offset * segment for offset in range(batch_size)]
  
  
  def next(self):
    """Generate the next array of batches from the data. The array consists of
    the last batch of the previous array, followed by num_unrollings new ones.
    """
    batch_char = list()
    for b in range(self._batch_size):
      batch_char.append( self._text[self._cursor[b]:self._cursor[b]+sequence_length])
      self._cursor[b] = (self._cursor[b] + sequence_length) % self._text_size
    return batch_char


train_batches = BatchGenerator(train_text, batch_size, num_unrollings)
valid_batches = BatchGenerator(valid_text, 1, 1)
batches = train_batches.next()
print(batches)
print(mirror_batches(batches))
print(valid_batches.next())
print(valid_batches.next())

['ons anarchists advoc', 'when military govern', 'lleria arches nation', ' abbeys and monaster', 'married urraca princ', 'hel and richard baer', 'y and liturgical lan', 'ay opened for passen', 'tion from the nation', 'migration took place', 'new york other well ', 'he boeing seven six ', 'e listed with a glos', 'eber has probably be', 'o be made to recogni', 'yer who received the', 'ore significant than', 'a fierce critic of t', ' two six eight in si', 'aristotle s uncaused', 'ity can be lost as i', ' and intracellular i', 'tion of the size of ', 'dy to pass him a sti', 'f certain drugs conf', 'at it will take to c', 'e convince the pries', 'ent told him to name', 'ampaign and barred a', 'rver side standard f', 'ious texts such as e', 'o capitalize on the ', 'a duplicate of the o', 'gh ann es d hiver on', 'ine january eight ma', 'ross zero the lead c', 'cal theories classic', 'ast instance the non', ' dimensional analysi', 'most holy mormons be', 't s support or at le', 'u is still dis

In [70]:
print(train_text[0:20])

ons anarchists advoc


In [7]:
def logprob(predictions, labels):
  """Log-probability of the true labels in a predicted batch."""
  predictions[predictions < 1e-10] = 1e-10
  return np.sum(np.multiply(labels, -np.log(predictions))) / labels.shape[0]

def sample_distribution(distribution):
  """Sample one element from a distribution assumed to be an array of normalized
  probabilities.
  """
  r = random.uniform(0, 1)
  s = 0
  for i in range(len(distribution)):
    s += distribution[i]
    if s >= r:
      return i
  return len(distribution) - 1

def sample(prediction):
  """Turn a (column) prediction into 1-hot encoded samples."""
  p = np.zeros(shape=[1, vocabulary_size], dtype=np.float)
  p[0, sample_distribution(prediction[0])] = 1.0
  return p

def random_distribution():
  """Generate a random column of probabilities."""
  b = np.random.uniform(0.0, 1.0, size=[1, vocabulary_size])
  return b/np.sum(b, 1)[:,None]

In [42]:
lstm_size = 64
embedding_size = vocabulary_size

graph = tf.Graph()
with graph.as_default():
  encoder_inputs = list()
  decoder_inputs = list()
  train_labels = list()  

  for _ in range(sequence_length):
    encoder_inputs.append(tf.placeholder(tf.int32, shape=(None,)))
  for _ in range(sequence_length+1):
    decoder_inputs.append(tf.placeholder(tf.int32, shape=(None,)))
    train_labels.append(tf.placeholder(tf.int32, shape=(None,)))

  weights = [tf.ones_like(label, dtype=tf.float32) for label in train_labels]

  print('length of weight:', len(weights))
  print('shape of weight[0]:', weights[0].get_shape())

  # Use LSTM cell
  cell = tf.nn.rnn_cell.BasicLSTMCell(lstm_size)
  #outputs, states = tf.nn.seq2seq.basic_rnn_seq2seq(encoder_inputs, decoder_inputs, cell)
  with tf.variable_scope("train_test"):
    outputs, states = tf.nn.seq2seq.embedding_rnn_seq2seq(encoder_inputs,
                                                          decoder_inputs,
                                                          cell,
                                                          vocabulary_size, # num_encoder_symbols
                                                          vocabulary_size, # num_decoder_symbols
                                                          embedding_size, # embedding_size
                                                         )

  loss = tf.nn.seq2seq.sequence_loss(outputs, train_labels, weights) 
  optimizer = tf.train.GradientDescentOptimizer(0.1).minimize(loss)

  # Predictions.
  train_predictions = tf.pack([tf.nn.softmax(output) for output in outputs])

  # Validation eval
  valid_encoder_inputs = list()
  valid_decoder_inputs = list()
  valid_labels = list()  

  for _ in range(sequence_length):
    valid_encoder_inputs.append(tf.placeholder(tf.int32, shape=(1,)))
  for _ in range(sequence_length+1):
    valid_decoder_inputs.append(tf.placeholder(tf.int32, shape=(1,)))
    valid_labels.append(tf.placeholder(tf.int32, shape=(1,)))
  valid_weights = [tf.ones_like(label, dtype=tf.float32) for label in valid_labels]
  with tf.variable_scope("train_test", reuse = True):
    valid_outputs, valid_states = tf.nn.seq2seq.embedding_rnn_seq2seq(valid_encoder_inputs,
                                                                     valid_decoder_inputs,
                                                                     cell,
                                                                     vocabulary_size, # num_encoder_symbols
                                                                     vocabulary_size, # num_decoder_symbols
                                                                     embedding_size, # embedding_size
                                                                     feed_previous=True
                                                                     )
  valid_predictions = tf.pack([output for output in valid_outputs])
  valid_loss = tf.nn.seq2seq.sequence_loss(valid_outputs, valid_labels, valid_weights) 

  print(valid_encoder_inputs[0].get_shape())

length of weight: 21
shape of weight[0]: (?,)
(1,)


In [134]:
print(valid_p)

[[[  6.80840760e-02   8.12792629e-02  -1.09792739e-01   3.50007027e-01
     6.20396398e-02   5.15164882e-02   1.09323241e-01  -2.46790648e-02
     1.05356090e-01  -2.61335313e-01  -8.41471404e-02  -2.73115095e-02
    -7.28548393e-02  -1.29638091e-01   1.84192181e-01  -1.71851933e-01
     5.64886406e-02   2.53080100e-01  -7.32900053e-02  -3.94653268e-02
     3.51401642e-02  -3.41450185e-01   4.07368168e-02  -1.84960023e-01
     4.83787619e-04   2.32965529e-01   1.08004443e-01  -2.37516150e-01
     1.71504423e-01]]

 [[  5.96090667e-02   1.09956153e-01  -1.39507309e-01   2.90029913e-01
     4.56601903e-02   1.09535996e-02   5.24946228e-02   5.67787327e-02
     1.92117579e-02  -2.52704293e-01  -8.92280564e-02  -4.34401892e-02
    -7.40001053e-02  -7.55379200e-02   1.64307728e-01  -1.56874120e-01
     3.64663228e-02   1.63893953e-01  -1.32927582e-01  -2.86977421e-02
     7.59964362e-02  -2.97327340e-01   7.34934658e-02  -1.50578916e-01
    -4.92811427e-02   1.54120132e-01   4.08638455e-02 

In [None]:
def characters(probabilities):
  """Turn a 1-hot encoding or a probability distribution over the possible
  characters back into its (most likely) character representation."""
  return [id2char(c) for c in np.argmax(probabilities, 1)]

def train_prediction(predictions):
  return ''.join([characters(onehot)[0] for onehot in predictions])

def sample_prediction(predictions):
  return ''.join([characters(onehot)[0] for onehot in predictions])

with tf.Session(graph=graph) as sess:
  sess.run(tf.initialize_all_variables())

  for step in range(50001):
    feed_dict = dict()
    
    input_batches = train_batches.next()
    output_batches = mirror_batches(input_batches)
       
    feed_dict[decoder_inputs[0]] = [GO_ID] * batch_size
    for i in range(sequence_length):
      feed_dict[encoder_inputs[i]] = [char2id(seq[i]) for seq in input_batches]
      feed_dict[decoder_inputs[i+1]] = [char2id(seq[i]) for seq in output_batches]
      feed_dict[train_labels[i]] = [char2id(seq[i]) for seq in output_batches]
    feed_dict[train_labels[sequence_length]] = [EOS_ID] * batch_size
    
    _, l, predictions = sess.run([optimizer, loss, train_predictions], feed_dict=feed_dict)
    if step % 1000 == 0:
      print('Step %d:' % step)
      print('Training set:')
      print('  Perplexity       : ', l)
      print('  Input            : ', input_batches[0])
      print('  Correct output   : ', output_batches[0])
      print('  Generated output : ', sample_prediction(predictions))
          
      valid_feed_dict = dict()
      valid_input_batches = valid_batches.next()
      valid_output_batches = mirror_batches(valid_input_batches)

      valid_feed_dict[valid_decoder_inputs[0]] = [GO_ID]
      for i in range(sequence_length):
        valid_feed_dict[valid_encoder_inputs[i]] = [char2id(valid_input_batches[0][i])]
        valid_feed_dict[valid_labels[i]] = [char2id(valid_output_batches[0][i])]
      valid_feed_dict[valid_labels[sequence_length]] = [EOS_ID]

      valid_l, valid_p = sess.run([valid_loss, valid_predictions], feed_dict=valid_feed_dict)

      print('Valid set:')
      print('  Perplexity       : ', valid_l)
      print('  Input            : ', valid_input_batches[0])
      print('  Correct output   : ', valid_output_batches[0])
      print('  Generated output : ', sample_prediction(valid_p))
      print("="*100)

Instructions for updating:
Use `tf.global_variables_initializer` instead.
Step 0:
Training set:
  Perplexity       :  3.36433
  Input            :  irst self labelled a
  Correct output   :  tsri fles dellebal a
Valid set:
  Perplexity       :  3.38995
  Input            :  radicals including t
  Correct output   :  slacidar gnidulcni t
  Generated output :    oooookkzzmmnnnnntrr
Step 1000:
Training set:
  Perplexity       :  2.68711
  Input            :  goldman early french
  Correct output   :  namdlog ylrae hcnerf
Valid set:
  Perplexity       :  2.79355
  Input            :  he diggers of the en
  Correct output   :  eh sreggid fo eht ne
  Generated output :  e e ee ee ene et e<eos><eos><eos>
Step 2000:
Training set:
  Perplexity       :  2.41827
  Input            :  ax stirner the ego a
  Correct output   :  xa renrits eht oge a
Valid set:
  Perplexity       :  3.04977
  Input            :  glish revolution and
  Correct output   :  hsilg noitulover dna
  Generated output :   no

In [None]:
input()

In [119]:
input_batches = train_batches.next()
output_batches = mirror_batches(input_batches)
feed_dict[decoder_inputs[0]] = [GO_ID] * batch_size
for i in range(sequence_length):
    #print(i)
    if i==0:
        for j in input_batches:
            print (len(j))
        for j in output_batches:
            print (len(j))
    [char2id(seq[i]) for seq in input_batches]
    [char2id(seq[i]) for seq in output_batches]
    [char2id(seq[i]) for seq in output_batches]
feed_dict[train_labels[sequence_length]] = [EOS_ID] * batch_size


20
20
20
20
20
20
20
20
20
20
20
20
20
20
20
20
20
20
20
20
20
20
20
20
20
20
20
20
20
20
20
20
20
20
20
20
20
20
20
20
20
20
20
20
20
20
20
20
20
20
20
20
20
20
20
20
20
20
20
20
20
20
20
20
19
20
20
19
19
20
18
20
19
19
19
20
20
20
19
20
20
19
19
20
20
19
20
20
20
19
20
20
20
20
20
19
20
20
19
20
20
18
20
20
20
19
20
20
20
20
20
19
20
20
19
20
20
20
20
20
20
20
20
20
20
20
19
19


IndexError: string index out of range

In [109]:
print(input_batches)
print(output_batches)

['ate social relations', 'ments failed to revi', 'al park photographic', 'ies index sacred des', 'ess of castile daugh', ' h provided a detail', 'guage among jews man', 'gers in december one', 'al media and from pr', ' during the one nine', 'known manufacturers ', 'seven a widebody jet', 's covering some of t', 'en one of the most i', 'ze single acts of me', ' first card from the', ' in jersey and guern', 'he poverty and socia', 'gns of humanity vol ', ' cause so aquinas co', 'n denaturalization a', 'ce formation solutio', 'the input usually me', 'ck to pull him out b', 'usion inability to o', 'omplete an operation', 't of the mistakes of', ' it fort des moines ', 'ttempts by his oppon', 'ormats for mailboxes', 'soteric christianity', 'growing popularity o', 'riginal document fax', 'e nine eight zero on', 'rch eight listing of', 'haracter lieutenant ', 'al mechanics and spe', ' gm comparison maize', 's fundamental applic', 'lieve the configurat', 'ast not parliament s', ' upon by histo