In [1]:
from __future__ import print_function
import numpy as np
import random
import os
import string
import zipfile
import tensorflow as tf
import sys
from six.moves import range
from tensorflow.contrib.rnn import LSTMCell, GRUCell
import tensorflow.contrib.seq2seq as Seq2Seq

In [2]:
url = 'http://mattmahoney.net/dc/'

def maybe_download(filename, expected_bytes):
  """Download a file if not present, and make sure it's the right size."""
  if not os.path.exists(filename):
    filename, _ = urlretrieve(url + filename, filename)
  statinfo = os.stat(filename)
  if statinfo.st_size == expected_bytes:
    print('Found and verified %s' % filename)
  else:
    print(statinfo.st_size)
    raise Exception(
      'Failed to verify ' + filename + '. Can you get to it with a browser?')
  return filename

filename = maybe_download('text8.zip', 31344016)

Found and verified text8.zip


In [3]:
def read_data(filename):
  with zipfile.ZipFile(filename) as f:
    name = f.namelist()[0]
    data = tf.compat.as_str(f.read(name))
  return data
  
text = read_data(filename)
print('Data size %d' % len(text))

Data size 100000000


In [4]:
valid_size = 1000
valid_text = text[:valid_size]
train_text = text[valid_size:]
train_size = len(train_text)
print(train_size, train_text[:64])
print(valid_size, valid_text[:64])


99999000 ons anarchists advocate social relations based upon voluntary as
1000  anarchism originated as a term of abuse first used against earl


In [5]:
vocabulary_size = len(string.ascii_lowercase) + 2 # [a-z] + ' ' + '_' (padding)
first_letter = ord(string.ascii_lowercase[0])

def char2id(char):
  if char in string.ascii_lowercase:
    return ord(char) - first_letter + 1
  elif char == ' ':
    return 27
  else:
    print('Unexpected character: %s' % char)
    return 0
  
def id2char(dictid):
  if dictid==27:
    return ' '
  if dictid > 0:
    return chr(dictid + first_letter - 1)
  else:
    return '_'

print(char2id('a'), char2id('z'), char2id(' '), char2id('ï'))
print(id2char(1), id2char(26), id2char(0))

Unexpected character: ï
1 26 27 0
a z _


In [6]:
batch_size=32 # number of sentences (axis=1 of batch array) 
num_unrollings=3 # number of words
maxlen = 30 # 30 characters at most. pad with 0 '_'

class BatchGenerator(object):
  def __init__(self, text, batch_size, num_unrollings):
    self._text = text
    self._words = text.split()
    self._num_words = len(self._words)
    self._batch_size = batch_size
    self._num_unrollings = num_unrollings
    segment = self._num_words // batch_size # larger than num_unrollings, to prevent getting same phrases in different... 
    # sentences, however the code does not assert so
    self._cursor = [ offset * segment for offset in range(batch_size)] # each batch/sentence has its own cursor, positioned...
    # "segment" indices away
  
  def _next_batch(self):
    """Generate a single batch from the current cursor position in the data."""
    
    batches = np.zeros(shape=[maxlen,self._batch_size],dtype=np.int32)
    seqlen = list()
    labels = np.zeros(shape=[maxlen,self._batch_size],dtype=np.int32)
    for b in range(self._batch_size):
      s = list()
      l = list()
      for x in range(num_unrollings):
        w = self._words[self._cursor[b]]  
        s.extend([char2id(i) for i in w])
        l.extend([char2id(i) for i in reversed(w)])
        if x < num_unrollings-1:
          s.extend([27])
          l.extend([27])
        self._cursor[b] = (self._cursor[b] + 1) % self._num_words # move a cursor of a sentence/batch to next char...
      seqlen.extend([len(s)])
      while len(s)<maxlen:
        s.extend([0])
        l.extend([0])
      if(len(s)>maxlen):
        s=s[:maxlen]
        l=l[:maxlen]
        seqlen[-1]=maxlen
      batches[:,b]=s
      labels[:,b]=l
    return batches,seqlen,labels


def characters(probabilities):
  """Turn a 1-hot encoding or a probability distribution over the possible
  characters back into its (most likely) character representation."""
  return [id2char(c) for c in np.argmax(probabilities, 1)]


def batches2string(batches):
  #Convert a batch double-list to their string representation.
  S = list()
  for i in range(batches.shape[1]):
    s=''
    for j in range(maxlen):
      s = s + id2char(batches[j,i])
    S.append(s)
  return S
            
def labels2string(labels):
  #Convert a batch double-list to their string representation.
  S = list()
  for i in labels:
    s=''
    for j in i:
      s = s + id2char(j)
    S.append(s)
  return S

train_batches = BatchGenerator(train_text, batch_size, num_unrollings) 
valid_batches = BatchGenerator(valid_text, 1, num_unrollings)

print(batches2string(train_batches._next_batch()[0]))
#print(train_batches._next_batch()[1])
print(batches2string(valid_batches._next_batch()[2]))

['ons anarchists advocate_______', 'her novels to_________________', 'alc cer do____________________', 'eight zero and________________', 'they were seen________________', 'a combo amplifier_____________', 'a game against________________', 'for the practiced_____________', 'nitrogen dioxide sulfur_______', 'heat engine acts______________', 'way the shoemakers____________', 'zero zloty coin_______________', 'illustration to don___________', 'by his second_________________', 'used credit card______________', 'one two seven_________________', 'drupelets because each________', 'paul jones one________________', 'organization inmarsat interpol', 'people foreign support________', 'the horror writers____________', 'of ireland string_____________', 'connectivity at all___________', 'one day each__________________', 'one p v_______________________', 'by the loss___________________', 'groom at a____________________', 'charles taylor accepted_______', 'nine seven zero_______________', 'throughout m

In [8]:
tf.reset_default_graph()
sess = tf.InteractiveSession()

In [10]:
encoder_inputs = tf.placeholder(shape=(None, None), dtype=tf.int32, name='encoder_inputs')
decoder_targets = tf.placeholder(shape=(None, None), dtype=tf.int32, name='decoder_targets')
decoder_inputs = tf.placeholder(shape=(None, None), dtype=tf.int32, name='decoder_inputs')

In [13]:
embedding_size=128
encoder_embeddings = tf.Variable(tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0), dtype=tf.float32)
decoder_embeddings = tf.Variable(tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0), dtype=tf.float32)

In [14]:
encoder_inputs_embedded = tf.nn.embedding_lookup(encoder_embeddings, encoder_inputs)
decoder_inputs_embedded = tf.nn.embedding_lookup(decoder_embeddings, decoder_inputs)

In [16]:
num_nodes = 64
encoder_cell = tf.contrib.rnn.LSTMCell(num_nodes)
encoder_outputs, encoder_final_state = tf.nn.dynamic_rnn(
    encoder_cell, encoder_inputs_embedded,
    dtype=tf.float32, time_major=True,
)

del encoder_outputs

In [17]:
encoder_final_state

LSTMStateTuple(c=<tf.Tensor 'rnn/while/Exit_3:0' shape=(?, 64) dtype=float32>, h=<tf.Tensor 'rnn/while/Exit_4:0' shape=(?, 64) dtype=float32>)

In [18]:
decoder_cell = tf.contrib.rnn.LSTMCell(num_nodes)

decoder_outputs, decoder_final_state = tf.nn.dynamic_rnn(
    decoder_cell, decoder_inputs_embedded,

    initial_state=encoder_final_state,

    dtype=tf.float32, time_major=True, scope="plain_decoder",
)

In [19]:
decoder_logits = tf.contrib.layers.linear(decoder_outputs, vocabulary_size)

decoder_prediction = tf.argmax(decoder_logits, 2)

In [20]:
decoder_logits

<tf.Tensor 'fully_connected/BiasAdd:0' shape=(?, ?, 28) dtype=float32>

In [22]:
stepwise_cross_entropy = tf.nn.softmax_cross_entropy_with_logits(
    labels=tf.one_hot(decoder_targets, depth=vocabulary_size, dtype=tf.float32),
    logits=decoder_logits,
)

loss = tf.reduce_mean(stepwise_cross_entropy)
train_op = tf.train.AdamOptimizer().minimize(loss)

Instructions for updating:

Future major versions of TensorFlow will allow gradients to flow
into the labels input on backprop by default.

See @{tf.nn.softmax_cross_entropy_with_logits_v2}.



In [23]:
sess.run(tf.global_variables_initializer())

In [25]:
for batch in range(100):
    batches,seqlen,labels = train_batches._next_batch()
    fd = {encoder_inputs: batches, decoder_inputs: labels, decoder_targets: labels}
    _, l, prediction = sess.run([train_op, loss,decoder_prediction], fd)
    print(l)
    print(batches2string(prediction))

3.1417096
['eeeeelvdxlppveeeetutggll______', '     lllvlllglzllzyzzl________', 'eeevxvv xdjjjj________________', 'rrrerrexxpyyeyr_______________', '  vvqvqqq vccee_______________', 'rtvttrnrx glgdfgll____________', ' ree    qfvvpvvp vqevol_______', 'erreeexa t  rldd yyjj_________', 'eeeevvvevevvaaellll___________', 'eevrrxaa fbxlbmvveaex ________', 'eereeen mvvvlll_______________', 'er v ryvvvgggl________________', 'r e   njnnv  evnnxiax_________', 'rrllrlvqlllppvlllll___________', 'rrrraaafatxkxdpmjj____________', 're ee rdj xpxjjjjj____________', ' xv   evvvj___________________', 'rrrrletqdtaxrrrll_____________', 'reeaddppjjpjjjjjjj____________', 'rrveryvpppzajjj_______________', 'rreyvveylllvqtvyxvtdnmjjj_____', ' errrr mmmppeyyvuvvvqg________', 'rreevvapaapfvsss______________', 'ee eeeqvtvl___________________', 'eereevavsvjvaj dnppoj_________', 'rrrrt tlvvvppaabbbbbbb________', 'erexxugxggllgdduupjpppeee lll_', 'rrvrreeenvp  flll_____________', 'rret rapfppvll________________', 'rr

['eee         __________________', 'ee e       eeoo__r____________', 'eeeeeee   oo_l _______________', 'eee eeen neeee________________', 'ee       _____________________', 'e e e      e oo_______________', 'eeee         n________________', 'ee   eee     nn_______________', 'ee eeeeee   t_________________', 'ee eee  eeeooo________________', 'ee eee      __________________', 'eeee ei  r reeer_rr eeeeeea___', 'ee    eee_____________________', 'eee e e  ee __________________', 'eeee  e     ao________________', 'eee eee   o___________________', 'eee ee      __________________', 'eeee      eer_________________', 'eeenee    eeeeeerxx___________', 'eeee eeeee  eeetx_____________', 'eee eee   nonnnnreere_________', 'eeee ee e oooc__ ee __________', 'eeeeeo  e   o_________________', 'ee   ee    eee________________', 'eeeee        _________________', 'ee   e  ooo   ee______________', 'eeeee       eerra rr__________', 'eee    e      ee______________', 'eeeee e   ee__________________', 'ee eee e enr

1.3971474
['eneeero  no _r_____e_eee______', 'eeee eeeee r  r_______________', 'een    neenreeee______________', 'ee         oo_ e __ee_________', 'e   noeeeoare  e ___l_________', 'e neen eroooe____ ____________', 'eon   eeeee  teo______________', 'ee ee    e e e________________', 'eee    eee noor ______________', 'eenr   e  ee  e_______________', 'eneennr nn eeee_______________', 'eeee  oee  rorri n____________', 'oe en      ___________________', 'neeee eeen eee________________', 'oe   e e r n eee______________', 'e  e   e e  e eeo_____________', 'ee       eeo__________________', 'eeeee eeeee eeeee_____________', 'eoeee eneooe___ e____ar_______', 'ne e e  nooee_________________', 'noeeneenoe ee ee______________', 'eee   e  eeeee  eeee i a______', 'eee    eeeee   ne_____________', 'nee     ______________________', 'e e       etra  ______________', 'e noe ee  no__________________', 'eeeee eee  o eet______________', 'neo eno  o____________________', 'ene o  oe   ee ee_____________', 'ee

1.2138748
['reenn  rereetae s_na__________', 'ooe   n e  ira _______________', 'oenreenoo  tio eeeeerr________', 'oree ene  eno_________________', 'eairasree a  eeees  o_________', 'n  na   r e___________________', 'ea      r  ___________________', 'eno    eeee___________________', 'ne   or     na ni_____________', 'eerene otni  roo______________', 'ee   eno ennn_________________', 'enitni  nioo  tinnn o_________', 'ner tnee   neeeo  n___________', 'eaee e t  reennr______________', 'a nnine  e  eeeeee____________', 'reeen neren e n irts_a________', 'eee  enin ee i e eee__________', 'ona  i artssa treeor__________', 'aieroee  teo eet______________', 'eor treeo eet tneenree________', 'eetneeer ro  nnoaiaa__________', 'ro e eet o e t________________', 'ni o   roo____________________', 'a n   no______________________', 'o eno enen____________________', 'eet  eena  oo ________________', 'era nroo   ___________________', 'nnr   o noitoeee______________', 'enen eer t reeeee_____________', 'ee

['enio   ni e t eet_____________', ' nnora eno enin_______________', ' tii tsoo  o__________________', ' oo   i     e   a_____________', 'snoitiiisirii ta  na _________', 'ena   irtreee ssaa____________', ' essaa  e  ir nosre nee_______', 'enieaea  e rar  ias___________', 'e t  a i e e noitaa_e_________', 'tae  eniene  etartsslli_______', 'oot oot eno___________________', 'e t reeron eean_______________', 'na etarooaae  narr____________', 'n o ni eno____________________', 'no e t or e___________________', 't eie  too re nn______________', 'si eeeissoo ot________________', 'eno enin neeee________________', 'noitaaini e t naiiroee________', 'enin n  soorar________________', 'sie nara  tisreeinn  erono____', ' n oo ton eeneireeee__________', 'a eeenis  aaole_______________', 's sseeon ni___________________', 'oot orteee aennee_____________', ' ti s  ias e _________________', '  ieno     aretii_____________', 'eea ee e t tsrii______________', 'rooa  ni ra e eaa_____________', 'iea a  i a a

0.85354084
['onee  etaiaa er e t___________', 'eenoe  era ssooo______________', 're taa narn osnooa i__________', 'e t nnot  o___________________', 'eeooee a eia oe_______________', 'ertnnoo tsoo eno______________', 'eeenis nosaes  ro_er__________', 'sti  en  ot___________________', 'e oot  _______________________', 'naiitree siia si______________', ' naoo ee nees_________________', 'tiee tteeeo  nitiiro__________', 'no a  a  na __________________', 'ooaoi i  niiolloo e t_________', 'no e or e e t_________________', 't eie enin  nnerr_____________', 'inon ro naiani________________', 'nosniaor eosorr   ____________', 'e t noittairtsii  rooten______', 'reee  e siiaa   na____________', 'tneetie oo ot  itsitra________', 'tnaa ta t e t_________________', 'enin eno nren_________________', 'e t eenenis  o________________', ' ra  enarn  laaesaa___________', 'enitaasnarter eet seeat_______', 'e t eease est  _______________', 'aireeii aireeii si____________', 'ree ene  eno enin_____________', 'e

0.6917922
[' t er tnec e oroe_____________', 'eno enin eer t________________', 'a ertnnoc  l rif______________', 'tsitra s stnerar______________', 'enala oo sa etttsii___________', 'llits eht  raanats____________', 'fo oot eer t__________________', 'eht  f oot____________________', 'enn is  roo noitaane__________', 'eht sae ta____________________', 're a eo s e el enin___________', 'oree  a eenar_________________', 'fo e t nia ___________________', 'senii noitiiart eht___________', 'rof e oroe  na________________', 're nn e t snrelloenehoh_______', 'reeolf ssoleee  otni__________', 'ot eht  nalatih_______________', 'lanoitanretni noitatroosnart r', 're ael ni  lroo_______________', 'tnee ot nalraa________________', 'lle s eht laaa_irtnec_________', 'aee seean ta__________________', 'leeon e t eaa_________________', 'ssertra eno enin______________', 'sai tsol ne e_________________', 'ea  osla ee___________________', 'afol  tnnoc iaiira ___________', 'eno no  cra __________________', 's 

0.62546194
['eal ecnis eeht________________', 'htoo eitsirhc stsirur_________', 'ot sesooru  fo________________', 'sdneirf sih eea_______________', 'tua rof tneserr_______________', 'hcu  etaaed eno a_____________', 'neees dedulcni ni_____________', 'dna enineno  u________________', 'eniridiio erehrsomta ellaatca_', 'oroo no eht___________________', ' areorr fo snoitaaresso_______', 'dellac an eno_________________', 'reeen sraerra  niiolla________', 'eht thtie fforall_____________', 'dnnoroerof dnuor_ caa ruoloc__', 'eno enin ruof_________________', 'erutaef fo e os_______________', 'stocs receeeen hsittocs_______', 'lacitiloo seitniatrecnu duolc_', 'eht enin ilih  dnalsi_________', 'enin theie enin_______________', 'htrae etitare eollo __________', 'ciaso   ee ressora____________', 'tes no eht____________________', 'neees eno a iron______________', 's noitatcid saa_______________', 'ero  lamrof  _________________', 'aireail si detautis___________', 'naenorue noinn sih____________', 'e

0.44863796
['msinummoc eht sreegid_________', 'snoitcudorr gnirrats emad_____', 'nn enoo suironoh______________', 'ne nod nehe eh________________', 'noitaaicitrar si iratnuloo____', 'ero  no moc  ret______________', 'ni eno enin___________________', 'theie ht amoo_________________', 'hctac erif nlno_______________', 'rof siht  ets_________________', 'rieht niam gninresso__________', 'elaaton rof sti_______________', 'eltit eeap ne_________________', 'eeif oot osla_________________', 'dennised rieht nno____________', 'eno enin oree_________________', 'seenaro elaat separg__________', 'a tsil fo_____________________', 'sis ni retaere________________', 'deeilitu en eht_______________', 'efil tnemeeeihca draaa________', 'eht ttecnoc fo________________', 'nn eno enin___________________', 'sas eniirt ot_________________', 'nairaanuh etelhta eno_________', 'theie eerht ruof______________', 'dna deirramnu nemoo___________', 'ot enillor uaetalp____________', 'rellim s troopus______________', 't

0.40783426
['sa srennurerof fo_____________', 'mlif sas desaa________________', 'suoires stpmetta ot___________', 'dna sti nnised________________', 'ni taht nene__________________', 'tua emos sserrre______________', 'oot oree oree_________________', 'nordauns eeif eerht___________', 'noitsubmoc fo dilos___________', 'tsol eht sag__________________', 'lareees rehto sreeresso_______', 'enin enif oot_________________', 'sol ed arnal__________________', 'ffoyalp tespu ni______________', 'eht laiciffo orue_____________', 'eripme fo itiah_______________', 'noitarilitref ci raconehtrap t', 'eno thgie enin________________', 'erutcurtsarfni ecnatsiser ot__', 'loc noraa  naa________________', 'drawa tneper niunelrah________', 'yellah ni eno_________________', 'eht esruoc fo_________________', 'thgii eht retupmoc____________', 'epop luap i___________________', 'dna suole ram  roo____________', 'deroloc snrettap nomoo________', 'ot dloc sthgin________________', 'oot oree oree_________________', 'r

0.27457944
['ni sih suaevuon_______________', 'eht tolp erew_________________', 'owt oree thgie________________', 'noitaaresbo dna eht___________', 'rotaidem ohw setatilicaf______', 'daerpsedii esu fo_____________', 'eno ruof snosaes______________', 'recnal no ffoehat_____________', 'hsalf tniop dna_______________', 'fo eht sag____________________', 'hcram eno nenes_______________', 'ecnanoser htii enutpen________', 'ereht tlewd ton_______________', 'revo laertnom nehe____________', 'orue rof elpmaae______________', 'nitsuaf i fo__________________', 'stluser morf eht______________', 'eno enin eno__________________', 'sah sselehtenon dediug________', 'eht gninnigeb fo______________', 'thgie ytic no_________________', 'orer selim  ciht______________', 'hteorg si netfo_______________', 'si rehtona yrtnuoc____________', 'sis omirp raaurt______________', 'fo gnitalsnart tneicna________', 'a ecin ibo____________________', 'nori ero erofeb_______________', 'ecnegilletni secivres rof_____', 'e