In [1]:
import os
import re
import string
import requests
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
import random
import collections
import pickle
from sklearn.model_selection import train_test_split
from tensorflow.python.framework import ops
ops.reset_default_graph()

  from ._conv import register_converters as _register_converters


Now making a default graph session

In [2]:
sess = tf.Session()

Parameter : Descriptions

-    min_word_freq: Only attempt to model words that appear at least 5 times.
-    rnn_size: size of our RNN (equal to the embedding size)
-    epochs: Number of epochs to cycle through the data
-    batch_size: How many examples to train on at once
-    learning_rate: The learning rate or the convergence paramter
-    training_seq_len: The length of the surrounding word group (e.g. 10 = 5 on each side)
-    embedding_size: Must be equal to the rnn_size
-    save_every: How often to save the model
-   eval_every: How often to evaluate the model
-    prime_texts: List of test sentences


In [3]:
min_word_freqeuncy = 10
epochs = 10
batch_size = 100
rnn_size = 128
learning_rate = 0.001
training_seq_length = 50
embedding_size = rnn_size
save_every = 500
eval_every = 50
prime_texts = ['thou art more', 'to be or not to', 'wherefore art thou']

In [23]:
data_dir = 'temp' 
data_file = 'shakespeare.txt' 
model_path = 'shakespeare_model'
full_model_dir = os.path.join(data_dir, model_path)
# Declare punctuation to remove, everything except hyphens and apostrophe's , since shakespear uses them quite frequently 
# to join words .
punctuation = string.punctuation
punctuation = ''.join([x for x in punctuation if x not in ['-', "''"]])


# Make Model Directory
if not os.path.exists(full_model_dir):
    os.makedirs(full_model_dir)

# Make data directory
if not os.path.exists(data_dir):
    os.makedirs(data_dir)
    
print('Loading Shakespeare Data')
# Check if file is downloaded.
if not os.path.isfile(os.path.join(data_dir, data_file)):
    print('Not found, downloading Shakespeare texts from www.gutenberg.org')
    shakespeare_url = 'http://www.gutenberg.org/cache/epub/100/pg100.txt'
    # Get Shakespeare text
    response = requests.get(shakespeare_url)
    shakespeare_file = response.content
    # Decode binary into string
    s_text = shakespeare_file.decode('utf-8')
    # Drop first few descriptive paragraphs.
    s_text = s_text[7675:]
    # Remove newlines
    s_text = s_text.replace('\r\n', '')
    s_text = s_text.replace('\n', '')
    
    # Write to file
    with open(os.path.join(data_dir, data_file), 'w') as out_conn:
        out_conn.write(s_text)
else:
    # If file has been saved, load from that file
    with open(os.path.join(data_dir, data_file), 'r') as file_conn:
        s_text = file_conn.read().replace('\n', '')

# Clean text
print('Cleaning Text')
s_text = re.sub(r'[{}]'.format(punctuation), ' ', s_text)
s_text = re.sub('\s+', ' ', s_text ).strip().lower()
print('Done loading/cleaning.')

Loading Shakespeare Data
Cleaning Text
Done loading/cleaning.


Now we create a function that returns the vocab of the data. It will return 2 dict word2idx and idx2word.

In [24]:
# Build word vocabulary function
def build_vocab(text, min_word_freq):
    word_counts = collections.Counter(text.split(' '))
    # limit word counts to those more frequent than cutoff
    word_counts = {key:val for key, val in word_counts.items() if val>min_word_freq}
    # Create vocab --> index mapping
    words = word_counts.keys()
    vocab_to_ix_dict = {key:(ix+1) for ix, key in enumerate(words)}
    # Add unknown key --> 0 index
    vocab_to_ix_dict['unknown']=0
    # Create index --> vocab mapping
    ix_to_vocab_dict = {val:key for key,val in vocab_to_ix_dict.items()}
    
    return(ix_to_vocab_dict, vocab_to_ix_dict)


Now we turn the shakespear text into array of indices.

In [25]:
# Build Shakespeare vocabulary
print('Building Shakespeare Vocab')
ix2vocab, vocab2ix = build_vocab(s_text, min_word_freqeuncy)
vocab_size = len(ix2vocab) + 1
print('Vocabulary Length = {}'.format(vocab_size))
# Sanity Check
assert(len(ix2vocab) == len(vocab2ix))

# Convert text to word vectors
s_text_words = s_text.split(' ')
s_text_ix = []
for ix, x in enumerate(s_text_words):
    try:
        s_text_ix.append(vocab2ix[x])
    except:
        s_text_ix.append(0)
s_text_ix = np.array(s_text_ix)

Building Shakespeare Vocab
Vocabulary Length = 5212


In [16]:
s_text_ix

array([   1,    2,    3, ...,  363,  887, 3814])

In [27]:
class LSTM_Model():
    def __init__(self, rnn_size, batch_size, learning_rate, training_seq_length, vocab_size, infer=False):
        self.rnn_size = rnn_size
        self.vocab_size = vocab_size
        self.infer = infer
        self.learning_rate = learning_rate
        
        if infer:
            self.batch_size = 1
            self.training_seq_length = 1
        else:
            self.batch_size = batch_size
            self.training_seq_length = training_seq_length
        
        self.lstm_cell = tf.nn.rnn_cell.BasicLSTMCell(rnn_size)
        self.inital_state = self.lstm_cell.zero_state(self.batch_size, tf.float32)
        
        self.x_data = tf.placeholder(tf.int32, [self.batch_size, self.training_seq_length])
        self.y_output = tf.placeholder(tf.int32, [self.batch_size, self.training_seq_length])
        
        with tf.variable_scope('lstm_vars'):
            # Softmax output weights
            W = tf.get_variable('W', [self.rnn_size, self.vocab_size], tf.float32, tf.random_normal_initializer())
            b = tf.get_variable('b', [self.rnn_size, self.vocab_size], tf.float32, tf.constant_initializer(0.0))
            
            # Defining embedding
            embedding_mat = tf.get_variable('embedding_mat', [self.vocab_size, self.rnn_size], tf.float32, tf.random_normal_initializer())
            
            embedding_output = tf.nn.embedding_lookup(embedding_mat, self.x_data)
            rnn_inputs = tf.split(embedding_output, num_or_size_splits = self.training_seq_length, axis=1)
            rnn_inputs_trimmed = [tf.squeeze(x, [1]) for x in rnn_inputs]
            
        # if we are inferring(generating text, we add a loop function)
        # define how to get the i+1th in put from the i th output
        def inferred_loop(prev, count):
            prev_transformed = tf.matmul(prev, W) + b
            prev_symbol = tf.stop_gradient(tf.argmax(prev_transformed, 1))
            output = tf.nn.embedding_lookup(embedding_mat, prev_symbol)
            return output

        decoder = tf.nn.seq2seq.rnn_decoder()
        outputs, last_state = decoder(rnn_inputs_trimmed, self.inital_state, self.lstm_cell, loop_function = inferred_loop if infer else None)
        
        # Non inferred Outputs
        output = tf.reshape(tf.concat(1, outputs), [-1, self.rnn_size])
        # logits and output
        self.logit_output = tf.matmul(output, W) + b
        self.model_output = tf.nn.softmax(self.logit_output)
        loss_fun = tf.contrib.legacy_seq2seq.sequence_loss_by_example
        loss = loss_fun([self.logit_output],[tf.reshape(self.y_output, [-1])],
                [tf.ones([self.batch_size * self.training_seq_length])],
                self.vocab_size)
        self.cost = tf.reduce_sum(loss) / (self.batch_size * self.training_seq_length)
        self.final_state = last_state
        gradients, _ = tf.clip_by_global_norm(tf.gradients(self.cost, tf.trainable_variables()), 4.5)
        optimizer = tf.train.AdamOptimizer(self.learning_rate)
        self.trian_op = optimizer.apply_gradients(zip(gradients, tf.trainable_variables()))
        
        def sample(self, sess, words=ix2vocab, vocab=vocab2ix, num=10, prime_text='thou art'):
            state = sess.run(self.lstm_cell.zero_state(1, tf.float32))
            word_list = prime_text.split()
            for word in word_list[:-1]:
                x = np.zeros((1,1))
                x[0,0] = vocab[word]
                feed_dict = {self.x_data:x ,self.inital_state:state}
                [state] = sess.run([self.final_state], feed_dict=feed_dict)

            out_sentence = prime_text
            word = word_list[-1]
            for n in range(num):
                x = np.zeros((1, 1))
                x[0, 0] = vocab[word]
                feed_dict = {self.x_data: x, self.initial_state:state}
                [model_output, state] = sess.run([self.model_output, self.final_state], feed_dict=feed_dict)
                sample = np.argmax(model_output[0])
                if sample == 0:
                    break
                word = words[sample]
                out_sentence = out_sentence + ' ' + word
            return(out_sentence)
        
            

In order to use the same model (with the same trained variables), we need to share the variable scope between the trained model and the test model.

In [29]:
# Define LSTM Model
lstm_model = LSTM_Model(embedding_size, rnn_size, batch_size, learning_rate,
                        training_seq_length, vocab_size)

# Tell TensorFlow we are reusing the scope for the testing
with tf.variable_scope(tf.get_variable_scope(), reuse=True):
    test_lstm_model = LSTM_Model(embedding_size, rnn_size, batch_size, learning_rate,
                                 training_seq_len, vocab_size, infer_sample=True)

Instructions for updating:
This class is deprecated, please use tf.nn.rnn_cell.LSTMCell, which supports all the feature this cell currently has. Please replace the existing code with tf.nn.rnn_cell.LSTMCell(name='basic_lstm_cell').


AttributeError: module 'tensorflow.nn' has no attribute 'seq2seq'