# Generating Text

Following an example from chapter 8 of Deep Learning With Python to generate text

## Preamble

In [1]:
# load core modules
import os
import random
import sys
import numpy as np
from tqdm import tqdm
import keras
from keras import layers
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

Using TensorFlow backend.


In [2]:
# plotly
from plotly.offline import init_notebook_mode, iplot
import plotly.graph_objs as go
import plotly.figure_factory as ff
init_notebook_mode(connected=True)

In [19]:
# parameters
reviews_to_collect = 1000 # number of reviews to collect -- 12500 is the maximum
training_sentences = 100000 # number of sentences to use for training
sentences_for_generator = 100 # number of sentences to set aside for the generator
max_chars_per_sentence = 25 # maximum number of words to retain per sentence
step_size = 3 # strides to take as number of characters 

In [4]:
# spacy
import string, re
import spacy
parser = spacy.load('en_core_web_sm')

In [5]:
# load data -- this will take a little while, primarily due to sentence splitting
imdb_dir = 'datasets/aclImdb'
neg_reviews = []
target_dir = 'datasets/aclImdb/train/neg' # just using the training dataset
i = 0
for fname in os.listdir(target_dir):
    i += 1
    if i > reviews_to_collect:
        break
    active_file = open('{}/{}'.format(target_dir, fname))
    txt_extract = parser(active_file.read())
    for s in txt_extract.sents:
        sent_extract = ''.join([x for x in s.text.lower() if x in string.printable]) # lower and drop non-ASCII
        neg_reviews.append(sent_extract)
    active_file.close()

## Inspection

In [7]:
# assess the distribution of character count per sentence
hist_trace = go.Histogram(x=list(map(len, neg_reviews)))
hist_layout = go.Layout(title='Distribution of Character Count Per Sentence')
fig = go.Figure(data=[hist_trace], layout=hist_layout)
iplot(fig)

## Preprocessing

In [8]:
# remove all sentences with character count fewer than max_chars_per_sentence 
neg_review_subset = [review for review in neg_reviews if len(review) > max_chars_per_sentence]
print("Retained {:,} sentences, {:.0%} of the original set".format(len(neg_review_subset), len(neg_review_subset)/len(neg_reviews)))

Retained 11,769 sentences, 81% of the original set


In [9]:
# set aside some documents for the generator to produce text
gen_sentences = neg_review_subset[0:sentences_for_generator]
neg_review_subset = neg_review_subset[sentences_for_generator:len(neg_review_subset)]
print("{:,} of sentences will be used for training and {:,} are retained for text generation".format(len(gen_sentences), len(neg_review_subset)))

100 of sentences will be used for training and 10,000 are retained for text generation


In [20]:
# set up input and output characters
text_dict = {'input':[], 'output':[]}
for review in neg_review_subset:
    for c in range(0, len(review)-max_chars_per_sentence, step_size):
        text_dict['input'].append(review[c:c+max_chars_per_sentence]) # sliding window; context
        text_dict['output'].append(review[c+max_chars_per_sentence]) # next character for the given input
# limit number of training examples:
if len(text_dict['input']) > training_sentences:
    text_dict['input'] = text_dict['input'][0:training_sentences]
    text_dict['output'] = text_dict['output'][0:training_sentences]

In [21]:
print("number of cases: {:,}".format(len(text_dict['input'])))

number of cases: 100,000


In [22]:
# explore characters seen in the dataset
retained_chars = sorted(set(' '.join(neg_review_subset)))
print('captures character stats: \ncount: {} \ncharacters: {}'.format(len(retained_chars), ''.join(retained_chars)))

captures character stats: 
count: 66 
characters: 	 !"#$%&'()*+,-./0123456789:;<=>?@^_`abcdefghijklmnopqrstuvwxyz{}~


In [23]:
# generate input and output matrices (this will take about 2 mins)
data_array = {'input': [], 'output': []} # set up a dictionary for data collection
char_indices = dict((char, retained_chars.index(char)) for char in retained_chars) # create an index for the character dictionary 
data_array['input'] = np.zeros((len(text_dict['input']), max_chars_per_sentence, len(retained_chars)), dtype=np.bool) 
data_array['output'] = np.zeros((len(text_dict['input']), len(retained_chars)), dtype=np.bool)
for i, sentence in enumerate(text_dict['input']): # populate input and output arrays
    for t, char in enumerate(sentence):
        data_array['input'][i, t, char_indices[char]] = 1
    data_array['output'][i, char_indices[text_dict['output'][i]]] = 1

In [24]:
# creating a function that samples text based on model predictions
def text_sampler(prediction, temperature):
    ''' Sampling text given expected value
    prediction: vector of probabilities
    temperature: randomness factor -- higher values introduce more unpredictability
    '''
    pred_array = np.asarray(prediction).astype('float64')
    pred_array = np.exp(np.log(pred_array) / temperature)
    pred_array /= np.sum(pred_array)
    pred_pr = np.random.multinomial(1, pred_array, 1)
    
    return np.argmax(pred_pr)
    

## Modelling

In [98]:
# define model
model = keras.models.Sequential()
model.add(layers.LSTM(128, recurrent_dropout=0.25, return_sequences=True, input_shape=(max_chars_per_sentence, len(retained_chars))))
model.add(layers.LSTM(256, recurrent_dropout=0.25))
model.add(layers.Dense(len(retained_chars), activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer=keras.optimizers.RMSprop(lr=0.01))
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_9 (LSTM)                (None, 25, 128)           99840     
_________________________________________________________________
lstm_10 (LSTM)               (None, 256)               394240    
_________________________________________________________________
dense_8 (Dense)              (None, 66)                16962     
Total params: 511,042
Trainable params: 511,042
Non-trainable params: 0
_________________________________________________________________


In [99]:
# fit the model
model.fit(
    data_array['input'],
    data_array['output'],
    batch_size=1000,
    epochs=10,
    validation_split=0.2
)


Train on 80000 samples, validate on 20000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x14a2d4390>

In [100]:
chars_to_generate = 250 # characters to produce
target_temp = 0.5

# generate text
composed_text = ''

# generate random text
generator_draw = random.randint(0, sentences_for_generator)
gen_text = gen_sentences[generator_draw]

# iterate over the number of characters we want to generate
for c in range(chars_to_generate):
    gen_text = gen_text[len(gen_text)-max_chars_per_sentence:len(gen_text)]
    gen_array = np.zeros((1, max_chars_per_sentence, len(retained_chars)))

    for t, char in enumerate(gen_text):
        gen_array[0, t, char_indices[char]] = 1

    preds = model.predict(gen_array, verbose=0)[0]
    next_char = retained_chars[text_sampler(preds, target_temp)]
    gen_text += next_char
    composed_text += next_char

In [101]:
gen_sentences[generator_draw] # selected seed text

"home video & theatrical version of airport '77 run 108 minutes while the us tv versions add an extra hour of footage including a new opening credits sequence, many more scenes with george kennedy as patroni, flashbacks to flesh out character's, longer rescue scenes & the discovery or another couple of dead bodies including the navigator."

In [102]:
# view the generated results!
composed_text

'<br /><br />the give the end nayss to the second having that thinking the comments of the movie because the posibully makes that was the poorland of the loney to be a lot of the same awer a thing to say the best to be the movie was the can be feel be'