# Week 4: Sequence models and literature (Text Generation)


In [None]:
import tensorflow as tf
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer()

data = "In the town of Athy one Jeremy Lanigan \n Battered away ... ..."
corpus = data.lower().split('\n') # split to create a list of sentences

tokenizer.fit_on_texts(corpus) # Create a dictionary (key-value pairs, where key is the word and value is a token for that word) of words and the overall corpus
total_words = len(tokenizer.word_index) + 1 # find total number of words in a corpus

# Training the data
input_sentences = [] # Training x's
for line in corpus:
    token_list = tokenizer.texts_to_sequences([line])[0] # this will convert a line of text into a a list of tokens representing the words (e.g.: [4 2 66 8 67])
    for i in range(1, len(token_list)): # iterate over a list of tokens and create a number of n-gram sequences:
        # [4 2 66 8 67] ->
        # [4 2]
        # [4 2 66]
        # [4 2 66 8]
        # [4 2 66 8 67]
        n_gram_sequence = token_list[:i+1]
        input_sentences.append(n_gram_sequence)

# Find the max length of the corpus:
max_sequence_len = max([len(x) for x in input_sentences])

# Pad all of the sequences so they are of the same length (pre-pad with zeros to make it easier to extract the label):
input_sentences = np.array(pad_sequences(input_sentences, maxlen=max_sequence_len, padding='pre'))
# So now we get this:
# [0 0 0 4 2]
# [0 0 4 2 66]
# [0 4 2 66 8]
# [4 2 66 8 67]

# Now turn sequences into X's and Y's (input values and the labels)
# General idea: use the last character as our label (Y) and take all but the last character as the input (X):
# [0 0 0 4 2] -> X: [0 0 0 4] Y: 2
# [0 0 4 2 66] -> X: [0 0 0 4 2] Y: 66
# [0 4 2 66 8] -> X: [0 0 0 4 2 66] Y: 8
# [4 2 66 8 67] -> X: [0 0 0 4 2 66 8] Y: 67
xs = input_sentences[:,:-1] # take first N tokens (sliced to remove the last token)
labels = input_sentences[:, -1] # take the last token (all sliced to keep the last token)

# Apply One-Hot encoding to the labels (as this is a classification problem: given a sequence of words, classify from the corpus what the next word would likely be):
ys = tf.keras.utils.to_categorical(labels, num_classes=total_words) # every word gets its own class

# Finding what the next word should be
# Create a NN to classify that the next word should be given a set of words
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional
from tensorflow.keras.models import Sequential
model = Sequential()
model.add(Embedding(total_words, 64, input_length=max_sequence_len - 1)) # handle all of our words, use 64 dimensions to set a vector for a word (can be tweaked), the size of the input dimensions is a length of the longest sequence minus 1d;
# we subtract 1 because we cropped off the last word of each sequence to get a label (Y)!

#model.add(LSTM(20)) # Carry context along for 2 units (can be tweaked)
# As a result, the model will give lots of repetitions of words, this is because our LSTM was carrying a context forward.
# We can change it to be bidirectional:
model.add(Bidirectional(LSTM(20))) # Carry context along for 20 units (can be tweaked) and do it in both directions; it will converge much quicker and will produce less repetitions
model.add(Dense(total_words, activation='softmax')) # Same size as total amount of words (same as used for One-Hot encoding); This layer will have 1 neuron per word.
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) # We do categorical classification.
model.fit(xs, ys, epochs=500, verbose=1) # train for a lot of epochs since it takes a while for a model like this to converge, because it has very little data

# To start predicting a word, we need a sentence to start from aka 'seed'.
seed_text = 'Laurence went to dublin'
next_words = 10 # predict next 10 words, the more words we predict the more uncertainty there is in every sentence; using a larger corpus would help
for _ in range(next_words):
    token_list = tokenizer. texts_to_sequences([seed_text])[0] # tokenize the seed text, OOV words (e.g. Laurence) will be ignored: [134 13 49]
    token_list = pad_sequences([token_list], maxlen=max_sequence_len - 1, padding='pre') # pad the sequence to match it with the one in a training set: [0 0 0 0 134 13 59]
    predicted = model.predict_classes(token_list, verbose=0) # pass it to the model and get a prediction back - this will give a token of the word most likely to be the next one in the sequence
    output_word = ""

    # perform a reverse lookup to turn token back into a word:
    for word, index in tokenizer.word_index.items():
        if index == predicted:
            output_word = word
            break
    seed_text += " " + output_word
print(seed_text)


In [None]:
# Poetry!
!wget --no-check-certificate \
    https://storage.googleapis.com/laurencemoroney-blog.appspot.com/irish-lyrics-eof.txt \
    -O /tmp/irish-lyrics-eof.txt

from tensorflow.keras.optimizers import Adam
data = open('/tmp/irish-lyrics-eof.txt'). read()
model = Sequential()
model.add(Embedding(total_words, 100, input_length=max_sequence_len-1)) # where 100 is a dimensionality of the embedding
model.add(Bidirectional(LSTM(150))) # LSTM units is set to 150
model.add(Dense(total_words, activation='softmax'))
adam = Adam(lr=0.01)
model.compile(loss='categorical_crossentropy', optimizer=adam, metrics=['accuracy'])
history = model.fit(xs, ys, epochs=100, verbose=1) # the more epochs is generally better, but eventually we hit a law of deminishing returns