# Week 4: Sequence models and literature (Text Generation)


In [None]:
import tensorflow as tf
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer()

data = "In the town of Athy one Jeremy Lanigan \n Battered away ... ..."
corpus = data.lower().split('\n') # split to create a list of sentences

tokenizer.fit_on_texts(corpus) # Create a dictionary (key-value pairs, where key is the word and value is a token for that word) of words and the overall corpus
total_words = len(tokenizer.word_index) + 1 # find total number of words in a corpus

# Training the data
input_sentences = [] # Training x's
for line in corpus:
    token_list = tokenizer.texts_to_sequences([line])[0] # this will convert a line of text into a a list of tokens representing the words (e.g.: [4 2 66 8 67])
    for i in range(1, len(token_list)): # iterate over a list of tokens and create a number of n-gram sequences:
        # [4 2 66 8 67] ->
        # [4 2]
        # [4 2 66]
        # [4 2 66 8]
        # [4 2 66 8 67]
        n_gram_sequence = token_list[:i+1]
        input_sentences.append(n_gram_sequence)

# Find the max length of the corpus:
max_sequence_len = max([len(x) for x in input_sentences])

# Pad all of the sequences so they are of the same length (pre-pad with zeros to make it easier to extract the label):
input_sentences = np.array(pad_sequences(input_sentences, maxlen=max_sequence_len, padding='pre'))
# So now we get this:
# [0 0 0 4 2]
# [0 0 4 2 66]
# [0 4 2 66 8]
# [4 2 66 8 67]

# Now turn sequences into X's and Y's (input values and the labels)
# General idea: use the last character as our label (Y) and take all but the last character as the input (X):
# [0 0 0 4 2] -> X: [0 0 0 4] Y: 2
# [0 0 4 2 66] -> X: [0 0 0 4 2] Y: 66
# [0 4 2 66 8] -> X: [0 0 0 4 2 66] Y: 8
# [4 2 66 8 67] -> X: [0 0 0 4 2 66 8] Y: 67
xs = input_sentences[:,:-1] # take first N tokens (sliced to remove the last token)
labels = input_sentences[:, -1] # take the last token (all sliced to keep the last token)

# Apply One-Hot encoding to the labels (as this is a classification problem: given a sequence of words, classify from the corpus what the next word would likely be):
ys = tf.keras.utils.to_categorical(labels, num_classes=total_words) # every word gets its own class