In [None]:
9# source text
data = """ Jack and Jill went up the hill\n
To fetch a pail of water\n
Jack fell down and broke his crown\n
And Jill came tumbling after\n """

#Model 1: One-Word-In, One-Word-Out Sequences
####We can start with a very simple model. Given one word as input, the model will learn to predict the next word in the sequence.

####The first step is to encode the text as integers. Each lowercase word in the source text is assigned a unique integer and we can convert the sequences of words to sequences of integers.

####Keras provides the Tokenizer class that can be used to perform this encoding.

####First, the Tokenizer is fit on the source text to develop the mapping from words to unique integers. Then sequences of text can be converted to sequences of integers by calling the texts to sequences() function.

In [None]:
from keras.preprocessing.text import Tokenizer
# integer encode text
tokenizer = Tokenizer()
tokenizer.fit_on_texts([data])
encoded = tokenizer.texts_to_sequences([data])[0]

####We will need to know the size of the vocabulary later for both defining the word embedding layer in the model, and for encoding output words using a one hot encoding.

####The size of the vocabulary can be retrieved from the trained Tokenizer by accessing the word index attribute.

In [None]:
# determine the vocabulary size
vocab_size = len(tokenizer.word_index) + 1
print('Vocabulary Size: %d' % vocab_size)

Vocabulary Size: 22


####Running this example, we can see that the size of the vocabulary is 22 words.

####We add one,because we will need to specify the integer for the largest encoded word as an array index, e.g. words encoded 1 to 21 with array indicies 0 to 21 or 22 positions.

####Next, we need to create sequences of words to fit the model with one word as input and one word as output.

In [None]:
#create word -> word sequences
sequences = []
for i in range(1, len(encoded)):
    sequence = encoded[i-1:i+1]
    sequences.append(sequence)
print('Total Sequences: %d' % len(sequences))

Total Sequences: 24


###Running this piece shows that we have a total of 24 input-output pairs to train the network.

####We can then split the sequences into input (X) and output elements (y). This is straightforward as we only have two columns in the data.

In [None]:
#split into X and y elements
import numpy as np
import pandas as pd
sequences = np.array(sequences)
X, y = sequences[:,0],sequences[:,1]

###We will fit our model to predict a probability distribution across all words in the vocabulary.

####That means that we need to turn the output element from a single integer into a one hot encoding with a 0 for every word in the vocabulary and a 1 for the actual word that the value.

###This gives the network a ground truth to aim for from which we can calculate error and update the model.

####Keras provides the to_categorical() function that we can use to convert the integer to a one hot encoding while specifying the number of classes as the vocabulary siz

In [None]:
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
# import plot_model from the correct module for TensorFlow 2.0 and later
from tensorflow.keras.utils import plot_model
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Embedding
# one hot encode outputs
y = to_categorical(y, num_classes=vocab_size)

###We are now ready to define the neural network model.

####The model uses a 'learned word embedding' in the input layer.

####This has one real-valued vector for each word in the vocabulary, where each word vector has a specified length.

####In this case we will use a 10-dimensional projection.

####The input sequence contains a single word, therefore the input length=1.

####The model has a single hidden LSTM layer with 50 units. This is far more than is needed.

####The output layer is comprised of one neuron for each word in the vocabulary and uses a softmax activation function to ensure the output is normalized to look like a probability.



In [None]:
# define the model
def define_model(vocab_size):
    model = Sequential()
    model.add(Embedding(vocab_size, 10, input_length=1))
    model.add(LSTM(50))
    model.add(Dense(vocab_size, activation='softmax'))
    # compile network
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    # summarize defined model
    model.summary()
    plot_model(model, to_file='model.png', show_shapes=True)
    return model

####We will use this same general network structure for each example in this tutorial, with minor changes to the learned embedding layer.

####We can compile and fit the network on the encoded text data.

####Technically, we are modeling a multiclass classification problem (predict the word in the vocabulary), therefore using the categorical cross entropy loss function.

###We use the efficientAdam implementation of gradient descent and track accuracy at the end of each epoch.

####The model is fit for 500 training epochs.

####After the model is fit, we test it by passing it a given word from the vocabulary and having the model predict the next word.

####Here we pass in "Jack" by encoding it and calling model.predict classes() to get the integer output for the predicted word. This is then looked up in the vocabulary mapping to give the associated word.

In [None]:
model = define_model(vocab_size) # build the model


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 1, 10)             220       
                                                                 
 lstm (LSTM)                 (None, 50)                12200     
                                                                 
 dense (Dense)               (None, 22)                1122      
                                                                 
Total params: 13542 (52.90 KB)
Trainable params: 13542 (52.90 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [None]:
model.fit(X, y, epochs=500, verbose=2) # fit the model


Epoch 1/500
1/1 - 3s - loss: 3.0909 - accuracy: 0.0833 - 3s/epoch - 3s/step
Epoch 2/500
1/1 - 0s - loss: 3.0901 - accuracy: 0.0833 - 18ms/epoch - 18ms/step
Epoch 3/500
1/1 - 0s - loss: 3.0894 - accuracy: 0.2083 - 21ms/epoch - 21ms/step
Epoch 4/500
1/1 - 0s - loss: 3.0886 - accuracy: 0.2083 - 32ms/epoch - 32ms/step
Epoch 5/500
1/1 - 0s - loss: 3.0878 - accuracy: 0.1250 - 17ms/epoch - 17ms/step
Epoch 6/500
1/1 - 0s - loss: 3.0870 - accuracy: 0.1250 - 14ms/epoch - 14ms/step
Epoch 7/500
1/1 - 0s - loss: 3.0862 - accuracy: 0.1250 - 12ms/epoch - 12ms/step
Epoch 8/500
1/1 - 0s - loss: 3.0854 - accuracy: 0.1250 - 12ms/epoch - 12ms/step
Epoch 9/500
1/1 - 0s - loss: 3.0846 - accuracy: 0.1250 - 12ms/epoch - 12ms/step
Epoch 10/500
1/1 - 0s - loss: 3.0838 - accuracy: 0.1250 - 13ms/epoch - 13ms/step
Epoch 11/500
1/1 - 0s - loss: 3.0830 - accuracy: 0.1250 - 14ms/epoch - 14ms/step
Epoch 12/500
1/1 - 0s - loss: 3.0821 - accuracy: 0.1250 - 14ms/epoch - 14ms/step
Epoch 13/500
1/1 - 0s - loss: 3.0813 - ac

<keras.src.callbacks.History at 0x7a7b781e57b0>

In [None]:
# evaluate
in_text = 'Jack'
print(in_text)
encoded = tokenizer.texts_to_sequences([in_text])[0]
encoded = np.array(encoded)
yhat_probs = model.predict(encoded, verbose=0)  # Get probabilities for each class
yhat = np.argmax(yhat_probs, axis=-1)  # Predict the class with highest probability
for word, index in tokenizer.word_index.items():
    if index == yhat[0]:
        print(word)

Jack
and


###This process could then be repeated a few times to build up a generated sequence of words.

###To make this easier, we wrap up the behavior in a function that we can call by passing in our model and the seed word.

In [None]:
# generate a sequence from the model
def generate_seq(model, tokenizer, seed_text, n_words):
    in_text, result = seed_text, seed_text
    # generate a fixed number of words
    for _ in range(n_words):
        # encode the text as integer
        encoded = tokenizer.texts_to_sequences([in_text])[0]
        encoded = np.array(encoded)
        # predict a word in the vocabulary
        yhat_probs = model.predict(encoded, verbose=0)  # Get probabilities for each class
        yhat = np.argmax(yhat_probs, axis=-1)  # Predict the class with highest probability
        # map predicted word index to word
        out_word = ''
        for word, index in tokenizer.word_index.items():
            if index == yhat[0]:  # Access the predicted class index
                out_word = word
                break
    # append to input
        in_text, result = out_word, result + ' ' + out_word
    return result

In [None]:
print(generate_seq(model, tokenizer, 'Jack', 6))


Jack and jill came tumbling after his
