In [1]:
from keras.preprocessing.text import Tokenizer
import re

import numpy as np
from keras.utils import to_categorical
from keras.models import Sequential, load_model
from keras.layers import Embedding, LSTM, Dense
from keras.preprocessing.sequence import pad_sequences
from pickle import dump, load

In [2]:
def make_train_seq_prev_words(text, n_pervw=1):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts([text])
    encoded = tokenizer.texts_to_sequences([text])[0]

    sequences = np.array([encoded[i-n_pervw: i+1]
                          for i in range(n_pervw, len(encoded))])

    return \
        sequences[:, :-1], \
        to_categorical(sequences[:, -1], num_classes=len(tokenizer.word_index)+1),\
        tokenizer, \
        n_pervw+1, \
        len(tokenizer.word_index) + 1

In [3]:
def make_model(X, y, max_len, vocab_size, epo=500):
    model = Sequential()
    model.add(Embedding(vocab_size, 10, input_length=max_len-1))
    model.add(LSTM(50))
    model.add(Dense(vocab_size, activation='softmax'))
    print(model.summary())

    model.compile(loss='categorical_crossentropy',
                  optimizer='adam', metrics=['accuracy'])
    model.fit(X, y, epochs=epo, verbose=2)
    return model

In [4]:
def gen_top_next_word(model, tokenizer, max_len, seed_text, n_words):

    seq = np.array(tokenizer.texts_to_sequences([seed_text])[0])
    seq = pad_sequences([seq], maxlen=max_len-1, padding='pre')

    yhat = model.predict_on_batch([seq])

    out_text = [tokenizer.index_word[i]
                for i in np.argsort(yhat)[0][::-1][:n_words]]

    return out_text

In [5]:
with open('data\shortteststory.txt', 'r', encoding='utf-8') as file:
    data = file.read()

In [6]:
print(data.split()[:200])

["bill's", 'disappeared', 'upstairs', 'with', 'the', 'little', 'sickly', 'girl', 'but', 'me', "i'm", 'here', 'to', 'see', 'the', 'blind', 'woman', 'i', 'hear', 'she', 'did', 'it', 'on', 'purpose', 'tore', 'donkey', 'tail', 'out', 'of', 'the', 'earth', 'and', 'smeared', 'its', 'milky', 'sap', 'directly', 'in', 'her', 'eyes', 'it', 'must', 'of', 'stung', 'her', 'hands', 'and', 'face', 'bad', 'and', 'maybe', 'left', 'marks', 'on', 'her', 'fingers', 'near', 'santa', 'fe', 'men', 'women', 'and', 'children', 'are', 'always', 'getting', 'possessed', 'of', 'this', 'sort', 'of', 'ecstasy', 'and', 'seem', 'like', 'taking', 'pleasure', 'in', 'swallowing', 'poison', 'and', 'fire', 'and', 'stepping', 'on', 'scorpions', 'and', 'suchlike', 'but', 'it', "ain't", 'superstition', 'like', 'the', 'government', 'try', 'to', 'tell', 'me', 'in', 'the', 'last', 'war', 'superstition', 'i', 'know', "that's", 'when', 'you', 'got', 'to', 'tie', 'your', 'right', 'boot', 'afore', 'your', 'left', 'or', 'when', 'like

In [7]:
words = data.split()
words = [word.lower() for word in words]
clean_words = []

for word in words:
    word = re.sub(r"\"|\?|—|-|\.|:|”|!|\*|;|,|“", ' ', word).strip()
    word = re.sub(r"’", "'", word).strip()
    if word:
        clean_words.append(word)

print(clean_words[:20])

["bill's", 'disappeared', 'upstairs', 'with', 'the', 'little', 'sickly', 'girl', 'but', 'me', "i'm", 'here', 'to', 'see', 'the', 'blind', 'woman', 'i', 'hear', 'she']


In [8]:
with open('data\shortteststory.txt', 'w', encoding='utf-8') as file:
    file.write(' '.join(clean_words))

In [9]:
model_n_prev_versions = {1: None, 2: None}

In [37]:
for i in model_n_prev_versions:
    X, y, tok, maxlen, vocab_size = make_train_seq_prev_words(clean_words, i)
    model = make_model(X, y, maxlen, vocab_size, epo=400)
    model_n_prev_versions[i] = model

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 1, 10)             9050      
                                                                 
 lstm_2 (LSTM)               (None, 50)                12200     
                                                                 
 dense_2 (Dense)             (None, 905)               46155     
                                                                 
Total params: 67405 (263.30 KB)
Trainable params: 67405 (263.30 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
None
Epoch 1/400
70/70 - 2s - loss: 6.7928 - accuracy: 0.0487 - 2s/epoch - 25ms/step
Epoch 2/400
70/70 - 0s - loss: 6.6906 - accuracy: 0.0541 - 136ms/epoch - 2ms/step
Epoch 3/400
70/70 - 0s - loss: 6.3770 - accuracy: 0.0532 - 132ms/epoch - 2ms/step
Epoch 4/400
70/70 - 0s - loss: 

In [52]:
print(gen_top_next_word(
    model_n_prev_versions[1], tok, maxlen, "i'm like It makes", 20))

['quick', 'me', 'of', 'know', 'their', "i'd", 'have', 'be', 'little', 'his', 'the', "don't", 'wax', 'that', 'is', 'railroads', 'must', 'fort', 'stop', 'arkansas']


In [41]:
model_n_prev_versions

{1: <keras.src.engine.sequential.Sequential at 0x1bd8c7dcdd0>,
 2: <keras.src.engine.sequential.Sequential at 0x1bd8c7d3b10>}

In [57]:
def save_models_and_tok():
    for k, v in model_n_prev_versions.items():
        v.save(f'models\model_{k}_prev.h5')
    dump(tok, open('tokenizer.pkl', 'wb'))

In [58]:
save_models_and_tok()

In [62]:
model1 = load_model('models\model_2_prev.h5')
maxlen = 3

In [63]:
tok = load(open('tokenizer.pkl', 'rb'))

In [66]:
print(
    gen_top_next_word(model1, tok, maxlen, "on makes", 100)
)

['but', 'children', 'if', 'time', 'his', 'last', "i'm", 'and', 'again', 'without', 'that', 'about', 'wails', 'so', 'heir', 'what', 'their', 'even', 'this', 'fingers', 'my', 'are', 'not', 'old', 'california', 'years', 'arkansas', 'way', 'up', 'problem', 'distance', 'worked', 'on', 'concrete', 'wax', 'does', 'one', 'just', 'alone', 'the', 'covered', 'fire', 'layer', 'spotted', 'bill', "we're", 'marks', 'getting', 'stepping', 'dart', 'anastasios', 'freezing', 'gory', 'yellow', 'for', 'brick', 'he', 'breath', 'then', 'pulse', 'already', 'we', 'pause', 'nearly', 'along', 'sense', 'horse', 'ecstasy', 'a', 'know', 'tore', 'fasten', 'few', 'life', "that's", 'to', 'two', 'was', 'poison', 'any', 'trace', 'boot', 'mouth', 'seem', 'little', 'pouched', 'restless', 'other', 'many', 'goddammit', 'stretch', 'by', 'explodes', 'groaning', 'or', 'across', 'women', 'right', 'days', 'as']
