## Prepare dataset

In [50]:
import os

import pandas as pd
import numpy as np

from keras.src.preprocessing.text import Tokenizer
from keras.src.utils import pad_sequences
from keras.src.utils import to_categorical

from nltk import ngrams

### Specify dataset version and number of sentences to import

In [51]:
dataset_version = 'v2'
n_sentences = 1000

n_gram_length_min = 3
n_gram_length_max = 10

### Import

In [52]:
dataset_filename = 'blogtext_cleaned{0}.csv'.format(dataset_version)

download = not os.path.exists('./' + dataset_filename)

if download:
    print("{0} not found, will attempt to download".format(dataset_filename))
    
    if dataset_version == 'v1':
        !gdown 16ySojdSN9etEurLs2beGWCJKb6h15bJV
    elif dataset_version == 'v2':
        !gdown 15El0E261xOjyhapRss9HJ2Fi91Th88jN
    else:
        raise Exception("Unknown dataset version {0}".format(dataset_version))

df = pd.read_csv(dataset_filename, nrows=n_sentences).head(n_sentences)

print("Loaded {0} rows from {1}".format(n_sentences, dataset_filename))

Loaded 1000 rows from blogtext_cleanedv2.csv


### Tokenize words

In [53]:
sentence_list = df['text'].tolist()

tokenizer = Tokenizer()
tokenizer.fit_on_texts(sentence_list)
total_words = len(tokenizer.word_index) + 1

### Generate n-gram list

In [54]:
n_gram_list = []

for line in sentence_list:
    token_list = tokenizer.texts_to_sequences([line])[0]

    for n in range(n_gram_length_min, n_gram_length_max):
        n_grams = ngrams(token_list, n)
        n_gram_list.extend(np.asarray([*n_grams]))

# Padding
n_gram_list = np.array(pad_sequences(
    n_gram_list,
    maxlen=n_gram_length_max,
    padding='pre'
))

X = n_gram_list[:, :-1]
y = n_gram_list[:, -1]

y = to_categorical(y, num_classes=total_words)

In [55]:
print("Number of words: {0}".format(total_words))
print("N-gram list length: {0}".format(len(n_gram_list)))

Number of words: 2505
N-gram list length: 51215


## Build and Train Model

In [56]:
from keras.src.layers import Embedding, GRU, Dense, LSTM
from keras import Sequential
import keras
import pickle

### Specify hyperparameters

In [57]:
embedding_size = 128
output_dim = 256 # Output dimension of LSTM / GRU / RNN layer
activation_func = 'softmax'
optimizer = 'adam'

epochs = 100

### Build model

In [58]:
model = Sequential()
model.add(Embedding(total_words, embedding_size, input_length=n_gram_length_max-1))

model.add(LSTM(output_dim))

model.add(Dense(total_words, activation=activation_func))

model.compile(
    loss='categorical_crossentropy',
    optimizer=optimizer,
    metrics=['accuracy']
)

model.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_3 (Embedding)     (None, 9, 128)            320640    
                                                                 
 lstm_3 (LSTM)               (None, 256)               394240    
                                                                 
 dense_3 (Dense)             (None, 2505)              643785    
                                                                 
Total params: 1358665 (5.18 MB)
Trainable params: 1358665 (5.18 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


## Begin training

In [59]:
from datetime import datetime

logdir = "logs/scalars/" + datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = keras.callbacks.TensorBoard(log_dir=logdir)

checkpoint_path = "checkpoints/" + datetime.now().strftime("%Y%m%d-%H%M%S") + "/" + "model_checkpoint_{epoch:02d}.h5"
checkpoint_callback = keras.callbacks.ModelCheckpoint(checkpoint_path, save_freq=5000, verbose=1)

model.fit(X, y,
          epochs=epochs, verbose=1,
          callbacks=[tensorboard_callback, checkpoint_callback]
)

Epoch 1/100
 105/1601 [>.............................] - ETA: 34s - loss: 6.8485 - accuracy: 0.0387

KeyboardInterrupt: 

### Persist model

In [None]:
model.save("models/model_{0}.h5".format(datetime.now()).replace(" ", "_"), save_format='h5')

with open("models/tokenizer_{0}.pickle".format(datetime.now()).replace(" ", "_"), 'wb') as tokenizer_file:
    pickle.dump(tokenizer, tokenizer_file, protocol=pickle.HIGHEST_PROTOCOL)

## Now we can play with the model!
Enter the seed text, and run the cell. The model will predict the most probable next word for your sentence

In [None]:
seed_text = "Hello there, do you like"
next_words = 1

for _ in range(next_words):
    token_list = tokenizer.texts_to_sequences([seed_text])[0]
    token_list = pad_sequences(
        [token_list],
        maxlen=n_gram_length_max - 1,
        padding='pre'
    )

    predictions = model.predict(token_list)
    pred_word = tokenizer.index_word[np.argmax(predictions)]
    seed_text += " " + pred_word

print("Next predicted words: ", seed_text)