## Install and Import Dependencies

In [1]:
pip install tensorflow numpy requests



In [2]:
import tensorflow as tf
import numpy as np
import requests
import string
import random

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional
from tensorflow.keras.models import Sequential

## Data Preparation

In [3]:
response = requests.get('https://www.gutenberg.org/cache/epub/1497/pg1497.txt')
data = response.text.split('\n')
data = " ".join(data)

#### Cleaning Text

In [4]:
def clean_text(doc):
    tokens = doc.split()
    table = str.maketrans('', '', string.punctuation)
    tokens = [w.translate(table) for w in tokens]
    tokens = [word for word in tokens if word.isalpha()]
    tokens = [word.lower() for word in tokens]
    return tokens

tokens = clean_text(data)

#### Creating Aligned Sequences

In [5]:
def get_aligned_training_data(text_tokens, train_data_width):
    length = train_data_width + 1
    lines = []
    for i in range(length, len(text_tokens)):
        seq = text_tokens[i - length:i]
        line = ' '.join(seq)
        lines.append(line)
    return lines

lines = get_aligned_training_data(tokens, 50)

### Tokenization and Padding

In [6]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(lines)
sequences = tokenizer.texts_to_sequences(lines)
vocab_size = len(tokenizer.word_index) + 1

sequences = np.array(pad_sequences(sequences, padding='pre'))

### Splitting Data For Training

In [7]:
# Split into input (X) and output (y)
X, y = sequences[:, :-1], sequences[:, -1]
y = tf.keras.utils.to_categorical(y, num_classes=vocab_size)

In [8]:
X.shape

(216232, 50)

## Model Architecture

In [9]:
model = Sequential([
    Embedding(vocab_size, 200, input_length = X.shape[1]),
    Bidirectional(LSTM(256, return_sequences = True)),
    Dropout(0.2),
    LSTM(256),
    Dropout(0.2),
    Dense(512, activation = 'relu'),
    Dense(vocab_size, activation = 'softmax')
])



#### Compile the Model

In [10]:
model.compile(optimizer = 'adam',
              loss = 'categorical_crossentropy',
              metrics = ['accuracy'])

## Model Training

In [11]:
model.fit(X[:50000], y[:50000],
          epochs = 75)

Epoch 1/75
[1m1563/1563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 23ms/step - accuracy: 0.0877 - loss: 6.5377
Epoch 2/75
[1m1563/1563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 23ms/step - accuracy: 0.1299 - loss: 5.7432
Epoch 3/75
[1m1563/1563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 23ms/step - accuracy: 0.1419 - loss: 5.5270
Epoch 4/75
[1m1563/1563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 22ms/step - accuracy: 0.1642 - loss: 5.3205
Epoch 5/75
[1m1563/1563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 23ms/step - accuracy: 0.1723 - loss: 5.1408
Epoch 6/75
[1m1563/1563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 22ms/step - accuracy: 0.1788 - loss: 5.0099
Epoch 7/75
[1m1563/1563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 23ms/step - accuracy: 0.1912 - loss: 4.8568
Epoch 8/75
[1m1563/1563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 23ms/step - accuracy: 0.1984 - loss: 4.7233
Epoch 9/

<keras.src.callbacks.history.History at 0x7d0c62206210>

In [14]:
model.save('plato.keras')

## Generating New Text

In [15]:
def generate_text_seq(model, tokenizer, text_seq_length, seed_text, generated_words_count):
    text_generated = []
    input_text = seed_text
    for _ in range(generated_words_count):
        encoded = tokenizer.texts_to_sequences([input_text])[0]
        encoded = pad_sequences([encoded], maxlen=text_seq_length, truncating='pre')

        predict_x = model.predict(encoded, verbose=0)
        y_predict = np.argmax(predict_x, axis=1)

        predicted_word = ''

        for word, index in tokenizer.word_index.items():
            if index == y_predict[0]:
                predicted_word = word
                break
        input_text += ' ' + predicted_word
        text_generated.append(predicted_word)
    return ' '.join(text_generated)

#### Example

In [21]:
seed_text = "Plato is"
number_of_words_to_generate = 50

generated_text = generate_text_seq(model, tokenizer, X.shape[1], seed_text, number_of_words_to_generate)

print("Seed text:", seed_text)
print("Generated text:", generated_text)

Seed text: Plato is
Generated text: only provisional and will have other applications at the world below grows upon every following their own interests or the intellect second the thing of the state will decline and the valuable time of the producers will be wasted in vain efforts at exchange if he will discern the same
