In [21]:
import requests
import re
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, GRU, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping

In [22]:
# Load the text data
url = "https://www.gutenberg.org/cache/epub/11/pg11.txt"
response = requests.get(url)
text = response.text

In [23]:
# Preprocess text data
def preprocess_text(text):
    start_idx = text.find("*** START")
    end_idx = text.find("*** END")
    text = text[start_idx:end_idx]
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = text.lower()
    return text

processed_text = preprocess_text(text)
corpus = processed_text.split('.')
corpus = [sentence.strip() for sentence in corpus if sentence.strip() != '']
print(' '.join(corpus)[:200])

start of the project gutenberg ebook alices adventures in wonderland 
illustration




alices adventures in wonderland

by lewis carroll

the millennium fulcrum edition 

contents

 chap


In [24]:
# Create vocabulary and input sequences
tokenizer = Tokenizer()
tokenizer.fit_on_texts(corpus)
total_words = len(tokenizer.word_index) + 1

input_sequences = []
for line in corpus:
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)

In [25]:
# Cap the max_sequence_len to a reasonable value
max_sequence_len = max([len(x) for x in input_sequences])
print(f"Max sequence length before capping: {max_sequence_len}")
max_sequence_len = min(max_sequence_len, 100)  # cap at 100
print(f"Max sequence length after capping: {max_sequence_len}")

input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))

xs, labels = input_sequences[:,:-1], input_sequences[:,-1]
ys = tf.keras.utils.to_categorical(labels, num_classes=total_words)

Max sequence length before capping: 27371
Max sequence length after capping: 100


In [29]:
# Build and compile the GRU model
model_gru = Sequential()
model_gru.add(Embedding(total_words, 100))  # Removed input_length argument
model_gru.add(GRU(150, return_sequences=True, input_shape=(max_sequence_len-1, 100)))
model_gru.add(Dropout(0.2))
model_gru.add(GRU(100))
model_gru.add(Dense(total_words, activation='softmax'))

model_gru.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

  super().__init__(**kwargs)


In [30]:
# Train the GRU model
early_stop = EarlyStopping(monitor='loss', patience=3)
history_gru = model_gru.fit(xs, ys, epochs=100, verbose=1, callbacks=[early_stop])

Epoch 1/100
[1m856/856[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m59s[0m 66ms/step - accuracy: 0.0541 - loss: 6.7709
Epoch 2/100
[1m856/856[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m57s[0m 66ms/step - accuracy: 0.0560 - loss: 6.1210
Epoch 3/100
[1m856/856[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m58s[0m 67ms/step - accuracy: 0.0678 - loss: 5.8201
Epoch 4/100
[1m856/856[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m56s[0m 66ms/step - accuracy: 0.0791 - loss: 5.6083
Epoch 5/100
[1m856/856[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m56s[0m 65ms/step - accuracy: 0.0967 - loss: 5.3963
Epoch 6/100
[1m856/856[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m56s[0m 65ms/step - accuracy: 0.1246 - loss: 5.0621
Epoch 7/100
[1m856/856[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m57s[0m 66ms/step - accuracy: 0.1486 - loss: 4.7978
Epoch 8/100
[1m856/856[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m57s[0m 66ms/step - accuracy: 0.1670 - loss: 4.5256
Epoch 9/100
[1m

In [32]:
# Generate text
def generate_text(seed_text, next_words, max_sequence_len):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
        predicted = np.argmax(model_gru.predict(token_list), axis=-1)
        output_word = ""
        for word, index in tokenizer.word_index.items():
            if index == predicted:
                output_word = word
                break
        seed_text += " " + output_word
    return seed_text

generated_text = generate_text("Alice was", 20, max_sequence_len)
print(generated_text)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25