# English Text Generator Using Project Gutenberg Books


In [1]:
!pip install gutenbergpy tensorflow keras numpy pandas nltk

Collecting gutenbergpy
  Downloading gutenbergpy-0.3.5-py3-none-any.whl.metadata (7.7 kB)
Collecting httpsproxy-urllib2 (from gutenbergpy)
  Downloading httpsproxy_urllib2-1.0.tar.gz (28 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pymongo (from gutenbergpy)
  Downloading pymongo-4.11.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (22 kB)
Collecting dnspython<3.0.0,>=1.16.0 (from pymongo->gutenbergpy)
  Downloading dnspython-2.7.0-py3-none-any.whl.metadata (5.8 kB)
Downloading gutenbergpy-0.3.5-py3-none-any.whl (22 kB)
Downloading pymongo-4.11.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.4/1.4 MB[0m [31m28.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dnspython-2.7.0-py3-none-any.whl (313 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m313.6/313.6 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected p

In [2]:
import gutenbergpy.textget
import re
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding
from tensorflow.keras.utils import to_categorical
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


## Downloading 3 Books (Romeo and Juliet, A Doll's House, and Alice in Wonderland) from Project Gutenberg


In [3]:
book_ids = [1513,11,2542]
texts = []
for book_id in book_ids:
    raw_text = gutenbergpy.textget.get_text_by_id(book_id)
    decoded_text = raw_text.decode('utf-8')
    cleaned_text = re.sub(r'[^a-zA-Z0-9\s]', '', decoded_text)
    texts.append(cleaned_text)

full_text = ' '.join(texts)
words = word_tokenize(full_text.lower())

## Tokenization and Sequence Preparation



In [4]:
# creating training length
train_len = 25+1 # The first 25 words act as the input, and the 26th word is the target

text_sequences = []
for i in range(train_len, len(words)):
    seq = words[i-train_len:i]  #slicing from 0 to 26, 1 to 27 and so on...
    text_sequences.append(seq)

In [5]:
# tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(text_sequences)
sequences = tokenizer.texts_to_sequences(text_sequences)
vocabulary_size = len(tokenizer.word_counts)+1
vocabulary_size

6703

In [6]:
import pandas as pd
sequences_df = pd.DataFrame(sequences)
sequences_df.head(4)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,16,17,18,19,20,21,22,23,24,25
0,1517,7,1,1514,1515,1516,3292,1,6702,7,...,1826,1,2308,444,4,216,4,6,1305,308
1,7,1,1514,1515,1516,3292,1,6702,7,40,...,1,2308,444,4,216,4,6,1305,308,216
2,1,1514,1515,1516,3292,1,6702,7,40,2,...,2308,444,4,216,4,6,1305,308,216,533
3,1514,1515,1516,3292,1,6702,7,40,2,72,...,444,4,216,4,6,1305,308,216,533,6


## Splitting into X and y


In [7]:
sequences = np.array(sequences)
# Splitting into X and y
X = sequences[:, :-1]
y= sequences[:, -1]
y = to_categorical(y, num_classes=vocabulary_size)
print(f"X shape: {X.shape}")
print(f"y shape: {y.shape}")
seq_len = X.shape[1]

X shape: (78949, 25)
y shape: (78949, 6703)


## Model Development


In [8]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense

def create_model(vocabulary_size, seq_len):

    inputs = Input(shape=(seq_len,))
    embedding_layer = Embedding(vocabulary_size, 25, input_length=seq_len)(inputs)
    LSTM_1 = LSTM(250, return_sequences=True)(embedding_layer)
    LSTM_2 = LSTM(125)(LSTM_1)
    Dense_1 = Dense(100, activation='relu')(LSTM_2)
    outputs = Dense(vocabulary_size, activation='softmax')(Dense_1)
    model = Model(inputs=inputs, outputs=outputs)
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.summary()
    return model


In [9]:
model = create_model(vocabulary_size, seq_len)



In [10]:
# Train the model
model.fit(X, y, epochs=300, batch_size=128)

Epoch 1/300
[1m617/617[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 13ms/step - accuracy: 0.0372 - loss: 6.9349
Epoch 2/300
[1m617/617[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 12ms/step - accuracy: 0.0398 - loss: 6.4806
Epoch 3/300
[1m617/617[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 13ms/step - accuracy: 0.0420 - loss: 6.3179
Epoch 4/300
[1m617/617[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 12ms/step - accuracy: 0.0496 - loss: 6.1092
Epoch 5/300
[1m617/617[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 12ms/step - accuracy: 0.0546 - loss: 5.9969
Epoch 6/300
[1m617/617[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 12ms/step - accuracy: 0.0605 - loss: 5.8779
Epoch 7/300
[1m617/617[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 12ms/step - accuracy: 0.0679 - loss: 5.8087
Epoch 8/300
[1m617/617[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 12ms/step - accuracy: 0.0738 - loss: 5.7323
Epoch 9/300
[1m617/61

<keras.src.callbacks.history.History at 0x7b8da2839ad0>

## Model Save


In [11]:
from pickle import dump, load
model.save('txt_model.h5')
dump(tokenizer, open('txt_tokenizer.pkl', 'wb'))



## Text Generation


In [12]:
def generate_text(model, tokenizer, seq_len, seed_text, num_gen_words):
    '''
    INPUTS:
    model : model that was trained on text data
    tokenizer : tokenizer that was fit on text data
    seq_len : length of training sequence
    seed_text : raw string text to serve as the seed
    num_gen_words : number of words to be generated by model
    '''

    # Final Output
    output_text = []

    # Intial Seed Sequence
    input_text = seed_text

    # Create num_gen_words
    for i in range(num_gen_words):

        # Take the input text string and encode it to a sequence
        encoded_text = tokenizer.texts_to_sequences([input_text])[0]

        # Pad sequences to our trained rate (50 words in the video)
        pad_encoded = pad_sequences([encoded_text], maxlen=seq_len, truncating='pre')

        # Predict Class Probabilities for each word
        pred_word_ind = np.argmax(model.predict(pad_encoded, verbose=0), axis=-1)[0]

        # Grab word
        pred_word = tokenizer.index_word[pred_word_ind]

        # Update the sequence of input text (shifting one over with the new word)
        input_text += ' ' + pred_word

        output_text.append(pred_word)

    # Make it look like a sentence.
    return ' '.join(output_text)

In [17]:
seed_text= 'I asked to hide the Christmas Tree  and make sure the children did not see it until this evening.'

In [18]:
generate_text(model, tokenizer, seq_len, seed_text, num_gen_words=100)

'i shall be valiant been and to be off in other christmas sobbing to stay on its wings of the bank and you used to look into my end at the words its all and very nearly forgotten to give us said the king and she can none the dear false and she went on said the gryphon youll be telling you about her change you used to the dewdropping man with cats nothing two long how did my mother the next difficulty again what was nothing about the letter and of tears and if i had the same scenethe'