In [1]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping
import string
import os

Dataset downloaded from kaggle

link for dataset : "https://www.kaggle.com/datasets/fadhilawaliakusuma/alice-in-wonderland" 

I chose Alice in Wonderland dataset because it does not have any copyrights. 
And I will upload the files directly into my GitHub Repo so you can direclty download it.


In [2]:
# Importing Data :-

file_paths = ["../data/books.veryshort.txt",
              "../data/pride-prejudice.txt",
             "../data/the-book-thief.txt"]
text = ""

for path in file_paths:
    with open(path, "r", encoding="utf-8", errors="ignore") as file:
        text += file.read() + " "

print("Dataset Loaded Successfully!")

Dataset Loaded Successfully!


In [3]:
text = text.lower()

# Remove punctuation
text = text.translate(str.maketrans('', '', string.punctuation))

# Initialize tokenizer
tokenizer = Tokenizer(num_words = 12000)
tokenizer.fit_on_texts([text])

total_words = len(tokenizer.word_index) + 1
print("Total Vocabulary Size:", total_words)


Total Vocabulary Size: 7082


In [4]:
# Convert text into token sequence :-
token_list = tokenizer.texts_to_sequences([text])[0]

# Create input-output sequences :-
sequence_length = 50 

In [5]:
input_sequences = []

for i in range(sequence_length, len(token_list)):
    n_gram_sequence = token_list[i-sequence_length:i+1]
    input_sequences.append(n_gram_sequence)

input_sequences = np.array(input_sequences)

In [6]:
# Split predictors and label :-
X = input_sequences[:, :-1]
y = input_sequences[:, -1]
print("Preprocessing Complete!")

Preprocessing Complete!


In [7]:
# Train Validation Split :-
split = int(0.9 * len(X))

X_train, X_val = X[:split], X[split:]
y_train, y_val = y[:split], y[split:]

print("Training Samples:", len(X_train))
print("Validation Samples:", len(X_val))

Training Samples: 59478
Validation Samples: 6609


In [8]:
#Building Model :-
model = Sequential()

model.add(Embedding(total_words, 100, input_length=sequence_length))
model.add(LSTM(150, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(100))
model.add(Dense(total_words, activation='softmax'))

model.compile(
    loss='sparse_categorical_crossentropy',
    optimizer='adam',
    metrics=['accuracy']
)




In [9]:
# Training Model"
early_stop = EarlyStopping(
    monitor='val_loss',
    patience=3,
    restore_best_weights=True
)

history = model.fit(
    X_train, y_train,
    epochs=10,
    batch_size=128,
    validation_data=(X_val, y_val),
    callbacks=[early_stop]
)

print("Training Complete!")

Epoch 1/10
[1m465/465[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m149s[0m 302ms/step - accuracy: 0.0484 - loss: 6.7565 - val_accuracy: 0.0657 - val_loss: 7.2588
Epoch 2/10
[1m465/465[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m156s[0m 331ms/step - accuracy: 0.0568 - loss: 6.3366 - val_accuracy: 0.0716 - val_loss: 7.3482
Epoch 3/10
[1m465/465[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m189s[0m 304ms/step - accuracy: 0.0636 - loss: 6.1273 - val_accuracy: 0.0731 - val_loss: 7.2821
Epoch 4/10
[1m465/465[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m137s[0m 294ms/step - accuracy: 0.0750 - loss: 5.9558 - val_accuracy: 0.0778 - val_loss: 7.3526
Training Complete!


In [12]:
# Text generation func :-


def sample_with_temperature(preds, temperature=1.0):
    preds = np.asarray(preds).astype("float64")
    preds = np.log(preds + 1e-8) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

def generate_text(seed_text, next_words=50):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=sequence_length, padding='pre')

        predicted = model.predict(token_list, verbose=0)
        predicted_word_index = np.argmax(predicted)

        output_word = ""
        for word, index in tokenizer.word_index.items():
            if index == predicted_word_index:
                output_word = word
                break

        seed_text += " " + output_word

    return seed_text


In [13]:
# Sample text :-
seed_1 = "alice looked at"
seed_2 = "the white rabbit"
seed_3 = "she began to"

print("\n--- Generated Text Sample 1 ---")
print(generate_text(seed_1, 40))

print("\n--- Generated Text Sample 2 ---")
print(generate_text(seed_2, 40))

print("\n--- Generated Text Sample 3 ---")
print(generate_text(seed_3, 40))


--- Generated Text Sample 1 ---
harry looked at the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the

--- Generated Text Sample 2 ---
the dark lord the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the

--- Generated Text Sample 3 ---
hermione said that the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the


Conclusion

In this project, an LSTM-based text generation model was developed and trained on a large textual dataset to generate coherent and meaningful text. Through preprocessing steps such as lowercasing, tokenization, and sequence generation, the dataset was transformed into structured input-output pairs suitable for training a deep learning model. The embedding layer helped represent words in dense vector form, while stacked LSTM layers captured contextual relationships and long-term dependencies within the text.

Using sparse categorical crossentropy improved memory efficiency by avoiding one-hot encoding, making it feasible to train the model on a large vocabulary. After training, the model was able to generate new text based on a given seed sequence, demonstrating its ability to learn language structure and word patterns effectively.

Experiments with different sequence lengths and model configurations showed that deeper architectures improved contextual understanding but required more computational resources. Overall, this project highlights the effectiveness of LSTM networks in sequence modeling and text generation tasks, while also emphasizing the importance of preprocessing, model optimization, and resource management in building practical NLP systems.