In [4]:
# !pip install tensorflow
# !pip install numpy

In [5]:
import numpy as np
import tensorflow as tf
import string
import os
import pickle
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

Load & Preprocess Data

In [6]:
DATA_PATH = "/content/shakespeare.txt"
CHECKPOINT_PATH = "best_lstm_text_model.keras"
SEED = 42
SEQ_LENGTH = 100
STEP = 1

np.random.seed(SEED)
tf.random.set_seed(SEED)

if not os.path.exists(DATA_PATH):
    raise FileNotFoundError(
        f"Dataset not found at {DATA_PATH}. Place shakespeare.txt in the project folder."
    )

In [7]:
with open(DATA_PATH, "r", encoding="utf-8") as f:
    text = f.read()

text = text.lower()

# Remove punctuation
text = text.translate(str.maketrans("", "", string.punctuation))

print(f"Total characters: {len(text)}")

Total characters: 5128224


In [8]:
# Character-level tokenization
chars = sorted(list(set(text)))
vocab_size = len(chars)

print(f"Vocabulary size: {vocab_size}")

Vocabulary size: 56


In [9]:
# Create char-to-index mappings
char_to_idx = {char: idx for idx, char in enumerate(chars)}
idx_to_char = {idx: char for idx, char in enumerate(chars)}

Create Sequences

In [10]:
input_sequences = []
target_chars = []

for i in range(0, len(text) - SEQ_LENGTH, STEP):
    input_sequences.append(text[i:i + SEQ_LENGTH])
    target_chars.append(text[i + SEQ_LENGTH])

print(f"Number of sequences: {len(input_sequences)}")

Number of sequences: 5128124


In [11]:
# Convert sequences to numerical form
X = np.zeros((len(input_sequences), SEQ_LENGTH), dtype=np.int32)
y = np.zeros((len(input_sequences)), dtype=np.int32)

for i, seq in enumerate(input_sequences):
    X[i] = [char_to_idx[char] for char in seq]
    y[i] = char_to_idx[target_chars[i]]

# Shuffle before split to reduce bias
perm = np.random.permutation(len(X))
X = X[perm]
y = y[perm]

 Train / Validation Split

In [12]:
split_idx = int(0.9 * len(X))
X_train, X_val = X[:split_idx], X[split_idx:]
y_train, y_val = y[:split_idx], y[split_idx:]

Build LSTM Model

In [13]:
model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=128, input_length=SEQ_LENGTH),
    LSTM(256, return_sequences=True),
    LSTM(256),
    Dense(vocab_size, activation="softmax")
])



In [14]:
model.compile(
    loss="sparse_categorical_crossentropy",
    optimizer="adam"
)

In [15]:
# model.summary()

Train Model

In [16]:
callbacks = [ EarlyStopping(monitor="val_loss", patience=3, restore_best_weights=True), ModelCheckpoint(CHECKPOINT_PATH, save_best_only=True)]

In [17]:
history = model.fit( X_train, y_train, validation_data=(X_val, y_val), epochs=1, batch_size=128, callbacks=callbacks)                # Its take training too much time so am use epochs = 1

[1m36058/36058[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1178s[0m 33ms/step - loss: 1.7316 - val_loss: 1.3875


Text Generation Logic

In [18]:

if os.path.exists(CHECKPOINT_PATH):
    model.load_weights(CHECKPOINT_PATH)

def _prepare_seed(seed_text):
    seed = seed_text.lower()
    if len(seed) < SEQ_LENGTH:
        pad_char = " " if " " in char_to_idx else list(char_to_idx.keys())[0]
        seed = (pad_char * (SEQ_LENGTH - len(seed))) + seed
    return seed

def generate_text(seed_text, length=500, temperature=0.8):
    generated = _prepare_seed(seed_text)

    for _ in range(length):
        input_seq = generated[-SEQ_LENGTH:]
        input_idx = np.array([[char_to_idx.get(c, 0) for c in input_seq]])

        preds = model.predict(input_idx, verbose=0)[0]

        preds = np.log(preds + 1e-8) / max(temperature, 1e-3)
        probs = np.exp(preds) / np.sum(np.exp(preds))

        next_idx = np.random.choice(len(probs), p=probs)
        generated += idx_to_char[next_idx]

    return generated

Generate Sample Text

In [19]:
seed = "to be or not to be that is the question"
print(generate_text(seed, 400))

                                                             to be or not to be that is the question
shall equal and in the people of our hortens it is a man tell them

lucibiales
and doth he lives well he will tell you they made then
what is his brow

grumio
run out bear and marriage

camillo
sir that play in master germies
and the sun you that they are made destised train by foul sir betwixt eye about a traitor of any good you have you father of the world of my tears and three war into so may 


In [20]:
seed = "my lord i do protest too much methinks"
print(generate_text(seed, 400))

                                                              my lord i do protest too much methinks to find
the fond wars a hope of her first
as i be cirching or with somectors nor that the vicer his salisbury

helena
they mean should be the weapons a worthy letter the guilty

exeunt

caesar
and he way what was more tiglary

marcellus
i that i am behind to prosper upon this wife

bolingbroke
no come my lord friends and his bear a sail of him

dromio of my down
why receive me the conquess be to 


In [21]:
seed = "all the worlds a stage and all the men and women"
print(generate_text(seed, 400))

                                                    all the worlds a stage and all the men and women to more be a wise it is the bitterweent

she doth she hadst my touch like a jest will i all chamberlain that thou couldst romeo thine in the contrary will cannot be what nor i for the enemy
pointed of my brother that we wilt thou shalt make thee to too
but warrant the murderers like a strife bewells my father is no catch
his undovenigence of york be ear that while yet not the sands hath wronged t


In [22]:
pickle.dump(model, open("shakespeare.pkl", "wb"))