In [100]:
# IMPORT LIBRARIES
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense , Dropout
from tensorflow.keras.optimizers import Adam


In [101]:
#  LOAD SHAKESPEARE TEXT
# This downloads a text file with Shakespeare's works

path = tf.keras.utils.get_file(
    "shakespeare.txt" , 
    "https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt"
)


# Read the text file into a big string

text = open(path , "r" , encoding = "utf-8").read().lower()

# Split the text into lines (each line is like a sentence)
# NOTE: use "\n" (newline) not "/n"
corpus = text.split("\n")

# To make it simple and fast, we only use the first 40000 lines
corpus = corpus[:40000]

print("Number of lines used : "  , len(corpus))
print("Example line :" , corpus[10])

Number of lines used :  40000
Example line : resolved. resolved.


In [102]:
# TOKENIZE THE TEXT (TURN WORDS INTO NUMBERS)
tokenizer = Tokenizer()
tokenizer.fit_on_texts(corpus)   #learn the word --> number mapping

In [103]:
# Convert entire text to a single list of token IDs
tokens = tokenizer.texts_to_sequences([text])[0]
print("Total tokens:", len(tokens))

Total tokens: 204089


In [105]:
# Total number of different words in our data (vocabulary size)

total_words = len(tokenizer.word_index) +1  #+1 for padding index
print("vocabulary size" , total_words)

vocabulary size 12633


In [106]:
# Create reverse dictionary (ID → word)
reverse_word_index = {idx: word for word, idx in tokenizer.word_index.items()}


In [107]:
# CREATE FIXED-LENGTH TRAINING SEQUENCES
# We choose a sequence length (number of words in each training example).
sequence_length = 60 

input_sequences = []

# We slide a window of length (sequence_length + 1) across the tokens.
for i in range(sequence_length, len(tokens)):
    seq = tokens[i - sequence_length : i + 1]
    input_sequences.append(seq)

print("Number of sequences:", len(input_sequences))

Number of sequences: 204029


In [108]:
# PREPARE X AND y
input_sequences = np.array(input_sequences)

# All but last token → input
X = input_sequences[:, :-1]
# Last token → target
y = input_sequences[:, -1]

# One-hot encode target
y = to_categorical(y, num_classes=total_words)

print("X shape:", X.shape)  # (num_samples, sequence_length)
print("y shape:", y.shape)  # (num_samples, vocab_size)# 

X shape: (204029, 60)
y shape: (204029, 12633)


In [109]:
# BUILD LSTM MODEL
total_words = y.shape[1]   #vocab size from y shape
model = Sequential()             # create model
model.add(                          # add layers
    Embedding(
        input_dim=total_words,      # vocab size
        output_dim=256,             # embedding size (can tune)
        input_length=X.shape[1]     # sequence length 
    )
)
model.add(LSTM(256 , return_sequences=True))  
model.add(Dropout(0.2)) 

model.add(LSTM(256))             # bigger LSTM for more power
model.add(Dropout(0.2))

model.add(Dense(total_words, activation='softmax'))     # softmax output layer , Dense layer with vocab size outputs

opt = Adam(learning_rate=0.001)          # Adam optimizer

model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['accuracy'])

model.summary()

Model: "sequential_7"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_7 (Embedding)     (None, 60, 256)           3234048   
                                                                 
 lstm_8 (LSTM)               (None, 60, 256)           525312    
                                                                 
 dropout_2 (Dropout)         (None, 60, 256)           0         
                                                                 
 lstm_9 (LSTM)               (None, 256)               525312    
                                                                 
 dropout_3 (Dropout)         (None, 256)               0         
                                                                 
 dense_7 (Dense)             (None, 12633)             3246681   
                                                                 
Total params: 7,531,353
Trainable params: 7,531,353
No

In [None]:
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint       # callbacks for training , modelcheckpoint is used to save the best model during training

early_stop = EarlyStopping(        # stop training if no improvement
    monitor='val_accuracy',
    patience=3,          # stop if no improvement for 3 epochs
    restore_best_weights=True     # restore best weights after stopping
)

checkpoint = ModelCheckpoint(       # save the best model during training
    'best_shakespeare_model.h5',      
    monitor='val_accuracy',
    save_best_only=True,
    verbose=1
)

history = model.fit(
    X, y,
    epochs=40,           # try up to 40, EarlyStopping will stop earlier
    batch_size=128,
    validation_split=0.1,    # use 10% of data for validation
    callbacks=[early_stop, checkpoint],    # callbacks for training
    verbose=1
)

Epoch 1/40
Epoch 1: val_accuracy improved from -inf to 0.04097, saving model to best_shakespeare_model.h5
Epoch 2/40
Epoch 2: val_accuracy improved from 0.04097 to 0.07161, saving model to best_shakespeare_model.h5
Epoch 3/40
Epoch 3: val_accuracy improved from 0.07161 to 0.07656, saving model to best_shakespeare_model.h5
Epoch 4/40

In [None]:
# TEMPERATURE SAMPLING FUNCTION

def sample_with_temperature(preds, temperature=1.0):        # temperature sampling
    """
    Sample an index from the predicted probabilities using temperature.
    Lower temperature = more predictable.
    Higher temperature = more random.
    """
    preds = np.asarray(preds).astype("float64")       # convert to numpy array
     #we add 1e-8? Because sometimes a predicted probability can be 0,
    # This makes sure nothing is exactly zero, but it’s so small that it doesn’t change the probabilities in any meaningful way.
    preds = np.log(preds + 1e-8) / temperature     
    
    exp_preds = np.exp(preds)                                # exponentiate the log probabilities
    preds = exp_preds / np.sum(exp_preds)                    # to get a valid probability distribution
    return np.random.choice(len(preds), p=preds)             # to sample a word index according to this distribution.


In [81]:
# TEXT GENERATION FUNCTION

def generate_text(seed_text, next_words=20, temperature=0.3):
    """
    Generate new text starting from seed_text by predicting
    next_words one by one with temperature sampling.
    """
    text = seed_text

    for _ in range(next_words):
        # Convert current text to token IDs
        token_list = tokenizer.texts_to_sequences([text])[0]

        # If no known tokens, stop
        if len(token_list) == 0:
            break

        # Only keep the last sequence_length tokens
        # (otherwise the input grows forever)
        token_list = token_list[-X.shape[1]:]

        # Pad to the fixed length needed by the model
        token_list_padded = pad_sequences(
            [token_list],
            maxlen=X.shape[1],
            padding='pre'
        )

        # Predict probabilities for next word
        predicted_probs = model.predict(token_list_padded, verbose=0)[0]

        # Sample next word index with temperature
        predicted_id = np.argmax(predicted_probs)           # choose the word with highest probability

        # Ignore padding index
        if predicted_id == 0:
            continue

        # Get word from index
        next_word = reverse_word_index.get(predicted_id, None)
        if next_word is None:
            break

        # Append word to text
        text += " " + next_word

    return text

In [82]:
# TEST TEXT GENERATION

print(generate_text("to be or", next_words= 40, temperature=0.3))
print(generate_text("the king", next_words=35, temperature=0.2))
print(generate_text("love is", next_words= 30, temperature=0.1))

to be or pleasant for it is being done what will you have but kneels and kiss me with a low gold i am a subject will have been more deep a young one though when we lay by the wall as if
the king is dead ' and while he be sure i will not kill this world and break the english by my state and he to spend the world as much sway when this was between that
love is set down for being done for shame rule my hearth but with some little train 'gainst my friends good my lord my lord and leave you lords to meet you
