In [39]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer

# 1) Load the text from disk
file_path = r"data/war_peace.txt"
with open(file_path, "r", encoding="utf-8") as f:
    raw_text = f.read()

max_chars = 500_000
raw_text = raw_text[:max_chars]

# 3) Tokenize at the word level
tokenizer = Tokenizer()
tokenizer.fit_on_texts([raw_text])
word_seq = tokenizer.texts_to_sequences([raw_text])[0]
vocab_size = len(tokenizer.word_index) + 1

# 4) Build a tf.data pipeline
seq_length = 15      # how many previous words to look at
batch_size  = 64
buffer_size = 20_000

dataset = (
    tf.data.Dataset.from_tensor_slices(word_seq)
    .window(size=seq_length+1, shift=1, drop_remainder=True) 
    .flat_map(lambda window: window.batch(seq_length+1))
    .map(lambda window: (window[:-1], window[-1]))
    .shuffle(buffer_size)
    .batch(batch_size, drop_remainder=True)
    .prefetch(tf.data.AUTOTUNE)
)

# 5) Define a simple LSTM language model
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, 128, mask_zero=True), # sequence - [0, 0,0,2,5,8,1,3,6,7] will ignore 0s
    tf.keras.layers.LSTM(32),
    tf.keras.layers.Dense(vocab_size, activation="softmax") # output - [0.1, 0.1, 0.2, 0.05, 0.1, 0.1, 0.1, 0.1, 0.05, 0.05]
])
model.compile(optimizer="adam",
              loss="sparse_categorical_crossentropy")

# 6) Train
epochs = 50
model.fit(dataset, epochs=epochs)

# 7) Generation helper
def generate(seed_text, gen_length=50):
    seq_ids = tokenizer.texts_to_sequences([seed_text])[0] # [2, 5, 8, 1, 3, 6, 7]
    for _ in range(gen_length):

        x = tf.keras.preprocessing.sequence.pad_sequences(
            [seq_ids], maxlen=len(seq_ids), padding="pre") # [0, 0, 0, 2, 5, 8, 1, 3, 6, 7]
        
        preds = model.predict(x, verbose=0)[0]
        next_id = tf.argmax(preds).numpy() # [0.1, 0.1, 0.2, 0.05, 0.1, 0.1, 0.1, 0.1, 0.05, 0.05] -> [2]

        seq_ids.append(int(next_id))
    inv_map = {v: k for k, v in tokenizer.word_index.items()}
    return " ".join(inv_map[i] for i in seq_ids)


Epoch 1/50
    847/Unknown [1m8s[0m 8ms/step - loss: 7.2679

KeyboardInterrupt: 

In [37]:
# 8) Try it
seed = "My name is"
print("→", generate(seed, gen_length=20))


→ my name is i am i am in the way ” said the staff officer with the suite and the officer in the


In [48]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dropout, Dense
from tensorflow.keras.callbacks import EarlyStopping

# 1. Load the text from disk
file_path = r"data/war_peace.txt"
with open(file_path, "r", encoding="utf-8") as f:
    raw_text = f.read()

# 2. (Optional) Truncate to keep memory and time reasonable
max_chars = 50000
raw_text = raw_text[:max_chars]

# 3. Tokenize at the word level
tokenizer = Tokenizer()
tokenizer.fit_on_texts([raw_text])                        # Build vocabulary
word_seq = tokenizer.texts_to_sequences([raw_text])[0]    # Convert text to integer sequence
vocab_size = len(tokenizer.word_index) + 1                # +1 because index 0 is reserved

# 4. Set hyperparameters for dataset and model
seq_length = 50       # Number of tokens in each input sequence
batch_size = 64       # Number of sequences per batch
buffer_size = 20000   # Shuffle buffer size

# 5. Build the tf.data pipeline
dataset = (
    tf.data.Dataset.from_tensor_slices(word_seq)
      .window(size=seq_length + 1, shift=1, drop_remainder=True)
      .flat_map(lambda window: window.batch(seq_length + 1))
      .map(lambda window: (window[:-1], window[1:]), num_parallel_calls=tf.data.AUTOTUNE)
      .shuffle(buffer_size)
      .batch(batch_size, drop_remainder=True)
      .prefetch(tf.data.AUTOTUNE)
)

# 6. Compute steps_per_epoch to avoid Progbar errors
num_windows = len(word_seq) - seq_length
steps_per_epoch = num_windows // batch_size

# 7. Define a many-to-many LSTM model
model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=128, mask_zero=True),
    LSTM(256, return_sequences=True),
    Dropout(0.2),
    LSTM(256, return_sequences=True),
    Dense(vocab_size)
])

model.compile(
    optimizer="adam",
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
)
model.summary()

# 8. Set up EarlyStopping
callbacks = [
    EarlyStopping(monitor="loss", patience=4)
]

# 9. Train the model, specifying steps_per_epoch
epochs = 20
model.fit(
    dataset,
    epochs=epochs,
    steps_per_epoch=steps_per_epoch
)



Epoch 1/20
[1m132/132[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 239ms/step - loss: 6.5644
Epoch 2/20
[1m132/132[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30us/step - loss: 0.0000e+00
Epoch 3/20
[1m132/132[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 248ms/step - loss: 5.9123
Epoch 4/20
[1m132/132[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8us/step - loss: 0.0000e+00
Epoch 5/20
[1m132/132[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 250ms/step - loss: 5.1877
Epoch 6/20
[1m132/132[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19us/step - loss: 0.0000e+00
Epoch 7/20
[1m132/132[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 249ms/step - loss: 4.6571
Epoch 8/20
[1m132/132[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14us/step - loss: 0.0000e+00
Epoch 9/20
[1m132/132[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 255ms/step - loss: 4.1953
Epoch 10/20
[1m132/132[0m [32m━━━━━━━━━━━━━━━━━━━━[0m

<keras.src.callbacks.history.History at 0x2a71090c860>

In [49]:
# 10. Create inverse vocabulary map for generation
inv_vocab = {idx: word for word, idx in tokenizer.word_index.items()}

# 11. Define text generation helper with temperature sampling
import numpy as np

def generate_text(model, seed_text, gen_length=50, temperature=1.0):
    """
    Generate text using the trained model.
    seed_text: initial text string to prime the model
    gen_length: number of tokens to generate
    temperature: >1 for more random, <1 for more deterministic sampling
    """
    # Convert seed text to sequence of IDs
    seq_ids = tokenizer.texts_to_sequences([seed_text])[0]
    for _ in range(gen_length):
        # Pad sequence to fixed length
        x = tf.keras.preprocessing.sequence.pad_sequences(
            [seq_ids], maxlen=seq_length, padding="pre"
        )
        # Predict logits for each token position
        logits = model.predict(x, verbose=0)[0, -1, :]
        # Apply temperature
        logits = logits / temperature
        # Sample from the distribution
        next_id = tf.random.categorical(tf.expand_dims(logits, 0), 1)[0, 0].numpy()
        seq_ids.append(int(next_id))
    # Convert back to words
    return " ".join(inv_vocab.get(i, "") for i in seq_ids)


In [61]:
# 12. Try generating text
input("Press Enter to generate text...")
print(generate_text(model, "You are", gen_length=50, temperature=5))

you are mentioned be ‘faithful george carrying although hour femme going has committed enter xviii neither so wants hardly quite wish whirl say what everyone shoulder so tell did au fourteen ’ slight terms in alliance said placed raised clutched was her indebted recommended hardly emigrant glossy “dieu changed only brown “i
