In [None]:
# 📚 Character-Level Text Generator with LSTM (Colab Version)
# Author: ChatGPT

# Step 1: Install dependencies (if needed)
!pip install tensorflow --quiet

# Step 2: Load Dataset
import numpy as np
import tensorflow as tf
import os
import requests

In [None]:
# Download a sample dataset (Alice in Wonderland)
url = "https://www.gutenberg.org/files/11/11-0.txt"
response = requests.get(url)
text = response.text

print(f"Corpus length: {len(text)} characters")

# Step 3: Character Mapping
chars = sorted(set(text))
char_to_idx = {ch: i for i, ch in enumerate(chars)}
idx_to_char = {i: ch for ch, i in char_to_idx.items()}

# Step 4: Create Sequences
SEQ_LENGTH = 100
STEP = 1
sequences = []
next_chars = []

for i in range(0, len(text) - SEQ_LENGTH, STEP):
    sequences.append(text[i: i + SEQ_LENGTH])
    next_chars.append(text[i + SEQ_LENGTH])

print(f"Number of sequences: {len(sequences)}")

# Step 5: Vectorize the sequences
X = np.zeros((len(sequences), SEQ_LENGTH), dtype=np.int32)
y = np.zeros((len(sequences), len(chars)), dtype=np.bool_)

for i, seq in enumerate(sequences):
    X[i] = [char_to_idx[c] for c in seq]
    y[i, char_to_idx[next_chars[i]]] = 1

# Step 6: Build the LSTM Model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Embedding
from tensorflow.keras.optimizers import Adam

model = Sequential([
    Embedding(input_dim=len(chars), output_dim=64, input_length=SEQ_LENGTH),
    LSTM(256),
    Dense(len(chars), activation='softmax')
])

model.compile(loss='categorical_crossentropy', optimizer=Adam(), metrics=['accuracy'])
model.summary()

# Step 7: Train the Model (short training for demo purposes)
model.fit(X, y, batch_size=64, epochs=3)

# Step 8: Generate Text
def sample(preds, temperature=1.0):
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds + 1e-8) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    return np.random.choice(len(preds), p=preds)

def generate_text(seed_text, length=400, temperature=1.0):
    generated = seed_text
    input_seq = [char_to_idx.get(c, 0) for c in seed_text]

    for _ in range(length):
        pad_seq = tf.keras.preprocessing.sequence.pad_sequences([input_seq], maxlen=SEQ_LENGTH)
        preds = model.predict(pad_seq, verbose=0)[0]
        next_idx = sample(preds, temperature)
        next_char = idx_to_char[next_idx]
        generated += next_char
        input_seq.append(next_idx)
        input_seq = input_seq[-SEQ_LENGTH:]

    return generated

# Step 9: Try it!
seed = "Alice was beginning to get very tired "
generated_text = generate_text(seed, temperature=0.5)
print("\n--- Generated Text ---\n")
print(generated_text)


Corpus length: 148080 characters
Number of sequences: 147980




Epoch 1/3
[1m2313/2313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 10ms/step - accuracy: 0.3221 - loss: 2.5393
Epoch 2/3
[1m2313/2313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 11ms/step - accuracy: 0.4921 - loss: 1.7740
Epoch 3/3
[1m2313/2313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 11ms/step - accuracy: 0.5432 - loss: 1.5489

--- Generated Text ---

Alice was beginning to get very tired themee some the was that would get the exepper there sorn into the said to be a little she
herself, and the said, and were was it she had the Hatter. There was the jury arain the Queen.

“I should thing you the erecully begin it, a lary grow the
seemed to here for hand that the exear.”

The Mouse of them was she could harder creatious, and she was not the looks the ratter, and there was a li
