# Character-Level RNN vs LSTM — Next-Character Prediction

In [1]:
# Optional: quick check of versions
import sys, platform
try:
    import tensorflow as tf
except ImportError:
    raise SystemExit("TensorFlow not installed. Please run: pip install tensorflow")

print("Python:", sys.version.split()[0])
print("TensorFlow:", tf.__version__)
print("Platform:", platform.platform())
print("GPU available:", tf.config.list_physical_devices('GPU'))


Python: 3.12.12
TensorFlow: 2.19.0
Platform: Linux-6.6.105+-x86_64-with-glibc2.35
GPU available: []


---
## 1) Data Ingestion

Choose **one** option:
- **Option A (Recommended):** Point `TEXT_PATH` to a local `.txt` file.
- **Option B (Optional):** Use Kaggle CLI to download a dataset that contains `.txt` files, then set `TEXT_PATH`.

> Tip: Start with a small text file (a few MB) to train quickly on CPU.


In [1]:
from google.colab import files
import io, os

uploaded = files.upload()  # choose your .txt
# Grab the first .txt you uploaded and set TEXT_PATH automatically
TEXT_PATH = None
for name in uploaded:
    if name.lower().endswith(".txt"):
        TEXT_PATH = name
        break

if TEXT_PATH and os.path.exists(TEXT_PATH):
    print("Using:", TEXT_PATH)
else:
    raise FileNotFoundError("No .txt found in upload. Please upload a .txt file.")


Saving cleaned_merged_fairy_tales_without_eos.txt to cleaned_merged_fairy_tales_without_eos.txt
Using: cleaned_merged_fairy_tales_without_eos.txt


---
## 2) Preprocessing

- Lowercase & basic cleanup
- Build character vocabulary (`char2idx`, `idx2char`)
- Create fixed-length input sequences with the **next character** as target


In [2]:
# Load & clean text
import io, os, re

if not os.path.exists(TEXT_PATH):
    raise FileNotFoundError(f"TEXT_PATH does not exist: {TEXT_PATH}. Please set it to a valid .txt file path.")

with io.open(TEXT_PATH, 'r', encoding='utf-8', errors='ignore') as f:
    raw_text = f.read()

text = raw_text.lower()
text = re.sub(r"\s+", " ", text)  # collapse whitespace
print("Total characters:", len(text))
print("Sample:", text[:500])


Total characters: 20395327
Sample: the happy prince. high above the city, on a tall column, stood the statue of the happy prince. he was gilded all over with thin leaves of fine gold, for eyes he had two bright sapphires, and a large red ruby glowed on his sword-hilt. he was very much admired indeed. “he is as beautiful as a weathercock,” remarked one of the town councillors who wished to gain a reputation for having artistic tastes; “only not quite so useful,” he added, fearing lest people should think him unpractical, which he 


In [3]:
# Build char vocabulary and encode
import numpy as np

vocab = sorted(list(set(text)))
vocab_size = len(vocab)
print("Vocab size:", vocab_size)
print("Vocab (first 100):", vocab[:100])

char2idx = {c:i for i,c in enumerate(vocab)}
idx2char = {i:c for c,i in char2idx.items()}

encoded = np.array([char2idx[c] for c in text], dtype=np.int32)
encoded[:20]


Vocab size: 86
Vocab (first 100): [' ', '!', '"', '$', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '=', '?', '[', ']', '`', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '{', '|', '}', '£', '°', '½', 'à', 'â', 'ä', 'æ', 'ç', 'è', 'é', 'ê', 'ë', 'î', 'ï', 'ñ', 'ô', 'ö', 'ü', 'ā', 'œ', '—', '‘', '’', '“', '”', '…']


array([50, 38, 35,  0, 38, 31, 46, 46, 55,  0, 46, 48, 39, 44, 33, 35, 12,
        0, 38, 39], dtype=int32)

In [4]:
# Create sequences (X) and next-char targets (y)
SEQ_LEN = 60
STEP = 1
BATCH_SIZE = 256
VAL_SPLIT  = 0.1

X, y = [], []
for i in range(0, len(encoded) - SEQ_LEN, STEP):
    X.append(encoded[i:i+SEQ_LEN])
    y.append(encoded[i+SEQ_LEN])
X = np.array(X, dtype=np.int32)
y = np.array(y, dtype=np.int32)

print("Sequences:", X.shape, "Targets:", y.shape)


Sequences: (20395267, 60) Targets: (20395267,)


In [5]:
# Train/Val split
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.1, random_state=42, shuffle=True
)
print("Train:", X_train.shape, y_train.shape, "Val:", X_val.shape, y_val.shape)


Train: (18355740, 60) (18355740,) Val: (2039527, 60) (2039527,)


In [6]:
# Define model builders
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

EMBED_DIM = 64
RNN_UNITS = 128
DROPOUT = 0.2

def build_simple_rnn(vocab_size, embed_dim=EMBED_DIM, rnn_units=RNN_UNITS, dropout=DROPOUT):
    inputs = keras.Input(shape=(None,), dtype='int32')
    x = layers.Embedding(vocab_size, embed_dim)(inputs)
    x = layers.SimpleRNN(rnn_units, dropout=dropout)(x)
    outputs = layers.Dense(vocab_size, activation='softmax')(x)
    model = keras.Model(inputs, outputs, name="SimpleRNN_NextChar")
    model.compile(optimizer=keras.optimizers.Adam(1e-3),
                  loss='sparse_categorical_crossentropy',
                  metrics=['accuracy'])
    return model

def build_lstm(vocab_size, embed_dim=EMBED_DIM, rnn_units=RNN_UNITS, dropout=DROPOUT):
    inputs = keras.Input(shape=(None,), dtype='int32')
    x = layers.Embedding(vocab_size, embed_dim)(inputs)
    x = layers.LSTM(rnn_units, dropout=dropout)(x)
    outputs = layers.Dense(vocab_size, activation='softmax')(x)
    model = keras.Model(inputs, outputs, name="LSTM_NextChar")
    model.compile(optimizer=keras.optimizers.Adam(1e-3),
                  loss='sparse_categorical_crossentropy',
                  metrics=['accuracy'])
    return model

rnn_model = build_simple_rnn(vocab_size)
lstm_model = build_lstm(vocab_size)

rnn_model.summary()
lstm_model.summary()


In [7]:
import tensorflow as tf
VAL_SPLIT = 0.1
N = len(encoded)
split_idx = int((1.0 - VAL_SPLIT) * N)
train_stream = encoded[:split_idx]
val_stream   = encoded[max(0, split_idx - (SEQ_LEN + 1)):]

def make_seq_ds(stream, seq_len=SEQ_LEN, step=STEP, batch_size=BATCH_SIZE):

    ds = tf.keras.utils.timeseries_dataset_from_array(
        data=stream,
        targets=None,
        sequence_length=seq_len + 1,
        sequence_stride=step,
        shuffle=True,
        batch_size=batch_size,
    )
    def split_xy(batch):
        return tf.cast(batch[:, :-1], tf.int32), tf.cast(batch[:, -1], tf.int32)
    return ds.map(split_xy, num_parallel_calls=tf.data.AUTOTUNE).prefetch(tf.data.AUTOTUNE)

train_ds = make_seq_ds(train_stream)
val_ds   = make_seq_ds(val_stream)



In [8]:
# Train
import tensorflow as tf
EPOCHS = 1

cb_early = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss', patience=3, restore_best_weights=True
)

print("Training SimpleRNN...")
hist_rnn = rnn_model.fit(
    train_ds, validation_data=val_ds,
    epochs=EPOCHS, callbacks=[cb_early], verbose=1
)

print("\nTraining LSTM...")
hist_lstm = lstm_model.fit(
    train_ds, validation_data=val_ds,
    epochs=EPOCHS, callbacks=[cb_early], verbose=1
)


Training SimpleRNN...
[1m71703/71703[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2164s[0m 30ms/step - accuracy: 0.4760 - loss: 1.7598 - val_accuracy: 0.5203 - val_loss: 1.5967

Training LSTM...
[1m71703/71703[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2175s[0m 30ms/step - accuracy: 0.4726 - loss: 1.7697 - val_accuracy: 0.5357 - val_loss: 1.5434


In [10]:
# Generation helpers
import numpy as np

def sample_from_probs(probs, temperature=1.0):
    probs = np.asarray(probs).astype('float64')
    if temperature <= 0:
        return int(np.argmax(probs))
    logits = np.log(probs + 1e-9) / temperature
    exp = np.exp(logits)
    probs = exp / np.sum(exp)
    return int(np.random.choice(len(probs), p=probs))

def generate_text(model, seed_text, gen_len=300, temperature=0.7):
    seq = [char2idx.get(c, 0) for c in seed_text.lower()]
    out = list(seed_text)

    for _ in range(gen_len):
        x = np.array([seq[-SEQ_LEN:]], dtype=np.int32)
        probs = model.predict(x, verbose=0)[0]
        next_id = sample_from_probs(probs, temperature=temperature)
        next_char = idx2char[next_id]
        out.append(next_char)
        seq.append(next_id)
    return ''.join(out)


In [11]:
# Try generating with each model
SEED_TEXT = "once upon a time"
GEN_LEN = 400
TEMPERATURE = 0.7

print("— SimpleRNN —")
print(generate_text(rnn_model, SEED_TEXT, gen_len=GEN_LEN, temperature=TEMPERATURE)[:1000])
print("\n— LSTM —")
print(generate_text(lstm_model, SEED_TEXT, gen_len=GEN_LEN, temperature=TEMPERATURE)[:1000])


— SimpleRNN —
once upon a time the foretheres who may a consided that the carry me to let the for queen-daughter of the letter and throw the moon the floor in. from when the passed into all she contage counted him the window he had being fine, and she do it to confiers, and she said the king for a chaping night in the fine the little amraya work at she can all he was were it it and was prince, and took the garden and he was fi

— LSTM —
once upon a time the deserved to be way she is your certal i’ll grance come and a name of the prince with with sent and a lady three her and never struck you mean patch of perceived mind you she had ever say my trace, and looking her cecter was could knew them they speared with a sound that ever understand on to the thing and the precise called a sudden son, looking us with a face with the found the white; and do
