In [33]:
import os
import json
import pickle
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import layers, models
import nltk

# Prepare Corpus

In [34]:
text = open("large_text.txt").read()

nltk.download('punkt_tab')
CORPUS = nltk.sent_tokenize(text)

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\lilyc\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [35]:
def build_tokenizer(texts, oov_token="<OOV>", lower=True, num_words=None):
    tok = Tokenizer(oov_token=oov_token, lower=lower, num_words=num_words)
    tok.fit_on_texts(texts)
    return tok

def make_ngram_dataset(tokenizer, texts, min_len=2):
    """
    Builds n-gram sequences for next-word prediction.
    Each sequence[i] is tokens[0:i] -> predict tokens[i]
    """
    sequences = []
    for line in texts:
        tokens = tokenizer.texts_to_sequences([line])[0]
        # create incremental n-grams: [w1,w2]->w3, [w1,w2,w3]->w4, ...
        for i in range(min_len, len(tokens) + 1):
            sequences.append(tokens[:i])
    if not sequences:
        raise ValueError("No sequences constructed; check your corpus/tokenizer.")

    max_len = max(len(s) for s in sequences)
    sequences = pad_sequences(sequences, maxlen=max_len, padding="pre")
    X, y = sequences[:, :-1], sequences[:, -1]
    vocab_size = min(tokenizer.num_words or len(tokenizer.word_index) + 1,
                     len(tokenizer.word_index) + 1)
    return X, y, max_len, vocab_size

# Build Model

In [36]:
def build_model(vocab_size, seq_len_minus1, embed_dim=128, rnn_units=256, dropout=0.2):
    model = models.Sequential([
        layers.Embedding(input_dim=vocab_size, output_dim=embed_dim, input_length=seq_len_minus1),
        layers.Bidirectional(layers.LSTM(rnn_units, return_sequences=False)),
        layers.Dropout(dropout),
        layers.Dense(rnn_units, activation="relu"),
        layers.Dense(vocab_size, activation="softmax")
    ])
    model.compile(
        loss="sparse_categorical_crossentropy",
        optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3),
        metrics=["accuracy"]
    )
    return model

# Inference Helpers

In [37]:
def _apply_temperature(logits, temperature=1.0):
    logits = np.asarray(logits, dtype=np.float64)
    if temperature <= 0:
        # zero (greedy) or negative => fallback to greedy
        one_hot = np.zeros_like(logits)
        one_hot[np.argmax(logits)] = 1.0
        return one_hot
    logits = np.log(np.maximum(logits, 1e-9)) / float(temperature)
    exp = np.exp(logits - np.max(logits))
    return exp / np.sum(exp)

def suggest_next_words(model, tokenizer, seed_text, seq_len_minus1, top_k=5, temperature=1.0):
    """
    Returns top_k candidate next words (no sampling), optionally temperature-adjusted.
    """
    seq = tokenizer.texts_to_sequences([seed_text])[0]
    seq = pad_sequences([seq], maxlen=seq_len_minus1, padding="pre")
    preds = model.predict(seq, verbose=0)[0]
    probs = _apply_temperature(preds, temperature=temperature)

    top_idx = probs.argsort()[-top_k:][::-1]
    idx2word = {idx: w for w, idx in tokenizer.word_index.items() if idx < (tokenizer.num_words or 10**9)}
    return [(idx2word.get(i, "<UNK>"), float(probs[i])) for i in top_idx]

def generate_text(model, tokenizer, seed_text, seq_len_minus1, num_words=10, temperature=1.0, greedy=False):
    """
    Autoregressively generates words. If greedy=True, always pick argmax.
    Otherwise sample by temperature.
    """
    out = seed_text.strip()
    vocab_limit = tokenizer.num_words or (len(tokenizer.word_index) + 1)
    idx2word = {idx: w for w, idx in tokenizer.word_index.items() if idx < vocab_limit}

    for _ in range(num_words):
        seq = tokenizer.texts_to_sequences([out])[0]
        seq = pad_sequences([seq], maxlen=seq_len_minus1, padding="pre")
        preds = model.predict(seq, verbose=0)[0]
        if greedy or temperature <= 0:
            next_id = int(np.argmax(preds))
        else:
            probs = _apply_temperature(preds, temperature=temperature)
            next_id = int(np.random.choice(len(probs), p=probs))
        next_word = idx2word.get(next_id, None)
        if not next_word:
            break
        out += " " + next_word
    return out

# Save/Load

In [38]:
def save_artifacts(model, tokenizer, save_dir="predictive_text_artifacts"):
    os.makedirs(save_dir, exist_ok=True)
    model.save(os.path.join(save_dir, "model.keras"))  # Keras v3 format
    tok_path = os.path.join(save_dir, "tokenizer.pkl")
    with open(tok_path, "wb") as f:
        pickle.dump({
            "config": tokenizer.to_json(),
            "num_words": tokenizer.num_words
        }, f)
    meta = {
        "vocab_size": len(tokenizer.word_index) + 1,
        "num_words_limit": tokenizer.num_words,
    }
    with open(os.path.join(save_dir, "meta.json"), "w") as f:
        json.dump(meta, f, indent=2)
    print(f"Saved model + tokenizer to: {save_dir}")

def load_artifacts(save_dir="predictive_text_artifacts"):
    model = tf.keras.models.load_model(os.path.join(save_dir, "model.keras"))
    with open(os.path.join(save_dir, "tokenizer.pkl"), "rb") as f:
        payload = pickle.load(f)
    tok = Tokenizer.from_json(payload["config"])
    tok.num_words = payload["num_words"]
    with open(os.path.join(save_dir, "meta.json"), "r") as f:
        meta = json.load(f)
    return model, tok, meta

# Train

In [39]:
# tokenize & dataset
tokenizer = build_tokenizer(CORPUS, oov_token="<OOV>", lower=True, num_words=None)
X, y, max_len, vocab_size = make_ngram_dataset(tokenizer, CORPUS, min_len=2)

print(f"Vocab size: {vocab_size}, Max sequence length: {max_len}")

model = build_model(vocab_size=vocab_size, seq_len_minus1=max_len-1,
                    embed_dim=128, rnn_units=256, dropout=0.3)

Vocab size: 4995, Max sequence length: 76




In [40]:
history = model.fit(
    X, y,
    epochs=50,
    batch_size=32,
    validation_split=0.2,
    verbose=1,
    # callbacks=[
    #     tf.keras.callbacks.ReduceLROnPlateau(patience=3, factor=0.5, verbose=1),
    #     tf.keras.callbacks.EarlyStopping(patience=5, restore_best_weights=True, monitor="val_loss")
    # ]
)

Epoch 1/50
[1m635/635[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m217s[0m 329ms/step - accuracy: 0.0436 - loss: 7.0470 - val_accuracy: 0.0682 - val_loss: 7.1212
Epoch 2/50
[1m635/635[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m157s[0m 248ms/step - accuracy: 0.0626 - loss: 6.5251 - val_accuracy: 0.0999 - val_loss: 7.1058
Epoch 3/50
[1m635/635[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m160s[0m 252ms/step - accuracy: 0.0761 - loss: 6.1631 - val_accuracy: 0.0997 - val_loss: 7.3220
Epoch 4/50
[1m635/635[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m161s[0m 254ms/step - accuracy: 0.0897 - loss: 5.8893 - val_accuracy: 0.1064 - val_loss: 7.5467
Epoch 5/50
[1m635/635[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m158s[0m 248ms/step - accuracy: 0.1016 - loss: 5.6515 - val_accuracy: 0.1072 - val_loss: 7.9623
Epoch 6/50
[1m635/635[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m157s[0m 247ms/step - accuracy: 0.1158 - loss: 5.4269 - val_accuracy: 0.1125 - val_loss: 8.1306
Epoc

In [75]:
# quick suggestions
seed = "There is a"
print("\nTop suggestions for next word:")
for word, p in suggest_next_words(model, tokenizer, seed, seq_len_minus1=max_len-1, top_k=5, temperature=0.8):
    print(f"{word:>12s}  {p:.4f}")

# autocomplete demo
generated = generate_text(model, tokenizer, seed, seq_len_minus1=max_len-1, num_words=6, temperature=0.9)
print("\nGenerated:", generated)


Top suggestions for next word:
        cave  0.4685
      minute  0.1905
    reminder  0.1301
        hump  0.0612
         cup  0.0187

Generated: There is a date to share with its timeless


In [76]:
# save
save_artifacts(model, tokenizer)

Saved model + tokenizer to: predictive_text_artifacts
