In [2]:
import os
import json
import pickle
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import layers, models
import nltk

# Prepare Corpus

In [4]:
text = open("large_text.txt").read()

nltk.download('punkt_tab')
CORPUS = nltk.sent_tokenize(text)

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\lilyc\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [5]:
def build_tokenizer(texts, oov_token="<OOV>", lower=True, num_words=None):
    tok = Tokenizer(oov_token=oov_token, lower=lower, num_words=num_words)
    tok.fit_on_texts(texts)
    return tok

def make_ngram_dataset(tokenizer, texts, min_len=2):
    """
    Builds n-gram sequences for next-word prediction.
    Each sequence[i] is tokens[0:i] -> predict tokens[i]
    """
    sequences = []
    for line in texts:
        tokens = tokenizer.texts_to_sequences([line])[0]
        # create incremental n-grams: [w1,w2]->w3, [w1,w2,w3]->w4, ...
        for i in range(min_len, len(tokens) + 1):
            sequences.append(tokens[:i])
    if not sequences:
        raise ValueError("No sequences constructed; check your corpus/tokenizer.")

    max_len = max(len(s) for s in sequences)
    sequences = pad_sequences(sequences, maxlen=max_len, padding="pre")
    X, y = sequences[:, :-1], sequences[:, -1]
    vocab_size = min(tokenizer.num_words or len(tokenizer.word_index) + 1,
                     len(tokenizer.word_index) + 1)
    return X, y, max_len, vocab_size

# Build Model

In [None]:
def build_model(vocab_size, seq_len_minus1, embed_dim=128, rnn_units=256, dropout=0.2):
    model = models.Sequential([
        layers.Embedding(input_dim=vocab_size, output_dim=embed_dim, input_length=seq_len_minus1),
        layers.Bidirectional(layers.LSTM(rnn_units, return_sequences=False)),
        layers.Dropout(dropout),
        layers.Dense(rnn_units, activation="relu"),
        layers.Dense(vocab_size, activation="softmax")
    ])
    model.compile(
        loss="sparse_categorical_crossentropy",
        optimizer=tf.keras.optimizers.Adam(learning_rate=3e-4),
        metrics=["accuracy"]
    )
    return model

# Inference Helpers

In [None]:
def _apply_temperature(logits, temperature=1.0):
    logits = np.asarray(logits, dtype=np.float64)
    if temperature <= 0:
        # zero (greedy) or negative => fallback to greedy
        one_hot = np.zeros_like(logits)
        one_hot[np.argmax(logits)] = 1.0
        return one_hot
    logits = np.log(np.maximum(logits, 1e-9)) / float(temperature)
    exp = np.exp(logits - np.max(logits))
    return exp / np.sum(exp)

def suggest_next_words(model, tokenizer, seed_text, seq_len_minus1, top_k=5, temperature=1.0):
    """
    Returns top_k candidate next words (no sampling), optionally temperature-adjusted.
    """
    seq = tokenizer.texts_to_sequences([seed_text])[0]
    seq = pad_sequences([seq], maxlen=seq_len_minus1, padding="pre")
    preds = model.predict(seq, verbose=0)[0]
    probs = _apply_temperature(preds, temperature=temperature)

    top_idx = probs.argsort()[-top_k:][::-1]
    idx2word = {idx: w for w, idx in tokenizer.word_index.items() if idx < (tokenizer.num_words or 10**9)}
    return [(idx2word.get(i, "<UNK>"), float(probs[i])) for i in top_idx]

def generate_text(model, tokenizer, seed_text, seq_len_minus1, num_words=10, temperature=1.0, greedy=False):
    """
    Autoregressively generates words. If greedy=True, always pick argmax.
    Otherwise sample by temperature.
    """
    out = seed_text.strip()
    vocab_limit = tokenizer.num_words or (len(tokenizer.word_index) + 1)
    idx2word = {idx: w for w, idx in tokenizer.word_index.items() if idx < vocab_limit}

    for _ in range(num_words):
        seq = tokenizer.texts_to_sequences([out])[0]
        seq = pad_sequences([seq], maxlen=seq_len_minus1, padding="pre")
        preds = model.predict(seq, verbose=0)[0]
        if greedy or temperature <= 0:
            next_id = int(np.argmax(preds))
        else:
            probs = _apply_temperature(preds, temperature=temperature)
            next_id = int(np.random.choice(len(probs), p=probs))
        next_word = idx2word.get(next_id, None)
        if not next_word:
            break
        out += " " + next_word
    return out