
# Large Language Models (LLMs): Sequence Modeling — Ready-to-Run Notebook

This notebook contains everything you need for the **Sequence Modeling** project:
- Helpers (`read_lines`, `clean_text`)
- Minimal `NGram` class and `get_next_word_loop`
- FastText embedding training (`generate_fasttext_vectors`)
- Dataset builder for RNN/LSTM (`create_dataset`)
- Utilities to map embeddings back to words
- **RNN** and **LSTM** PyTorch models + training & prediction helpers

> **Dataset path:** `/anvil/projects/tdm/data/amazon/music.txt`  
> You can reduce the number of lines to speed things up while testing.


In [None]:

import os, time, codecs, numpy as np
from typing import List, Tuple, Dict, Any
import matplotlib.pyplot as plt

# Gensim (word vectors)
from gensim.models import FastText, KeyedVectors

# Torch (RNN/LSTM)
import torch
from torch import nn, optim


In [None]:

def read_lines(file_path: str, n: int, start: int = 0) -> List[str]:
    """Read `n` lines from `file_path` starting at line index `start` (0-based)."""
    lines = []
    with open(file_path, 'r') as f:
        for i, line in enumerate(f):
            if i < start:
                continue
            lines.append(line.strip())
            if len(lines) == n:
                break
    return lines

def clean_text(text: str) -> str:
    """Decode escapes, normalize whitespace/newlines, lowercase, keep alpha+space."""
    text = codecs.decode(text, 'unicode_escape', errors='ignore')
    text = text.replace('\n', ' ')
    text = text.lower()
    text = ''.join(c for c in text if c.isalpha() or c.isspace())
    return text


In [None]:

class NGram:
    def __init__(self, n: int, is_character_based: bool = False):
        self.n = n
        self.is_character_based = is_character_based
        self.data = None
        self.ngram_frequencies: Dict[Any, int] = {}
        self.ngram_probabilities: Dict[Any, Dict[Any, float]] = {}

    def set_data(self, data: List[str]):
        if not isinstance(data, list):
            return
        cleaned = [clean_text(s) for s in data]
        if self.is_character_based:
            # list of strings
            self.data = [s for s in cleaned if s]
        else:
            # list of tuples of words for hashability
            self.data = [tuple(s.split()) for s in cleaned if s]

    def generate_ngrams(self) -> Dict[Any, int]:
        self.ngram_frequencies = {}
        if not isinstance(self.data, list) or len(self.data) == 0:
            return {}
        for item in self.data:
            seq = item if self.is_character_based else list(item)
            for i in range(len(seq) - self.n + 1):
                ngram = ''.join(seq[i:i+self.n]) if self.is_character_based else tuple(seq[i:i+self.n])
                self.ngram_frequencies[ngram] = self.ngram_frequencies.get(ngram, 0) + 1
        return self.ngram_frequencies

    def generate_ngram_probabilities(self) -> Dict[Any, Dict[Any, float]]:
        freqs = self.generate_ngrams()
        self.ngram_probabilities = {}
        if not isinstance(freqs, dict) or len(freqs) == 0:
            return {}
        for ngram, count in freqs.items():
            prefix = ngram[:-1] if not self.is_character_based else ngram[:-1]
            last = ngram[-1] if not self.is_character_based else ngram[-1]
            self.ngram_probabilities.setdefault(prefix, {})
            self.ngram_probabilities[prefix][last] = self.ngram_probabilities[prefix].get(last, 0) + count
        # normalize
        for prefix, nexts in self.ngram_probabilities.items():
            total = float(sum(nexts.values()))
            for k in list(nexts.keys()):
                nexts[k] = nexts[k] / total if total > 0 else 0.0
        return self.ngram_probabilities

    def get_next_word(self, previous_words: str, method: str = 'common') -> str:
        if not isinstance(self.ngram_probabilities, dict) or len(self.ngram_probabilities) == 0:
            return ''
        if self.is_character_based:
            key = previous_words[-(self.n-1):]
        else:
            toks = previous_words.split()
            if len(toks) != self.n - 1:
                return ''
            key = tuple(toks)
        if key not in self.ngram_probabilities:
            return ''
        options = self.ngram_probabilities[key]
        if method == 'random':
            choices = list(options.keys())
            probs = list(options.values())
            return np.random.choice(choices, p=probs)
        elif method == 'uncommon':
            return min(options.items(), key=lambda x: x[1])[0]
        else: # 'common'
            return max(options.items(), key=lambda x: x[1])[0]


In [None]:

def get_next_word_loop(model: NGram, start_string: str, n: int = 10, method: str = 'common') -> str:
    s = start_string.strip()
    for _ in range(n):
        nxt = model.get_next_word(' '.join(s.split()[-(model.n-1):]), method=method)
        if not nxt:
            break
        s += ' ' + nxt
    return s


In [None]:

def generate_fasttext_vectors(lines: List[str], filename: str) -> KeyedVectors:
    cleaned = [clean_text(line) for line in lines]
    tokenized = [line.split() for line in cleaned if line]
    model = FastText(tokenized, vector_size=100, window=5, min_count=1, workers=1)
    model.save(f'{filename}.model')
    word_vectors = model.wv
    word_vectors.save(f'{filename}.wordvectors')
    return word_vectors


In [None]:

def create_dataset(data: List[str], word_embeddings: KeyedVectors, n: int = 3):
    input_data, output_data = [], []
    all_words, all_embeddings = [], []

    for line in data:
        line_clean = clean_text(line)
        words = line_clean.split()
        if len(words) < n:
            continue

        embeddings = []
        for w in words:
            if w in word_embeddings:
                embeddings.append(word_embeddings[w])
                all_words.append(w)
                all_embeddings.append(word_embeddings[w])

        if len(embeddings) < n:
            continue

        # groups of length n
        embedding_groups = [embeddings[i:i+n] for i in range(len(embeddings) - n + 1)]

        r_i, r_o = [], []
        for group in embedding_groups:
            prev_seq = np.array(group[:-1], dtype=np.float32)  # (n-1, 100)
            target = np.array([group[-1]], dtype=np.float32)   # (1, 100)
            r_i.append(prev_seq)
            r_o.append(target)

        if r_i and r_o:
            input_data.append(np.array(r_i, dtype=np.float32))   # (num_seq, n-1, 100)
            output_data.append(np.array(r_o, dtype=np.float32))  # (num_seq, 1, 100)

    return (all_words, all_embeddings, input_data, output_data)


In [None]:

def cosine_similarity_vec(v1: np.ndarray, v2: np.ndarray) -> float:
    dot = np.dot(v1, v2)
    n1 = np.linalg.norm(v1)
    n2 = np.linalg.norm(v2)
    if n1 == 0 or n2 == 0:
        return 0.0
    return float(dot / (n1 * n2))

def get_word_from_embedding(embedding: np.ndarray, keyedvectors: KeyedVectors):
    best_word, best_sim = None, -1.0
    for w in keyedvectors.index_to_key:
        sim = cosine_similarity_vec(embedding, keyedvectors[w])
        if sim > best_sim:
            best_sim, best_word = sim, w
    return (best_word, best_sim)


In [None]:

class RNNModel(nn.Module):
    def __init__(self, input_size=100, hidden_size=128, output_size=100, input_sequence_length=2):
        super().__init__()
        self.input_sequence_length = input_sequence_length
        self.rnn = nn.RNN(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x, hidden):
        out, hidden = self.rnn(x, hidden)
        out = self.fc(out[:, -1, :])  # take last time step
        return out, hidden

    def train_model(self, input_data, output_data, num_epochs=3, learning_rate=0.001):
        criterion = nn.MSELoss()
        optimizer = optim.Adam(self.parameters(), lr=learning_rate)

        for epoch in range(num_epochs):
            total_loss = 0.0
            s = time.time()
            for i, (r_i, r_o) in enumerate(zip(input_data, output_data)):
                if i % 500 == 0:
                    print(f'Processing review {i+1}/{len(input_data)}')
                    print(f'Elapsed time: {time.time() - s:.2f}s'); s = time.time()

                hidden_state = None  # reset per review
                for seq, o in zip(r_i, r_o):
                    input_tensor = torch.tensor(seq, dtype=torch.float32).unsqueeze(0)  # [1, seq_len, 100]
                    target_tensor = torch.tensor(o[0], dtype=torch.float32)             # [100]

                    optimizer.zero_grad()
                    output, hidden_state = self.forward(input_tensor, hidden_state)
                    output = output[-1]  # [100]

                    if hidden_state is not None:
                        hidden_state = hidden_state.detach()

                    loss = criterion(output, target_tensor)
                    loss.backward()
                    optimizer.step()
                    total_loss += loss.item()

            print(f'Epoch {epoch+1}/{num_epochs}, Loss: {total_loss/len(input_data):.4f}')
        return total_loss/len(input_data)

def predict_from_string(input_string: str, model: RNNModel, keyedvectors: KeyedVectors):
    clean = clean_text(input_string)
    words = clean.split()
    if len(words) < model.input_sequence_length:
        return ('', 0.0)

    input_embeddings = [keyedvectors[w] for w in words if w in keyedvectors]
    if len(input_embeddings) < model.input_sequence_length:
        return ('', 0.0)

    seqs = [input_embeddings[i:i+model.input_sequence_length] for i in range(len(input_embeddings) - model.input_sequence_length + 1)]
    hidden = None
    out = None
    for seq in seqs:
        input_tensor = torch.tensor(seq, dtype=torch.float32).unsqueeze(0)
        out, hidden = model(input_tensor, hidden)
        if hidden is not None:
            hidden = hidden.detach()

    pred_emb = out.detach().numpy()[0]  # [100]
    return get_word_from_embedding(pred_emb, keyedvectors)


In [None]:

class LSTMModel(nn.Module):
    def __init__(self, input_size=100, hidden_size=128, output_size=100, input_sequence_length=2):
        super().__init__()
        self.input_sequence_length = input_sequence_length
        self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x, hidden):
        out, hidden = self.lstm(x, hidden)
        out = self.fc(out[:, -1, :])
        return out, hidden

    def train_model(self, input_data, output_data, num_epochs=3, learning_rate=0.001):
        criterion = nn.MSELoss()
        optimizer = optim.Adam(self.parameters(), lr=learning_rate)

        for epoch in range(num_epochs):
            total_loss = 0.0
            s = time.time()
            for i, (r_i, r_o) in enumerate(zip(input_data, output_data)):
                if i % 500 == 0:
                    print(f'Processing review {i+1}/{len(input_data)}')
                    print(f'Elapsed time: {time.time() - s:.2f}s'); s = time.time()

                hidden = None
                for seq, o in zip(r_i, r_o):
                    input_tensor = torch.tensor(seq, dtype=torch.float32).unsqueeze(0)  # [1, seq_len, 100]
                    target_tensor = torch.tensor(o[0], dtype=torch.float32)             # [100]

                    optimizer.zero_grad()
                    output, hidden = self.forward(input_tensor, hidden)
                    output = output[-1]

                    if hidden is not None:
                        hidden = (hidden[0].detach(), hidden[1].detach())

                    loss = criterion(output, target_tensor)
                    loss.backward()
                    optimizer.step()
                    total_loss += loss.item()

            print(f'Epoch {epoch+1}/{num_epochs}, Loss: {total_loss/len(input_data):.4f}')
        return total_loss/len(input_data)

def predict_from_string_lstm(input_string: str, model: LSTMModel, keyedvectors: KeyedVectors):
    clean = clean_text(input_string)
    words = clean.split()
    if len(words) < model.input_sequence_length:
        return ('', 0.0)
    input_embeddings = [keyedvectors[w] for w in words if w in keyedvectors]
    if len(input_embeddings) < model.input_sequence_length:
        return ('', 0.0)

    seqs = [input_embeddings[i:i+model.input_sequence_length] for i in range(len(input_embeddings) - model.input_sequence_length + 1)]
    hidden = None
    out = None
    for seq in seqs:
        input_tensor = torch.tensor(seq, dtype=torch.float32).unsqueeze(0)
        out, hidden = model(input_tensor, hidden)
        if hidden is not None:
            hidden = (hidden[0].detach(), hidden[1].detach())

    pred_emb = out.detach().numpy()[0]
    return get_word_from_embedding(pred_emb, keyedvectors)



## Quick-start demo (optional)

> This section trains very small toy runs so you can verify everything works.  
> For better results, increase the number of lines and training epochs.


In [None]:

DATA_PATH = '/anvil/projects/tdm/data/amazon/music.txt'

# --- Tiny NGram demo ---
ng = NGram(3, is_character_based=False)
ng.set_data(read_lines(DATA_PATH, 2000, 0))
ng.generate_ngram_probabilities()
print(get_next_word_loop(ng, "this is a", n=5))

# --- Train small FastText & build dataset ---
wv = generate_fasttext_vectors(read_lines(DATA_PATH, 3000, 0), 'P12_fasttext_small')
words, embs, X, Y = create_dataset(read_lines(DATA_PATH, 1200, 500), wv, n=3)

print(len(X), len(Y))
if len(X) > 5:
    print(X[0].shape, X[1].shape)

# --- Train a very small RNN (few epochs) ---
if X:
    rnn = RNNModel(input_size=100, hidden_size=64, output_size=100, input_sequence_length=2)
    rnn.train_model(X, Y, num_epochs=1, learning_rate=0.001)
    print(predict_from_string("this is a wonderful cd and", rnn, wv))

# --- Train a very small LSTM (few epochs) ---
if X:
    lstm = LSTMModel(input_size=100, hidden_size=64, output_size=100, input_sequence_length=2)
    lstm.train_model(X, Y, num_epochs=1, learning_rate=0.001)
    print(predict_from_string_lstm("this is a wonderful cd and", lstm, wv))
