# Large Language Models (LLMs): Transformers & Mini‑LLM

This notebook implements all required parts of the project:

- Data helpers (`read_lines`, `clean_text`)
- **Q1:** `build_vocab`, `generate_dataset`
- **Q2:** `positional_encoding`
- **Q3:** `MultiHeadAttention`
- **Q4:** `TransformerBlock`, `MiniLLM`
- **Q5:** `train_model`, `generate_text` + a quick demo harness

> Dataset path used: `/anvil/projects/tdm/data/amazon/music.txt`


In [None]:

# --- Helpers: reading & cleaning ---
from typing import List, Tuple, Dict
import codecs

def read_lines(file_path: str, n: int, start: int = 0) -> List[str]:
    lines = []
    with open(file_path, 'r') as f:  # open the file in read mode
        for i, line in enumerate(f):
            if i < start:
                continue
            lines.append(line.strip())
            if len(lines) == n:
                break
    return lines

def clean_text(text: str) -> str:
    '''
    1) decode escape sequences
    2) replace any newlines with a space
    3) lowercase
    4) keep only alphabetic + whitespace
    '''
    text = codecs.decode(text, 'unicode_escape')
    text = text.replace('\n', ' ')
    text = text.lower()
    text = ''.join(c for c in text if c.isalpha() or c.isspace())
    return text


## Q1 — Vocabulary & Dataset Builders

In [None]:

from typing import List, Tuple, Dict
import numpy as np

def build_vocab(lines: List[str]) -> Tuple[Dict[str, int], List[str]]:
    """
    Returns:
      token_to_idx: dict mapping token -> index
      idx_to_token: list such that idx_to_token[i] gives token for index i
    """
    token_to_idx: Dict[str, int] = {}
    idx_to_token: List[str] = []
    for line in lines:
        for token in clean_text(line).split():
            if token not in token_to_idx:
                token_to_idx[token] = len(idx_to_token)
                idx_to_token.append(token)
    return token_to_idx, idx_to_token

def generate_dataset(lines: List[str], token_to_idx: Dict[str, int], sequence_length: int):
    """
    For each review:
      - clean + split
      - skip if not long enough
      - convert tokens -> indices
      - create sliding window of length `sequence_length` as inputs and next token as output
    Returns: (inputs, outputs) where both are Python lists of sequences/targets (ints).
    """
    inputs = []
    outputs = []
    for line in lines:
        split_text = clean_text(line).split()
        if len(split_text) < sequence_length + 1:
            continue
        tokens = [token_to_idx[w] for w in split_text if w in token_to_idx]
        if len(tokens) < sequence_length + 1:
            continue
        for i in range(len(tokens) - sequence_length):
            x = tokens[i:i+sequence_length]
            y = tokens[i+sequence_length]
            inputs.append(x)
            outputs.append(y)
    return inputs, outputs


In [None]:

# --- Q1 quick test ---
try:
    sequence_len = 3
    lines = read_lines('/anvil/projects/tdm/data/amazon/music.txt', 500)
    token_to_idx, idx_to_token = build_vocab(lines)
    inputs, outputs = generate_dataset(lines, token_to_idx, sequence_len)
    print(f'Length of Inputs {len(inputs)}')  # Expected ~11095
    print(f"CD Index: {token_to_idx.get('cd', None)}")  # Expected 3 in the prompt
except FileNotFoundError:
    print("Dataset not found in this environment. Update the path if needed and re-run this cell.")


## Q2 — Positional Encoding

In [None]:

import torch, math

def positional_encoding(seq_len: int, model_dimensions: int) -> torch.Tensor:
    """Returns tensor of shape [1, seq_len, model_dimensions] with sinusoidal PE."""
    pe = torch.zeros(seq_len, model_dimensions, dtype=torch.float32)  # [T, C]
    position = torch.arange(0, seq_len, dtype=torch.float32).unsqueeze(1)  # [T, 1]
    div_term = torch.exp(-math.log(10000.0) * (torch.arange(0, model_dimensions, 2, dtype=torch.float32) / model_dimensions))
    pe[:, 0::2] = torch.sin(position * div_term)  # even
    pe[:, 1::2] = torch.cos(position * div_term)  # odd
    return pe.unsqueeze(0)  # [1, T, C]

# Sanity check from prompt
pe = positional_encoding(10, 4)
print(pe.shape)                       # torch.Size([1, 10, 4])
print(torch.round(pe[0, 0, :], decimals=4))
print(torch.round(pe[0, 1, :], decimals=4))
print(torch.round(pe[0, 2, :], decimals=4))
print(torch.round(pe[0, 3, :], decimals=4))
print(torch.round(pe[0, 4, :], decimals=4))
print(torch.round(pe[0, 5, :], decimals=4))


## Q3 — Multi-Head Attention

In [None]:

class MultiHeadAttention(torch.nn.Module):
    def __init__(self, model_dimensions: int, num_heads: int):
        super().__init__()
        assert model_dimensions % num_heads == 0, "model_dimensions must be divisible by num_heads"
        self.d_k = model_dimensions // num_heads
        self.num_heads = num_heads
        self.qkv = torch.nn.Linear(model_dimensions, model_dimensions * 3)
        self.fc_out = torch.nn.Linear(model_dimensions, model_dimensions)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        # x: [B, T, C]
        B, T, C = x.size()
        qkv = self.qkv(x).reshape(B, T, 3, self.num_heads, self.d_k).permute(2, 0, 3, 1, 4)
        q, k, v = qkv[0], qkv[1], qkv[2]   # each: [B, H, T, d_k]
        scores = (q @ k.transpose(-2, -1)) / math.sqrt(self.d_k)  # [B, H, T, T]
        attn = torch.softmax(scores, dim=-1)                      # [B, H, T, T]
        out = (attn @ v).transpose(1, 2).reshape(B, T, C)         # [B, T, C]
        return self.fc_out(out)                                   # [B, T, C]

# Sanity check
import numpy as np, random
b, t, c, h = 2, 5, 128, 4
mha = MultiHeadAttention(c, h)
torch.manual_seed(78); np.random.seed(78); random.seed(78)
x = torch.from_numpy(np.random.rand(b, t, c).astype('f'))
y = mha(x)
print(y.shape)
print(y[0][0][:10])


## Q4 — Transformer Block & Mini‑LLM

In [None]:

class TransformerBlock(torch.nn.Module):
    def __init__(self, model_dimensions: int, num_heads: int, attention_class=MultiHeadAttention):
        super().__init__()
        self.attn = attention_class(model_dimensions, num_heads)
        self.norm1 = torch.nn.LayerNorm(model_dimensions)
        self.ff = torch.nn.Sequential(
            torch.nn.Linear(model_dimensions, 4 * model_dimensions),
            torch.nn.ReLU(),
            torch.nn.Linear(4 * model_dimensions, model_dimensions)
        )
        self.norm2 = torch.nn.LayerNorm(model_dimensions)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x = x + self.attn(x)
        x = self.norm1(x)
        x = x + self.ff(x)
        x = self.norm2(x)
        return x

class MiniLLM(torch.nn.Module):
    def __init__(self, vocab_size: int, model_dimensions: int = 128, num_heads: int = 4, num_layers: int = 2, seq_len: int = 10, attention_class=MultiHeadAttention):
        super().__init__()
        self.embed = torch.nn.Embedding(vocab_size, model_dimensions)
        self.register_buffer('pe', positional_encoding(seq_len, model_dimensions))
        blocks = [TransformerBlock(model_dimensions, num_heads, attention_class=attention_class) for _ in range(num_layers)]
        self.transformer_blocks = torch.nn.Sequential(*blocks)
        self.fc = torch.nn.Linear(model_dimensions, vocab_size)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        positional_encodings = self.pe[:, :x.size(1), :]
        x = self.embed(x) + positional_encodings
        x = self.transformer_blocks(x)
        x = x[:, -1, :]
        x = self.fc(x)
        return x


## Q5 — Training Loop & Text Generation

In [None]:

from tqdm.auto import tqdm
import torch.nn.functional as F

def train_model(model: torch.nn.Module, inputs, targets, epochs: int = 5, learning_rate: float = 1e-3):
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    loss_fn = torch.nn.CrossEntropyLoss()
    model.train()
    for epoch in tqdm(range(epochs), desc='Training Epochs'):
        total_loss = 0.0
        for i, (x, y) in tqdm(enumerate(zip(inputs, targets)), desc=f'Epoch {epoch+1}', total=len(inputs), leave=False):
            x_tensor = torch.tensor([x], dtype=torch.long)         # [B=1, T]
            y_tensor = torch.tensor([y], dtype=torch.long)         # [B=1]
            logits = model(x_tensor)                                # [1, V]
            loss = loss_fn(logits, y_tensor)                        # CE expects [N, C] and target [N]
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        avg = total_loss / max(1, len(inputs))
        print(f"Epoch {epoch+1}/{epochs} - Loss: {avg:.4f}")
    return model

def generate_text(model, start_sequence: str, token_to_idx, idx_to_token, length: int = 20, context_length: int = 10, temperature: float = 1.0):
    model.eval()
    current = [token_to_idx.get(tok, None) for tok in start_sequence.split()]
    current = [i for i in current if i is not None]
    if not current:
        return ""
    for _ in range(length):
        x = torch.tensor([current[-context_length:]], dtype=torch.long)
        with torch.no_grad():
            logits = model(x)  # [1, V]
            probs = F.softmax(logits / max(1e-6, temperature), dim=-1).squeeze(0)  # [V]
            next_token = torch.multinomial(probs, num_samples=1).item()
            current.append(next_token)
    return ' '.join(idx_to_token[i] for i in current)


### Quick Smoke Test (tiny subset for speed)

In [None]:

# Tiny demo to verify end-to-end flow quickly.
try:
    CONTEXT = 6
    LINES = read_lines('/anvil/projects/tdm/data/amazon/music.txt', 300)
    tok2i, i2tok = build_vocab(LINES)
    X, Y = generate_dataset(LINES, tok2i, CONTEXT)
    vocab_size = len(tok2i)
    model = MiniLLM(vocab_size, model_dimensions=64, num_heads=4, num_layers=2, seq_len=CONTEXT)
    train_model(model, X[:2000], Y[:2000], epochs=2, learning_rate=1e-3)
    prompt = "this cd is the"
    out = generate_text(model, prompt, tok2i, i2tok, length=12, context_length=CONTEXT, temperature=0.9)
    print("Prompt:", prompt)
    print("Output:", out)
except FileNotFoundError:
    print("Dataset not found in this environment. Update the path and re-run.")
