# Setup

In [1]:
%pip install -r requirements.txt

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.0 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import pandas as pd
import sys
import regex as re

import torch
import torch.nn as nn
from torch.nn import functional as F
from torch.utils.data import Dataset, DataLoader

from pathlib import Path
from typing import Optional, Tuple, Dict

import matplotlib.pyplot as plt

In [3]:
torch.manual_seed(36432)

<torch._C.Generator at 0x1e0ed10ce50>

# Data Preparation

## Importing

In [4]:
def read_document(filepath: str) -> pd.DataFrame:

    # put any filters here

    lines = []
    with open(filepath, "r", encoding="utf-8") as f:
        lines = f.readlines()

    filtered_lines = []
    for line in lines:
        if (
            "ENTRY" not in line and
            "CHAPTER" not in line
        ):
            filtered_lines.append(line)

    df = pd.DataFrame(filtered_lines)

    return df

In [5]:
all_stories = {}
data_directory = Path("./data")

for file in data_directory.glob("*.txt"):
    filename = file.stem
    all_stories[filename] = read_document(file)

all_stories[filename].head()

Unnamed: 0,0
0,I have come to the conclusion.. That I have a ...
1,If I were to begin writing about these strange...
2,It happened during my foreign exchange at Japa...
3,"At the very least, that's what I thought. No....."
4,"Though then again, how I came to bother her wa..."


With this simple code snippet, we are able to import as many text files
as we want from our data directory, filter out lines as necessary, and
convert them into a DataFrame.

## Tokenizing the text

This implementation will focus on using Byte Pair Encoding tokenization,
which encodes a fixed size of tokens.
It is a healthy balance between simply tokenizing per character, or
tokenizing per word, which in either case may yield too little or too
many tokens.

In [6]:
text_sequence = ""
for story in all_stories.keys():
    text_sequence += " ".join(all_stories[story][0].values)

print(f"size of text_sequence: {len(text_sequence)}")

size of text_sequence: 47476


In [7]:
sys.path.append('./lib')
from minbpe import BasicTokenizer
from transformer.model import GPTLanguageModel

tokenizer = BasicTokenizer()
tokenizer.train(text_sequence, vocab_size=1024)

Taking a look at the token sequences now:

In [8]:
vocab = tokenizer.vocab

print(vocab)

{0: b'\x00', 1: b'\x01', 2: b'\x02', 3: b'\x03', 4: b'\x04', 5: b'\x05', 6: b'\x06', 7: b'\x07', 8: b'\x08', 9: b'\t', 10: b'\n', 11: b'\x0b', 12: b'\x0c', 13: b'\r', 14: b'\x0e', 15: b'\x0f', 16: b'\x10', 17: b'\x11', 18: b'\x12', 19: b'\x13', 20: b'\x14', 21: b'\x15', 22: b'\x16', 23: b'\x17', 24: b'\x18', 25: b'\x19', 26: b'\x1a', 27: b'\x1b', 28: b'\x1c', 29: b'\x1d', 30: b'\x1e', 31: b'\x1f', 32: b' ', 33: b'!', 34: b'"', 35: b'#', 36: b'$', 37: b'%', 38: b'&', 39: b"'", 40: b'(', 41: b')', 42: b'*', 43: b'+', 44: b',', 45: b'-', 46: b'.', 47: b'/', 48: b'0', 49: b'1', 50: b'2', 51: b'3', 52: b'4', 53: b'5', 54: b'6', 55: b'7', 56: b'8', 57: b'9', 58: b':', 59: b';', 60: b'<', 61: b'=', 62: b'>', 63: b'?', 64: b'@', 65: b'A', 66: b'B', 67: b'C', 68: b'D', 69: b'E', 70: b'F', 71: b'G', 72: b'H', 73: b'I', 74: b'J', 75: b'K', 76: b'L', 77: b'M', 78: b'N', 79: b'O', 80: b'P', 81: b'Q', 82: b'R', 83: b'S', 84: b'T', 85: b'U', 86: b'V', 87: b'W', 88: b'X', 89: b'Y', 90: b'Z', 91: b'[',

In [9]:
tokenizer.encode("What the")

[559, 396]

In [10]:
tokenizer.decode([120, 543, 222, 76])

'xstill �L'

Looking pretty spicy.

Let's also append some special tokens to the vocab.

This will be useful for training later on.

In [11]:
max_vocab_id = list(vocab.keys())[-1]
tokenizer.special_tokens = {
    max_vocab_id + 1: "<<startoftext>>",
    max_vocab_id + 2: "<<separator>>",
    max_vocab_id + 3: "<<endoftext>>",
    max_vocab_id + 4: "<<unk>>"
}

In [12]:
len(tokenizer.encode(text_sequence))

15625

In [13]:
tokenizer.save(file_prefix="./output/tokenizer/da_tokenizer")

## Transformer Model

We will need both an encoder and a decoder.

A decoder will consist of the following components:
    - token embedding (represent a token with a vector)
    - positional encoding (preserving token orders)
    - self attention (keep track of relation between tokens)
    - residual connections
    - layer normalization

Parameters of a decoder:
- block size
- embedding size
- number of heads & head size
- number of blocks (layers)

In [14]:
def get_vocab_size(tokenizer: BasicTokenizer) -> int:
    vocab = tokenizer.vocab
    special_tokens = tokenizer.special_tokens

    return len(vocab) + len(special_tokens)

def print_model_structure(model: torch.nn.Module, indent: str = '') -> None:
    for name, child in model.named_children():
        params = sum(p.numel() for p in child.parameters())
        print(f"{indent}|-- {name}: {child.__class__.__name__} ({params:,} parameters)")
        print_model_structure(child, indent + '|    ')

In [15]:
# hyperparameters
blockSize = 256
embedSize = 384
headCount = 6
layerCount = 6
dropout = 0.2
batchSize = 128
vocabSize = get_vocab_size(tokenizer)
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [16]:
model = GPTLanguageModel(
    vocab_size=vocabSize,
    block_size=blockSize,
    n_embd=embedSize,
    n_head=headCount,
    n_layer=layerCount,
    dropout=dropout,
    device=device
).to(device)

print(sum(p.numel() for p in model.parameters())/1e6, 'M params')

11.529476 M params


In [17]:
#print_model_structure(model)

# Pre-Training

In [18]:
encoded_text_sequence = tokenizer.encode(text_sequence)
len(encoded_text_sequence)

15625

In [19]:
data = torch.tensor(encoded_text_sequence, dtype=torch.long)
split_index = int(0.9 * len(data))
train_data = data[:split_index]
val_data = data[split_index:]

In [20]:
class TextDataset(Dataset):
    def __init__(self, data: torch.Tensor, block_size: int) -> None:
        self.data = data
        self.block_size = block_size

    def __len__(self) -> int:
        return len(self.data) - self.block_size

    def __getitem__(self, index: int) -> Tuple[torch.Tensor, torch.Tensor]:
        x = self.data[index:index + self.block_size]
        y = self.data[index + 1:index + self.block_size + 1]

        return x, y

def get_dataloaders(
    train_data: torch.Tensor,
    val_data: torch.Tensor,
    block_size: int,
    batch_size: int,
    device: torch.device
) -> Tuple[DataLoader, DataLoader]:
    train_dataset = TextDataset(train_data.to(device), block_size)
    val_dataset = TextDataset(val_data.to(device), block_size)

    train_loader = DataLoader(
        train_dataset,
        batch_size=batch_size,
        shuffle=True
    )

    val_loader = DataLoader(
        train_dataset,
        batch_size=batch_size,
        shuffle=False
    )

    return train_loader, val_loader

In [21]:
train_loader, val_loader = get_dataloaders(
    train_data=train_data,
    val_data=val_data,
    block_size=blockSize,
    batch_size=batchSize,
    device=device
)

x, y = next(iter(train_loader))
x.shape, y.shape

(torch.Size([128, 256]), torch.Size([128, 256]))

In [22]:
@torch.no_grad()
def estimate_loss(
    model: torch.nn.Module,
    train_loader: DataLoader,
    val_loader: DataLoader,
    eval_iters: int
) -> Dict[str, float]:
    output = {}
    model.eval()

    for split, loader in [('train', train_loader), ('val', val_loader)]:
        losses = torch.zeros(eval_iters)
        for i, (x, y) in enumerate(loader):
            if i >= eval_iters:
                break
            with torch.no_grad():
                _, loss = model(x, y)
            losses[i] = loss.item()
        output[split] = losses.mean().item()

    model.train()
    return output

def save_checkpoint(
    model: GPTLanguageModel,
    optimizer: torch.optim.Optimizer,
    epoch: int,
    loss: float,
    filename: str = "checkpoint.pth"
) -> None:
    checkpoint = {
        'epoch' : epoch,
        'model_state_dict' : model.state_dict(),
        'optimizer_state_dict' : optimizer.state_dict(),
        'loss' : loss
    }

    torch.save(checkpoint, filename)

# Training

In [27]:
max_iters = 2
eval_interval = 10
eval_iters = 200
learning_rate = 1e-4
save_interval = 5

optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
train_loader, val_loader = get_dataloaders(
    train_data=train_data,
    val_data=val_data,
    block_size=blockSize,
    batch_size=batchSize,
    device=device
)

train_losses = []
val_losses = []

In [None]:
for iteration in range(max_iters):
    for batch_idx, (x_batch, y_batch) in enumerate(train_loader):
        if batch_idx % eval_interval == 0 or batch_idx == len(train_loader) - 1:
            losses = estimate_loss(
                model=model,
                train_loader=train_loader,
                val_loader=val_loader,
                eval_iters=min(eval_iters, len(val_loader))
            )

            train_losses.append(losses['train'])
            val_losses.append(losses['val'])

            print(
                f"iteration {iteration} / step {batch_idx}: "
                f"train loss {losses['train']:.4f}, "
                f"val loss {losses['val']:.4f}"
            )

            logits, loss = model(x_batch, y_batch)
            optimizer.zero_grad(set_to_none=True)
            loss.backward()
            optimizer.step()

        if iteration % save_interval == 0:
            save_checkpoint(
                model=model,
                optimizer=optimizer,
                epoch=iteration,
                loss=loss.item(),
                filename=f"./output/pre_training/checkpoint_{iteration}.pth"
            )

In [None]:
plt.figure(figsize=(10, 5))

plt.plot(train_losses, label="Train Loss")
plt.plot(val_losses, label="Validation Loss")
plt.xlabel("Eval Step")
plt.ylim(0)
plt.ylabel("Loss")
plt.title("Training and Eval Loss Over Time")
plt.legend()
plt.grid()
plt.show()
