If you're opening this Notebook on colab, you will need to clone the repo and change directory. Uncomment the cell below and run it.


In [None]:
# !git clone https://github.com/jbergq/transformer.git


In [None]:
from pathlib import Path

if Path.cwd().name != "transformer":
  %cd transformer

In [None]:
%pip install portalocker
%pip install -r requirements.txt

In [None]:
from easydict import EasyDict


# Define base config. Partly adopted from nanoGPT by Andrej Karpathy
cfg = EasyDict(
    {
        "val_size": 1000,  # Size of validation set.
        "max_iters": 600000,  # Total num training iterations.
        "eval_iters": 100,  # Number of evaluation iterations.
        "eval_interval": 1000,
        "effective_batch_size": 512,
        "batch_size": 4,
        "grad_accum_steps": 1,
        "lr": 1e-3,
        "warmup_iters": 2000,
        "lr_decay_iters": 600000,  # Should be ~= max_iters per Chinchilla.
        "min_lr": 6e-5,  # Minimum learning rate, should be ~= learning_rate/10 per Chinchilla.
        "weight_decay": 0.0005,
        "print_example": True,
    }
)

# Define model configs.
models = {
    "toy-model": {
        "hidden_size": 128,
        "ff_hidden_size": 256,
        "num_blocks": 4,
        "num_heads": 4,
        "context_size": 64,
    },
    "gpt2-small": {
        "hidden_size": 768,
        "ff_hidden_size": 3072,
        "num_blocks": 12,
        "num_heads": 12,
        "context_size": 1024,
    },
}

# Edit below to select a model.
cfg.update(models["gpt2-small"])

# Derive accumulation steps to get target effective batch size.
if cfg.effective_batch_size is not None:
    cfg["grad_accum_steps"] = cfg["effective_batch_size"] // cfg["batch_size"]

cfg


In [None]:
import torch
import torch.nn as nn

torch.manual_seed(1337)
device = "cuda" if torch.cuda.is_available() else "cpu"

device


In [None]:
import wandb

wandb.login()
wandb.init(project="transformer", config=cfg)


Let's setup our dataset. We will use Hugging Face's `datasets` package to prepare and load the WebText dataset.


In [None]:
from datasets import load_dataset

# Load WebText dataset in streaming mode. No need to download!
dataset = load_dataset("openwebtext", streaming=True)["train"]
shuffled_dataset = dataset.shuffle(seed=42, buffer_size=10000)

# Split dataset.
train_set = shuffled_dataset.skip(cfg.val_size)
val_set = shuffled_dataset.take(cfg.val_size)


To tokenize our dataset, we will use the GPT-2 tokenizer, available from Hugging Face's `transformers` package.


In [None]:
from transformers import GPT2Tokenizer

# Tokenizer used by GPT-2.
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")


In [None]:
from functools import partial


def tokenize(example):
    outputs = tokenizer(
        example["text"],
        truncation=True,  # Truncate returned token sequences to max_length.
        max_length=cfg.context_size + 1,  # Max length of returned token sequences.
        return_overflowing_tokens=True,  # Tokenize whole input and split into chunks.
        return_length=True,  # Return lengths of chunks.
    )

    # Create examples.
    source_batch = []
    target_batch = []
    for length, input_ids in zip(outputs["length"], outputs["input_ids"]):
        if length == cfg.context_size + 1:  # Only include full length sequences.
            source_batch.append(input_ids[:-1])
            target_batch.append(input_ids[1:])  # Note: Target is source shifted by one.

    return {"source": source_batch, "target": target_batch}


# Tokenize train and val sets.
train_tokenized = train_set.map(
    partial(tokenize),
    batched=True,
    remove_columns=train_set.column_names,
)
val_tokenized = val_set.map(
    partial(tokenize),
    batched=True,
    remove_columns=val_set.column_names,
)


The training uses an "infinite loop" style, where we continue to sample random batches until we reach convergence or the maximum number of batches configured.

Let's define a dataset wrapper that will allow us to continue sampling the dataset endlessly.


In [None]:
from typing import Iterator

from torch.utils.data import IterableDataset


class InfiniteIterableDataset(IterableDataset):
    def __init__(self, hf_dataset, shuffle=False):
        self.hf_dataset = hf_dataset

    def __iter__(self) -> Iterator:
        while True:
            for item in self.hf_dataset:
                yield item


In [None]:
from torch.utils.data import DataLoader

# Create data loaders for sampling batches.
train_loader = DataLoader(
    InfiniteIterableDataset(train_tokenized),
    batch_size=cfg.batch_size,
    collate_fn=lambda samples: {
        "source": torch.tensor([sample["source"] for sample in samples]),
        "target": torch.tensor([sample["target"] for sample in samples]),
    },
)
val_loader = DataLoader(
    InfiniteIterableDataset(val_tokenized),
    batch_size=cfg.batch_size,
    collate_fn=lambda samples: {
        "source": torch.tensor([sample["source"] for sample in samples]),
        "target": torch.tensor([sample["target"] for sample in samples]),
    },
)


Let's load one train batch and one validation batch to make sure everything works.


In [None]:
batch_train = next(iter(train_loader))

print(batch_train["source"][0][:10])
print(batch_train["target"][0][:10])


In [None]:
batch_val = next(iter(val_loader))

print(batch_val["source"][0][:10])
print(batch_val["target"][0][:10])


In [None]:
def step(model, criterion, iterator):
    batch = next(iterator)
    src, tgt = batch["source"].to(device), batch["target"].to(device)

    out = model(src)
    # pred = out.softmax(dim=2).argmax(dim=2)

    out_reshape = out.contiguous().view(-1, out.shape[-1])  # (B * T, vocab_size)
    tgt_reshape = tgt.contiguous().view(-1)  # (B * T, 1)

    loss = criterion(out_reshape, tgt_reshape)

    return loss


In [None]:
# Loss estimation function inspired by nanoGPT repo by Andrej Karpathy.
@torch.no_grad()
def estimate_loss(model, criterion, train_iter, val_iter, eval_iters):
    iterators = {"train": train_iter, "val": val_iter}
    out = {}
    model.eval()
    for split, iterator in iterators.items():
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            loss = step(model, criterion, iterator)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()

    return out


In [None]:
import math

# Learning rate decay scheduler inspired by nanoGPT repo by Andrej Karpathy.
def get_lr(iter, warmup_iters, base_lr, min_lr, lr_decay_iters):
    # 1) linear warmup for warmup_iters steps
    if iter < warmup_iters:
        return base_lr * iter / warmup_iters
    # 2) if it > lr_decay_iters, return min learning rate
    if iter > lr_decay_iters:
        return min_lr
    # 3) in between, use cosine decay down to min learning rate
    decay_ratio = (iter - warmup_iters) / (lr_decay_iters - warmup_iters)
    assert 0 <= decay_ratio <= 1
    coeff = 0.5 * (1.0 + math.cos(math.pi * decay_ratio))  # coeff ranges 0..1
    return min_lr + coeff * (base_lr - min_lr)


In [None]:
from torch.optim import Adam

from src.model.transformer import TransformerDecoder
from src.utils import train_start_print, iter_print, evaluation_print

model = TransformerDecoder(
    tokenizer.vocab_size,
    cfg.context_size,
    cfg.hidden_size,
    cfg.ff_hidden_size,
    cfg.num_blocks,
    cfg.num_heads,
)
model = model.to(device)
optimizer = Adam(model.parameters(), lr=cfg.lr, weight_decay=cfg.weight_decay, eps=5e-9)
criterion = nn.CrossEntropyLoss(ignore_index=1)

fixed_inp = torch.tensor(
    tokenizer.encode("The"), dtype=torch.long, device=device
).unsqueeze(0)

if cfg.print_example:
    batch = next(iter(train_loader))
    out = model.generate(fixed_inp)

    print("Example sequence: ", tokenizer.decode(batch["target"][0].numpy())[:200])
    print("Model output: ", tokenizer.decode(out[0].detach().cpu().numpy())[:200])

iter_num = 0
model.train()

# Create data iterators.
train_iter = iter(train_loader)
val_iter = iter(train_iter)

# Start training.
train_start_print(model)
while True:
    # Get learning rate according to schedule.
    lr = get_lr(iter_num, cfg.warmup_iters, cfg.lr, cfg.min_lr, cfg.lr_decay_iters)
    for param_group in optimizer.param_groups:
        param_group["lr"] = lr

    # Train model on one batch.
    train_loss = step(model, criterion, train_iter)
    train_loss.backward()

    # Accumulate gradients for N steps and update weights.
    if (iter_num + 1) % cfg.grad_accum_steps == 0:
        optimizer.step()
        optimizer.zero_grad(set_to_none=True)

    if iter_num > 0 and iter_num % cfg.eval_interval == 0:
        losses = estimate_loss(model, criterion, train_iter, val_iter, cfg.eval_iters)
        evaluation_print(losses)

        # Generate sample and print.
        out = model.generate(fixed_inp)
        print("Model output: ", tokenizer.decode(out[0].detach().cpu().numpy())[:200])

        wandb.log(
            {
                "iter": iter_num,
                "train/loss": losses["train"],
                "val/loss": losses["val"],
                "lr": lr,
            }
        )

    iter_print(iter_num, train_loss)
    iter_num += 1
