# Imports

In [1]:
import lance
import numpy as np

from tqdm import tqdm

import torch
from torch.utils.data import Dataset, DataLoader, Sampler

from transformers import AutoTokenizer, AutoModelForCausalLM, CONFIG_MAPPING

import warnings
warnings.simplefilter('ignore')

# Model, Hyperparameter and Tokenizer

We'll be using the `wikitext_100K.lance` dataset that we created in an earlier example to train our GPT2 model from scratch.

Change the `block_size` and `batch_size` based on your hardward and use case.

In [2]:
# Define necessary parameters
model_name = 'gpt2'
lr = 3e-4
nb_epochs = 2
block_size = 1024
batch_size = 8
device = 'cuda:0'
dataset_path = 'wikitext_100K.lance' # the dataset we created in a previous example


# We'll be training the a GPT2 model from scratch in this example for a couple epochs
tokenizer = AutoTokenizer.from_pretrained(model_name)

config = CONFIG_MAPPING[model_name]()
model = AutoModelForCausalLM.from_config(config, trust_remote_code=True).to(device)

# Custom Dataset and Sampler

The dataset that we created in our previous example is essentially a long, contiguous table of tokens.

We need to retrieve a causal window of tokens of `block_size` amount **and** make sure that window doesn't overlap with any other window of tokens.

The most elegant way to achieve this is to make a custom sampler that only samples random indices that are `block_size` or more apart. This way our windows of tokens will actually behave as as individual samples.

In [3]:
def from_indices(dataset, indices):
    """Load the elements on given indices from the dataset"""
    chunk = dataset.take(indices).to_pylist()
    chunk = list(map(lambda x: x['input_ids'], chunk))
    return chunk

In [4]:
class LanceDataset(Dataset):
    def __init__(
        self,
        dataset_path,
        block_size,
    ):
        # Load the lance dataset from the saved path
        self.ds = lance.dataset(dataset_path)
        self.block_size = block_size

        # Doing this so the sampler never asks for an index at the end of text
        self.length = self.ds.count_rows() - block_size

    def __len__(self):
        return self.length

    def __getitem__(self, idx):
        """
        Generate a window of indices starting from the current idx to idx+block_size
        and return the tokens at those indices
        """
        window = np.arange(idx, idx + self.block_size)
        sample = from_indices(self.ds, window)

        return {"input_ids": torch.tensor(sample), "labels": torch.tensor(sample)}

In [5]:
class LanceSampler(Sampler):
    r"""Samples tokens randomly but `block_size` indices apart to emulate unique samples

    Args:
        data_source (Dataset): dataset to sample from
        block_size (int): minimum index distance between each random sample
    """

    def __init__(self, data_source, block_size=1024):
        self.data_source = data_source
        self.num_samples = len(self.data_source)
        self.available_indices = list(range(0, self.num_samples, block_size))
        np.random.shuffle(self.available_indices)

    def __iter__(self):
        yield from self.available_indices

    def __len__(self) -> int:
        return len(self.available_indices)

# Train!

After this, the model training is pretty standard. One has to load the batch from the dataloader, transfer all it's elements (input_ids and labels) to GPU, pass them through the model, run backward pass and optimize the model.

In [6]:
# Define the dataset, sampler and dataloader
dataset = LanceDataset(dataset_path, block_size)
sampler = LanceSampler(dataset, block_size)
dataloader = DataLoader(
    dataset,
    shuffle=False,
    batch_size=batch_size,
    sampler=sampler,
    pin_memory=True
)

# Define the optimizer, training loop and train the model!
model = model.to(device)
model.eval()
optimizer = torch.optim.AdamW(model.parameters(), lr=lr)

for epoch in range(nb_epochs):
    print(f"========= Epoch: {epoch+1} / {nb_epochs} =========")
    epoch_loss = []
    prog_bar = tqdm(dataloader, total=len(dataloader))
    for batch in prog_bar:
        optimizer.zero_grad(set_to_none=True)

        # Put both input_ids and labels to the device
        for k, v in batch.items():
            batch[k] = v.to(device)

        # Perform one forward pass and get the loss
        outputs = model(**batch)
        loss = outputs.loss

        # Perform backward pass
        loss.backward()
        optimizer.step()

        prog_bar.set_description(f"loss: {loss.item():.4f}")

        epoch_loss.append(loss.item())

    # Calculate training perplexity for this epoch
    try:
        perplexity = np.exp(np.mean(epoch_loss))
    except OverflowError:
        perplexity = float("-inf")

    print(f"train_perplexity: {perplexity}")



loss: 5.4288: 100%|██████████| 1222/1222 [11:57<00:00,  1.70it/s]


train_perplexity: 428.88519977329736


loss: 4.6657: 100%|██████████| 1222/1222 [11:56<00:00,  1.70it/s]

train_perplexity: 139.86871656813784



