# Training the model with Composer

Going to see how far I can get. Might have to leave it for fine-tuning and quantization, but it'll be nice for the base pretraining.

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from composer.utils import reproducibility

seed = 42
reproducibility.seed_all(seed)


# Learning about batch collation

I imagined that we'd be creating a dataset where you get many samples per actual sample. Looks like this doesn't happen in the dataset itself; wondering if it happens in the dataloader or the model itself?

In [22]:
outputs = tokenizer(
    ["Hello, world!", "12345 My name is John Kim and I like food"],
    truncation=True,
    max_length=3,
    return_length=True,
    return_overflowing_tokens=True,  # without this, we only return the first sequence of max_length from each sample
)
from datasets import Dataset

test_dataset = Dataset.from_dict(outputs)

In [24]:
test_dataset[0]


{'input_ids': [2832, 292, 11],
 'token_type_ids': [0, 0, 0],
 'attention_mask': [1, 1, 1],
 'length': 3,
 'overflow_to_sample_mapping': 0}

In [33]:
tokenizer.add_special_tokens({"pad_token": "<pad>"})
# This collator just pads the sequences to the max length in the batch and vstacks them into a tensor
# this forces the pad - with FastTokenizers it's better to pad before?
# warning message
# the problem is, for whatever reason, labels are shifted inside the model...?
collate_fn = transformers.DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
)

In [38]:
test_dataset["input_ids"]


[[2832, 292, 11],
 [1461, 0],
 [167, 16, 17],
 [18, 19, 20],
 [1773, 709, 293],
 [2313, 395, 262],
 [260, 605, 866],
 [1687]]

In [37]:
collated = collate_fn([test_dataset[i] for i in range(len(test_dataset))])
print("Input IDs\n", collated["input_ids"])
print("Labels\n", collated["labels"])


Input IDs
 tensor([[2832,  292,   11],
        [1461,    0, 8192],
        [ 167,   16,   17],
        [  18,   19,   20],
        [1773,  709,  293],
        [2313,  395,  262],
        [ 260,  605,  866],
        [1687, 8192, 8192]])
Labels
 tensor([[2832,  292,   11],
        [1461,    0, -100],
        [ 167,   16,   17],
        [  18,   19,   20],
        [1773,  709,  293],
        [2313,  395,  262],
        [ 260,  605,  866],
        [1687, -100, -100]])


In [42]:
torch.roll(collated["labels"], -1, dims=1)


tensor([[ 292,   11, 2832],
        [   0, -100, 1461],
        [  16,   17,  167],
        [  19,   20,   18],
        [ 709,  293, 1773],
        [ 395,  262, 2313],
        [ 605,  866,  260],
        [-100, -100, 1687]])

# Learning about Cross Entropy Loss

Implementation takes in tensors of (batch_size x vocab_size) and also a vector of (batch_size), where each element is the index of the correct word.

Previously, I thought that it took in a tensor of (batch_size x vocab_size) and a tensor of (batch_size x vocab_size), where each element is the probability of the word. This is not the case.

For whatever reason, during training, Hugging Face models expect the batch input_ids and the labels to be the same, even though the targets should really be shifted. The model internally shifts the labels during training...?

# Composer model definition

Adapting my model to work with Composer.

In [3]:
from typing import Any
from composer.models.huggingface import HuggingFaceModel
from composer.metrics.nlp import LanguageCrossEntropy
import torch
from torchmetrics import Metric
from einops import rearrange
from transformers import PreTrainedTokenizerFast, DataCollatorForLanguageModeling
from transformers.modeling_outputs import CausalLMOutputWithPast
from model import WSConfig, WSModel


class ComposerWSModel(HuggingFaceModel):
    def __init__(
        self,
        config: WSConfig,
        tokenizer: PreTrainedTokenizerFast,
    ):
        model = WSModel(config)

        # this takes in pred and target logits
        # should be batch_size x seq_len x vocab_size? probably
        train_metrics: list[Metric] = [LanguageCrossEntropy()]

        super().__init__(
            model=model,
            tokenizer=tokenizer,
            use_logits=True,
            shift_labels=True,
            metrics=train_metrics,
        )

        # Note: wanted to use flash-attn for fused CE, but there's an install error with rye
        # Honestly should be pretty small relative to other things, not going to worry about it for now
        self.loss_fn = torch.nn.CrossEntropyLoss()

    def forward(self, batch: dict[str, Any]):
        """
        Mosaic's forward pass. Batch is a Mapping with keys possibly reflecting HuggingFace's forward function inputs.
        Check GPT2 implementation for args; there isn't really a standard set.

        Output needs to be an output dataclass from huggingface.
        """
        return CausalLMOutputWithPast(
            logits=self.model(batch["input_ids"]),
        )

    def loss(self, outputs: CausalLMOutputWithPast, batch: dict[str, Any]):
        """
        Mosaic's loss function. Outputs is the output of the forward pass.
        """
        # outputs is batch x seq_len x vocab_size
        # labels is batch x seq_len
        # need to reduce to (batch * seq_len) x vocab_size and (batch * seq_len)
        output_logits = rearrange(
            outputs.logits,
            "batch seq_len vocab_size -> (batch seq_len) vocab_size",
            vocab_size=self.config.vocab_size,
        )
        labels = batch["labels"]
        # shift labels left
        labels = torch.roll(labels, -1, dims=1)
        labels[:, -1] = -100  # don't predict the last token
        # flatten
        labels = rearrange(labels, "batch seq_len -> (batch seq_len)")

        return self.loss_fn(output_logits, labels)

Either FairScale or torch distributed is not available, MixtureOfExperts will not be exposed. Please install them if you would like to use MoE


# Testing Model Training

Let's do a tiny dataset with this model.

In [9]:
import datasets
import os

HF_TOKEN = os.getenv("HF_TOKEN")
CACHE_DIR = "/datadrive/hf_cache"

tokenizer = PreTrainedTokenizerFast.from_pretrained("tokenizer")
config = WSConfig(
    d_model=64,
    n_heads=4,
    n_layers=2,
    vocab_size=tokenizer.vocab_size,
)

In [10]:
# Build dataloader
import torch.utils.data

wikihow_data: datasets.Dataset = datasets.load_dataset(
    "wikihow",
    name="all",
    data_dir=CACHE_DIR,
    cache_dir=CACHE_DIR,
    use_auth_token=HF_TOKEN,
    split="train",
    # streaming=True,
).shuffle(
    seed=seed
)  # type: ignore

text_column_name = "text"


def tokenize_function(examples: dict[str, Any]):
    examples[text_column_name] = [
        line
        for line in examples[text_column_name]
        if len(line) > 0 and not line.isspace()
    ]
    return tokenizer(
        examples[text_column_name],
        padding="max_length",
        truncation=True,
        max_length=256,
        return_special_tokens_mask=True,
    )


tokenized_train = wikihow_data.map(
    tokenize_function,
    batched=True,
    remove_columns=wikihow_data.column_names,  # collate_fn doesn't like other columns
    load_from_cache_file=False,
)

collate_fn = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
)

train_dataloader = torch.utils.data.DataLoader(
    tokenized_train, batch_size=64, collate_fn=collate_fn
)

Found cached dataset wikihow (/datadrive/hf_cache/wikihow/all-data_dir=%2Fdatadrive%2Fhf_cache/1.2.0/5343fc81d685acaa086c9cc19eb8706206cd1f8b315792b04c1d7b92091c305e)
Loading cached shuffled indices for dataset at /datadrive/hf_cache/wikihow/all-data_dir=%2Fdatadrive%2Fhf_cache/1.2.0/5343fc81d685acaa086c9cc19eb8706206cd1f8b315792b04c1d7b92091c305e/cache-ca61b0a7a4447ccd.arrow


Map:   0%|          | 0/157252 [00:00<?, ? examples/s]

In [11]:
from composer.optim import DecoupledAdamW, LinearWithWarmupScheduler

composer_model = ComposerWSModel(config=config, tokenizer=tokenizer)
optimizer = DecoupledAdamW(
    composer_model.model.parameters(),
    lr=1.0e-4,
    betas=(0.9, 0.98),
    eps=1.0e-06,
    weight_decay=1.0e-5,
)
lr_scheduler = LinearWithWarmupScheduler(t_warmup="250ba", alpha_f=0.02)


In [13]:
import torch
from composer import Trainer

# Create Trainer Object
trainer = Trainer(
    model=composer_model,  # This is the model from the HuggingFaceModel wrapper class.
    train_dataloader=train_dataloader,
    # eval_dataloader=eval_dataloader,
    max_duration="1ep",  # train for more epochs to get better performance
    optimizers=optimizer,
    schedulers=[lr_scheduler],
    device="gpu" if torch.cuda.is_available() else "cpu",
    # train_subset_num_batches=100, # uncomment this line to only run part of training, which will be faster
    precision="fp32",
    progress_bar=True,
    # checkpointing
    save_folder="checkpoints/pretraining/",
    save_filename="ep{epoch}-ba{batch}-rank{rank}.pt",
    save_interval="500ba",
)
# Start training
trainer.fit()

******************************
Config:
node_name: unknown because NODENAME environment variable not set
num_gpus_per_node: 1
num_nodes: 1
rank_zero_seed: 300766860

******************************


train          Epoch   0:    0%|| 0/2458 [00:00<?, ?ba/s]         

KeyboardInterrupt: 

In [None]:
composer_model.model.save_pretrained("model/")
