In [None]:
!pip install datasets evaluate transformers[sentencepiece]
!pip install accelerate

# Training a causal language model from scratch

In previous sections, we have used pretrained models and fine-tuned them for new use cases by reusing the weights from pretraining. This is commonly referred to as *transfer learning*.

In this section, we will train a completely new model from scratch and build a scaled-down version of a code generation model: we focus on one-line completions instead of full functions or classes, using a subset of Python code.

## Gathering the data

We only need the subset of the dataset concerned with the Python data science stack. So we start by filtering the `codeparrot` dataset for all files that include any of the libraries in this stack.

In [2]:
def any_keyword_in_string(string, keywords):
    for keyword in keywords:
        if keyword in string:
            return True

    return False

In [3]:
filters = ['pandas', 'sklearn', 'matplotlib', 'seaborn']
example_1 = 'import numpy as np'
example_2 = 'import pandas as pd'

print(
    any_keyword_in_string(example_1, filters),
    any_keyword_in_string(example_2, filters),
)

False True


We can use this to create a function that will stream the dataset and filter the elements:

In [4]:
from collections import defaultdict
from tqdm import tqdm
from datasets import Dataset

def filter_streaming_dataset(dataset, filters):
    filtered_dict = defaultdict(list)
    total = 0

    for sample in tqdm(iter(dataset)):
        total += 1

        if any_keyword_in_string(sample['content'], filters):
            for k, v in sample.items():
                filtered_dict[k].append(v)

    print(f"{len(filtered_dict['content'])/total:.2%} of data after filtering.")
    return Dataset.from_dict(filtered_dict)

We can apply this function to the streaming dataset:

In [None]:
from datasets import load_dataset

split = 'train' # or 'valid'
filters = ['pandas', 'sklearn', 'matplotlib', 'seaborn']

data = load_dataset(f"transformerbook/codeparrot-{split}", split=split, streaming=True)
filtered_data = filter_streaming_dataset(data, filters)

Filtering the full dataset may take several hours. We can also download the filtered dataset from the Hub:

In [None]:
from datasets import load_dataset, DatasetDict

ds_train = load_dataset('huggingface-course/codeparrot-ds-train', split='train')
ds_valid = load_dataset('huggingface-course/codeparrot-ds-valid', split='validation')

raw_datasets = DatasetDict(
    {
        'train': ds_train, # .shuffle().select(range(50000)),
        'valid': ds_valid, # .shuffle().select(range(500)),
    }
)

In [6]:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['repo_name', 'path', 'copies', 'size', 'content', 'license'],
        num_rows: 606720
    })
    valid: Dataset({
        features: ['repo_name', 'path', 'copies', 'size', 'content', 'license'],
        num_rows: 3322
    })
})

Pretraining the language model will take a while. It is recommended to first run the training loop on a sample of the data by uncommenting the two partial lines above, and make sure that the training succussfully completes and the models are stored.

Check an example:

In [7]:
for key in raw_datasets['train'][0]:
    print(f"{key.upper()}: {raw_datasets['train'][0][key][:200]}")

REPO_NAME: kmike/scikit-learn
PATH: sklearn/utils/__init__.py
COPIES: 3
SIZE: 10094
CONTENT: """
The :mod:`sklearn.utils` module includes various utilites.
"""

from collections import Sequence

import numpy as np
from scipy.sparse import issparse

from .murmurhash import murm
LICENSE: bsd-3-clause


We can see that the `content` field contains the code that we want our model to train on.

## Preparing the dataset

The first step is to tokenize the data for training. Since our goal is to mainly autocomplete short function calls,  we can keep the context size relatively small. This has the benefit that we can train the model much faster and it requires significantly less memory.

If it is important for our application to have more context, make sure to increate this number, but keep in mind that this comes with a greater GPU memory footprint.

Here, let's fix the context size at 128 tokens. Most documents contain more than 128 tokens, so simply truncating the inputs to the maximum length would eliminate a large fraction of our dataset. Instead, we will use the `return_overflowing_tokens` option to tokenize the whole input and split it into several chunk. We will also use the `return_length` option to return the legnth of each created chunk automatically. Often the last chunk will be smaller than the context size, and we will get rid of these pieces to avoid padding issues.

In [None]:
from transformers import AutoTokenizer

context_length = 128
tokenizer = AutoTokenizer.from_pretrained('huggingface-course/code-search-net-tokenizer')

In [9]:
# check the frist two examples
outputs = tokenizer(
    raw_datasets['train'][:2]['content'],
    truncation=True,
    max_length=context_length,
    return_overflowing_tokens=True,
    return_length=True,
)

print(f"Input IDs length: {len(outputs['input_ids'])}")
print(f"Input chunk lengths: {(outputs['length'])}")
print(f"Chunk mapping: {outputs['overflow_to_sample_mapping']}")

Input IDs length: 34
Input chunk lengths: [128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 117, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 41]
Chunk mapping: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


With the `overflow_to_sample_mapping` field, we can reconstruct which chunks belonged to which input samples. With this operation we use a handy feature of teh `Dataset.map()` function, which is that it does not require one-to-one maps; we can create batches with more or fewer elements than the input batch.

In our case, when tokenizing each element into chunks of the specified context size, we create many samples from each document. We need to make sure to delete the existing columns, since they have a conflicting size. If we want to keep them, we could repeat them appropriately and return them within the `Dataset.map()` call:

In [None]:
def tokenize(element):
    outputs = tokenizer(
        element['content'],
        truncation=True,
        max_length=context_length,
        return_overflowing_tokens=True,
        return_length=True,
    )

    input_batch = []
    for length, input_ids in zip(outputs['length'], outputs['input_ids']):
        if length == context_length:
            input_batch.append(input_ids)

    return {'input_ids': input_batch}

tokenized_datasets = raw_datasets.map(
    tokenize,
    batched=True,
    remove_columns=raw_datasets['train'].column_names,
)

In [11]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids'],
        num_rows: 16702061
    })
    valid: Dataset({
        features: ['input_ids'],
        num_rows: 93164
    })
})

## Initializing a new model

In [None]:
from transformers import AutoTokenizer, GPT2LMHeadModel, AutoConfig

config = AutoConfig.from_pretrained(
    'gpt2',
    vocab_size=len(tokenizer),
    n_ctx=context_length,
    bos_token_id=tokenizer.bos_token_id,
    eos_token_id=tokenizer.eos_token_id,
)

With this configuration, we can load a new model.

In [13]:
model = GPT2LMHeadModel(config) # not .from_pretrained
model_size = sum(t.numel() for t in model.parameters())
print(f"GPT-2 size: {model_size/1000**2:.2f}M parameters")

GPT-2 size: 124.24M parameters


Before we can start training, we need to set up a data collator that will take care of creating the batches. We can use the `DataCollatorForLanguageModeling` collator, which is designed specifically for language modeling.

`DataCollatorForLanguageModeling` supports both masked language modeling (MLM) and causal language modeling (CLM). By default it prepares data for MLM, but we can switch to CLM  by setting the argument `mlm=False`:

In [14]:
from transformers import DataCollatorForLanguageModeling

tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

In [15]:
out = data_collator([tokenized_datasets['train'][i] for i in range(5)])

for key in out:
    print(f"{key} shape: {out[key].shape}")

input_ids shape: torch.Size([5, 128])
attention_mask shape: torch.Size([5, 128])
labels shape: torch.Size([5, 128])


Now the examples have been stacked and all the tensors have the same shape.

Shifting the inputs and labels to align them happens inside the model, so the data collator just copies the inputs to create the labels.

All that's left to do is to configure the training arguments and fire up the `Trainer`.

We will use a cosine learning rate schedule with some warmup and an effective batch size of 256 (`per_device_train_batch_size` * `gradient_accumulation_steps`). The Gradient accumulation is used when a single batch does not fit into memory, and incrementally builds up the gradient through several forward/backward passes.

In [16]:
from transformers import Trainer, TrainingArguments

args = TrainingArguments(
    output_dir='codeparrot-ds',
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    evaluation_strategy='steps',
    eval_steps=5000,
    logging_steps=5000,
    gradient_accumulation_steps=8,
    num_train_epochs=1,
    weight_decay=0.1,
    warmup_steps=1000,
    lr_scheduler_type='cosine',
    learning_rate=5e-4,
    save_steps=5000,
    fp16=True,
    push_to_hub=False,
)

trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=args,
    data_collator=data_collator,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['valid'],
)



In [None]:
# ready to run
trainer.train()

## Code generation with a pipeline

In [None]:
import torch
from transformers import pipeline

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

pipe = pipeline(
    'text-generation',
    model='huggingface-course/codeparrot-ds',
    device=device,
)

In [18]:
txt = """\
# create some data
x = np.random.randn(100)
y = np.random.randn(100)

# create scatter plot with x, y
"""

print(pipe(txt, num_return_sequences=1)[0]['generated_text'])

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


# create some data
x = np.random.randn(100)
y = np.random.randn(100)

# create scatter plot with x, y
plt.scatter(x, y, color='r', marker


In [19]:
txt = """\
# create some data
x = np.random.randn(100)
y = np.random.randn(100)

# create dataframe from x and y
"""
print(pipe(txt, num_return_sequences=1)[0]["generated_text"])

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


# create some data
x = np.random.randn(100)
y = np.random.randn(100)

# create dataframe from x and y
df = pd.DataFrame(x, columns=["c0", "


Since the number of generated tokens is limited, the for loop is cut off in the geenration.

In [None]:
txt = """\
# dataframe with profession, income and name
df = pd.DataFrame({'profession': x, 'income':y, 'name': z})

# calculate the mean income per profession
"""
print(pipe(txt, num_return_sequences=1)[0]["generated_text"])

In [None]:
txt = """
# import random forest regressor from scikit-learn
from sklearn.ensemble import RandomForestRegressor

# fit random forest model with 300 estimators on X, y:
"""
print(pipe(txt, num_return_sequences=1)[0]["generated_text"])

## Training with HuggingFace Accelerate

Since we are mainly interested in sensible autocompletion for the the data science libraries, it makes sense to give more weight to training samples that make more use of these libraries.

We can easily identify these examples through the use of keywords such as `plt`, `pd`, `sk`, `fit`, and `predict`. If these are each represented as a single token, we can easily check if they occur in the input sequence. Tokens can have a whitespace prefix, so we will also check for those versions in the tokenizer vocabulary.

To verify that it works, we will add one test token which should be split into multiple tokens:

In [20]:
keytoken_ids = []
for keyword in ['plt', 'pd', 'sk', 'fit', 'predict', ' plt', ' pd', ' sk', ' fit', ' predict', 'testtest']:
    ids = tokenizer([keyword]).input_ids[0]

    if len(ids) == 1:
        keytoken_ids.append(ids[0])
    else:
        print(f"Keyword has not single token: {keyword}")

Keyword has not single token: testtest


Now we can write a custom loss function that takes the input sequence, the logits, and the key tokens we just selected as inputs.
1. Align the logits and inputs: the input sequence shfited by one to the right forms the labels, since the next token is the label for the current token.
2. Cut off the last logits, as we don't have a label for the token that follows the full input sequence. With that we can compute the loss per sample and count the occurrences of all keywords in each sample.
3. Calculate the weighted average over all samples using the occurrences as weights. Since we don't want to throw away all the samples that have no keywords, we add 1 to the weights:

In [21]:
from torch.nn import CrossEntropyLoss
import torch

def keytoken_weighted_loss(inputs, logits, keytoken_ids, alpha=1.0):
    # shift so that tokens < n predict n
    shift_labels = inputs[..., 1:].contiguous()
    shift_logits = logits[..., :-1, :].contiguous()

    # calculate per-token loss
    loss_fct = CrossEntropyLoss(reduction='none')
    loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)),
                    shift_labels.view(-1))

    # resize and average loss per sample
    loss_per_sample = loss.view(shift_logits.size(0), shift_logits.size(1)).mean(axis=1)
    # calculate and scale weighting
    weights = torch.stack([
        (inputs == kt).float() for kt in keytoken_ids
    ]).sum(axis=[0, 2])
    weights = alpha * (1.0 + weights)

    # calculate weighted average
    weighted_loss = (loss_per_sample * weights).mean()

    return weighted_loss

Before we can start training with the new loss function, we need to prepare a few things:
* dataloaders to load the data in batches,
* weight decay parameters,
* evaluation code in a function.

In [23]:
from torch.utils.data import DataLoader

tokenized_datasets.set_format('torch')
# set up dataloaders
train_dataloader = DataLoader(
    tokenized_datasets['train'],
    batch_size=32,
    shuffle=True,
)
eval_dataloader = DataLoader(
    tokenized_datasets['valid'],
    batch_size=32,
)

Next, we group the parameters so that the optimizer knows which ones will get an additional weight decay. Usually, all bias and LayerNorm weights terms are exempt this from:

In [28]:
weight_decay = 0.1

def get_grouped_params(model, no_decay=['bias', 'LayerNorm.weight']):
    params_with_wd, params_without_wd = [], []

    for n, p in model.named_parameters():
        if any(nd in n for nd in no_decay):
            params_without_wd.append(p)
        else:
            params_with_wd.append(p)

    return [
        {'params': params_with_wd, 'weight_decay': weight_decay},
        {'params': params_without_wd, 'weight_decay': 0.0},
    ]

Since we want to evaluate the model regularly on the validation set during trianing, let's write a function for that as well.

In [29]:
def evaluate():
    model.eval()
    losses = []

    for step, batch in enumerate(eval_dataloader):
        with torch.no_grad():
            outputs = model(batch['input_ids'], labels=batch['input_ids'])

        losses.append(accelerator.gather(outputs.loss))

    loss = torch.mean(torch.cat(losses))

    try:
        perplexity = torch.exp(loss)
    except OverflowError:
        perplexity = float('inf')

    return loss.item(), perplexity.item()

With the `evaluate()` function, we can report loss and perplexity at regular intervals.

Next, we redefine our model to make sure we train from scratch:

In [26]:
model = GPT2LMHeadModel(config)

Define our optimizer, using the function from before to split the parameters for weight decay:

In [30]:
from torch.optim import AdamW

optimizer = AdamW(get_grouped_params(model), lr=5e-4)

Prepare the model, optimizer, and dataloaders with Accelerate:

In [32]:
from accelerate import Accelerator

accelerator = Accelerator()

model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
    model, optimizer, train_dataloader, eval_dataloader,
)

After we have sent `train_dataloader` to `accelerator.prepare()`, we can use its length to compute the number of training steps. **We should always do this after preparing the dataloader, as that method will change its length.**

In [33]:
from transformers import get_scheduler

num_train_epochs = 1
num_update_steps_per_epoch = len(train_dataloader)
num_training_steps = num_train_epochs * num_update_steps_per_epoch

lr_scheduler = get_scheduler(
    name='linear',
    optimizer=optimizer,
    num_warmup_steps=1000,
    num_training_steps=num_training_steps,
)

In [34]:
model_name = 'codeparrot-ds-accelerate'
output_dir = 'codeparrot-ds-accelerate'

Before training, make sure if the evaluation function works properly:

In [None]:
evaluate()

The training loop:

In [None]:
from tqdm.notebook import tqdm

gradient_accumulation_steps = 8
eval_steps = 5000

model.train()
completed_steps = 0

for epoch in range(num_train_epochs):
    for step, batch in tqdm(
        enumerate(train_dataloader, start=1), total=num_training_steps
    ):
        logits = model(batch['input_ids']).logits

        loss = keytoken_weighted_loss(batch['input_ids'], logits, keytoken_ids)

        if step % 100 == 0:
            accelerator.print(
                {
                    'sample': step * samples_per_step,
                    'steps': completed_steps,
                    'loss/train': loss.items() * gradient_accumulation_steps,
                }
            )

        loss = loss / gradient_accumulation_steps
        accelerator.backward(loss)

        if step % gradient_accumulation_steps == 0:
            accelerator.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()
            completed_steps += 1

        if (step % (eval_steps * gradient_accumulation_steps)) == 0:
            eval_loss, perplexity = evaluate()
            accelerator.print(
                {'loss/eval': eval_loss, 'perplexity': perplexity}
            )

            model.train()
            accelerator.wait_for_everyone()
            unwrapped_model = accelerator.unwrap_model(model)
            unwrapped_model.save_pretrained(output_dir, save_function=accelerator.save)
            if accelerator.is_main_process:
                tokenizer.save_pretrained(output_dir)