In [1]:
from transformers import AutoModelForMaskedLM

model_checkpoint = "distilbert-base-uncased"
model = AutoModelForMaskedLM.from_pretrained(model_checkpoint)

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

In [2]:
distilbert_num_parameters = model.num_parameters() / 1_000_000
print(f"'>>> DistilBERT number of parameters: {round(distilbert_num_parameters)}M'")
print(f"'>>> BERT number of parameters: 110M'")

'>>> DistilBERT number of parameters: 67M'
'>>> BERT number of parameters: 110M'


In [3]:
text = "This is a great [MASK]."

In [4]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [7]:
inputs = tokenizer(text, return_tensors="pt")

In [10]:
import torch

token_logits = model(**inputs).logits
mask_token_index = torch.where(inputs["input_ids"] == tokenizer.mask_token_id)[1]

In [11]:
mask_token_logits = token_logits[0, mask_token_index, :]

In [12]:
top_5_tokens = torch.topk(mask_token_logits, 5, dim=1).indices[0].tolist()

In [13]:
for token in top_5_tokens:
    print(f"'>>> {text.replace(tokenizer.mask_token, tokenizer.decode([token]))}'")

'>>> This is a great deal.'
'>>> This is a great success.'
'>>> This is a great adventure.'
'>>> This is a great idea.'
'>>> This is a great feat.'


In [23]:
from datasets import load_dataset

imdb_dataset = load_dataset("imdb")
imdb_dataset

Downloading readme:   0%|          | 0.00/7.81k [00:00<?, ?B/s]

Downloading data: 100%|█| 21.0M/21.0M [00:00<00:00, 23.1MB/s]
Downloading data: 100%|█| 20.5M/20.5M [00:00<00:00, 22.0MB/s]
Downloading data: 100%|█| 42.0M/42.0M [00:01<00:00, 30.0MB/s]


Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})

In [26]:
sample = imdb_dataset["unsupervised"].shuffle(seed=43).select(range(3))

for row in sample:
    print(f"\n'>>> Review: {row['text']}'")
    print(f"'>>> Label: {row['label']}'")


'>>> Review: Shep Ramsey is forced to land his ship in the suburbs, hilarity ensues. But yet this film is so much more. Brilliant execution and top-notch acting from Hogan and Christopher Lloyd are examples of what propel this film to the top of cinematic history. There is a sadness in Hogan's Ramsey; a feeling of alienation. This is perfectly exemplified in the scene where Ramsey must play an arcade game. He truly believe he is saving this virtual world, and he plays with such veracious intensity that he ends up forcing the machine to explode, causing us the audience to look on in empathy for this lost soul. A truly heart wrenching experience, and a masterpiece I shall never forget, this one haunts dreams folks.'
'>>> Label: -1'

'>>> Review: I don't know why but I actually liked this show. It was entertaining, funny, sexy and overall fun to watch. John-Ryphus was great in it, I haven't seen enough of him and it was a refreshing thing (and appropriate) to see him in this. Usually sho

In [27]:
def tokenize_function(examples):
    result = tokenizer(examples["text"])
    if tokenizer.is_fast:
        result["word_ids"] = [result.word_ids(i) for i in range(len(result["input_ids"]))]
    return result


# Use batched=True to activate fast multithreading!
tokenized_datasets = imdb_dataset.map(
    tokenize_function, batched=True, remove_columns=["text", "label"]
)
tokenized_datasets

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (720 > 512). Running this sequence through the model will result in indexing errors


Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids'],
        num_rows: 50000
    })
})

In [28]:
tokenizer.model_max_length

512

In [29]:
chunk_size = 128

In [30]:
# Slicing produces a list of lists for each feature
tokenized_samples = tokenized_datasets["train"][:3]

for idx, sample in enumerate(tokenized_samples["input_ids"]):
    print(f"'>>> Review {idx} length: {len(sample)}'")

'>>> Review 0 length: 363'
'>>> Review 1 length: 304'
'>>> Review 2 length: 133'


In [31]:
concatenated_examples = {
    k: sum(tokenized_samples[k], []) for k in tokenized_samples.keys()
}
total_length = len(concatenated_examples["input_ids"])
print(f"'>>> Concatenated reviews length: {total_length}'")

'>>> Concatenated reviews length: 800'


In [32]:
chunks = {
    k: [t[i : i + chunk_size] for i in range(0, total_length, chunk_size)]
    for k, t in concatenated_examples.items()
}


In [34]:
for chunk in chunks["input_ids"]:
    print(f"'>>> Chunk length: {len(chunk)}'")

'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 32'


In [35]:
def group_texts(examples):
    # Concatenate all texts
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    # Compute length of concatenated texts
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the last chunk if it's smaller than chunk_size
    total_length = (total_length // chunk_size) * chunk_size
    # Split by chunks of max_len
    result = {
        k: [t[i : i + chunk_size] for i in range(0, total_length, chunk_size)]
        for k, t in concatenated_examples.items()
    }
    # Create a new labels column
    result["labels"] = result["input_ids"].copy()
    return result

In [36]:
lm_datasets = tokenized_datasets.map(group_texts, batched=True)

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [37]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)

In [38]:
samples = [lm_datasets["train"][i] for i in range(2)]
for sample in samples:
    _ = sample.pop("word_ids")

for chunk in data_collator(samples)["input_ids"]:
    print(f"\n'>>> {tokenizer.decode(chunk)}'")


'>>> [CLS] i rented i am [MASK] - yellow from [MASK] video store because of all the controversy [MASK] surrounded it when it was first released in 1967. i also heard that at first it was seized by u. [MASK]. customs if it [MASK] [MASK] to enter this country, therefore being a fan of films considered [MASK] controversial " i really had to see this for myself. < br / > < br / [MASK] the plot is centered around [MASK] [MASK] swedish [unused763] student named lena [MASK] wants to learn everything she [MASK] [MASK] life. in particular she wants to focus [MASK] attention [MASK] to [MASK] some sort [MASK] documentary on what the average swede thought about [MASK] [MASK] issues such'

'>>> as the vietnam warrdy race issues in the united [MASK]. in between asking politicians and ordinary den [MASK]ns of [MASK] [MASK] their opinions on politics, she has sex with her drama teacher, [MASK], and married men [MASK] < [MASK] / [MASK] < br / > what kills me about i am curious - yellow is that 40 year

In [47]:
import collections
import numpy as np

from transformers import default_data_collator

wwm_probability = 0.2


def whole_word_masking_data_collator(features):
    for feature in features:
        word_ids = feature.pop("word_ids")

        # Create a map between words and corresponding token indices
        mapping = collections.defaultdict(list)
        current_word_index = -1
        current_word = None
        for idx, word_id in enumerate(word_ids):
            if word_id is not None:
                if word_id != current_word:
                    current_word = word_id
                    current_word_index += 1
                mapping[current_word_index].append(idx)

        # Randomly mask words
        mask = np.random.binomial(1, wwm_probability, (len(mapping),))
        input_ids = feature["input_ids"]
        labels = feature["labels"]
        new_labels = [-100] * len(labels)
        for word_id in np.where(mask)[0]:
            word_id = word_id.item()
            for idx in mapping[word_id]:
                new_labels[idx] = labels[idx]
                input_ids[idx] = tokenizer.mask_token_id
        feature["labels"] = new_labels

    return default_data_collator(features)

In [48]:
samples = [lm_datasets["train"][i] for i in range(2)]
batch = whole_word_masking_data_collator(samples)

for chunk in batch["input_ids"]:
    print(f"\n'>>> {tokenizer.decode(chunk)}'")


'>>> [CLS] i rented i [MASK] curious - yellow from my video store because of all the controversy [MASK] surrounded it when it [MASK] [MASK] released in 1967 [MASK] i [MASK] heard [MASK] at first [MASK] was seized by u. [MASK]. customs if it ever tried to [MASK] this country, therefore [MASK] [MASK] fan of films considered " controversial " i really had to see this for [MASK]. < br / > < [MASK] / > [MASK] [MASK] [MASK] centered around a young swedish drama student [MASK] lena who wants to learn [MASK] she can about life. in [MASK] she wants to focus her attentions to making some sort of documentary on what the [MASK] swede thought about certain political issues such'

'>>> as the vietnam war and [MASK] issues in the [MASK] states. in between asking politicians [MASK] ordinary denizens of [MASK] about their opinions on [MASK], she has sex [MASK] her [MASK] teacher, classmates, [MASK] [MASK] [MASK]. [MASK] br / [MASK] < br / > what kills me about i am curious - yellow is [MASK] 40 [MASK]

In [49]:
train_size = 10_000
test_size = int(0.1 * train_size)

downsampled_dataset = lm_datasets["train"].train_test_split(
    train_size=train_size, test_size=test_size, seed=42
)
downsampled_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 10000
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 1000
    })
})

In [50]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [55]:
from transformers import TrainingArguments

batch_size = 64
# Show the training loss with every epoch
logging_steps = len(downsampled_dataset["train"]) // batch_size
model_name = model_checkpoint.split("/")[-1]

training_args = TrainingArguments(
    output_dir=f"{model_name}-finetuned-imdb",
    overwrite_output_dir=True,
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    #push_to_hub=True,
    #fp16=True,
    logging_steps=logging_steps,
)

In [56]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=downsampled_dataset["train"],
    eval_dataset=downsampled_dataset["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
)

In [57]:
import math

eval_results = trainer.evaluate()
print(f">>> Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

>>> Perplexity: 21.94


In [58]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,2.6767,2.508285
2,2.5718,2.449048
3,2.5362,2.445142


TrainOutput(global_step=471, training_loss=2.595475628117847, metrics={'train_runtime': 13703.0804, 'train_samples_per_second': 2.189, 'train_steps_per_second': 0.034, 'total_flos': 994208670720000.0, 'train_loss': 2.595475628117847, 'epoch': 3.0})

In [59]:
eval_results = trainer.evaluate()
print(f">>> Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

>>> Perplexity: 11.78


In [60]:
def insert_random_mask(batch):
    features = [dict(zip(batch, t)) for t in zip(*batch.values())]
    masked_inputs = data_collator(features)
    # Create a new "masked" column for each column in the dataset
    return {"masked_" + k: v.numpy() for k, v in masked_inputs.items()}

In [61]:
downsampled_dataset = downsampled_dataset.remove_columns(["word_ids"])
eval_dataset = downsampled_dataset["test"].map(
    insert_random_mask,
    batched=True,
    remove_columns=downsampled_dataset["test"].column_names,
)
eval_dataset = eval_dataset.rename_columns(
    {
        "masked_input_ids": "input_ids",
        "masked_attention_mask": "attention_mask",
        "masked_labels": "labels",
    }
)

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [62]:
from torch.utils.data import DataLoader
from transformers import default_data_collator

batch_size = 64
train_dataloader = DataLoader(
    downsampled_dataset["train"],
    shuffle=True,
    batch_size=batch_size,
    collate_fn=data_collator,
)
eval_dataloader = DataLoader(
    eval_dataset, batch_size=batch_size, collate_fn=default_data_collator
)

In [63]:
model = AutoModelForMaskedLM.from_pretrained(model_checkpoint)

In [64]:
from torch.optim import AdamW

optimizer = AdamW(model.parameters(), lr=5e-5)

In [66]:
from accelerate import Accelerator

accelerator = Accelerator()
model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
    model, optimizer, train_dataloader, eval_dataloader
)

In [67]:
from transformers import get_scheduler

num_train_epochs = 3
num_update_steps_per_epoch = len(train_dataloader)
num_training_steps = num_train_epochs * num_update_steps_per_epoch

lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)

In [68]:
#from tqdm.auto import tqdm
#import torch
#import math
#
#progress_bar = tqdm(range(num_training_steps))
#
#for epoch in range(num_train_epochs):
#    # Training
#    model.train()
#    for batch in train_dataloader:
#        outputs = model(**batch)
#        loss = outputs.loss
#        accelerator.backward(loss)
#
#        optimizer.step()
#        lr_scheduler.step()
#        optimizer.zero_grad()
#        progress_bar.update(1)
#
#    # Evaluation
#    model.eval()
#    losses = []
#    for step, batch in enumerate(eval_dataloader):
#        with torch.no_grad():
#            outputs = model(**batch)
#
#        loss = outputs.loss
#        losses.append(accelerator.gather(loss.repeat(batch_size)))
#
#    losses = torch.cat(losses)
#    losses = losses[: len(eval_dataset)]
#    try:
#        perplexity = math.exp(torch.mean(losses))
#    except OverflowError:
#        perplexity = float("inf")
#
#    print(f">>> Epoch {epoch}: Perplexity: {perplexity}")
#
#    # Save and upload
#    accelerator.wait_for_everyone()
#    unwrapped_model = accelerator.unwrap_model(model)
#    unwrapped_model.save_pretrained(output_dir, save_function=accelerator.save)
#    if accelerator.is_main_process:
#        tokenizer.save_pretrained(output_dir)
#        repo.push_to_hub(
#            commit_message=f"Training in progress epoch {epoch}", blocking=False
#        )