In [1]:
from dataclasses import dataclass
import collections
from tqdm.auto import tqdm

import numpy as np
from transformers import AutoModelForMaskedLM
from transformers import AutoTokenizer
from transformers import DataCollatorForLanguageModeling
from transformers import default_data_collator
from transformers import pipeline
from transformers import get_scheduler
import transformers
from datasets import load_dataset
from accelerate import Accelerator
import torch
from torch.utils.data import DataLoader
from torch.optim import AdamW


print('transformers', transformers.__version__)
print('pytorch', torch.version.__version__)

2023-01-26 16:16:32.827337: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE3 SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


transformers 4.26.0
pytorch 1.13.0a0+git49444c3


In [2]:
@dataclass
class HyperParameters:
    chunk_size = 128
    model_name = 'distilbert-base-uncased'
    wwm_probability = 0.2
    mlm_probability = 0.15
    batch_size = 32
    learning_rate = 5e-5
    epochs = 6

params = HyperParameters()

In [3]:
model = AutoModelForMaskedLM.from_pretrained(params.model_name)

In [4]:
distilbert_num_parameters = model.num_parameters() / 1_000_000
print(f"'>>> DistilBERT number of parameters: {round(distilbert_num_parameters)}M'")
print(f"'>>> BERT number of parameters: 110M'")

'>>> DistilBERT number of parameters: 67M'
'>>> BERT number of parameters: 110M'


In [5]:
tokenizer = AutoTokenizer.from_pretrained(params.model_name)

text = 'This is a great [MASK].'

inputs = tokenizer(text, return_tensors='pt')
token_logits = model(**inputs).logits
# Find the location of [MASK] and extract its logits
mask_token_index = torch.where(inputs['input_ids'] == tokenizer.mask_token_id)[1]
mask_token_logits = token_logits[0, mask_token_index, :]
# Pick the [MASK] candidates with the highest logits
top_5_tokens = torch.topk(mask_token_logits, 5, dim=1).indices[0].tolist()

for token in top_5_tokens:
    print(f"'>>> {text.replace(tokenizer.mask_token, tokenizer.decode([token]))}'")

'>>> This is a great deal.'
'>>> This is a great success.'
'>>> This is a great adventure.'
'>>> This is a great idea.'
'>>> This is a great feat.'


In [6]:
imdb_dataset = load_dataset("imdb")
imdb_dataset



  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})

In [7]:
sample = imdb_dataset['train'].shuffle().select(range(3))

for row in sample:
    print(f"\n>>> Review: {row['text']}")
    print(f">>> Label: {row['label']}")


>>> Review: I saw this film at a pre-release screening at the Writers Guild theater in Beverly Hills. As I recall, the film's producers and director were in attendance, presumably to gage our reaction.<br /><br />Many scenes evoked gales of laughter from the audience, which would have been fine if it had been a comedy, but it was supposed to be a horror film.<br /><br />If the audience wasn't scared, it seems the filmmakers were. They delayed release for over a year. Out of curiosity I saw it again to see if they'd re-cut it; as far as I can tell, they hadn't. It was the same lousy movie, just a year older.<br /><br />It almost qualifies as "so bad, it's good," but it's slow-paced and boring.
>>> Label: 0

>>> Review: I had a personal interest in this movie. When I was 17 and just out of high school I got a job at 20th Century Fox as a member of the Laborers and Hod Carriers Union. At the end of my first day (sweeping the deck of an aircraft carrier) I was told to bring a suitcase the

In [8]:
def tokenize_function(examples):
    result = tokenizer(examples['text'])
    if tokenizer.is_fast:
        result['word_ids'] = [result.word_ids(i) for i in range(len(result['input_ids']))]
    return result


# Use batched=True to activate fast multithreading!
tokenized_datasets = imdb_dataset.map(
    tokenize_function, batched=True, remove_columns=['text', 'label']
)
tokenized_datasets



DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids'],
        num_rows: 50000
    })
})

In [9]:
tokenizer.model_max_length

512

In [10]:
# Slicing produces a list of lists for each feature
tokenized_samples = tokenized_datasets['train'][:3]

for idx, sample in enumerate(tokenized_samples['input_ids']):
    print(f'>>> Review {idx} length: {len(sample)}')

>>> Review 0 length: 363
>>> Review 1 length: 304
>>> Review 2 length: 133


In [11]:
concatenated_examples = {
    k: sum(tokenized_samples[k], []) for k in tokenized_samples.keys()
}
total_length = len(concatenated_examples["input_ids"])
print(f"'>>> Concatenated reviews length: {total_length}'")

'>>> Concatenated reviews length: 800'


In [12]:
min([len(sample) for sample in tokenized_datasets['train']])

3

In [13]:
total_length = ((total_length + params.chunk_size -1) // params.chunk_size) * params.chunk_size

chunks = {}
for k, t in concatenated_examples.items():
    t = np.pad(t, (0, total_length - len(t)))
    chunks[k] = [t[i : i + params.chunk_size] for i in range(0, total_length, params.chunk_size)]

for chunk in chunks["input_ids"]:
    print(f"'>>> Chunk length: {len(chunk)}'")

'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'


In [14]:
def group_texts(examples):
    # Concatenate all texts
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    # Compute length of concatenated texts
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    total_length = ((total_length + params.chunk_size -1) // params.chunk_size) * params.chunk_size
    result = {}
    for k, t in concatenated_examples.items():
        t = np.pad(t, (0, total_length - len(t)))
        result[k] = [t[i : i + params.chunk_size] for i in range(0, total_length, params.chunk_size)]
    # Create a new labels column
    result["labels"] = result["input_ids"].copy()
    return result


lm_datasets = tokenized_datasets.map(group_texts, batched=True)
lm_datasets

  0%|          | 0/25 [00:00<?, ?ba/s]

  0%|          | 0/25 [00:00<?, ?ba/s]

  0%|          | 0/50 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 61314
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 59929
    })
    unsupervised: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 123007
    })
})

In [15]:
tokenizer.decode(lm_datasets['train'][1]['input_ids'])

"as the vietnam war and race issues in the united states. in between asking politicians and ordinary denizens of stockholm about their opinions on politics, she has sex with her drama teacher, classmates, and married men. < br / > < br / > what kills me about i am curious - yellow is that 40 years ago, this was considered pornographic. really, the sex and nudity scenes are few and far between, even then it's not shot like some cheaply made porno. while my countrymen mind find it shocking, in reality sex and nudity are a major staple in swedish cinema. even ingmar bergman,"

In [16]:
tokenizer.decode(lm_datasets['train'][1]['labels'])

"as the vietnam war and race issues in the united states. in between asking politicians and ordinary denizens of stockholm about their opinions on politics, she has sex with her drama teacher, classmates, and married men. < br / > < br / > what kills me about i am curious - yellow is that 40 years ago, this was considered pornographic. really, the sex and nudity scenes are few and far between, even then it's not shot like some cheaply made porno. while my countrymen mind find it shocking, in reality sex and nudity are a major staple in swedish cinema. even ingmar bergman,"

In [17]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=params.mlm_probability)

In [18]:
samples = [lm_datasets['train'][i] for i in range(2)]
for sample in samples:
    _ = sample.pop('word_ids')

for chunk in data_collator(samples)['input_ids']:
    print(f'\n>>> {tokenizer.decode(chunk)}')

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.



>>> [CLS] i rented [MASK] am [MASK] - yellow from my video store because of all the controversy that surrounded it when it was [MASK] released in 1967. [MASK] also heard that at [MASK] it was seized by u [MASK] s. customs if it ever tried to enter this country [MASK] [MASK] [MASK] a fan of [MASK] considered " controversial " i really had to see this for myself [MASK] [MASK] br / > < br / > the plot is centered around a young swedish drama student [MASK] lena who wants [MASK] learn everything she can [MASK] life. fuels particular she wants [MASK] focus her [MASK]s to making some sort of documentary on what [MASK] average swede thought about certain political issues such

>>> as the vietnam [MASK] and race issues in the united states. in between asking politicians and ordinary denizens of stockholm about their opinions on politics, she has sex [MASK] [MASK] drama teacher, classmates, and [MASK] men. < [MASK] / > < br / > what kills me about i am curious - yellow [MASK] that 40 years [MA

In [19]:
# if we want to mask the entire word instead of just single token, we have to implement this function
def whole_word_masking_data_collator(features):
    for feature in features:
        word_ids = feature.pop('word_ids')

        # Create a map between words and corresponding token indices
        mapping = collections.defaultdict(list)
        current_word_index = -1
        current_word = None
        for idx, word_id in enumerate(word_ids):
            if word_id is not None:
                if word_id != current_word:
                    current_word = word_id
                    current_word_index += 1
                mapping[current_word_index].append(idx)

        # Randomly mask words
        mask = np.random.binomial(1, params.wwm_probability, (len(mapping),))
        input_ids = feature['input_ids']
        labels = feature['labels']
        new_labels = [-100] * len(labels)
        for word_id in np.where(mask)[0]:
            word_id = word_id.item()
            for idx in mapping[word_id]:
                new_labels[idx] = labels[idx]
                input_ids[idx] = tokenizer.mask_token_id
        feature['labels'] = new_labels

    return default_data_collator(features)


samples = [lm_datasets['train'][i] for i in range(2)]
batch = whole_word_masking_data_collator(samples)

for chunk in batch['input_ids']:
    print(f'\n>>> {tokenizer.decode(chunk)}')


'>>> [CLS] i rented [MASK] am curious - [MASK] [MASK] my [MASK] store [MASK] [MASK] all the controversy that [MASK] it when it was first released in 1967. i also [MASK] that [MASK] first it was seized [MASK] u. s [MASK] customs [MASK] it ever [MASK] [MASK] enter this country [MASK] therefore being a fan [MASK] [MASK] considered [MASK] controversial " [MASK] really had to [MASK] this for myself. < br / > < br / > the plot [MASK] centered around a young swedish [MASK] student named lena [MASK] wants to learn everything she can about life [MASK] in [MASK] she [MASK] to [MASK] her [MASK] [MASK] to making some sort of documentary on what the [MASK] swede thought about certain political issues such'

'>>> [MASK] the [MASK] war and [MASK] issues in the united states. in [MASK] [MASK] politicians and ordinary [MASK] [MASK] [MASK] of stockholm about [MASK] opinions on politics, she has sex [MASK] her drama teacher, classmates, and married men. [MASK] br / > < [MASK] / [MASK] what kills me abou

In [20]:
data_size = len(lm_datasets['train'])
train_size = int(data_size * 0.9)
test_size = data_size - train_size

downsampled_dataset = lm_datasets['train'].train_test_split(
    train_size=train_size, test_size=test_size,
)
downsampled_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 55182
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 6132
    })
})

In [21]:
def insert_random_mask(batch):
    features = [dict(zip(batch, t)) for t in zip(*batch.values())]
    masked_inputs = data_collator(features)
    # Create a new "masked" column for each column in the dataset
    return {'masked_' + k: v.numpy() for k, v in masked_inputs.items()}


downsampled_dataset = downsampled_dataset.remove_columns(['word_ids'])
eval_dataset = downsampled_dataset['test'].map(
    insert_random_mask,
    batched=True,
    remove_columns=downsampled_dataset['test'].column_names,
)
eval_dataset = eval_dataset.rename_columns(
    {
        'masked_input_ids': 'input_ids',
        'masked_attention_mask': 'attention_mask',
        'masked_labels': 'labels',
    }
)

  0%|          | 0/7 [00:00<?, ?ba/s]

In [22]:
train_dataloader = DataLoader(
    downsampled_dataset['train'],
    shuffle=True,
    batch_size=params.batch_size,
    collate_fn=data_collator,
)
eval_dataloader = DataLoader(
    eval_dataset, batch_size=params.batch_size, collate_fn=default_data_collator
)

In [23]:
accelerator = Accelerator()
optimizer = AdamW(model.parameters(), lr=params.learning_rate)
model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
    model, optimizer, train_dataloader, eval_dataloader
)

In [24]:
num_update_steps_per_epoch = len(train_dataloader)
num_training_steps = params.epochs * num_update_steps_per_epoch

lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)

In [27]:
progress_bar = tqdm(range(num_training_steps))
output_dir = f'{params.model_name}-finetuned-imdb-accelerate'

for epoch in range(params.epochs):
    # Training
    model.train()
    for batch in train_dataloader:
        outputs = model(**batch)
        loss = outputs.loss
        accelerator.backward(loss)

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

    # Evaluation
    model.eval()
    losses = []
    for step, batch in enumerate(eval_dataloader):
        with torch.no_grad():
            outputs = model(**batch)

        loss = outputs.loss
        losses.append(accelerator.gather(loss.repeat(params.batch_size)))

    losses = torch.cat(losses)
    losses = losses[: len(eval_dataset)]
    try:
        perplexity = torch.exp(torch.mean(losses))
    except OverflowError:
        perplexity = float("inf")

    print(f">>> Epoch {epoch}: Perplexity: {perplexity}")

    # Save and upload
    accelerator.wait_for_everyone()
    unwrapped_model = accelerator.unwrap_model(model)
    unwrapped_model.save_pretrained(output_dir, save_function=accelerator.save)

  0%|          | 0/10350 [00:00<?, ?it/s]

>>> Epoch 0: Perplexity: 9.057567596435547
>>> Epoch 1: Perplexity: 8.754276275634766
>>> Epoch 2: Perplexity: 8.54840087890625
>>> Epoch 3: Perplexity: 8.4483060836792
>>> Epoch 4: Perplexity: 8.4483060836792
>>> Epoch 5: Perplexity: 8.4483060836792


In [28]:
model.to('cpu')
fill_masker = pipeline('fill-mask', model=model, tokenizer=tokenizer)

In [29]:
fill_masker('This is a great [MASK].')

[{'score': 0.43710577487945557,
  'token': 3185,
  'token_str': 'movie',
  'sequence': 'this is a great movie.'},
 {'score': 0.31932228803634644,
  'token': 2143,
  'token_str': 'film',
  'sequence': 'this is a great film.'},
 {'score': 0.017497606575489044,
  'token': 4038,
  'token_str': 'comedy',
  'sequence': 'this is a great comedy.'},
 {'score': 0.017465347424149513,
  'token': 2265,
  'token_str': 'show',
  'sequence': 'this is a great show.'},
 {'score': 0.013669433072209358,
  'token': 2801,
  'token_str': 'idea',
  'sequence': 'this is a great idea.'}]