In [1]:
import datasets
import transformers as trx

In [2]:
ds = datasets.load_dataset('yelp/yelp_review_full')
ds

DatasetDict({
    train: Dataset({
        features: ['label', 'text'],
        num_rows: 650000
    })
    test: Dataset({
        features: ['label', 'text'],
        num_rows: 50000
    })
})

In [3]:
tokenizer = trx.AutoTokenizer.from_pretrained('bert-base-uncased')
cfg = trx.BertConfig()
model = trx.BertForMaskedLM(cfg)



In [4]:
# n_params
n_params = 0
for p in model.parameters():
    n_params += p.numel()
n_params

109514298

In [5]:
n_params / 1_000_000

109.514298

In [9]:
cfg.max_position_embeddings = 1024
long_model = trx.BertForMaskedLM(cfg)
n_params_long = 0
for p in long_model.parameters():
    n_params_long += p.numel()
n_params_long

109907514

In [11]:
(n_params_long - n_params) / 1_000_000

0.393216


 Pack (chunk) the samples such that the length of all the samples in the dataset is 512 (for efficient training). Define a mapping function that implements the following procedure
 1. Take a batch of 1000 samples
 2. Tokenize it to get input IDs and attention mask
 3. Concatenate all the input IDs
 4. Chunk the concatenated IDs into a size of 512
 5. Drop the last chunk if its length is less than 512
 6. Pack all the chunks
 7. Iterate over all the batches in the dataset 
Store the resulting dataset in the variable “ds_chunked”. Enter the total number of samples in the new dataset.
Note: the batch size should be kept at 1000 while calling "ds.map()" for theanswer to match.


In [12]:
ds = datasets.load_dataset('yelp/yelp_review_full', split='all')

In [18]:
ds = ds.map(lambda x: {'n_tokens': len(tokenizer(x['text'])['input_ids'])}, num_proc=12)

Map (num_proc=12):   0%|          | 0/700000 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (819 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (539 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1019 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1134 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (751 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for t

In [20]:
sum(ds['n_tokens']) / len(ds)

180.70404142857143

In [22]:
from itertools import chain

In [24]:
def chunk(batch):
    enc = tokenizer(batch['text'])
    input_ids = list(chain(*enc['input_ids']))
    attention_mask = list(chain(*enc['attention_mask']))
    assert len(input_ids) == len(attention_mask)
    input_ids_chunked = [input_ids[i: (i + 512)] for i in range(0, len(input_ids), 512)]
    attention_mask_chunked = [attention_mask[i: (i + 512)] for i in range(0, len(attention_mask), 512)]
    if len(input_ids_chunked[-1]) < 512:
        input_ids_chunked = input_ids_chunked[:-1]
        attention_mask_chunked = attention_mask_chunked[:-1]
    return {'input_ids': input_ids_chunked, 'attention_mask': attention_mask_chunked}

ds_chunked = ds.map(chunk, batched=True, batch_size=1000, remove_columns=ds.column_names, num_proc=12)
ds_chunked

Map (num_proc=12):   0%|          | 0/700000 [00:00<?, ? examples/s]

Dataset({
    features: ['input_ids', 'attention_mask'],
    num_rows: 246703
})

In [25]:
ds_chunked = ds_chunked.map(lambda x: {'n_tokens': len(x['input_ids'])}, num_proc=12)
set(ds_chunked['n_tokens'])

Map (num_proc=12):   0%|          | 0/246703 [00:00<?, ? examples/s]

{512}

In [27]:
len(ds) / len(ds_chunked)

2.8374198935562194

In [28]:
# This is good! ^

In [29]:
ds_split = ds_chunked.train_test_split(test_size=0.05, seed=42)
ds_split

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'n_tokens'],
        num_rows: 234367
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'n_tokens'],
        num_rows: 12336
    })
})

In [31]:
collator = trx.DataCollatorForLanguageModeling(tokenizer, mlm=True, mlm_probability=0.2)

In [33]:
from torch.utils.data import DataLoader

In [35]:
loader = DataLoader(ds_split['train'], batch_size=4, collate_fn=collator)
for batch in loader:
    break
batch

{'input_ids': tensor([[ 1997,  6881,  2073,  ...,  2795,  7020,  2080],
        [ 3504,  3651,  1012,  ..., 22640,  2696,  2001],
        [ 3403,  2051,   103,  ...,   103,  1056,  3710],
        [ 5587, 25861,  5167,  ...,  2000,  2256,   103]]), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1]]), 'n_tokens': tensor([512, 512, 512, 512]), 'labels': tensor([[-100, -100, -100,  ..., -100, -100, -100],
        [-100, 5236, -100,  ..., -100, -100, -100],
        [-100, -100, 2003,  ..., 1005, -100, -100],
        [-100, 3176, -100,  ..., -100, -100, 2795]])}

In [37]:
(batch['labels'][0] == -100).sum() / 512

tensor(0.8047)

In [38]:
cfg

BertConfig {
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 1024,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.44.2",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

Create a small BERT model by changing the following hyper-parameters and keeping the other hyper-parameters as is

 * num_hidden_layers = 6
 * hidden size: 384
 * intermediate_size: 1536
 
 and start training the model with a batch of size 8 for an epoch. What is the loss value at the end of the training?

In [44]:
cfg = trx.BertConfig()
cfg.num_hidden_layers = 6
cfg.hidden_size = 384
cfg.intermediate_size = 1536
model = trx.BertForMaskedLM(cfg)

training_args = trx.TrainingArguments(
    output_dir="out-bert-2",
    eval_strategy="steps",
    eval_steps=1000,
    num_train_epochs=1,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    bf16=False,
    fp16=False,
    tf32=False,
    adam_beta1=0.9,
    adam_beta2=0.999,
    learning_rate=2e-5,
    weight_decay=0.01,
    gradient_accumulation_steps=1,
    logging_strategy="steps",
    logging_steps=5000,
    save_steps=5000,
    save_total_limit=10,
)
trainer = trx.Trainer(
    model=model, args=training_args,
    train_dataset=ds_split['train'], eval_dataset=ds_split['test'], data_collator=collator
)
yuri = trainer.train()

Step,Training Loss,Validation Loss
1000,No log,6.328219
2000,No log,6.172441
3000,No log,6.103881
4000,No log,6.062629
5000,6.350300,6.02449
6000,6.350300,5.993413
7000,6.350300,5.979392
8000,6.350300,5.957824
9000,6.350300,5.947351
10000,5.995200,5.936842


In [50]:
yuri.metrics

{'train_runtime': 7942.4873,
 'train_samples_per_second': 29.508,
 'train_steps_per_second': 3.689,
 'total_flos': 7794944963463168.0,
 'train_loss': 5.986819539676918,
 'epoch': 1.0}

In [41]:
246703 / 8

30837.875