In [1]:
from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
raw_datasets = load_dataset('glue', 'mrpc')

In [3]:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 1725
    })
})

In [4]:
raw_datasets['train']

Dataset({
    features: ['sentence1', 'sentence2', 'label', 'idx'],
    num_rows: 3668
})

In [5]:
raw_datasets['train'][:3]

{'sentence1': ['Amrozi accused his brother , whom he called " the witness " , of deliberately distorting his evidence .',
  "Yucaipa owned Dominick 's before selling the chain to Safeway in 1998 for $ 2.5 billion .",
  'They had published an advertisement on the Internet on June 10 , offering the cargo for sale , he added .'],
 'sentence2': ['Referring to him as only " the witness " , Amrozi accused his brother of deliberately distorting his evidence .',
  "Yucaipa bought Dominick 's in 1995 for $ 693 million and sold it to Safeway for $ 1.8 billion in 1998 .",
  "On June 10 , the ship 's owners had published an advertisement on the Internet , offering the explosives for sale ."],
 'label': [1, 0, 1],
 'idx': [0, 1, 2]}

In [6]:
raw_datasets['train'].features

{'sentence1': Value(dtype='string', id=None),
 'sentence2': Value(dtype='string', id=None),
 'label': ClassLabel(names=['not_equivalent', 'equivalent'], id=None),
 'idx': Value(dtype='int32', id=None)}

In [7]:
from transformers import AutoTokenizer

In [8]:
checkpoint = 'bert-base-cased'

In [9]:
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [37]:
def tokenize_function(example):
    return tokenizer(
        example['sentence1'],
        example['sentence2'],
        padding=True,
        max_length=128,
    )

In [38]:
tokenized_dataset = raw_datasets.map(tokenize_function, batched=True)
tokenized_dataset.column_names

{'train': ['sentence1',
  'sentence2',
  'label',
  'idx',
  'input_ids',
  'token_type_ids',
  'attention_mask'],
 'validation': ['sentence1',
  'sentence2',
  'label',
  'idx',
  'input_ids',
  'token_type_ids',
  'attention_mask'],
 'test': ['sentence1',
  'sentence2',
  'label',
  'idx',
  'input_ids',
  'token_type_ids',
  'attention_mask']}

In [23]:
tokenized_dataset['train'].features

{'sentence1': Value(dtype='string', id=None),
 'sentence2': Value(dtype='string', id=None),
 'label': ClassLabel(names=['not_equivalent', 'equivalent'], id=None),
 'idx': Value(dtype='int32', id=None),
 'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None),
 'token_type_ids': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None),
 'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None)}

In [24]:
tokenized_dataset['train']

Dataset({
    features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 3668
})

In [25]:
tokenized_dataset = tokenized_dataset.remove_columns(['sentence1', 'sentence2', 'idx'])
tokenized_dataset = tokenized_dataset.rename_column('label', 'labels')
tokenized_dataset = tokenized_dataset.with_format('torch')
tokenized_dataset['train']

Dataset({
    features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 3668
})

In [26]:
tokenized_dataset['train'][0]

{'labels': tensor(1),
 'input_ids': tensor([  101,  7277,  2180,  5303,  4806,  1117,  1711,   117,  2292,  1119,
          1270,   107,  1103,  7737,   107,   117,  1104,  9938,  4267, 12223,
         21811,  1117,  2554,   119,   102, 11336,  6732,  3384,  1106,  1140,
          1112,  1178,   107,  1103,  7737,   107,   117,  7277,  2180,  5303,
          4806,  1117,  1711,  1104,  9938,  4267, 12223, 21811,  1117,  2554,
           119,   102,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0]),
 'token_type_ids': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [28]:
small_train_dataset = tokenized_dataset['train'].select(range(5))
small_train_dataset[0]

{'labels': tensor(1),
 'input_ids': tensor([  101,  7277,  2180,  5303,  4806,  1117,  1711,   117,  2292,  1119,
          1270,   107,  1103,  7737,   107,   117,  1104,  9938,  4267, 12223,
         21811,  1117,  2554,   119,   102, 11336,  6732,  3384,  1106,  1140,
          1112,  1178,   107,  1103,  7737,   107,   117,  7277,  2180,  5303,
          4806,  1117,  1711,  1104,  9938,  4267, 12223, 21811,  1117,  2554,
           119,   102,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0]),
 'token_type_ids': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

### Using dynamic padding


In [30]:
def tokenize_function(example):
    # Remove padding from tokenize function
    return tokenizer(
        example['sentence1'],
        example['sentence2'],
        max_length=128,
    )

In [31]:
tokenized_dataset = raw_datasets.map(tokenize_function, batched=True)
tokenized_dataset = tokenized_dataset.remove_columns(['sentence1', 'sentence2', 'idx'])
tokenized_dataset = tokenized_dataset.rename_column('label', 'labels')
tokenized_dataset = tokenized_dataset.with_format('torch')

Map:   0%|          | 0/3668 [00:00<?, ? examples/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Map: 100%|██████████| 3668/3668 [00:00<00:00, 39269.64 examples/s]
Map: 100%|██████████| 408/408 [00:00<00:00, 6414.49 examples/s]
Map: 100%|██████████| 1725/1725 [00:00<00:00, 48556.26 examples/s]


In [33]:
# No padding now
tokenized_dataset['train'][0]

{'labels': tensor(1),
 'input_ids': tensor([  101,  7277,  2180,  5303,  4806,  1117,  1711,   117,  2292,  1119,
          1270,   107,  1103,  7737,   107,   117,  1104,  9938,  4267, 12223,
         21811,  1117,  2554,   119,   102, 11336,  6732,  3384,  1106,  1140,
          1112,  1178,   107,  1103,  7737,   107,   117,  7277,  2180,  5303,
          4806,  1117,  1711,  1104,  9938,  4267, 12223, 21811,  1117,  2554,
           119,   102]),
 'token_type_ids': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1]),
 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1])}

In [34]:
from torch.utils.data import DataLoader
from transformers import DataCollatorWithPadding

In [109]:
# Padding at this step
data_collator = DataCollatorWithPadding(tokenizer)
train_dataloader = DataLoader(
    tokenized_dataset['train'],
    batch_size=16,
    collate_fn=data_collator,
    shuffle=True,
)

In [110]:
for step, batch in enumerate(train_dataloader):
    print(batch['input_ids'].shape)
    if step > 5:
        break

torch.Size([16, 90])
torch.Size([16, 84])
torch.Size([16, 83])
torch.Size([16, 73])
torch.Size([16, 79])
torch.Size([16, 80])
torch.Size([16, 88])


In [102]:
data_collator(tokenized_dataset['train'][:16])

{'labels': tensor([1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0]), 'input_ids': tensor([[  101,  7277,  2180,  ...,     0,     0,     0],
        [  101, 10684,  2599,  ...,     0,     0,     0],
        [  101,  1220,  1125,  ...,     0,     0,     0],
        ...,
        [  101,  1124,  1500,  ...,     0,     0,     0],
        [  101,   144,  7490,  ...,     0,     0,     0],
        [  101,   155, 23007,  ...,     0,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}