#### Data Processing

In [9]:
# Training from scratch
import torch
from pprint import pprint
from transformers import AdamW, AutoTokenizer, AutoModelForSequenceClassification

checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
sequences = [
    "I've been waiting for a HuggingFace course my whole life.",
    "This course is amazing!",
]
batch = tokenizer(sequences, padding=True, truncation=True, return_tensors="pt")

# Add labels
batch['labels'] = torch.tensor([1,1])

optimizer = AdamW(model.parameters())
loss = model(**batch).loss
loss.backward()
optimizer.step()
# Training on 2 examples is not exactly revolutionary

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


#### Loading Data From HuggingFace Hub

In [3]:
from datasets import load_dataset

raw_data = load_dataset("glue", "mrpc")
raw_data

Downloading readme: 100%|██████████| 35.3k/35.3k [00:00<00:00, 151kB/s] 
Downloading data: 100%|██████████| 649k/649k [00:03<00:00, 177kB/s]
Downloading data: 100%|██████████| 75.7k/75.7k [00:00<00:00, 86.6kB/s]
Downloading data: 100%|██████████| 308k/308k [00:01<00:00, 188kB/s]
Generating train split: 100%|██████████| 3668/3668 [00:00<00:00, 483506.93 examples/s]
Generating validation split: 100%|██████████| 408/408 [00:00<00:00, 288142.12 examples/s]
Generating test split: 100%|██████████| 1725/1725 [00:00<00:00, 792722.08 examples/s]


DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 1725
    })
})

In [4]:
raw_train = raw_data["train"]
raw_train[0]

{'sentence1': 'Amrozi accused his brother , whom he called " the witness " , of deliberately distorting his evidence .',
 'sentence2': 'Referring to him as only " the witness " , Amrozi accused his brother of deliberately distorting his evidence .',
 'label': 1,
 'idx': 0}

In [5]:
raw_train.features

{'sentence1': Value(dtype='string', id=None),
 'sentence2': Value(dtype='string', id=None),
 'label': ClassLabel(names=['not_equivalent', 'equivalent'], id=None),
 'idx': Value(dtype='int32', id=None)}

In [13]:
pprint(raw_train[15, 87])
# labels are not equivalent and equivalent

{'idx': [16, 100],
 'label': [0, 1],
 'sentence1': ['Rudder was most recently senior vice president for the '
               'Developer & Platform Evangelism Business .',
               'Tuition at four-year private colleges averaged $ 19,710 this '
               'year , up 6 percent from 2002 .'],
 'sentence2': ['Senior Vice President Eric Rudder , formerly head of the '
               'Developer and Platform Evangelism unit , will lead the new '
               'entity .',
               'For the current academic year , tuition at public colleges '
               'averaged $ 4,694 , up almost $ 600 from the year before .']}


#### Processing Sentence Pairs

In [14]:
# This kind of direct tokenization won't work
tokenized_sentences_1 = tokenizer(raw_train['sentence1'])
tokenized_sentences_2 = tokenizer(raw_train['sentence2'])

In [16]:
# Need sentence pairs, so pass as pairs
inputs = tokenizer("This is the first sentence.", "This is the second one.")
pprint(inputs)


{'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
 'input_ids': [101,
               2023,
               2003,
               1996,
               2034,
               6251,
               1012,
               102,
               2023,
               2003,
               1996,
               2117,
               2028,
               1012,
               102],
 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1]}


In [28]:
tokenizer(raw_train[15]['sentence1'], raw_train[15]['sentence2'], padding=True)

{'input_ids': [101, 24049, 2001, 2087, 3728, 3026, 3580, 2343, 2005, 1996, 9722, 1004, 4132, 9340, 12439, 2964, 2449, 1012, 102, 3026, 3580, 2343, 4388, 24049, 1010, 3839, 2132, 1997, 1996, 9722, 1998, 4132, 9340, 12439, 2964, 3131, 1010, 2097, 2599, 1996, 2047, 9178, 1012, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [34]:
print(tokenizer.convert_ids_to_tokens(inputs["input_ids"]))
inputs['token_type_ids']

['[CLS]', 'this', 'is', 'the', 'first', 'sentence', '.', '[SEP]', 'this', 'is', 'the', 'second', 'one', '.', '[SEP]']


[0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1]

In [36]:
# This only works if you have enough memory to store the data
tokenized_data = tokenizer(
    raw_data["train"]["sentence1"],
    raw_data["train"]["sentence2"],
    padding=True,
    truncation=True,
)

In [37]:
# Better to make a function that can generate tokens on the fly
def tokenize_function(example):
    return tokenizer(example['sentence1'],
                     example['sentence2'],
                     truncation=True)

tokenized_data = raw_data.map(tokenize_function, batched=True)
tokenized_data

Map: 100%|██████████| 3668/3668 [00:00<00:00, 22705.98 examples/s]
Map: 100%|██████████| 408/408 [00:00<00:00, 18758.23 examples/s]
Map: 100%|██████████| 1725/1725 [00:00<00:00, 21903.00 examples/s]


DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1725
    })
})

#### Dynamic Padding
- You want to pad per batch. It's more effiecient by space and time. This is done with a Data Collator
- `DataCollatorWithPadding`

In [38]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [40]:
# Take some samples

samples = tokenized_data["train"][:8]
samples = {k: v for k, v in samples.items() if k not in ["idx", "sentence1", "sentence2"]}
[len(x) for x in samples["input_ids"]]

[50, 59, 47, 67, 59, 50, 62, 32]

In [42]:
batch = data_collator(samples)
{k: v.shape for k, v in batch.items()}

{'input_ids': torch.Size([8, 67]),
 'token_type_ids': torch.Size([8, 67]),
 'attention_mask': torch.Size([8, 67]),
 'labels': torch.Size([8])}