In [1]:
%load_ext rich
import torch
import numpy as np

### Basic fine-tuning

In [9]:
from transformers import AdamW, AutoTokenizer, AutoModelForSequenceClassification

checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
sequences = [
    "I've been waiting for a HuggingFace course my whole life.",
    "This course is amazing!",
]
batch = tokenizer(sequences, padding=True, truncation=True, return_tensors="pt")

batch

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



[1m{[0m
    [32m'input_ids'[0m: [1;35mtensor[0m[1m([0m[1m[[0m[1m[[0m  [1;36m101[0m,  [1;36m1045[0m,  [1;36m1005[0m,  [1;36m2310[0m,  [1;36m2042[0m,  [1;36m3403[0m,  [1;36m2005[0m,  [1;36m1037[0m, [1;36m17662[0m, [1;36m12172[0m,
          [1;36m2607[0m,  [1;36m2026[0m,  [1;36m2878[0m,  [1;36m2166[0m,  [1;36m1012[0m,   [1;36m102[0m[1m][0m,
        [1m[[0m  [1;36m101[0m,  [1;36m2023[0m,  [1;36m2607[0m,  [1;36m2003[0m,  [1;36m6429[0m,   [1;36m999[0m,   [1;36m102[0m,     [1;36m0[0m,     [1;36m0[0m,     [1;36m0[0m,
             [1;36m0[0m,     [1;36m0[0m,     [1;36m0[0m,     [1;36m0[0m,     [1;36m0[0m,     [1;36m0[0m[1m][0m[1m][0m[1m)[0m,
    [32m'token_type_ids'[0m: [1;35mtensor[0m[1m([0m[1m[[0m[1m[[0m[1;36m0[0m, [1;36m0[0m, [1;36m0[0m, [1;36m0[0m, [1;36m0[0m, [1;36m0[0m, [1;36m0[0m, [1;36m0[0m, [1;36m0[0m, [1;36m0[0m, [1;36m0[0m, [1;36m0[0m, [1;36m0[0m, [1;36m0[0m, 

In [None]:
batch["labels"] = torch.tensor([1, 1])

optimizer = AdamW(model.parameters())
loss = model(**batch).loss
loss.backward()
optimizer.step()



### Loading data from HuggingFace

In [12]:
from datasets import load_dataset

In [16]:
raw_data = load_dataset('glue', 'mrpc')
raw_data


[1;35mDatasetDict[0m[1m([0m[1m{[0m
    train: [1;35mDataset[0m[1m([0m[1m{[0m
        features: [1m[[0m[32m'sentence1'[0m, [32m'sentence2'[0m, [32m'label'[0m, [32m'idx'[0m[1m][0m,
        num_rows: [1;36m3668[0m
    [1m}[0m[1m)[0m
    validation: [1;35mDataset[0m[1m([0m[1m{[0m
        features: [1m[[0m[32m'sentence1'[0m, [32m'sentence2'[0m, [32m'label'[0m, [32m'idx'[0m[1m][0m,
        num_rows: [1;36m408[0m
    [1m}[0m[1m)[0m
    test: [1;35mDataset[0m[1m([0m[1m{[0m
        features: [1m[[0m[32m'sentence1'[0m, [32m'sentence2'[0m, [32m'label'[0m, [32m'idx'[0m[1m][0m,
        num_rows: [1;36m1725[0m
    [1m}[0m[1m)[0m
[1m}[0m[1m)[0m

In [21]:
train_data = raw_data['train']
train_data[15]


[1m{[0m
    [32m'sentence1'[0m: [32m'Rudder was most recently senior vice president for the Developer & Platform Evangelism Business .'[0m,
    [32m'sentence2'[0m: [32m'Senior Vice President Eric Rudder , formerly head of the Developer and Platform Evangelism unit , will lead the new entity .'[0m,
    [32m'label'[0m: [1;36m0[0m,
    [32m'idx'[0m: [1;36m16[0m
[1m}[0m

In [20]:
train_data.features


[1m{[0m
    [32m'sentence1'[0m: [1;35mValue[0m[1m([0m[33mdtype[0m=[32m'string'[0m, [33mid[0m=[3;35mNone[0m[1m)[0m,
    [32m'sentence2'[0m: [1;35mValue[0m[1m([0m[33mdtype[0m=[32m'string'[0m, [33mid[0m=[3;35mNone[0m[1m)[0m,
    [32m'label'[0m: [1;35mClassLabel[0m[1m([0m[33mnames[0m=[1m[[0m[32m'not_equivalent'[0m, [32m'equivalent'[0m[1m][0m, [33mid[0m=[3;35mNone[0m[1m)[0m,
    [32m'idx'[0m: [1;35mValue[0m[1m([0m[33mdtype[0m=[32m'int32'[0m, [33mid[0m=[3;35mNone[0m[1m)[0m
[1m}[0m

### Tokenize

In [27]:
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

tokenized_sentences_1 = tokenizer(raw_data['train']['sentence1'])
tokenized_sentences_2 = tokenizer(raw_data['train']['sentence2'])

In [29]:
def tokenize_fn(example):
    return tokenizer(example['sentence1'], example['sentence2'], truncation=True)

In [30]:
tokenized_datasets = raw_data.map(tokenize_fn, batched=True)
tokenized_datasets

Map: 100%|██████████| 3668/3668 [00:00<00:00, 6957.26 examples/s]
Map: 100%|██████████| 408/408 [00:00<00:00, 7598.75 examples/s]
Map: 100%|██████████| 1725/1725 [00:00<00:00, 6321.62 examples/s]



[1;35mDatasetDict[0m[1m([0m[1m{[0m
    train: [1;35mDataset[0m[1m([0m[1m{[0m
        features: [1m[[0m[32m'sentence1'[0m, [32m'sentence2'[0m, [32m'label'[0m, [32m'idx'[0m, [32m'input_ids'[0m, [32m'token_type_ids'[0m, [32m'attention_mask'[0m[1m][0m,
        num_rows: [1;36m3668[0m
    [1m}[0m[1m)[0m
    validation: [1;35mDataset[0m[1m([0m[1m{[0m
        features: [1m[[0m[32m'sentence1'[0m, [32m'sentence2'[0m, [32m'label'[0m, [32m'idx'[0m, [32m'input_ids'[0m, [32m'token_type_ids'[0m, [32m'attention_mask'[0m[1m][0m,
        num_rows: [1;36m408[0m
    [1m}[0m[1m)[0m
    test: [1;35mDataset[0m[1m([0m[1m{[0m
        features: [1m[[0m[32m'sentence1'[0m, [32m'sentence2'[0m, [32m'label'[0m, [32m'idx'[0m, [32m'input_ids'[0m, [32m'token_type_ids'[0m, [32m'attention_mask'[0m[1m][0m,
        num_rows: [1;36m1725[0m
    [1m}[0m[1m)[0m
[1m}[0m[1m)[0m

In [31]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
data_collator

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)



[1;35mDataCollatorWithPadding[0m[1m([0m
    [33mtokenizer[0m=[1;35mBertTokenizerFast[0m[1m([0m[33mname_or_path[0m=[32m'bert-base-uncased'[0m, [33mvocab_size[0m=[1;36m30522[0m, [33mmodel_max_length[0m=[1;36m512[0m, [33mis_fast[0m=[3;92mTrue[0m, [33mpadding_side[0m=[32m'right'[0m, [33mtruncation_side[0m=[32m'right'[0m, [33mspecial_tokens[0m=[1m{[0m[32m'unk_token'[0m: [32m'[0m[32m[[0m[32mUNK[0m[32m][0m[32m'[0m, [32m'sep_token'[0m: [32m'[0m[32m[[0m[32mSEP[0m[32m][0m[32m'[0m, [32m'pad_token'[0m: [32m'[0m[32m[[0m[32mPAD[0m[32m][0m[32m'[0m, [32m'cls_token'[0m: [32m'[0m[32m[[0m[32mCLS[0m[32m][0m[32m'[0m, [32m'mask_token'[0m: [32m'[0m[32m[[0m[32mMASK[0m[32m][0m[32m'[0m[1m}[0m, [33mclean_up_tokenization_spaces[0m=[3;92mTrue[0m[1m)[0m,  [33madded_tokens_decoder[0m=[1m{[0m
        [1;36m0[0m: [1;35mAddedToken[0m[1m([0m[32m"[0m[32m[[0m[32mPAD[0m[32m][0m[32m"[0m, [33mrstrip

In [34]:
samples = tokenized_datasets['train'][:8]
samples = {k: v for k, v in samples.items() if k not in ['idx', 'sentence1', 'sentence2']}
[len(x) for x in samples['input_ids']]

[1m[[0m[1;36m50[0m, [1;36m59[0m, [1;36m47[0m, [1;36m67[0m, [1;36m59[0m, [1;36m50[0m, [1;36m62[0m, [1;36m32[0m[1m][0m

In [36]:
batch = data_collator(samples)
{k: v.shape for k, v in batch.items()}


[1m{[0m
    [32m'input_ids'[0m: [1;35mtorch.Size[0m[1m([0m[1m[[0m[1;36m8[0m, [1;36m67[0m[1m][0m[1m)[0m,
    [32m'token_type_ids'[0m: [1;35mtorch.Size[0m[1m([0m[1m[[0m[1;36m8[0m, [1;36m67[0m[1m][0m[1m)[0m,
    [32m'attention_mask'[0m: [1;35mtorch.Size[0m[1m([0m[1m[[0m[1;36m8[0m, [1;36m67[0m[1m][0m[1m)[0m,
    [32m'labels'[0m: [1;35mtorch.Size[0m[1m([0m[1m[[0m[1;36m8[0m[1m][0m[1m)[0m
[1m}[0m