In [71]:
%%capture
%load_ext dotenv
%dotenv
%env TOKENIZERS_PARALLELISM=false
%pip install transformers datasets accelerate sentencepiece
%pip install evaluate scikit-learn scipy
!huggingface-cli login --token $HUGGING_FACE_TOKEN

In [20]:
import torch
from transformers import AdamW, AutoTokenizer, AutoModelForSequenceClassification
from datasets import load_dataset
from pprint import pprint

In [7]:
raw_datasets = load_dataset("glue", "mrpc")
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 1725
    })
})

In [24]:
raw_train_dataset = raw_datasets["train"]
pprint(raw_train_dataset.features)
pprint(raw_train_dataset[0])

{'idx': Value(dtype='int32', id=None),
 'label': ClassLabel(names=['not_equivalent', 'equivalent'], id=None),
 'sentence1': Value(dtype='string', id=None),
 'sentence2': Value(dtype='string', id=None)}
{'idx': 0,
 'label': 1,
 'sentence1': 'Amrozi accused his brother , whom he called " the witness " , '
              'of deliberately distorting his evidence .',
 'sentence2': 'Referring to him as only " the witness " , Amrozi accused his '
              'brother of deliberately distorting his evidence .'}


In [28]:
from transformers import AutoTokenizer

checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
tokenized_sentences_1 = tokenizer(raw_datasets["train"]["sentence1"])
tokenized_sentences_2 = tokenizer(raw_datasets["train"]["sentence2"])

In [32]:
inputs = tokenizer("This is the first sentence.", "This is the second one.")
print(inputs)

{'input_ids': [101, 2023, 2003, 1996, 2034, 6251, 1012, 102, 2023, 2003, 1996, 2117, 2028, 1012, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [34]:
tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"])
print(tokens)

['[CLS]', 'this', 'is', 'the', 'first', 'sentence', '.', '[SEP]', 'this', 'is', 'the', 'second', 'one', '.', '[SEP]']


In [36]:
def tokenize_function(example):
    return tokenizer(example["sentence1"], example["sentence2"], truncation=True)


tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
tokenized_datasets

Map:   0%|          | 0/3668 [00:00<?, ? examples/s]

Map:   0%|          | 0/408 [00:00<?, ? examples/s]

Map:   0%|          | 0/1725 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1725
    })
})

In [52]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

samples = tokenized_datasets["train"][:8]
samples = {k: v for k, v in samples.items() if k not in ["idx", "sentence1", "sentence2"]}
print(*(f"{key}: {len(value)}" for key, value in samples.items()), sep="\n")
[len(x) for x in samples["input_ids"]]

label: 8
input_ids: 8
token_type_ids: 8
attention_mask: 8


[50, 59, 47, 67, 59, 50, 62, 32]

In [46]:
batch = data_collator(samples)
{k: v.shape for k, v in batch.items()}

{'input_ids': torch.Size([8, 67]),
 'token_type_ids': torch.Size([8, 67]),
 'attention_mask': torch.Size([8, 67]),
 'labels': torch.Size([8])}

## Fine-tunning with Trainer

[Fine-tuning a model with the Trainer API](https://huggingface.co/learn/nlp-course/chapter3/3?fw=pt)


In [53]:
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding

raw_datasets = load_dataset("glue", "mrpc")
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)


def tokenize_function(example):
    return tokenizer(example["sentence1"], example["sentence2"], truncation=True)


tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Map:   0%|          | 0/408 [00:00<?, ? examples/s]

In [75]:
from transformers import TrainingArguments
from transformers import AutoModelForSequenceClassification

training_args = TrainingArguments("test-trainer")
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [55]:
from transformers import Trainer

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [56]:
trainer.train()

Step,Training Loss
500,0.5348
1000,0.2749


TrainOutput(global_step=1377, training_loss=0.3271818832807465, metrics={'train_runtime': 71.3534, 'train_samples_per_second': 154.218, 'train_steps_per_second': 19.298, 'total_flos': 405114969714960.0, 'train_loss': 0.3271818832807465, 'epoch': 3.0})

In [104]:
predictions = trainer.predict(tokenized_datasets["validation"])
print(predictions.predictions.shape, predictions.label_ids.shape, predictions.metrics)

(408, 2) (408,) {'test_loss': 0.6342515349388123, 'test_accuracy': 0.6838235294117647, 'test_f1': 0.8122270742358079, 'test_runtime': 0.8806, 'test_samples_per_second': 463.298, 'test_steps_per_second': 57.912}


In [88]:
import numpy as np

preds = np.argmax(predictions.predictions, axis=-1)
# pprint(preds)

In [117]:
import evaluate

preds = predictions.predictions
# pprint(preds)
pp = np.argmax(preds, axis=-1)
# pprint(pp)
label_ids = predictions.label_ids
# pprint(label_ids)

metric = evaluate.load("glue", "mrpc")
# print(metric.inputs_description)

metric.compute(predictions=pp, references=label_ids)

{'accuracy': 0.6838235294117647, 'f1': 0.8122270742358079}

In [112]:
import datasets
metric = datasets.load_metric('sacrebleu')

print(metric.inputs_description)

  metric = datasets.load_metric('sacrebleu')
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


Downloading builder script:   0%|          | 0.00/2.85k [00:00<?, ?B/s]

ImportError: To be able to use sacrebleu, you need to install the following dependency: sacrebleu.
Please install it using 'pip install sacrebleu' for instance.

In [73]:
def compute_metrics(eval_preds):
    metric = evaluate.load("glue", "mrpc")
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [118]:
training_args = TrainingArguments("test-trainer", evaluation_strategy="epoch")
training_args.num_train_epochs = 5
# pprint(training_args)

model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [119]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.386535,0.821078,0.864564
2,0.511800,0.452088,0.862745,0.903114
3,0.271000,0.619117,0.865196,0.903678
4,0.125700,0.790341,0.857843,0.899654
5,0.054200,0.821337,0.857843,0.900344


TrainOutput(global_step=2295, training_loss=0.21403182742382706, metrics={'train_runtime': 122.7559, 'train_samples_per_second': 149.402, 'train_steps_per_second': 18.696, 'total_flos': 675891190117440.0, 'train_loss': 0.21403182742382706, 'epoch': 5.0})