# Processing the Data

In [2]:
import torch

from transformers import AdamW, AutoTokenizer, AutoModelForSequenceClassification

checkpoint = "bert-base-uncased"

# First time loading
# tokenizer = AutoTokenizer.from_pretrained(checkpoint)
# model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

tokenizer = AutoTokenizer.from_pretrained("tokenizers/" + checkpoint, local_files_only=True)
model = AutoModelForSequenceClassification.from_pretrained("models/" + checkpoint, local_files_only=True)

In [3]:
tokenizer.save_pretrained("tokenizers/" + checkpoint)
model.save_pretrained("models/" + checkpoint)

In [4]:
sequences = [
    "I've been waiting for a HuggingFace course my whole life.",
    "This course is amazing!",
]

In [5]:
batch = tokenizer(sequences, padding=True, truncation=True, return_tensors="pt")

batch["labels"] = torch.tensor([1, 1])

optimizer = AdamW(model.parameters())
loss = model(**batch).loss
loss.backward()
optimizer.step()


## Downloading and Caching the Data

In [6]:
from datasets import load_dataset

raw_datasets = load_dataset("glue", "mrpc") # Dataset name
raw_datasets

100%|██████████| 3/3 [00:00<00:00, 73.11it/s]


DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 1725
    })
})

Access each pair of sentences in our `raw_datasets` object by indexing like a dictionary.

In [7]:
raw_train_dataset = raw_datasets["train"]
raw_train_dataset[0]

{'sentence1': 'Amrozi accused his brother , whom he called " the witness " , of deliberately distorting his evidence .',
 'sentence2': 'Referring to him as only " the witness " , Amrozi accused his brother of deliberately distorting his evidence .',
 'label': 1,
 'idx': 0}

Find out which integer corresponds to which label.

In [8]:
raw_train_dataset.features

{'sentence1': Value(dtype='string', id=None),
 'sentence2': Value(dtype='string', id=None),
 'label': ClassLabel(num_classes=2, names=['not_equivalent', 'equivalent'], names_file=None, id=None),
 'idx': Value(dtype='int32', id=None)}

In [9]:
raw_train_dataset[15]

{'sentence1': 'Rudder was most recently senior vice president for the Developer & Platform Evangelism Business .',
 'sentence2': 'Senior Vice President Eric Rudder , formerly head of the Developer and Platform Evangelism unit , will lead the new entity .',
 'label': 0,
 'idx': 16}

In [10]:
raw_train_dataset[87]

{'sentence1': 'Tuition at four-year private colleges averaged $ 19,710 this year , up 6 percent from 2002 .',
 'sentence2': 'For the current academic year , tuition at public colleges averaged $ 4,694 , up almost $ 600 from the year before .',
 'label': 1,
 'idx': 100}

## Preprocessing Dataset

Tokenise data: can feed the tokeniser with one sentence or a list of sentences.

In [11]:
from transformers import AutoTokenizer

tokenized_sentences_1 = tokenizer(raw_train_dataset["sentence1"])
tokenized_sentences_2 = tokenizer(raw_train_dataset["sentence2"])

But cannot pass two sequences for predicting whether they are paraphrases. These two sequences must be handled as a pair with appropriate processing.

In [12]:
inputs = tokenizer("This is the first sentence.", "This is the second one.")
inputs

{'input_ids': [101, 2023, 2003, 1996, 2034, 6251, 1012, 102, 2023, 2003, 1996, 2117, 2028, 1012, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

Separate tokenising

In [13]:
tokenizer(raw_train_dataset[15]["sentence1"])

{'input_ids': [101, 24049, 2001, 2087, 3728, 3026, 3580, 2343, 2005, 1996, 9722, 1004, 4132, 9340, 12439, 2964, 2449, 1012, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [14]:
tokenizer(raw_train_dataset[15]["sentence2"])

{'input_ids': [101, 3026, 3580, 2343, 4388, 24049, 1010, 3839, 2132, 1997, 1996, 9722, 1998, 4132, 9340, 12439, 2964, 3131, 1010, 2097, 2599, 1996, 2047, 9178, 1012, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

Pair tokenising

In [15]:
tokenizer(raw_train_dataset[15]["sentence1"], raw_train_dataset[15]["sentence2"])

{'input_ids': [101, 24049, 2001, 2087, 3728, 3026, 3580, 2343, 2005, 1996, 9722, 1004, 4132, 9340, 12439, 2964, 2449, 1012, 102, 3026, 3580, 2343, 4388, 24049, 1010, 3839, 2132, 1997, 1996, 9722, 1998, 4132, 9340, 12439, 2964, 3131, 1010, 2097, 2599, 1996, 2047, 9178, 1012, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [16]:
tokenizer.convert_ids_to_tokens(inputs["input_ids"])

['[CLS]',
 'this',
 'is',
 'the',
 'first',
 'sentence',
 '.',
 '[SEP]',
 'this',
 'is',
 'the',
 'second',
 'one',
 '.',
 '[SEP]']

BERT is pretrained with token type IDs, and on top of the masked language modeling objective we talked about in Chapter 1, it has an additional objective called next sentence prediction. The goal with this task is to model the relationship between pairs of sentences.

In [17]:
# First sentence is 0, second sentence 1, for BERT
['[CLS]', 'this', 'is', 'the', 'first', 'sentence', '.', '[SEP]', 'this', 'is', 'the', 'second', 'one', '.', '[SEP]']
[      0,      0,    0,     0,       0,          0,   0,       0,      1,    1,     1,        1,     1,   1,       1]

[0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1]

In [18]:
tokenized_dataset = tokenizer(
    raw_datasets["train"]["sentence1"],
    raw_datasets["train"]["sentence2"],
    padding=True,
    truncation=True,
)

# Disadvantage
# Returns a dicionary
# Only work if enough RAM available to store the whole dataset

### Multi-processing

Using the `map()` function. This function applies a function on each element on a dataset.

In [19]:
# Takes a dictionary and returns a new dictionary with the keys input_ids, attention_mask, token_type_ids
def tokenize_function(example):
    return tokenizer(example["sentence1"], example["sentence2"], truncation=True)

Padding all samples to the maximum length is not efficient. Pad the samples when building a batch (max length of that batch). The preprocessing function can be used to add new fields to all datasets or change data values for existing keys.

In [20]:
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True) # The function is applied to multiple elements in the dataset at once (faster)
tokenized_datasets

100%|██████████| 4/4 [00:00<00:00, 11.34ba/s]
100%|██████████| 1/1 [00:00<00:00, 14.57ba/s]
100%|██████████| 2/2 [00:00<00:00,  6.02ba/s]


DatasetDict({
    train: Dataset({
        features: ['attention_mask', 'idx', 'input_ids', 'label', 'sentence1', 'sentence2', 'token_type_ids'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['attention_mask', 'idx', 'input_ids', 'label', 'sentence1', 'sentence2', 'token_type_ids'],
        num_rows: 408
    })
    test: Dataset({
        features: ['attention_mask', 'idx', 'input_ids', 'label', 'sentence1', 'sentence2', 'token_type_ids'],
        num_rows: 1725
    })
})

### Dynamic Padding

The function that puts samples together inside a batch is called a **collate function**, an argument that can be passed to a `DataLoader`. The default function will convert samples to PyTorch tensors and concatenate them. That's why padding must be postponed because of the different sizes. (Can cause problems when training on TPUs as they prefer fixed shapes. Use extra padding.) We need to define a collate function that applies an appropriate amount of padding.

In [21]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Test some samples. Remove `idx`, `sentence1` and `sentence2` because they contain strings and strings cannot be converted into tensors. We already have their tokens.

In [22]:
samples = tokenized_datasets["train"][:8] # One batch
samples = {k: v for k, v in samples.items() if k not in ["idx", "sentence1", "sentence2"]} # Create a dictionary from dict comprehension
[len(x) for x in samples["input_ids"]] # Length of each entry in the batch

[50, 59, 47, 67, 59, 50, 62, 32]

In [23]:
batch = data_collator(samples)
{k: v.shape for k, v in batch.items()}

{'attention_mask': torch.Size([8, 67]),
 'input_ids': torch.Size([8, 67]),
 'token_type_ids': torch.Size([8, 67]),
 'labels': torch.Size([8])}

# Fine-tuning With the Trainer API

## Training

Define training arguments (hyperparameters). Only mandatory argument is a directory to save the model and checkpoints. The rest has defaults for basic fine-tuning.

In [24]:
from transformers import TrainingArguments

training_args = TrainingArguments("test-trainer")

## Define Our Model

In [25]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [26]:
from transformers import Trainer

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator, # Default data_collator is DataCollatorWithPadding so no need define also can
    tokenizer=tokenizer,
    # evaluation_strategy="steps" or "epoch"
    # compute_metrics()
)

In [27]:
trainer.train()

The following columns in the training set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: sentence2, idx, sentence1.
***** Running training *****
  Num examples = 3668
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 1377
 36%|███▋      | 500/1377 [02:19<03:37,  4.04it/s]Saving model checkpoint to test-trainer\checkpoint-500
Configuration saved in test-trainer\checkpoint-500\config.json


{'loss': 0.5112, 'learning_rate': 3.184458968772695e-05, 'epoch': 1.09}


Model weights saved in test-trainer\checkpoint-500\pytorch_model.bin
tokenizer config file saved in test-trainer\checkpoint-500\tokenizer_config.json
Special tokens file saved in test-trainer\checkpoint-500\special_tokens_map.json
 73%|███████▎  | 1000/1377 [04:29<01:18,  4.83it/s]Saving model checkpoint to test-trainer\checkpoint-1000
Configuration saved in test-trainer\checkpoint-1000\config.json


{'loss': 0.2934, 'learning_rate': 1.3689179375453886e-05, 'epoch': 2.18}


Model weights saved in test-trainer\checkpoint-1000\pytorch_model.bin
tokenizer config file saved in test-trainer\checkpoint-1000\tokenizer_config.json
Special tokens file saved in test-trainer\checkpoint-1000\special_tokens_map.json
100%|██████████| 1377/1377 [05:54<00:00,  4.83it/s]

Training completed. Do not forget to share your model on huggingface.co/models =)


100%|██████████| 1377/1377 [05:54<00:00,  3.88it/s]

{'train_runtime': 354.6807, 'train_samples_per_second': 31.025, 'train_steps_per_second': 3.882, 'train_loss': 0.32232549663202953, 'epoch': 3.0}





TrainOutput(global_step=1377, training_loss=0.32232549663202953, metrics={'train_runtime': 354.6807, 'train_samples_per_second': 31.025, 'train_steps_per_second': 3.882, 'train_loss': 0.32232549663202953, 'epoch': 3.0})

## Evaluation

Building a `compute_metrics()`. The output of the predict() method is another named tuple with three fields: predictions, label_ids, and metrics. The metrics will contain loss on the dataset passed.

In [28]:
predictions = trainer.predict(tokenized_datasets["validation"])
print(predictions.predictions.shape, predictions.label_ids.shape)

The following columns in the test set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: sentence2, idx, sentence1.
***** Running Prediction *****
  Num examples = 408
  Batch size = 8
 98%|█████████▊| 50/51 [00:02<00:00, 20.02it/s]

(408, 2) (408,)


All Transformer models return logits. Transform logits into prediction by taking the index with the maximum value on the second axis.

In [29]:
import numpy as np

preds = np.argmax(predictions.predictions, axis=-1)

Compare `preds` to the labels. The table in the BERT paper reported an F1 score of 88.9 for the base model. That was the uncased model while we are currently using the cased model, which explains the better result.

In [30]:
from datasets import load_metric

metric = load_metric("glue", "mrpc") # Benchmark name and dataset name
metric.compute(predictions=preds, references=predictions.label_ids)

{'accuracy': 0.8676470588235294, 'f1': 0.90625}

Combining everything togther.

In [31]:
def compute_metrics(eval_preds):
    metric = load_metric("glue", "mrpc")
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [32]:
training_args = TrainingArguments("test-trainer", evaluation_strategy="epoch")
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics, # Our function
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
loading configuration file https://huggingface.co/bert-base-uncased/resolve/main/config.json from cache at C:\Users\liana/.cache\huggingface\transformers\3c61d016573b14f7f008c02c4e51a366c67ab274726fe2910691e2a761acf43e.37395cee442ab11005bcd270f3c34464dc1704b715b5d7d52b1a461abe3b9e4e
Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads"

Note that we create a new TrainingArguments with its evaluation_strategy set to "epoch" and a new model — otherwise, we would just be continuing the training of the model we have already trained. The exact accuracy/F1 score you reach might be a bit different from what we found, because of the random head initialization of the model, but it should be in the same ballpark.

In [33]:
trainer.train()

The following columns in the training set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: sentence2, idx, sentence1.
***** Running training *****
  Num examples = 3668
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 1377
 33%|███▎      | 459/1377 [01:44<03:36,  4.25it/s]The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: sentence2, idx, sentence1.
***** Running Evaluation *****
  Num examples = 408
  Batch size = 8

 33%|███▎      | 459/1377 [01:48<03:36,  4.25it/s]

{'eval_loss': 0.48459067940711975, 'eval_accuracy': 0.8137254901960784, 'eval_f1': 0.8694158075601375, 'eval_runtime': 4.0022, 'eval_samples_per_second': 101.944, 'eval_steps_per_second': 12.743, 'epoch': 1.0}


 36%|███▋      | 500/1377 [01:57<03:15,  4.48it/s]Saving model checkpoint to test-trainer\checkpoint-500
Configuration saved in test-trainer\checkpoint-500\config.json


{'loss': 0.5664, 'learning_rate': 3.184458968772695e-05, 'epoch': 1.09}


Model weights saved in test-trainer\checkpoint-500\pytorch_model.bin
tokenizer config file saved in test-trainer\checkpoint-500\tokenizer_config.json
Special tokens file saved in test-trainer\checkpoint-500\special_tokens_map.json
 67%|██████▋   | 918/1377 [03:35<01:38,  4.68it/s]The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: sentence2, idx, sentence1.
***** Running Evaluation *****
  Num examples = 408
  Batch size = 8

 67%|██████▋   | 918/1377 [03:39<01:38,  4.68it/s]

{'eval_loss': 0.545478343963623, 'eval_accuracy': 0.821078431372549, 'eval_f1': 0.8777219430485762, 'eval_runtime': 3.8601, 'eval_samples_per_second': 105.697, 'eval_steps_per_second': 13.212, 'epoch': 2.0}


 73%|███████▎  | 1000/1377 [03:57<01:22,  4.55it/s]Saving model checkpoint to test-trainer\checkpoint-1000
Configuration saved in test-trainer\checkpoint-1000\config.json


{'loss': 0.3728, 'learning_rate': 1.3689179375453886e-05, 'epoch': 2.18}


Model weights saved in test-trainer\checkpoint-1000\pytorch_model.bin
tokenizer config file saved in test-trainer\checkpoint-1000\tokenizer_config.json
Special tokens file saved in test-trainer\checkpoint-1000\special_tokens_map.json
100%|██████████| 1377/1377 [05:50<00:00,  2.97it/s]The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: sentence2, idx, sentence1.
***** Running Evaluation *****
  Num examples = 408
  Batch size = 8

100%|██████████| 1377/1377 [05:56<00:00,  2.97it/s]

Training completed. Do not forget to share your model on huggingface.co/models =)


100%|██████████| 1377/1377 [05:57<00:00,  3.86it/s]

{'eval_loss': 0.7744005918502808, 'eval_accuracy': 0.8235294117647058, 'eval_f1': 0.8762886597938144, 'eval_runtime': 6.3088, 'eval_samples_per_second': 64.671, 'eval_steps_per_second': 8.084, 'epoch': 3.0}
{'train_runtime': 357.0206, 'train_samples_per_second': 30.822, 'train_steps_per_second': 3.857, 'train_loss': 0.40473202751785753, 'epoch': 3.0}





TrainOutput(global_step=1377, training_loss=0.40473202751785753, metrics={'train_runtime': 357.0206, 'train_samples_per_second': 30.822, 'train_steps_per_second': 3.857, 'train_loss': 0.40473202751785753, 'epoch': 3.0})

The Trainer will work out of the box on multiple GPUs or TPUs and provides lots of options, like mixed-precision.

# A Full Fine-Tuning Without the Trainer API

Define variables first

In [34]:
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding

raw_datasets = load_dataset("glue", "mrpc") # Dataset
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained("tokenizers/" + checkpoint, local_files_only=True) # Tokeniser

def tokenize_function(example): # Used for tokenisation
    return tokenizer(example["sentence1"], example["sentence2"], truncation=True)

tokenized_datasets = raw_datasets.map(tokenize_function, batched=True) # Input data
data_collator = DataCollatorWithPadding(tokenizer=tokenizer) # Data loader

100%|██████████| 3/3 [00:00<00:00, 83.41it/s]
Didn't find file tokenizers/bert-base-uncased\added_tokens.json. We won't load it.
loading file tokenizers/bert-base-uncased\vocab.txt
loading file tokenizers/bert-base-uncased\tokenizer.json
loading file None
loading file tokenizers/bert-base-uncased\special_tokens_map.json
loading file tokenizers/bert-base-uncased\tokenizer_config.json
100%|██████████| 4/4 [00:01<00:00,  3.85ba/s]
100%|██████████| 1/1 [00:00<00:00,  8.55ba/s]
100%|██████████| 2/2 [00:00<00:00,  4.67ba/s]


## Prepare for Training

Define a few objects first: dataloaders used to iterate over batches. But we need to postprocess `tokenized_datasets` due to the Trainer API.

1. Remove unexpected columns
2. Rename column label to labels (model expects `labels`)
3. Set dataset formats to return PyTorch tensors instead of lists

In [35]:
tokenized_datasets = tokenized_datasets.remove_columns(["sentence1", "sentence2", "idx"])
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets.set_format("torch")

tokenized_datasets["train"].column_names # Check result

['attention_mask', 'input_ids', 'labels', 'token_type_ids']

In [78]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(
    tokenized_datasets["train"], shuffle=True, batch_size=8, collate_fn=data_collator
)
eval_dataloader = DataLoader(
    tokenized_datasets["validation"], batch_size=8, collate_fn=data_collator
)

Inspect a batch to check there is no mistake in data processing. Shape might differ because `shuffle=True` and padding is set to batch maximum length.

In [72]:
for batch in train_dataloader:
    break
{k: v.shape for k, v in batch.items()}

{'attention_mask': torch.Size([8, 70]),
 'input_ids': torch.Size([8, 70]),
 'labels': torch.Size([8]),
 'token_type_ids': torch.Size([8, 70])}

In [73]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("models/" + checkpoint, num_labels=2, local_files_only=True) # Model

loading configuration file models/bert-base-uncased\config.json
Model config BertConfig {
  "_name_or_path": "models/bert-base-uncased",
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transformers_version": "4.10.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading weights file models/bert-base-uncased\pytorch_model.bin
All model checkpoint weights were used when initializing BertForSequenceClassification.

All the weights of BertForSequenceClassification were initialized from th

All Transformers models will return the loss when labels are provided, and we also get the logits (two for each input in our batch, so a tensor of size 8 x 2).

In [74]:
outputs = model(**batch)
print(outputs.loss, outputs.logits.shape)

tensor(0.2944, grad_fn=<NllLossBackward>) torch.Size([8, 2])


Two more things:

1. Optimiser (`AdamW`)
2. Learning rate scheduler (linear decay from the maximum value 5e-5 to 0)

All will be set at defaults to replicate the Trainer API.

In [75]:
from transformers import AdamW

optimizer = AdamW(model.parameters(), lr=5e-5)

To define the learning rate, we need to know the number of training steps that will be taken (number of epochs * number of training batches, which is the length of our dataloader). Trainer API uses 3 by default.

In [76]:
from transformers import get_scheduler

num_epochs = 3 # Trainer default
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps
)

print(num_training_steps)

1377


## Training Loop

Use GPU instead of CPU.

In [79]:
import torch

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device) # Push model to GPU
device

RuntimeError: CUDA out of memory. Tried to allocate 20.00 MiB (GPU 0; 4.00 GiB total capacity; 2.29 GiB already allocated; 8.88 MiB free; 2.47 GiB reserved in total by PyTorch)

Add progress bar over number of training steps.

In [None]:
from tqdm.auto import tqdm

progress_bar = tqdm(range(num_training_steps))

model.train()
for epoch in range(num_epochs):
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()} # Push data to GPU
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

  0%|          | 0/1377 [05:02<?, ?it/s]


RuntimeError: CUDA out of memory. Tried to allocate 20.00 MiB (GPU 0; 4.00 GiB total capacity; 2.45 GiB already allocated; 10.88 MiB free; 2.47 GiB reserved in total by PyTorch)

## Evaluation Loop

Add evaluation loop to check how good the model is because we didn't ask for any reporting.

In [None]:
from datasets import load_metric

metric = load_metric("glue", "mrpc")
model.eval()
for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch["labels"])

metric.compute()

{'accuracy': 0.8308823529411765, 'f1': 0.880415944540728}

## Accelerate API

The training loop we defined earlier works fine on a single CPU or GPU. But using the Accelerate library, with just a few adjustments we can enable distributed training on multiple GPUs or TPUs.

In [None]:
from transformers import AdamW, AutoModelForSequenceClassification, get_scheduler
from accelerate import Accelerator # New

accelerator = Accelerator() # New

model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)
optimizer = AdamW(model.parameters(), lr=3e-5)

# device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
# model.to(device)

# New
train_dataloader, eval_dataloader. model, optimizer = accelerator.prepare(
    train_dataloader, eval_loader, model, optimizer
)

num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)

progress_bar = tqdm(range(num_training_steps))

model.train()
for epoch in range(num_epochs):
    for batch in train_dataloader:
        # batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        # loss.backward()
        accelerator.backward(loss)

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "C:\Users\liana\AppData\Local\Programs\Python\Python38\lib\site-packages\IPython\core\interactiveshell.py", line 3441, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "C:\Users\liana\AppData\Local\Temp/ipykernel_16344/3263909906.py", line 6, in <module>
    model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)
NameError: name 'checkpoint' is not defined

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "C:\Users\liana\AppData\Local\Programs\Python\Python38\lib\site-packages\IPython\core\interactiveshell.py", line 2061, in showtraceback
    stb = value._render_traceback_()
AttributeError: 'NameError' object has no attribute '_render_traceback_'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "C:\Users\liana\AppData\Local\Programs\Python\Python38\lib\site-packages\IPy

TypeError: object of type 'NoneType' has no len()