In [15]:
import numpy as np

In [1]:
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding

raw_datasets = load_dataset("glue", "mrpc")
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)


def tokenize_function(example):
    return tokenizer(example["sentence1"], example["sentence2"], truncation=True)


tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Found cached dataset glue (/home/lklimkiewicz/.cache/huggingface/datasets/glue/mrpc/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)


  0%|          | 0/3 [00:00<?, ?it/s]

Loading cached processed dataset at /home/lklimkiewicz/.cache/huggingface/datasets/glue/mrpc/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad/cache-bb21e6423b980722.arrow
Loading cached processed dataset at /home/lklimkiewicz/.cache/huggingface/datasets/glue/mrpc/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad/cache-d1e8c90b5d349f7a.arrow
Loading cached processed dataset at /home/lklimkiewicz/.cache/huggingface/datasets/glue/mrpc/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad/cache-33304e37c309912f.arrow


In [2]:
from transformers import TrainingArguments

training_args = TrainingArguments("test-trainer")

In [3]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly i

## Training without metrics

In [4]:
from transformers import Trainer

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
)

In [5]:
trainer.train()



  0%|          | 0/1377 [00:00<?, ?it/s]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'loss': 0.54, 'learning_rate': 3.184458968772695e-05, 'epoch': 1.09}
{'loss': 0.2836, 'learning_rate': 1.3689179375453886e-05, 'epoch': 2.18}
{'train_runtime': 115.0839, 'train_samples_per_second': 95.617, 'train_steps_per_second': 11.965, 'train_loss': 0.3287043824261656, 'epoch': 3.0}


TrainOutput(global_step=1377, training_loss=0.3287043824261656, metrics={'train_runtime': 115.0839, 'train_samples_per_second': 95.617, 'train_steps_per_second': 11.965, 'train_loss': 0.3287043824261656, 'epoch': 3.0})

In [13]:
predictions = trainer.predict(tokenized_datasets["validation"])
print(predictions.predictions.shape)
print(predictions.label_ids.shape)
print(predictions.metrics)

  0%|          | 0/51 [00:00<?, ?it/s]

(408, 2)
(408,)
{'test_loss': 0.6350306272506714, 'test_runtime': 0.9678, 'test_samples_per_second': 421.574, 'test_steps_per_second': 52.697}


In [17]:
preds = np.argmax(predictions.predictions, axis=-1)

## Training with metrics

In [18]:
# testing metric
import evaluate

metric = evaluate.load("glue", "mrpc")
metric.compute(predictions=preds, references=predictions.label_ids)

{'accuracy': 0.8725490196078431, 'f1': 0.9097222222222222}

In [19]:
def compute_metrics(eval_preds):
    metric = evaluate.load("glue", "mrpc")
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [20]:
training_args = TrainingArguments("test-trainer", evaluation_strategy="epoch")
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly i

In [21]:
trainer.train()



  0%|          | 0/1377 [00:00<?, ?it/s]

  0%|          | 0/51 [00:00<?, ?it/s]

{'eval_loss': 0.3468886613845825, 'eval_accuracy': 0.8651960784313726, 'eval_f1': 0.9043478260869565, 'eval_runtime': 1.9383, 'eval_samples_per_second': 210.492, 'eval_steps_per_second': 26.312, 'epoch': 1.0}
{'loss': 0.5233, 'learning_rate': 3.184458968772695e-05, 'epoch': 1.09}


  0%|          | 0/51 [00:00<?, ?it/s]

{'eval_loss': 0.8077418804168701, 'eval_accuracy': 0.8161764705882353, 'eval_f1': 0.8784440842787682, 'eval_runtime': 1.757, 'eval_samples_per_second': 232.21, 'eval_steps_per_second': 29.026, 'epoch': 2.0}
{'loss': 0.2832, 'learning_rate': 1.3689179375453886e-05, 'epoch': 2.18}


  0%|          | 0/51 [00:00<?, ?it/s]

{'eval_loss': 0.8102775812149048, 'eval_accuracy': 0.8333333333333334, 'eval_f1': 0.8859060402684563, 'eval_runtime': 1.8005, 'eval_samples_per_second': 226.607, 'eval_steps_per_second': 28.326, 'epoch': 3.0}
{'train_runtime': 121.0947, 'train_samples_per_second': 90.871, 'train_steps_per_second': 11.371, 'train_loss': 0.3268706710250289, 'epoch': 3.0}


TrainOutput(global_step=1377, training_loss=0.3268706710250289, metrics={'train_runtime': 121.0947, 'train_samples_per_second': 90.871, 'train_steps_per_second': 11.371, 'train_loss': 0.3268706710250289, 'epoch': 3.0})

In [22]:
predictions = trainer.predict(tokenized_datasets["validation"])
print(predictions.predictions.shape)
print(predictions.label_ids.shape)
print(predictions.metrics)

  0%|          | 0/51 [00:00<?, ?it/s]

(408, 2)
(408,)
{'test_loss': 0.8102775812149048, 'test_accuracy': 0.8333333333333334, 'test_f1': 0.8859060402684563, 'test_runtime': 1.9706, 'test_samples_per_second': 207.049, 'test_steps_per_second': 25.881}


In [None]:
preds = np.argmax(predictions.predictions, axis=-1)