In [1]:
from datasets import load_dataset

raw_datasets = load_dataset("glue", "mrpc")
raw_datasets

  from .autonotebook import tqdm as notebook_tqdm
Found cached dataset glue (/home/jling/.cache/huggingface/datasets/glue/mrpc/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)
100%|██████████| 3/3 [00:00<00:00, 851.81it/s]


DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 1725
    })
})

In [2]:
# sample:
raw_train_dataset = raw_datasets["train"]
raw_train_dataset[0]

{'sentence1': 'Amrozi accused his brother , whom he called " the witness " , of deliberately distorting his evidence .',
 'sentence2': 'Referring to him as only " the witness " , Amrozi accused his brother of deliberately distorting his evidence .',
 'label': 1,
 'idx': 0}

In [3]:
raw_train_dataset.features

{'sentence1': Value(dtype='string', id=None),
 'sentence2': Value(dtype='string', id=None),
 'label': ClassLabel(names=['not_equivalent', 'equivalent'], id=None),
 'idx': Value(dtype='int32', id=None)}

In [4]:
from transformers import AutoTokenizer

checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def tokenize_function(example):
    return tokenizer(example["sentence1"], example["sentence2"], truncation=True)

tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)


Loading cached processed dataset at /home/jling/.cache/huggingface/datasets/glue/mrpc/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad/cache-2286bfeed35ec98c.arrow
Loading cached processed dataset at /home/jling/.cache/huggingface/datasets/glue/mrpc/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad/cache-d2871c4b8b165e62.arrow
Loading cached processed dataset at /home/jling/.cache/huggingface/datasets/glue/mrpc/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad/cache-e4e2a9b2438b2f62.arrow


In [5]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)



In [6]:
from transformers import TrainingArguments

training_args = TrainingArguments("test_trainer")

In [7]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [8]:
from transformers import Trainer

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
)

In [9]:
trainer.train()

Step,Training Loss
500,0.5736
1000,0.4031


TrainOutput(global_step=1377, training_loss=0.4316764283024408, metrics={'train_runtime': 138.9047, 'train_samples_per_second': 79.22, 'train_steps_per_second': 9.913, 'total_flos': 406183858377360.0, 'train_loss': 0.4316764283024408, 'epoch': 3.0})

In [10]:
import numpy as np
predictions = trainer.predict(tokenized_datasets["validation"])


preds = np.argmax(predictions.predictions, axis=-1)


In [17]:
import evaluate

metric = evaluate.load("glue", "mrpc")
metric.compute(predictions=preds, references=predictions.label_ids)


{'accuracy': 0.8382352941176471, 'f1': 0.8892617449664431}

In [18]:
def compute_metrics(eval_preds):
    metric = evaluate.load("glue", "mrpc")
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [20]:
training_args = TrainingArguments("test-trainer", evaluation_strategy="epoch")

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [21]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.966008,0.803922,0.873418
2,0.318200,0.860616,0.811275,0.868825
3,0.215400,0.767401,0.833333,0.881944


TrainOutput(global_step=1377, training_loss=0.22989421282112382, metrics={'train_runtime': 151.5197, 'train_samples_per_second': 72.624, 'train_steps_per_second': 9.088, 'total_flos': 406183858377360.0, 'train_loss': 0.22989421282112382, 'epoch': 3.0})

To use the Trainer instance you defined to predict on new data, you can create a new dataset containing your new data, tokenize it using the same tokenizer that was used to tokenize the training and validation datasets, and then call the predict() method of the Trainer instance to generate predictions.

Here is an example of how you can do this:

```makefile
Copy code
# Assuming you have some new data stored in a list called `new_data`

# Tokenize the new data using the same tokenizer as the training data
tokenized_new_data = tokenizer(new_data, padding=True, truncation=True, max_length=128)

# Create a Dataset object from the tokenized data
new_dataset = Dataset.from_dict(tokenized_new_data)

# Call the `predict()` method of the `Trainer` instance to generate predictions on the new data
predictions = trainer.predict(new_dataset)

# Extract the predicted labels from the predictions
predicted_labels = np.argmax(predictions.predictions, axis=-1)

# Print the predicted labels
print(predicted_labels)
```

This code will tokenize the new data using the same tokenizer that was used to tokenize the training and validation datasets, create a Dataset object from the tokenized data, and then call the predict() method of the Trainer instance to generate predictions on the new data. Finally, it extracts the predicted labels from the predictions using np.argmax() and prints them out. Note that the np.argmax() function is used to convert the predicted logits to predicted labels.