# Preprocessing


In [1]:
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
raw_dataset = load_dataset('nyu-mll/glue', 'mrpc')

In [13]:
checkpoint = 'google-bert/bert-base-cased'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [14]:
def tokenize_function(examples):
    return tokenizer(
        examples['sentence1'],
        examples['sentence2'],
        truncation=True,
    )

In [15]:
tokenized_dataset = raw_dataset.map(tokenize_function, batched=True)
tokenized_dataset = tokenized_dataset.remove_columns(['sentence1', 'sentence2', 'idx'])
tokenized_dataset = tokenized_dataset.rename_column('label', 'labels')
tokenized_dataset = tokenized_dataset.with_format('torch')
data_collator = DataCollatorWithPadding(tokenizer)
tokenized_dataset.column_names

Map: 100%|██████████| 3668/3668 [00:00<00:00, 27665.81 examples/s]
Map: 100%|██████████| 408/408 [00:00<00:00, 43217.32 examples/s]
Map: 100%|██████████| 1725/1725 [00:00<00:00, 47646.85 examples/s]


{'train': ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
 'validation': ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
 'test': ['labels', 'input_ids', 'token_type_ids', 'attention_mask']}

In [16]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(
    tokenized_dataset['train'],
    batch_size=16,
    collate_fn=data_collator,
    shuffle=True,
)

In [17]:
for step, batch in enumerate(train_dataloader):
    print(batch['input_ids'].shape)
    if step > 5:
        break

torch.Size([16, 105])
torch.Size([16, 73])
torch.Size([16, 73])
torch.Size([16, 78])
torch.Size([16, 78])
torch.Size([16, 87])
torch.Size([16, 73])


# Training upper parameters


In [18]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained('bert-base-cased', num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [19]:
from transformers import TrainingArguments

training_agrs = TrainingArguments(
    'test-trainer3',
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    learning_rate=2e-5,
    weight_decay=0.01,
    evaluation_strategy='epoch',
)

In [26]:
from transformers import Trainer

trainer = Trainer(
    model,
    training_agrs,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['validation'],
    data_collator=data_collator,
    tokenizer=tokenizer,
)
trainer.train()

Epoch,Training Loss,Validation Loss
1,No log,1.141369
2,No log,1.450285
3,0.020000,1.272812
4,0.020000,1.199451
5,0.009100,1.238924


TrainOutput(global_step=1150, training_loss=0.013446055495220682, metrics={'train_runtime': 71.2727, 'train_samples_per_second': 257.322, 'train_steps_per_second': 16.135, 'total_flos': 742152009410640.0, 'train_loss': 0.013446055495220682, 'epoch': 5.0})

In [11]:
predictions = trainer.predict(tokenized_dataset['validation'])
print(predictions.predictions.shape, predictions.label_ids.shape)

(408, 2) (408,)


In [44]:
import numpy as np
from datasets import load_metric

metric = load_metric('glue', 'mrpc')
preds = np.argmax(predictions.predictions, axis=-1)
metric.compute(predictions=preds, references=predictions.label_ids)

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


{'accuracy': 0.8431372549019608, 'f1': 0.8904109589041096}

In [49]:
def compute_metrics(eval_preds):
    logits, label = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=label)

In [50]:
trainer = Trainer(
    model,
    training_agrs,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['validation'],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)
trainer.train()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,1.105809,0.860294,0.900175
2,No log,1.36905,0.823529,0.869091
3,0.022600,1.259902,0.843137,0.891525
4,0.022600,1.374394,0.830882,0.878735
5,0.007500,1.297394,0.835784,0.885077


TrainOutput(global_step=1150, training_loss=0.013665472839189612, metrics={'train_runtime': 68.8517, 'train_samples_per_second': 266.369, 'train_steps_per_second': 16.703, 'total_flos': 714950848507680.0, 'train_loss': 0.013665472839189612, 'epoch': 5.0})