In [1]:
from datasets import load_dataset
raw_dataset = load_dataset("glue", "mrpc")
raw_dataset

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 1725
    })
})

In [2]:
# raw_dataset['train'][1:3]

In [3]:
raw_dataset['train'].features

{'sentence1': Value(dtype='string', id=None),
 'sentence2': Value(dtype='string', id=None),
 'label': ClassLabel(names=['not_equivalent', 'equivalent'], id=None),
 'idx': Value(dtype='int32', id=None)}

In [4]:
# give a pre-trained sequence classifier
import torch
from transformers import AdamW, AutoTokenizer, AutoModelForSequenceClassification

checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
def tokenize_function(example):
    return tokenizer(
        example['sentence1'], example['sentence2'], 
        #padding="max_length", 
        truncation=True,
        #max_length=128 
        )



In [None]:
tokenize_function(raw_dataset['train'][0])

In [6]:
tokenized_dataset = raw_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/3668 [00:00<?, ? examples/s]

In [None]:
tokenized_dataset['train']

In [7]:
tokenized_datasets = tokenized_dataset.remove_columns(['idx',"sentence1","sentence2"])

In [8]:
tokenized_datasets = tokenized_datasets.rename_column("label","labels") # transformer library requirement
tokenized_datasets = tokenized_datasets.with_format("torch")
tokenized_datasets['train']

Dataset({
    features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 3668
})

## Dynamic padding

In [11]:
from torch.utils.data import DataLoader
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer)

train_dataloader = DataLoader(tokenized_datasets["train"], batch_size=16, shuffle=True, collate_fn=data_collator)


In [12]:
for step, batch in enumerate(train_dataloader):
    print(batch["input_ids"].shape)
    if step > 5:
        break

torch.Size([16, 73])
torch.Size([16, 82])
torch.Size([16, 69])
torch.Size([16, 81])
torch.Size([16, 68])
torch.Size([16, 73])
torch.Size([16, 75])
