In [2]:
from transformers import BertTokenizerFast
from torch.utils.data import Dataset
from datasets import load_dataset
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
tokenizer = BertTokenizerFast.from_pretrained('bert-base-multilingual-cased')
sentence = "This is an example sentence. The quick fox jumps over the lazy dog"
tokens = tokenizer.tokenize(sentence)
print(tokens)


['This', 'is', 'an', 'example', 'sentence', '.', 'The', 'quick', 'f', '##ox', 'jump', '##s', 'over', 'the', 'la', '##zy', 'dog']


In [4]:
mnli_dataset = load_dataset("multi_nli")

In [5]:
print(mnli_dataset['train'][0])

{'promptID': 31193, 'pairID': '31193n', 'premise': 'Conceptually cream skimming has two basic dimensions - product and geography.', 'premise_binary_parse': '( ( Conceptually ( cream skimming ) ) ( ( has ( ( ( two ( basic dimensions ) ) - ) ( ( product and ) geography ) ) ) . ) )', 'premise_parse': '(ROOT (S (NP (JJ Conceptually) (NN cream) (NN skimming)) (VP (VBZ has) (NP (NP (CD two) (JJ basic) (NNS dimensions)) (: -) (NP (NN product) (CC and) (NN geography)))) (. .)))', 'hypothesis': 'Product and geography are what make cream skimming work. ', 'hypothesis_binary_parse': '( ( ( Product and ) geography ) ( ( are ( what ( make ( cream ( skimming work ) ) ) ) ) . ) )', 'hypothesis_parse': '(ROOT (S (NP (NN Product) (CC and) (NN geography)) (VP (VBP are) (SBAR (WHNP (WP what)) (S (VP (VBP make) (NP (NP (NN cream)) (VP (VBG skimming) (NP (NN work)))))))) (. .)))', 'genre': 'government', 'label': 1}


In [6]:
class MNLIDataset(Dataset):
    def __init__(self, data, tokenizer, max_length):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length
    def __len__(self):
        return len(self.data)
    def  __getitem__(self, idx):
        example = self.data[idx]
        premise = example['premise']
        hypothesis = example['hypothesis']
        label = example['label']
        encoded_pair = self.tokenizer.encode_plus(premise, hypothesis, max_length=self.max_length, padding='max_length', truncation=True,return_tensors='pt')
        input_ids = encoded_pair['input_ids'].squeeze(0)
        attention_mask = encoded_pair['attention_mask'].squeeze(0)
        # Keeping this commented out for now, maybe not very essential for encoder only models? Investigate further...
        # token_type_ids = encoded_pair.get('token_type_ids', torch.zeros_like(input_ids))
        return {'input_ids': input_ids,'attention_mask': attention_mask,# 'token_type_ids': token_type_ids,
'labels': torch.tensor(label)}


In [7]:
train_data = mnli_dataset["train"]
max_seq_length = 128
train_dataset = MNLIDataset(train_data, tokenizer, max_seq_length)
print(f"Size of training dataset: {len(train_dataset)}")

Size of training dataset: 392702


In [8]:
sample = train_dataset[0]
print(sample['input_ids'].shape)
print(sample['attention_mask'].shape)
print(sample['labels'])

torch.Size([128])
torch.Size([128])
tensor(1)
