In [1]:
from transformers import BertTokenizer, BertModel
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained("bert-base-uncased")

In [2]:
from datasets import load_dataset

dataset = load_dataset('../dataset_hf/dataset_hateful_memes.py')

Repo card metadata block was not found. Setting CardData to empty.


In [3]:
dataset

DatasetDict({
    train: Dataset({
        features: ['image', 'caption'],
        num_rows: 8439
    })
    validation: Dataset({
        features: ['image', 'caption'],
        num_rows: 500
    })
    test: Dataset({
        features: ['image', 'caption'],
        num_rows: 971
    })
})

In [4]:
def tokenization(example):
    return tokenizer(example["caption"], truncation=True)

dataset = dataset.map(tokenization, batched=True)

Map:   0%|          | 0/8439 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Map:   0%|          | 0/971 [00:00<?, ? examples/s]

In [5]:
dataset

DatasetDict({
    train: Dataset({
        features: ['image', 'caption', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 8439
    })
    validation: Dataset({
        features: ['image', 'caption', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 500
    })
    test: Dataset({
        features: ['image', 'caption', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 971
    })
})

In [6]:
dataset.set_format(type="torch", columns=["input_ids", "token_type_ids", "attention_mask"])
# dataset.format['type']

In [7]:
dataset['train'][0]

{'input_ids': tensor([ 101, 2049, 2037, 2839, 2025, 2037, 3609, 2008, 5609,  102]),
 'token_type_ids': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1])}

In [8]:
len(dataset)

3

In [9]:
len(dataset['train'])

8439

In [10]:
len(dataset['test'])

971

In [11]:
len(dataset['validation'])

500

In [12]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding='max_length')

In [13]:
data_collator(dataset['train'][:5])

{'input_ids': tensor([[  101,  2049,  2037,  ...,     0,     0,     0],
        [  101,  2123,  1005,  ...,     0,     0,     0],
        [  101,  5128, 21207,  ...,     0,     0,     0],
        [  101,  1045,  2293,  ...,     0,     0,     0],
        [  101,  7955,  7459,  ...,     0,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}

In [14]:
inputs = data_collator(dataset['train'][:1])
output = model(**inputs)
output.last_hidden_state.shape

torch.Size([1, 512, 768])

In [15]:
inputs = data_collator(dataset['train'][:10])
output = model(**inputs)
output.last_hidden_state.shape

torch.Size([10, 512, 768])