# How to load huggingface data?

In [1]:
import itertools
from typing import List

import torch
from datasets import load_dataset
from torch.utils.data import DataLoader
from transformers import RobertaForSequenceClassification, RobertaTokenizer

In [2]:
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = RobertaForSequenceClassification.from_pretrained('roberta-base', return_dict=True)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.weight', 'classifie

Let's define a TextDataset.
* Input: list of texts + list of labels

In [22]:
train = load_dataset("imdb", split="train")

Reusing dataset imdb (/home/przemyslaw/.cache/huggingface/datasets/imdb/plain_text/1.0.0/90099cb476936b753383ba2ae6ab2eae419b2e87f71cd5189cb9c8e5814d12a3)


In [4]:
train.column_names

['label', 'text']

In [5]:
train[0]

{'label': 1,
 'text': 'Bromwell High is a cartoon comedy. It ran at the same time as some other programs about school life, such as "Teachers". My 35 years in the teaching profession lead me to believe that Bromwell High\'s satire is much closer to reality than is "Teachers". The scramble to survive financially, the insightful students who can see right through their pathetic teachers\' pomp, the pettiness of the whole situation, all remind me of the schools I knew and their students. When I saw the episode in which a student repeatedly tried to burn down the school, I immediately recalled ......... at .......... High. A classic line: INSPECTOR: I\'m here to sack one of your teachers. STUDENT: Welcome to Bromwell High. I expect that many adults of my age think that Bromwell High is far fetched. What a pity that it isn\'t!'}

In [6]:
train = train.map(lambda batch: tokenizer(batch["text"], truncation=True, padding=True), batched=True)  # wait, padding=True will make my sequences way too long
train.rename_column_("label", "labels")

Loading cached processed dataset at /home/przemyslaw/.cache/huggingface/datasets/imdb/plain_text/1.0.0/90099cb476936b753383ba2ae6ab2eae419b2e87f71cd5189cb9c8e5814d12a3/cache-d0c943f97d38f28d.arrow


In [7]:
train[0].keys()

dict_keys(['attention_mask', 'input_ids', 'labels', 'text'])

In [8]:
train[0]['input_ids'].__len__()

512

## Data Loading
Note: approach of creating dataset proposed by authors of huggingface/transformers uses padding=True, which sets padding to maximum possible model value (512 for Roberta). This is way too large, and will cause too much memory usage. **Why?**

We need a different approach. The better way to do it would be to `set padding=True, max_length=MAX_LENGTH`, where MAX_LENGTH is maximum lenght after tokenization. 

Even better, we would prefer to set max_length as a maximum size after tokenizing a batch. In order to achieve this, we would require DataLoader to handle it for us. To do that, we will use `collate_fn` as a DataLoader argument (https://pytorch.org/docs/stable/data.html). 

In [9]:
output = tokenizer(["I like you", "Even though I get lost sometimes, I push forward. Like Sysipuhus pushing the stone"], padding=True)
output

{'input_ids': [[0, 100, 101, 47, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [0, 8170, 600, 38, 120, 685, 2128, 6, 38, 1920, 556, 4, 2011, 208, 2459, 1588, 2957, 687, 3784, 5, 7326, 2]], 'attention_mask': [[1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}

In [10]:
len(output["input_ids"][0]), len(output["input_ids"][1])

(22, 22)

In [11]:
output = tokenizer(["I like you", "Even though I get lost sometimes, I push forward. Like Sysipuhus pushing the stone"], padding=False)
len(output["input_ids"][0]), len(output["input_ids"][1])

(5, 22)

In [12]:
train_dataset = load_dataset("imdb", split="train")
len(train_dataset)

Reusing dataset imdb (/home/przemyslaw/.cache/huggingface/datasets/imdb/plain_text/1.0.0/90099cb476936b753383ba2ae6ab2eae419b2e87f71cd5189cb9c8e5814d12a3)


25000

### Plain DataLoader

In [13]:
trainloader = DataLoader(train_dataset, batch_size=3)
batch = next(iter(trainloader))
batch

{'label': tensor([1, 1, 1]),
 'text': ['Bromwell High is a cartoon comedy. It ran at the same time as some other programs about school life, such as "Teachers". My 35 years in the teaching profession lead me to believe that Bromwell High\'s satire is much closer to reality than is "Teachers". The scramble to survive financially, the insightful students who can see right through their pathetic teachers\' pomp, the pettiness of the whole situation, all remind me of the schools I knew and their students. When I saw the episode in which a student repeatedly tried to burn down the school, I immediately recalled ......... at .......... High. A classic line: INSPECTOR: I\'m here to sack one of your teachers. STUDENT: Welcome to Bromwell High. I expect that many adults of my age think that Bromwell High is far fetched. What a pity that it isn\'t!',
  'Homelessness (or Houselessness as George Carlin stated) has been an issue for years but never a plan to help those on the street that were once 

In plain DataLoader we get something we can iterate over and what returns us a dict. It contains
* tensor of labels
* list of texts
We want to transform list of texts into tensor (batch_size, sequence_length)

In [14]:
len(batch['text'][1].split())

428

### Using collate_fn 

In [15]:
def collate_fn(batch):
    batch['text'] = tokenizer(batch['text'])
    return batch

In [16]:
trainloader_iter = iter(trainloader)
batch = next(trainloader_iter)
input_ids = collate_fn(batch)['text']['input_ids']
print(len(input_ids))
for ids in input_ids:
    print(len(ids))

Token indices sequence length is longer than the specified maximum sequence length for this model (535 > 512). Running this sequence through the model will result in indexing errors


3
180
535
191


In [17]:
def collate_fn(batch):
    batch['text'] = tokenizer(batch['text'], truncation=True, padding=True)
    return batch

In [18]:
batch = next(trainloader_iter)
input_ids = collate_fn(batch)['text']['input_ids']
print(len(input_ids))
for ids in input_ids:
    print(len(ids))

3
247
247
247


### Using DataLoader with collate_fn 

In [19]:
# TODO: WARNING, tokenizer is a global variable here, be careful
def collate_fn(examples: List[dict]):
    labels = [example['label'] for example in examples]
    texts = [example['text'] for example in examples]
    tokenizer_output = tokenizer(texts, truncation=True, padding=True)
    tokenizer_output['input_ids'] = torch.tensor(tokenizer_output['input_ids'])
    tokenizer_output['attention_mask'] = torch.tensor(tokenizer_output['attention_mask'])
    output_dict = dict(labels=labels, **tokenizer_output)
    return output_dict

trainloader = DataLoader(train_dataset, batch_size=3, collate_fn=collate_fn, shuffle=True)
batch = next(iter(trainloader))

In [20]:
batch.keys()

dict_keys(['labels', 'input_ids', 'attention_mask'])

In [21]:
for batch in itertools.islice(trainloader, 10):
    print(batch['input_ids'].shape)

torch.Size([3, 255])
torch.Size([3, 512])
torch.Size([3, 512])
torch.Size([3, 512])
torch.Size([3, 437])
torch.Size([3, 512])
torch.Size([3, 309])
torch.Size([3, 448])
torch.Size([3, 250])
torch.Size([3, 512])


Using this approach we will have variable sequence_length in batches. Thus, training time will be less predictable (but hopefully not slower) and more memory efficient.In particular, for datasets with short texts (e.g less than 100 tokens), no sequence lenght will be bigger than 100.