In [1]:
from tqdm.notebook import tqdm
from datasets import load_dataset, DatasetDict, load_from_disk
from transformers import BertTokenizer, BertForMaskedLM, GPT2Tokenizer, GPT2Config, GPT2Model, GPT2LMHeadModel, GPT2LMHeadModel
from transformers import DataCollatorForLanguageModeling, DataCollatorWithPadding
from transformers import Trainer, TrainingArguments
import random

### Loading normal tokenized dataset from disk

In [2]:
tokenized_data_path = '../data/coca_spoken/tokens/'

In [3]:
encoded_datasets = load_from_disk(tokenized_data_path)

In [4]:
encoded_datasets

DatasetDict({
    train: Dataset({
        features: ['text', 'input_ids', 'attention_mask'],
        num_rows: 4802969
    })
    val: Dataset({
        features: ['text', 'input_ids', 'attention_mask'],
        num_rows: 600371
    })
    test: Dataset({
        features: ['text', 'input_ids', 'attention_mask'],
        num_rows: 600372
    })
})

In [5]:
encoded_datasets['train'][0]

{'text': 'He sees things very similar .',
 'input_ids': [1544, 7224, 1243, 845, 2092, 764],
 'attention_mask': [1, 1, 1, 1, 1, 1]}

## Reverse data collator

In [6]:
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

In [7]:
tokenized_datasets = encoded_datasets.remove_columns(['text'])

In [10]:
for i in range(5):
    print(encoded_datasets['train'][i])

{'text': 'He sees things very similar .', 'input_ids': [1544, 7224, 1243, 845, 2092, 764], 'attention_mask': [1, 1, 1, 1, 1, 1]}
{'text': 'He ran a very strong race in New Hampshire .', 'input_ids': [1544, 4966, 257, 845, 1913, 3234, 287, 968, 13910, 764], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
{'text': 'So what is the significance of all this Mark ?', 'input_ids': [2396, 644, 318, 262, 12085, 286, 477, 428, 2940, 5633], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
{'text': 'And for those who have placed their faith in Jesus Christ , and can claim God as their father , then heaven is going to be their eternal life .', 'input_ids': [1870, 329, 883, 508, 423, 4624, 511, 4562, 287, 5803, 1951, 837, 290, 460, 1624, 1793, 355, 511, 2988, 837, 788, 9538, 318, 1016, 284, 307, 511, 15851, 1204, 764], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
{'text': 'That is all we have time for today .', 'input_ids': [2504, 3

In [8]:
# Normal data collator
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

In [15]:
example_data_collation = data_collator([tokenized_datasets['train'][i] for i in range(3)])
for key in example_data_collation:
    print(f"{key} shape: {example_data_collation[key].shape}")
    print(example_data_collation[key])

input_ids shape: torch.Size([3, 10])
tensor([[ 1544,  7224,  1243,   845,  2092,   764, 50256, 50256, 50256, 50256],
        [ 1544,  4966,   257,   845,  1913,  3234,   287,   968, 13910,   764],
        [ 2396,   644,   318,   262, 12085,   286,   477,   428,  2940,  5633]])
attention_mask shape: torch.Size([3, 10])
tensor([[1, 1, 1, 1, 1, 1, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])
labels shape: torch.Size([3, 10])
tensor([[ 1544,  7224,  1243,   845,  2092,   764,  -100,  -100,  -100,  -100],
        [ 1544,  4966,   257,   845,  1913,  3234,   287,   968, 13910,   764],
        [ 2396,   644,   318,   262, 12085,   286,   477,   428,  2940,  5633]])


In [16]:
class ReverseSequenceDataCollator(DataCollatorForLanguageModeling):
    def __call__(self, features, return_tensors=None):
        for feature in features:
            feature['input_ids'] = feature['input_ids'][::-1]
        return super().__call__(features, return_tensors)

In [17]:
reverse_data_collator = ReverseSequenceDataCollator(tokenizer, mlm=False)

In [18]:
example_data_collation = reverse_data_collator([tokenized_datasets['train'][i] for i in range(3)])
for key in example_data_collation:
    print(f"{key} shape: {example_data_collation[key].shape}")
    print(example_data_collation[key])

input_ids shape: torch.Size([3, 10])
tensor([[  764,  2092,   845,  1243,  7224,  1544, 50256, 50256, 50256, 50256],
        [  764, 13910,   968,   287,  3234,  1913,   845,   257,  4966,  1544],
        [ 5633,  2940,   428,   477,   286, 12085,   262,   318,   644,  2396]])
attention_mask shape: torch.Size([3, 10])
tensor([[1, 1, 1, 1, 1, 1, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])
labels shape: torch.Size([3, 10])
tensor([[  764,  2092,   845,  1243,  7224,  1544,  -100,  -100,  -100,  -100],
        [  764, 13910,   968,   287,  3234,  1913,   845,   257,  4966,  1544],
        [ 5633,  2940,   428,   477,   286, 12085,   262,   318,   644,  2396]])


The tokens are correctly reversed, but padding is put at the end of the input. This should be still fine.

### Deprecated method below

Problem with reversing the tokens of the dataset (as below) is that the data is by default not loaded into memory.

I also would rather not make a clone of the dataset in reverse, since it would double the size.

In [6]:
def reverse_tokens(tokenized_dataset):
    """Reverses tokens INPLACE"""
    for split in tokenized_dataset:
        for i in tqdm(range(len(tokenized_dataset[split]))):
            tokenized_dataset[split][i]['input_ids'].reverse()


In [7]:
reversed_encoded_datasets = load_from_disk(tokenized_data_path)
reverse_tokens(reversed_encoded_datasets)

  0%|          | 0/4802969 [00:00<?, ?it/s]

  0%|          | 0/600371 [00:00<?, ?it/s]

  0%|          | 0/600372 [00:00<?, ?it/s]

In [8]:
reversed_encoded_datasets['train'][0]

{'text': 'He sees things very similar .',
 'input_ids': [1544, 7224, 1243, 845, 2092, 764],
 'attention_mask': [1, 1, 1, 1, 1, 1]}