In [80]:
from tqdm.notebook import tqdm
from datasets import load_dataset, DatasetDict, load_from_disk
from transformers import BertTokenizer, BertForMaskedLM, GPT2Tokenizer, GPT2Config, GPT2Model, GPT2LMHeadModel, GPT2LMHeadModel
from transformers import DataCollatorForLanguageModeling, DataCollatorWithPadding
from transformers import Trainer, TrainingArguments
import random
import torch

### Loading normal tokenized dataset from disk

In [22]:
tokenized_data_path = '../data/coca_spoken/tokens_sentence/'

In [23]:
encoded_datasets = load_from_disk(tokenized_data_path)

In [None]:
encoded_datasets

DatasetDict({
    train: Dataset({
        features: ['text', 'input_ids', 'attention_mask'],
        num_rows: 4802969
    })
    val: Dataset({
        features: ['text', 'input_ids', 'attention_mask'],
        num_rows: 600371
    })
    test: Dataset({
        features: ['text', 'input_ids', 'attention_mask'],
        num_rows: 600372
    })
})

In [24]:
encoded_datasets['train'][0]

{'text': 'He sees things very similar .',
 'input_ids': [1544, 7224, 1243, 845, 2092, 764],
 'attention_mask': [1, 1, 1, 1, 1, 1]}

## Reverse data collator

In [25]:
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

In [26]:
tokenized_datasets = encoded_datasets.remove_columns(['text'])

In [27]:
for i in range(5):
    print(encoded_datasets['train'][i])

{'text': 'He sees things very similar .', 'input_ids': [1544, 7224, 1243, 845, 2092, 764], 'attention_mask': [1, 1, 1, 1, 1, 1]}
{'text': 'He ran a very strong race in New Hampshire .', 'input_ids': [1544, 4966, 257, 845, 1913, 3234, 287, 968, 13910, 764], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
{'text': 'So what is the significance of all this Mark ?', 'input_ids': [2396, 644, 318, 262, 12085, 286, 477, 428, 2940, 5633], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
{'text': 'And for those who have placed their faith in Jesus Christ , and can claim God as their father , then heaven is going to be their eternal life .', 'input_ids': [1870, 329, 883, 508, 423, 4624, 511, 4562, 287, 5803, 1951, 837, 290, 460, 1624, 1793, 355, 511, 2988, 837, 788, 9538, 318, 1016, 284, 307, 511, 15851, 1204, 764], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
{'text': 'That is all we have time for today .', 'input_ids': [2504, 3

In [28]:
# Normal data collator
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

In [29]:
example_data_collation = data_collator([tokenized_datasets['train'][i] for i in range(3)])
for key in example_data_collation:
    print(f"{key} shape: {example_data_collation[key].shape}")
    print(example_data_collation[key])

input_ids shape: torch.Size([3, 10])
tensor([[ 1544,  7224,  1243,   845,  2092,   764, 50256, 50256, 50256, 50256],
        [ 1544,  4966,   257,   845,  1913,  3234,   287,   968, 13910,   764],
        [ 2396,   644,   318,   262, 12085,   286,   477,   428,  2940,  5633]])
attention_mask shape: torch.Size([3, 10])
tensor([[1, 1, 1, 1, 1, 1, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])
labels shape: torch.Size([3, 10])
tensor([[ 1544,  7224,  1243,   845,  2092,   764,  -100,  -100,  -100,  -100],
        [ 1544,  4966,   257,   845,  1913,  3234,   287,   968, 13910,   764],
        [ 2396,   644,   318,   262, 12085,   286,   477,   428,  2940,  5633]])


In [30]:
class ReverseSequenceDataCollator(DataCollatorForLanguageModeling):
    def __call__(self, features, return_tensors=None):
        for feature in features:
            feature['input_ids'] = feature['input_ids'][::-1]
        return super().__call__(features, return_tensors)

In [31]:
reverse_data_collator = ReverseSequenceDataCollator(tokenizer, mlm=False)

In [32]:
example_data_collation = reverse_data_collator([tokenized_datasets['train'][i] for i in range(3)])
for key in example_data_collation:
    print(f"{key} shape: {example_data_collation[key].shape}")
    print(example_data_collation[key])

input_ids shape: torch.Size([3, 10])
tensor([[  764,  2092,   845,  1243,  7224,  1544, 50256, 50256, 50256, 50256],
        [  764, 13910,   968,   287,  3234,  1913,   845,   257,  4966,  1544],
        [ 5633,  2940,   428,   477,   286, 12085,   262,   318,   644,  2396]])
attention_mask shape: torch.Size([3, 10])
tensor([[1, 1, 1, 1, 1, 1, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])
labels shape: torch.Size([3, 10])
tensor([[  764,  2092,   845,  1243,  7224,  1544,  -100,  -100,  -100,  -100],
        [  764, 13910,   968,   287,  3234,  1913,   845,   257,  4966,  1544],
        [ 5633,  2940,   428,   477,   286, 12085,   262,   318,   644,  2396]])


The tokens are correctly reversed, but padding is put at the end of the input. This should be still fine.

## Bidirectional Data Collator(?)
Can I simply modify the data collator like above to create the necessary input_ids, labels, and attention_mask for the Bidirectional model without having to modify anything internal to the GPT2LMHeadModel instance or Trainer training loop? let's see!

In [90]:
class BidirectionalInfillingDataCollator(DataCollatorForLanguageModeling):
    """
    Modifies the DataCollatorForLanguageModeling to return
    input_ids, labels, and attention_ids
    as per pqian11/fragment-completion code (Qian and Levy, 2022)
    suitable for bidirectional infilling task

    From a single input (token) sentence, I should be able to create a whole batch
    of bidirectional task inputs where each successive token is masked.
    """
    def __call__(self, features, return_tensors='pt', 
                 BLANK_id=-2000, SEP_id=-1000, FILLER_id=-3000):
        """
        Given:
            features := Dict{
                'input_ids': List, 
                'attention_mask': List
            }
        Returns:
            batch := transformers.tokenization_utils_base.BatchEncoding{
                'input_ids': Tensor,
                'attention_mask': Tensor,
                'labels': Tensor
            }

        Example:
            Given:
                input_ids = [1544, 7224, 1243, 845, 2092, 764]
                attention_mask = [1, 1, 1, 1, 1, 1]

            Return:
                bidi_input_ids = [1544, BLANK, 1243, 845, 2092, 764, SEP, FILL]
                bidi_attention_mask = [1, 1, 1, 1, 1, 1, 1, 1]
                bidi_labels = [BLANK, BLANK, BLANK, BLANK, BLANK, BLANK, BLANK, 7224]

            * Ensure automatic shifting of labels doesn't happen in the model

            (Note: Padding not necessary if we make the batch from the single sentence
            Otherwise, we need to additionally pad all inputs with zeros at the end)

        """
        # print(features)
        # features := {'input_ids': List, 'attention_mask': List}
        # for feature in features:
        #     for key, val in feature.items():
        #         # print(f"{key} shape: {len(val)}")
        #         print(key, val)

        # print(features)
        
        
        # batch = {}
        # for feature in features:
        # feature = features[0]
        assert isinstance(features, dict), f"bidirectional data collator input features should be a dict, not {type(features)}"
        assert return_tensors == 'pt', f"only supports return pytorch tensors"

        feature = features
        input_ids = feature['input_ids']
        # attention_mask = features['attention_mask']

        n_tokens = len(input_ids)

        bidi_input_ids = [input_ids[:i] + [BLANK_id] + input_ids[i+1:] + [SEP_id, FILLER_id] 
                        for i in range(n_tokens)]

        bidi_attention_mask = [[1 for _ in range(n_tokens + 2)] for _ in range(n_tokens)]

        bidi_labels = [[-100 for _ in range(n_tokens + 1)] + [answer_token] 
                    for answer_token in input_ids]

        batch = {
            'input_ids': torch.tensor(bidi_input_ids),
            'attention_mask': torch.tensor(bidi_attention_mask),
            'labels': torch.tensor(bidi_labels)
        }
        # print(batch)
        return batch
        # return super().__call__(features, return_tensors)



In [91]:
bidirectional_data_collator = BidirectionalInfillingDataCollator(tokenizer, mlm=False)

In [92]:
example_bidi_data_collation = bidirectional_data_collator(tokenized_datasets['train'][0])

In [93]:
example_bidi_data_collation

{'input_ids': tensor([[-2000,  7224,  1243,   845,  2092,   764, -1000, -3000],
         [ 1544, -2000,  1243,   845,  2092,   764, -1000, -3000],
         [ 1544,  7224, -2000,   845,  2092,   764, -1000, -3000],
         [ 1544,  7224,  1243, -2000,  2092,   764, -1000, -3000],
         [ 1544,  7224,  1243,   845, -2000,   764, -1000, -3000],
         [ 1544,  7224,  1243,   845,  2092, -2000, -1000, -3000]]),
 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1],
         [1, 1, 1, 1, 1, 1, 1, 1],
         [1, 1, 1, 1, 1, 1, 1, 1],
         [1, 1, 1, 1, 1, 1, 1, 1],
         [1, 1, 1, 1, 1, 1, 1, 1],
         [1, 1, 1, 1, 1, 1, 1, 1]]),
 'labels': tensor([[-100, -100, -100, -100, -100, -100, -100, 1544],
         [-100, -100, -100, -100, -100, -100, -100, 7224],
         [-100, -100, -100, -100, -100, -100, -100, 1243],
         [-100, -100, -100, -100, -100, -100, -100,  845],
         [-100, -100, -100, -100, -100, -100, -100, 2092],
         [-100, -100, -100, -100, -100, -100, -1

In [89]:
type(example_bidi_data_collation)

dict

In [None]:
import accelerate
import transformers

transformers.__version__, accelerate.__version__

In [None]:
import torch
torch.cuda.is_available()
torch.cuda.get_device_name(0)

In [None]:
training_output_dir = '../models/bidi/test1/'
args = TrainingArguments(
    training_output_dir,
    per_device_train_batch_size=128, # change to fit GPU specs
    per_device_eval_batch_size=128,
    group_by_length=True, # bucketing
)

In [None]:
args.device

In [None]:
configuration = GPT2Config()
model = GPT2LMHeadModel(configuration)

In [None]:
trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=args,
    data_collator=data_collator,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['val'],
)

In [None]:
trainer.train(
    # resume_from_checkpoint=True
)

### Deprecated method below

Problem with reversing the tokens of the dataset (as below) is that the data is by default not loaded into memory.

I also would rather not make a clone of the dataset in reverse, since it would double the size.

In [6]:
def reverse_tokens(tokenized_dataset):
    """Reverses tokens INPLACE"""
    for split in tokenized_dataset:
        for i in tqdm(range(len(tokenized_dataset[split]))):
            tokenized_dataset[split][i]['input_ids'].reverse()


In [7]:
reversed_encoded_datasets = load_from_disk(tokenized_data_path)
reverse_tokens(reversed_encoded_datasets)

  0%|          | 0/4802969 [00:00<?, ?it/s]

  0%|          | 0/600371 [00:00<?, ?it/s]

  0%|          | 0/600372 [00:00<?, ?it/s]

In [8]:
reversed_encoded_datasets['train'][0]

{'text': 'He sees things very similar .',
 'input_ids': [1544, 7224, 1243, 845, 2092, 764],
 'attention_mask': [1, 1, 1, 1, 1, 1]}