In [31]:
from datasets import load_from_disk
from transformers import GPT2LMHeadModel, GPT2Config, GPT2TokenizerFast, DataCollatorForLanguageModeling, Trainer, TrainingArguments
import torch

In [2]:
from datasets import disable_caching
disable_caching()

In [None]:
# model_dir = "models/script1/left_sentence/checkpoint-75047"
# # model_dir = "gpt2"
# tokenizer_name = "gpt2"

# model = load_pretrained_model(model_dir)
# tokenizer = load_pretrained_tokenizer(tokenizer_name)
# data_collator = init_data_collator(tokenizer, 'left')

In [3]:
tokenized_testset_dir = "../data/coca_spoken/tokens_sentence/test"
test_set = load_from_disk(tokenized_testset_dir)
# test_set = test_set.remove_columns('text')

In [4]:
test_set

Dataset({
    features: ['text', 'input_ids', 'attention_mask'],
    num_rows: 600372
})

In [5]:
def load_pretrained_tokenizer(pretrained_model_name_or_path, context=None, add_prefix_space=False):
    print(f'Loading pretrained tokenizer from {pretrained_model_name_or_path}...')
    tokenizer = GPT2TokenizerFast.from_pretrained(
        pretrained_model_name_or_path, 
        add_prefix_space=add_prefix_space, #?
    )

    if context == 'bigram':
        tokenizer.bos_token = '<s>'
        tokenizer.eos_token = '</s>'

    tokenizer.pad_token = tokenizer.eos_token # ?
    print("Vocabulary size:", tokenizer.vocab_size)
    print("Max Model Input Sizes:", tokenizer.model_max_length)
    print("BOS token:", tokenizer.bos_token, tokenizer.bos_token_id)
    print("EOS token:", tokenizer.eos_token, tokenizer.eos_token_id)
    print("PAD token:", tokenizer.pad_token, tokenizer.pad_token_id)
    print("SEP token:", tokenizer.sep_token, tokenizer.sep_token_id)
    print("UNK token:", tokenizer.unk_token, tokenizer.unk_token_id)
    print("Special tokens:", tokenizer.all_special_tokens)
    print('...done')
    return tokenizer

In [12]:
BLANK = '[BLANK]'
FILLER = '[FILLER]'
SEP = '[SEP]'
BOS = '<s>'
EOS = '</s>'

In [16]:
tokenizer = load_pretrained_tokenizer('gpt2')
num_added_tokens = tokenizer.add_tokens([BLANK, FILLER, SEP, BOS, EOS])

BLANK_id = tokenizer.convert_tokens_to_ids(BLANK)
FILLER_id = tokenizer.convert_tokens_to_ids(FILLER)
SEP_id = tokenizer.convert_tokens_to_ids(SEP)
BOS_id = tokenizer.convert_tokens_to_ids(BOS)
EOS_id = tokenizer.convert_tokens_to_ids(EOS)
print(BLANK_id, FILLER_id, SEP_id, BOS_id, EOS_id)

Loading pretrained tokenizer from gpt2...
Vocabulary size: 50257
Max Model Input Sizes: 1024
BOS token: <|endoftext|> 50256
EOS token: <|endoftext|> 50256
PAD token: <|endoftext|> 50256
SEP token: None None
UNK token: <|endoftext|> 50256
Special tokens: ['<|endoftext|>']
...done
50257 50258 50259 50260 50261


## Making an IterableDataset

In [15]:
from datasets import IterableDataset

In [25]:
def expand_inputs(example):

    input_ids = example['input_ids']
    # attention_mask = features['attention_mask']

    n_tokens = len(input_ids)
    
    for i in range(n_tokens):
        bidi_input_ids = [BOS_id] +  input_ids[:i] + [BLANK_id] + input_ids[i+1:] + [EOS_id] + [SEP_id, FILLER_id]
        bidi_attention_mask = [1] * (n_tokens + 4)
        bidi_labels = ([-100] * (n_tokens + 3)) + [input_ids[i]] 
        
        bidi_input = {
            'input_ids': bidi_input_ids,
            'attention_mask': bidi_attention_mask,
            'labels': bidi_labels
        }

        # assert len(bidi_input_ids) == len(bidi_attention_mask) == len(bidi_labels)
        yield bidi_input

    # return mini_batch

In [23]:
def gen_bidi_inputs(dataset):
    for example in dataset:
        yield from expand_inputs(example)

In [30]:
my_iterable_dataset = IterableDataset.from_generator(gen_bidi_inputs, gen_kwargs={"dataset": test_set})
# my_iterable_dataset = my_iterable_dataset.shuffle(seed=42, buffer_size=100)

i = 0
for example in my_iterable_dataset:
    i += 1
    print(example)
    if i == 20:
        break

{'input_ids': [50260, 50257, 284, 477, 281, 3053, 764, 50261, 50259, 50258], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'labels': [-100, -100, -100, -100, -100, -100, -100, -100, -100, 9690]}
{'input_ids': [50260, 9690, 50257, 477, 281, 3053, 764, 50261, 50259, 50258], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'labels': [-100, -100, -100, -100, -100, -100, -100, -100, -100, 284]}
{'input_ids': [50260, 9690, 284, 50257, 281, 3053, 764, 50261, 50259, 50258], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'labels': [-100, -100, -100, -100, -100, -100, -100, -100, -100, 477]}
{'input_ids': [50260, 9690, 284, 477, 50257, 3053, 764, 50261, 50259, 50258], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'labels': [-100, -100, -100, -100, -100, -100, -100, -100, -100, 281]}
{'input_ids': [50260, 9690, 284, 477, 281, 50257, 764, 50261, 50259, 50258], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'labels': [-100, -100, -100, -100, -100, -100, -100, -100, -100, 3053]}
{'inp

### Testing

In [None]:
model = GPT2LMHeadModel()
device = "cuda" if torch.cuda.is_available() else "cpu"
print(torch.cuda.get_device_name(0))
model.to(device)
tokenizer = GPT2TokenizerFast.from_pretrained('gpt')
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

In [None]:
train_args = TrainingArguments(
    '../models/arisetnrst',
    per_device_train_batch_size=8, # change to fit GPU specs
    per_device_eval_batch_size=8,
    # auto_find_batch_size=True,
    evaluation_strategy='epoch',
    eval_steps=1,
    logging_steps=0.01,
    save_strategy='epoch',
    save_steps=0.25,
    group_by_length=True, # bucketing
    # load_best_model_at_end=True,
    # metric_for_best_model='loss',
    # greater_is_better=False,
    save_total_limit=5,
    num_train_epochs=1,
)