In [1]:
from transformers import AutoTokenizer
from transformers import TextDataset, DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments, AutoModelWithLMHead
from datasets import load_dataset, load_from_disk

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
tokenizer = AutoTokenizer.from_pretrained("gpt2")
dberttokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
train_path = 'data/gpt2_train.txt'
val_path = 'data/gpt2_valid.txt'
test_path = 'data/gpt2_test.txt'


In [3]:
dberttokenizer

DistilBertTokenizerFast(name_or_path='distilbert-base-uncased', vocab_size=30522, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True)

In [4]:
tokenizer.eos_token

'<|endoftext|>'

In [5]:
train_dataset_json = load_dataset('json', data_files='data/train_small_best.json')
valid_dataset_json = load_dataset('json', data_files='data/valid_small_best.json')
test_dataset_json = load_dataset('json', data_files='data/test_small_best.json')

gpt2_train_dataset_json = load_dataset('json', data_files='data/train_small_best.json')
gpt2_valid_dataset_json = load_dataset('json', data_files='data/valid_small_best.json')
gpt2_test_dataset_json = load_dataset('json', data_files='data/test_small_best.json')

Found cached dataset json (/home/jlunder/.cache/huggingface/datasets/json/default-701e01a9083cc2fd/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4)
100%|██████████| 1/1 [00:00<00:00, 387.00it/s]
Found cached dataset json (/home/jlunder/.cache/huggingface/datasets/json/default-3265e29a55220f2d/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4)
100%|██████████| 1/1 [00:00<00:00, 525.80it/s]
Found cached dataset json (/home/jlunder/.cache/huggingface/datasets/json/default-9865269cefb71641/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4)
100%|██████████| 1/1 [00:00<00:00, 531.87it/s]
Found cached dataset json (/home/jlunder/.cache/huggingface/datasets/json/default-701e01a9083cc2fd/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4)
100%|██████████| 1/1 [00:00<00:00, 482.38it/s]
Found cached dataset json (/home/jlunder/.cache/huggingface/datasets/json/default-3265e29a55220f2d/0.0.0/e347ab1c932092252e717ff

In [6]:

train_dataset = load_dataset("text", data_files=train_path)
val_dataset = load_dataset("text", data_files=val_path)
test_dataset = load_dataset("text", data_files=test_path)


Found cached dataset text (/home/jlunder/.cache/huggingface/datasets/text/default-0d2f3e943eaa858a/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2)
100%|██████████| 1/1 [00:00<00:00, 473.93it/s]
Found cached dataset text (/home/jlunder/.cache/huggingface/datasets/text/default-5458949e5908317b/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2)
100%|██████████| 1/1 [00:00<00:00, 536.63it/s]
Found cached dataset text (/home/jlunder/.cache/huggingface/datasets/text/default-b098a2e17acbc7c0/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2)
100%|██████████| 1/1 [00:00<00:00, 561.86it/s]


In [7]:

tokenizer.add_special_tokens({"pad_token":tokenizer.eos_token})
pad_token_id=tokenizer.get_vocab()[tokenizer.eos_token]
print(pad_token_id)
print(tokenizer.pad_token_id)

dberttokenizer.add_special_tokens({"pad_token":"<|endoftext|>"})
pad_token_id=dberttokenizer.get_vocab()['<|endoftext|>']
print(pad_token_id)
print(dberttokenizer.pad_token_id)



50256
50256
30522
30522


In [8]:


print(tokenizer.pad_token_id, tokenizer.pad_token)
print(dberttokenizer.pad_token_id, dberttokenizer.pad_token)
tokenizer.padding_side = "left"
tokenizer.truncation_side = 'left'
dberttokenizer.padding_side="left"
dberttokenizer.truncation_size = 'left'

50256 <|endoftext|>
30522 <|endoftext|>


In [9]:
train_dataset = train_dataset.map(lambda examples:tokenizer(examples['text']), num_proc=32)
val_dataset = val_dataset.map(lambda examples:tokenizer(examples['text']), num_proc=32)
test_dataset = test_dataset.map(lambda examples:tokenizer(examples['text']), num_proc=32)

Loading cached processed dataset at /home/jlunder/.cache/huggingface/datasets/text/default-0d2f3e943eaa858a/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2/cache-a0585a71eb005e6a_*_of_00032.arrow
Loading cached processed dataset at /home/jlunder/.cache/huggingface/datasets/text/default-5458949e5908317b/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2/cache-366cce31146c048c_*_of_00032.arrow
Loading cached processed dataset at /home/jlunder/.cache/huggingface/datasets/text/default-b098a2e17acbc7c0/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2/cache-89fadf313cf2df23_*_of_00032.arrow


In [10]:
def tokenize_title_and_abstract(example):
    title_tokenized = tokenizer(example['title'], padding="max_length", truncation=True, return_tensors='pt', max_length=tokenizer.model_max_length)
    title_tokenized['titles_ids'] = title_tokenized['input_ids']
    del title_tokenized['input_ids']
    title_tokenized['titles_attention_masks'] = title_tokenized['attention_mask']
    del title_tokenized['attention_mask']
    example['titles'] = example['title']
    del example['title']
    example.update(title_tokenized)
    
    abstract_tokenized = tokenizer(example['abstract'], padding="max_length", truncation=True, return_tensors='pt', max_length=tokenizer.model_max_length)
    abstract_tokenized['abstracts_ids'] = abstract_tokenized['input_ids']
    del abstract_tokenized['input_ids']
    abstract_tokenized['abstracts_attention_masks'] = abstract_tokenized['attention_mask']
    del abstract_tokenized['attention_mask']
    example.update(abstract_tokenized)
    
    #abstract_tokenized_gpt = tokenizer(example['abstract'], padding="max_length", truncation=True, return_tensors='pt', max_length=dberttokenizer.model_max_length)
    #abstract_tokenized_gpt['abstracts_gpt_ids'] = abstract_tokenized_gpt['input_ids']
    #del abstract_tokenized_gpt['input_ids']
    #abstract_tokenized_gpt['abstracts_gpt_attention_masks'] = abstract_tokenized_gpt['attention_mask']
    #del abstract_tokenized_gpt['attention_mask']
    example['abstracts'] = example['abstract']
    del example['abstract']
    #example.update(abstract_tokenized_gpt)
    return example

def tokenize_for_gpt2_finetuning(example):
    title_tokenized = tokenizer(example['title'], padding="max_length", truncation=True, return_tensors='pt', max_length=dberttokenizer.model_max_length)
    example['titles'] = example['title']
    del example['title']
    example.update(title_tokenized)
    
    abstract_tokenized = tokenizer(example['abstract'], padding="max_length", truncation=True, return_tensors='pt', max_length=dberttokenizer.model_max_length)
    example['abstracts'] = example['abstract']
    del example['abstract']
    example['labels'] = abstract_tokenized['input_ids']
    example['labels_attention_mask'] = abstract_tokenized['attention_mask']
    
    return example



In [11]:
train_dataset_json = train_dataset_json.map(tokenize_title_and_abstract, num_proc=32)
valid_dataset_json = valid_dataset_json.map(tokenize_title_and_abstract, num_proc=32)
test_dataset_json = test_dataset_json.map(tokenize_title_and_abstract, num_proc=32)

gpt2_train_dataset_json = gpt2_train_dataset_json.map(tokenize_for_gpt2_finetuning, num_proc=32)
gpt2_valid_dataset_json = gpt2_valid_dataset_json.map(tokenize_for_gpt2_finetuning, num_proc=32)
gpt2_test_dataset_json = gpt2_test_dataset_json.map(tokenize_for_gpt2_finetuning, num_proc=32)


Loading cached processed dataset at /home/jlunder/.cache/huggingface/datasets/json/default-701e01a9083cc2fd/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4/cache-cfbb066deb8e63e4_*_of_00032.arrow
Loading cached processed dataset at /home/jlunder/.cache/huggingface/datasets/json/default-3265e29a55220f2d/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4/cache-755f2f8b3dceaf48_*_of_00032.arrow
Loading cached processed dataset at /home/jlunder/.cache/huggingface/datasets/json/default-9865269cefb71641/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4/cache-71a89babbbad3b4e_*_of_00032.arrow


In [12]:
train_dataset_json

DatasetDict({
    train: Dataset({
        features: ['titles', 'titles_ids', 'titles_attention_masks', 'abstracts_ids', 'abstracts_attention_masks', 'abstracts'],
        num_rows: 15564
    })
})

In [13]:
train_dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'input_ids', 'attention_mask'],
        num_rows: 15564
    })
})

In [14]:
train_dataset_json.save_to_disk('data/train_tokenized_json_small_double_gpt')
valid_dataset_json.save_to_disk('data/valid_tokenized_json_small_double_gpt')
test_dataset_json.save_to_disk('data/test_tokenized_json_small_double_gpt')

gpt2_train_dataset_json.save_to_disk('data/gpt2_train_tokenized_json_small')
gpt2_valid_dataset_json.save_to_disk('data/gpt2_valid_tokenized_json_small')
gpt2_test_dataset_json.save_to_disk('data/gpt2_test_tokenized_json_small')


                                                                                                

In [15]:
train_dataset.save_to_disk('data/gpt2_train_tokenized_pad_small')
val_dataset.save_to_disk('data/gpt2_val_tokenized_pad_small')
test_dataset.save_to_disk('data/gpt2_test_tokenized_pad_small')

                                                                                                 

In [16]:
train_dataset = load_from_disk("data/train_tokenized_json_small")
val_dataset = load_from_disk("data/valid_tokenized_json_small")


In [17]:
train_dataset_json['train']['titles_ids'][0]

[[50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
 

In [18]:
tokenizer.save_pretrained('./gpt2_tokenizer/')
dberttokenizer.save_pretrained('./dberttokenizer/')

('./dberttokenizer/tokenizer_config.json',
 './dberttokenizer/special_tokens_map.json',
 './dberttokenizer/vocab.txt',
 './dberttokenizer/added_tokens.json',
 './dberttokenizer/tokenizer.json')

In [19]:
len(train_dataset_json['train']['abstracts_ids'][4][0])

512

In [20]:
from torch.utils.data import DataLoader
from dataloader import collate_fn_GPT2
dl = DataLoader(train_dataset_json['train'], collate_fn=collate_fn_GPT2, batch_size=4)

In [21]:
for i, item in enumerate(dl):
    print(i)
    print(item)

0
{'titles': ['Bulges', 'On the origin of randomness in quantum mechanics', 'Density Functional Theory for non-relativistic Fermions in the Unitarity\n  Limit', 'Quenched and Negative Hall Effect in Periodic Media: Application to\n  Antidot Superlattices'], 'abstracts': ['  We model the evolution of the galactic bulge and of the bulges of a selected\nsample of external spiral galaxies, via the multiphase multizone evolution\nmodel. We address a few questions concerning the role of the bulges within\ngalactic evolution schemes and the properties of bulge stellar populations. We\nprovide solutions to the problems of chemical abundances and spectral indices,\nthe two main observational constraints to bulge structure.\n', '  Quantum statistics originate from the physics of state preparation. It is\ntherefore wrong to think of quantum states as fundamental. In fact, quantum\nstates are merely summaries of dynamical processes that randomize the\nproperties of the system by drawing on the ine

KeyboardInterrupt: 