In [2]:
from datetime import datetime
import argparse
import numpy as np
from datasets import load_from_disk
from transformers import (AutoTokenizer, AlbertForPreTraining, DefaultDataCollator, AutoConfig, TrainingArguments, Trainer, DataCollatorForLanguageModeling,
AlbertForMaskedLM, DataCollatorWithPadding)

In [3]:
def tokenize_function(example):
    '''
    Function to be used during the tokenization of the dataset.
    -------------------------------------------------------------
    Args:
        example: dictionary-like object that represents one example of the dataset,
        that is, a single pair "text","label" (Ex: {'text':..., 'label':...}).
    Returns:
        A dictionary with a single key input_ids, whose value is a list of token ids
        corresponding to the characters, given example's 'text' key.
    '''

    #output is a dictionary with keys "input_ids", "token_type_ids" and
    #"attention mask", whose values are lists of integers
    output = tokenizer(example["text"], truncation=True, max_length=max_len)
    input_batch = []
    for token_id in output['input_ids']:
        input_batch.append(token_id)
    return {"input_ids": input_batch}

#Loading dataset
dataset = load_from_disk('../data/datasets/cuneiform/')

#Loading tokenizer
tokenizer = AutoTokenizer.from_pretrained('../tokenizers/tokenizer/')
vocab_size = tokenizer.vocab_size

#Since only 0,01% of all data has more than 64 for cuneiform characters (or 2*64 - 1, if we count the white spaces),
#we set max_len to this value
max_len = 2*64

tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=dataset["train"].column_names)

Loading cached processed dataset at ../data/datasets/cuneiform/train\cache-2630f9322d2eae4d.arrow


  0%|          | 0/10 [00:00<?, ?ba/s]

Loading cached processed dataset at ../data/datasets/cuneiform/test\cache-ee5c050a6ad78515.arrow


In [4]:
x = tokenizer(dataset['train']['text'][0],dataset['train']['text'][1])#,dataset['train']['text'][2])

In [5]:
tokenizer.decode(x['input_ids'])

'[CLS] 𒂔 𒈾 𒆠[SEP] 𒊭 𒈗 𒁁 𒉌 𒋫 𒇽 𒄷 𒌒 𒋾[SEP]'

In [6]:
tokenized_dataset['train']['input_ids'][:4]

[[2, 5, 220, 5, 9, 5, 20, 3],
 [2, 5, 40, 5, 46, 5, 57, 5, 15, 5, 26, 5, 27, 5, 70, 5, 111, 5, 30, 3],
 [2, 5, 156, 5, 24, 5, 12, 5, 62, 5, 44, 5, 143, 5, 98, 5, 12, 3],
 [2, 5, 31, 5, 319, 5, 85, 5, 142, 3]]

In [7]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.15)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [8]:
samples = {'input_ids':tokenized_dataset['train']['input_ids'][:4]}

In [9]:
samples

{'input_ids': [[2, 5, 220, 5, 9, 5, 20, 3],
  [2, 5, 40, 5, 46, 5, 57, 5, 15, 5, 26, 5, 27, 5, 70, 5, 111, 5, 30, 3],
  [2, 5, 156, 5, 24, 5, 12, 5, 62, 5, 44, 5, 143, 5, 98, 5, 12, 3],
  [2, 5, 31, 5, 319, 5, 85, 5, 142, 3]]}

In [10]:
y = data_collator(samples)
{k:np.shape(v) for k,v in y.items()}

You're using a AlbertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'input_ids': torch.Size([4, 20]), 'attention_mask': torch.Size([4, 20])}

In [11]:
exemplo = tokenized_dataset['train'][:3]

In [12]:
concatenated_examples = {k: sum(exemplo[k],[]) for k in exemplo.keys()}

In [13]:
exemplo['input_ids']

[[2, 5, 220, 5, 9, 5, 20, 3],
 [2, 5, 40, 5, 46, 5, 57, 5, 15, 5, 26, 5, 27, 5, 70, 5, 111, 5, 30, 3],
 [2, 5, 156, 5, 24, 5, 12, 5, 62, 5, 44, 5, 143, 5, 98, 5, 12, 3]]

In [15]:
import itertools
x = list(itertools.chain(*tokenized_dataset['train']['input_ids']))

In [54]:
def createChunks(examples):
    '''
    Creates chunks of data.
    -------
    Args:
        examples (dict): Dictionary-like object, whose values are lists of lists.
    Returns:
        dict_chunks (dict): A dictionary with the same keys (plus a copy of the key "input_ids"),
        but whose values are lists of chunks (lists) of same size.
    '''
    chunk_size = max_len

    #For each key (inputs_ids, attention_mask, ...), we concatenate all its lists
    concatenated_examples = {k: list(itertools.chain(*examples[k])) for k in examples.keys()}
    
    #We calculate the maximum number of chunks that can be formed from concatenated_examples.
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    max_num_chunks = total_length // chunk_size

    #We create chunks from concatenated_examples.
    #If total_length is not a multiple of chunk_size, then the remainder will be discarded.
    dict_chunks = { k:[concatenated_examples[k][chunk_size*i:(i+1)*chunk_size] for i in range(max_num_chunks)] for k in concatenated_examples.keys()}

    #We create a copy of input_ids that will be used as a reference during the training.
    dict_chunks["labels"] = dict_chunks["input_ids"].copy()
    return dict_chunks