In [1]:
import argparse
import itertools
from datetime import datetime

from evaluate import load
from datasets import load_from_disk
from transformers import (AutoTokenizer, AlbertForPreTraining,
                          DefaultDataCollator, AutoConfig,
                          TrainingArguments, Trainer,
                          DataCollatorForLanguageModeling,
                          AlbertForMaskedLM)


def tokenize_function(example):
    '''
    This function should be given to map for dataset tokenization.
    ------------------------------------------------------------------
    Arg:
        example: Dictionary-like object containing examples of the 
        dataset (Ex: {'text':..., 'label':...}). If the option batch 
        in map is set to False, each key of the dictionary is attached
        to one single example. Otherwise, each keys is attached to
        list of examples.
    Returns:
        output: A dictionary with keys input_ids and attention_mask.
        Its values may be a single list (for map's batch option set to
        False) or a list of lists (batch set to True).
    '''
    output = tokenizer(example["text"], truncation=True, max_length=max_len)
    del output['token_type_ids']
    return output


def create_chunks(examples):
    '''
    Creates a dataset of chunks of data.
    ------------------------------------------------------------------
    Args:
        examples (dict): Dictionary-like object, whose values are 
        lists of lists.
    Returns:
        dict_chunks (dict): A dictionary with the same keys (plus a 
        copy of the key "input_ids"), but whose values are lists of 
        chunks (lists) of same size.
    '''
    chunk_size = max_len

    #For each key (inputs_ids, attention_mask, ...), we concatenate
    #all its lists
    concatenated_examples = {k: list(itertools.chain(*examples[k])) 
                            for k in examples.keys()}
    
    #We calculate the maximum number of chunks that can be formed from
    #concatenated_examples.
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    max_num_chunks = total_length // chunk_size

    #We create chunks from concatenated_examples. If total_length is
    #not a multiple of chunk_size, then the remainder will be
    #discarded.
    dict_chunks = {k:[concatenated_examples[k][chunk_size*i:(i+1)*chunk_size]
                   for i in range(max_num_chunks)]
                   for k in concatenated_examples.keys()}

    #We create a copy of input_ids that will be used as a reference
    #for masked language modeling during training.
    dict_chunks["labels"] = dict_chunks["input_ids"].copy()
    return dict_chunks


def compute_perplexity(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return perplexity.compute(predictions=predictions, references=labels)

#Loading dataset
dataset = load_from_disk('../data/datasets/cuneiform/')

#Loading tokenizer
tokenizer = AutoTokenizer.from_pretrained('../tokenizers/tokenizer/')
vocab_size = tokenizer.vocab_size

#Since only 0,01% of all data has more than 64 cuneiform characters
#(or 2*64 -1, if we count the white spaces), we set max_len to this
#value
max_len = 2*64

#Tokenized dataset that has a "input_ids" key (containg a list of
#tokens ids) and "attention_mask" key (indicating which characters 
#should be attended).
tokenized_dataset = dataset.map(tokenize_function, batched=True,
                                remove_columns=dataset["train"].column_names)

#Dataset with examples organized in chunks of size max_len
chuked_dataset = tokenized_dataset.map(create_chunks, batched=True)

#Data collator
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True,
                                                mlm_probability=0.15)

perplexity = load("perplexity", module_type="metric")

config = AutoConfig.from_pretrained('albert-xlarge-v2')
model = AlbertForMaskedLM(config)

dia = datetime.today().strftime("%Y-%M-%d")
hora = datetime.now().strftime("%H-%M")


Loading cached processed dataset at ../data/datasets/cuneiform/train\cache-a615edd2ff1aaf10.arrow
Loading cached processed dataset at ../data/datasets/cuneiform/val\cache-177b1de2c1bef4c2.arrow
Loading cached processed dataset at ../data/datasets/cuneiform/test\cache-44006bcc82ef534d.arrow
Loading cached processed dataset at ../data/datasets/cuneiform/train\cache-d80c26604b24b48d.arrow
Loading cached processed dataset at ../data/datasets/cuneiform/val\cache-7309d4f149fb1b31.arrow
Loading cached processed dataset at ../data/datasets/cuneiform/test\cache-247eff21687e41df.arrow


In [2]:
training_args = TrainingArguments(
    output_dir=f'../checkpoints/pretraining/meu_output_dia_{dia}_hora_{hora}',
    overwrite_output_dir=True,
    evaluation_strategy='steps',
    max_steps=100,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    learning_rate=1e-5,
    #warmup_steps=100,
    #logging_dir='log_tf',
    logging_strategy='steps',
    logging_steps=1,
    eval_steps=3,
    report_to="wandb",
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['val'],
    compute_metrics=compute_perplexity
)

trainer.train()

max_steps is given, it will override any value given in num_train_epochs
***** Running training *****
  Num examples = 82017
  Num Epochs = 1
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 1
  Gradient Accumulation steps = 1
  Total optimization steps = 100
  Number of trainable parameters = 54821040
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33migorruys[0m. Use [1m`wandb login --relogin`[0m to force relogin


  0%|          | 0/100 [00:00<?, ?it/s]

You're using a AlbertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'loss': 10.2812, 'learning_rate': 9.9e-06, 'epoch': 0.0}
{'loss': 10.1616, 'learning_rate': 9.800000000000001e-06, 'epoch': 0.0}


***** Running Evaluation *****
  Num examples = 9113
  Batch size = 1


{'loss': 0.0, 'learning_rate': 9.7e-06, 'epoch': 0.0}


  0%|          | 0/9113 [00:00<?, ?it/s]

RuntimeError: CUDA out of memory. Tried to allocate 1.32 GiB (GPU 0; 4.00 GiB total capacity; 2.13 GiB already allocated; 442.35 MiB free; 2.17 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [95]:
inputs = tokenizer(dataset['train']['text'][0], return_tensors='pt')

In [96]:
inputs

{'input_ids': tensor([[  2,   5, 220,   5,   9,   5,  20,   3]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1]])}

In [86]:
inputs['input_ids']

[2, 5, 220, 5, 9, 5, 20, 3]

In [100]:
model.to("cpu")
output = model(**inputs)

In [104]:
output.logits

tensor([[[-0.0008, -0.1967, -0.1836,  ...,  0.2964,  0.3753, -0.0686],
         [-0.0008, -0.1967, -0.1836,  ...,  0.2964,  0.3753, -0.0685],
         [-0.0008, -0.1967, -0.1836,  ...,  0.2964,  0.3752, -0.0685],
         ...,
         [-0.0008, -0.1966, -0.1836,  ...,  0.2963,  0.3753, -0.0686],
         [-0.0008, -0.1967, -0.1836,  ...,  0.2964,  0.3752, -0.0685],
         [-0.0008, -0.1968, -0.1836,  ...,  0.2964,  0.3753, -0.0685]]],
       grad_fn=<AddBackward0>)