In [5]:
from task.SpvMaskedLanguageModelingTask import SpvMaskedLanguageModelingTask
from dataset.FragmentPerformanceSnapshotDataset import FragmentPerformanceSnapshotDataset
import transformers
import tokenizers, tokenizers.models, tokenizers.decoders
from transformers import PreTrainedTokenizer, DataCollatorForLanguageModeling
from dataset.MapDataset import MapDataset

In [8]:
config = transformers.RobertaConfig.from_pretrained("roberta-base")
model = transformers.RobertaForMaskedLM(config)

training_args = transformers.TrainingArguments(
    output_dir="model_output",  # output directory
    overwrite_output_dir=True,  # overwrite the content of the output directory
    num_train_epochs=3,  # number of training epochs
    per_device_train_batch_size=16,  # batch size for training
    save_steps=10_000,  # after # steps model is saved
    save_total_limit=2,  # limit the total amount of checkpoints. Deletes the older checkpoints.
)

# tokenizer = tokenizers.Tokenizer.from_file("./SpvBpeTokenizer.json")
# Define the special tokens
special_tokens = {"pad_token": "<PAD>", "unk_token": "<UNK>", "cls_token": "<CLS>", \
                "sep_token": "<SEP>", "mask_token": "<MASK>"}

# Load the tokenizer using PreTrainedTokenizerFast
tokOrig = tokenizers.Tokenizer.from_file("SpvBpeTokenizer.json")

tokenizer = PreTrainedTokenizer(tokenizer_object=tokOrig, **special_tokens)
spvlmTask = SpvMaskedLanguageModelingTask(model)

dataset = FragmentPerformanceSnapshotDataset("FragPerfSnapshotDataset.json", "train")
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)


In [13]:
dir(tokOrig)

['__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getnewargs__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 'add_special_tokens',
 'add_tokens',
 'decode',
 'decode_batch',
 'decoder',
 'enable_padding',
 'enable_truncation',
 'encode',
 'encode_batch',
 'from_buffer',
 'from_file',
 'from_pretrained',
 'from_str',
 'get_vocab',
 'get_vocab_size',
 'id_to_token',
 'model',
 'no_padding',
 'no_truncation',
 'normalizer',
 'num_special_tokens_to_add',
 'padding',
 'post_process',
 'post_processor',
 'pre_tokenizer',
 'save',
 'to_str',
 'token_to_id',
 'train',
 'train_from_iterator',
 'truncation']

In [21]:
tokOrig.token_to_id("[BOS]")

In [50]:
tokens = tokOrig.encode("OpAdd %1 %2 %3").tokens
tokens

['ĠOpA', 'dd', 'Ġ%', '1', 'Ġ%', '2', 'Ġ%', '3']

In [49]:
tokOrig.token_to_id("1")

12

In [42]:
tokOrig.decoder = tokenizers.decoders.ByteLevel()
tokOrig.token_to_id(tokens[1])

AttributeError: 'tokenizers.Tokenizer' object has no attribute 'convert_tokens_to_ids'

In [22]:
tokenizer.encode("OpAdd %1 %2 %3")

NotImplementedError: 

In [None]:

def dataset_postprocess_fn(elem):
    text = elem["spvText"]
    encoded_input = tokenizer(text, return_tensors='pt')
    return encoded_input

trainer = transformers.Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=MapDataset(dataset, lambda elem: dataset_postprocess_fn(elem))
)

trainer.train()
