# Training RoBERTa Tokenizer

Byte-Pair Encoding tokenization https://youtu.be/HEikzVL-lZU

# 01 Setup

In [1]:
# Load dataset path
import pickle

with open('data/all_dataset_path', 'rb') as fp:
    all_dataset_path = pickle.load(fp)

In [2]:
len(all_dataset_path)

1363

In [3]:
# see merge path
all_dataset_path[0:10]

['data/id_oscar/text_543.txt',
 'data/id_oscar/text_155.txt',
 'data/id_oscar/text_528.txt',
 'data/id_oscar/text_582.txt',
 'data/id_oscar/text_983.txt',
 'data/id_oscar/text_919.txt',
 'data/id_oscar/text_729.txt',
 'data/id_oscar/text_857.txt',
 'data/id_oscar/text_246.txt',
 'data/id_oscar/text_222.txt']

# 02 Build a Tokenizer

In [4]:
%%time

from tokenizers import ByteLevelBPETokenizer
tokenizer = ByteLevelBPETokenizer()
# https://huggingface.co/docs/tokenizers/api/trainers#tokenizers.trainers.BpeTrainer

# RoBERTa's default vocabulary size is 50_265 (compare this to BERT's uncased 30_522)
tokenizer.train(
    files=all_dataset_path, 
    vocab_size=50_265,
    min_frequency=2,
    show_progress=True,
    special_tokens=['<s>', '<pad>', '</s>', '<unk>', '<mask>']
  )




CPU times: user 5h 48min 37s, sys: 24min 56s, total: 6h 13min 33s
Wall time: 24min 56s


# 03 Save Tokenizer

In [5]:
tokenizer.save_model('feel-in') 

['feel-in/vocab.json', 'feel-in/merges.txt']

# 04 Load Tokenizer (Example Usage)

In [6]:
from transformers import RobertaTokenizer

vocab_file_dir = 'feel-in'
roberta_tokenizer = RobertaTokenizer.from_pretrained(vocab_file_dir) 

In [7]:
sentence = 'Hidup tidak selamanya berjalan dengan mulus'
token = roberta_tokenizer.tokenize(sentence)
enc = roberta_tokenizer(sentence)

In [8]:
token

['H', 'idup', 'Ġtidak', 'Ġselamanya', 'Ġberjalan', 'Ġdengan', 'Ġmulus']

In [9]:
enc

{'input_ids': [0, 44, 720, 365, 7601, 1675, 326, 8924, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1]}