In [21]:
import os
import json
import ijson

from tokenizers import Tokenizer, Regex
from tokenizers.models import BPE
from tokenizers.pre_tokenizers import Split, ByteLevel
from tokenizers.trainers import BpeTrainer
from tokenizers.normalizers import BertNormalizer
from tokenizers.decoders import WordPiece

In [22]:
def load_examples(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        return json.load(f)
    
def data_loader(file_path):
    with open(file_path, 'r', encoding="utf-8") as f_in:
        items = ijson.items(f_in, 'item')
        for item in items: yield item

In [23]:
normalizer = BertNormalizer(strip_accents=True, lowercase=True)

pattern = """'(?:[sdmt]|ll|ve|re)| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+"""
pre_tokenizer = Split(Regex(pattern), behavior="isolated")

In [24]:
MAX_SEQ_LEN=256

tokenizer = Tokenizer(BPE(
    unk_token="[UNK]",
    continuing_subword_prefix="##",
))
tokenizer.normalizer = normalizer
tokenizer.pre_tokenizer = pre_tokenizer
tokenizer.enable_padding(pad_id=0, pad_token="[PAD]", length=MAX_SEQ_LEN)
tokenizer.decoder = WordPiece()

In [25]:
token_trainer = BpeTrainer(
    vocab_size=5000,
    special_tokens=["[UNK]", "<|endoftext|>", "[PAD]"],
    show_progress=True,
    initial_alphabet=ByteLevel.alphabet(),
    min_frequency=2,
    continuing_subword_prefix="##",
)

train_file = "data/train.json"
#data = load_examples(train_file)

In [26]:
#tokenizer.train_from_iterator(data, token_trainer)
tokenizer.train_from_iterator(data_loader(train_file), token_trainer)

tokenizer.save("models/tokenizer.json")

In [None]:
import json
from transformers import PreTrainedTokenizerFast

MAX_SEQ_LEN = 256
tokenizer = PreTrainedTokenizerFast(
    tokenizer_file="models/tokenizer.json",
    pad_token="[PAD]",
    unk_token="[UNK]",
    max_len = MAX_SEQ_LEN,
    add_prefix_space=False,
    eos_token="<|endoftext|>"
)

# with open("data/train.json", "r", encoding="utf-8") as f:
#     data = json.load(f)

# data = list(map(lambda x: len(
#     tokenizer(
#         x,
#         #padding="max_length",
#         return_attention_mask=False,
#         return_token_type_ids=False,
#     )["input_ids"]
#     ), data
# ))

In [19]:
tokenizer.eos_token_id, tokenizer.pad_token_id, tokenizer.unk_token_id

(1, 2, 0)

In [20]:
tokenizer.all_special_tokens

['<|endoftext|>', '[UNK]', '[PAD]']