In [1]:
import os
import json

from tokenizers import Tokenizer, Regex
from tokenizers.models import BPE
from tokenizers.pre_tokenizers import Split, ByteLevel
from tokenizers.trainers import BpeTrainer
from tokenizers.normalizers import BertNormalizer
from tokenizers.decoders import WordPiece

In [2]:
def load_examples(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        return json.load(f)

In [3]:
normalizer = BertNormalizer(strip_accents=True, lowercase=True)

pattern = """'(?:[sdmt]|ll|ve|re)| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+"""
pre_tokenizer = Split(Regex(pattern), behavior="isolated")

In [4]:
MAX_SEQ_LEN=128+1

tokenizer = Tokenizer(BPE(
    unk_token="[UNK]",
    continuing_subword_prefix="##",
))
tokenizer.normalizer = normalizer
tokenizer.pre_tokenizer = pre_tokenizer
tokenizer.enable_padding(pad_id=0, pad_token="[PAD]", length=MAX_SEQ_LEN)
tokenizer.decoder = WordPiece()

In [5]:
token_trainer = BpeTrainer(
    vocab_size=1500,
    special_tokens=["[UNK]", "<|endoftext|>", "[PAD]"],
    show_progress=True,
    initial_alphabet=ByteLevel.alphabet(),
    min_frequency=2,
    max_token_length=100,
    continuing_subword_prefix="##",
)

train_file = "data/train-sampled.json"
data = load_examples(train_file)

In [6]:
tokenizer.train_from_iterator(data, token_trainer)

tokenizer.save("models/tokenizer.json")

In [3]:
import json
from transformers import PreTrainedTokenizerFast

MAX_SEQ_LEN = 128 + 1 
tokenizer = PreTrainedTokenizerFast(
    tokenizer_file="models/tokenizer.json",
    pad_token="[PAD]",
    unk_token="[UNK]",
    max_len = MAX_SEQ_LEN,
    add_prefix_space=False
)

with open("data/train.json", "r", encoding="utf-8") as f:
    data = json.load(f)

data = list(map(lambda x: len(
    tokenizer(
        x,
        #padding="max_length",
        return_attention_mask=False,
        return_token_type_ids=False,
    )["input_ids"]
    ), data
))

Token indices sequence length is longer than the specified maximum sequence length for this model (201 > 129). Running this sequence through the model will result in indexing errors


In [5]:
import numpy as np

print(f"""Max Length: {np.max(data)}
----------------------
Min Length: {np.min(data)}
---------------------------------
Mean Length: {np.mean(data):.2f} +/- {np.std(data):.2f}""")

Max Length: 2158
----------------------
Min Length: 1
---------------------------------
Mean Length: 227.72 +/- 100.87
