# Tokenizer library

train tokenizer in a few seconds  
when using, wrap with transformers library tokenizer object for usability


Tokenizer quick tour document: https://huggingface.co/docs/tokenizers/quicktour  
post about byte pair encoding : https://towardsdatascience.com/byte-pair-encoding-subword-based-tokenization-algorithm-77828a70bee0

In [None]:
# install datasets library  
# !pip install library  
# !pip install transformers  
# !pip install tokenizers   

In [102]:
from datasets import load_dataset
import os

data_dir = './data'
dataset = load_dataset('nsmc')

### Tokenizers example

In [103]:
from tokenizers import Tokenizer
from tokenizers.models import BPE
tokenizer = Tokenizer(BPE(unk_token="[UNK]"))

from tokenizers.trainers import BpeTrainer
trainer = BpeTrainer(special_tokens=["[UNK]","[CLS]", "[SEP]", "[PAD]", "[MASK]", "[BOS]", "[EOS]"])


Preprocessing before tokenizer

In [104]:
from tokenizers.pre_tokenizers import Whitespace
tokenizer.pre_tokenizer = Whitespace()

In [105]:
import time

data_dir ="data/nsmc"
os.makedirs(data_dir, exist_ok=True)
for split_key in dataset.keys():
    doc_path = f"{data_dir}/{split_key}.txt"
    with open(doc_path, 'w') as f:
        for doc in dataset[split_key]['document']:
            f.write(doc+'\n')
files = [f"{data_dir}/{key}.txt" for key in dataset.keys()]
start = time.time()
tokenizer.train(files, trainer)
end = time.time()

print(end-start)




22.020955324172974


Special tokens

In [106]:
print(tokenizer.token_to_id("[UNK]"),
      tokenizer.token_to_id("[CLS]"),
      tokenizer.token_to_id("[SEP]"),
      tokenizer.token_to_id("[PAD]"),
      tokenizer.token_to_id("[MASK]"),
      tokenizer.token_to_id("[BOS]"),
      tokenizer.token_to_id("[EOS]"))

0 1 2 3 4 5 6


In [107]:
from tokenizers.processors import TemplateProcessing
tokenizer.post_processor = TemplateProcessing(
    single="[CLS] $A [SEP]",
    pair="[CLS] $A [SEP] $B:1 [SEP]:1",
    special_tokens=[
        ("[CLS]", tokenizer.token_to_id("[CLS]")),
        ("[SEP]", tokenizer.token_to_id("[SEP]")),
    ],
)

In [108]:
print(tokenizer.token_to_id("[PAD]"))
tokenizer.enable_padding(pad_id=tokenizer.token_to_id("[PAD]"), pad_token="[PAD]", length= 512)


3


In [111]:
text_case_2 = "제가 제일 좋아하는 과일은 오렌지입니다." # korean
output = tokenizer.encode(text_case_2)
print(output.tokens)
print() 
print(output.ids)

# hwo to get RoBERTaTokenizer vocab.json and also merge file
# https://github.com/huggingface/transformers/issues/1083

# Exporting BPE file to vocab/merges files used by Huggingface tokenizers
# https://github.com/openai/tiktoken/issues/60

# The purpose of files merges.txt, special_tokens_map.json, training_args.bin and add_tokens.json
# https://github.com/huggingface/transformers/issues/4777

# HuggingFace ByteLevelBPETokenizer encoding issue in merge.txt file
# https://stackoverflow.com/questions/61599157/huggingface-bytelevelbpetokenizer-encoding-issue-in-merge-txt-file
# tokenizers library에서 merge.txt를 저장할 수 없는 문제가 있어서 서칭 중

['[CLS]', '제가', '제일', '좋아하는', '과', '일은', '오렌', '지', '입니다', '.', '[SEP]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', 

In [112]:

print("original text: ", text_case_2)
print("토큰화 결과:\n", output.tokens)

print("입력 스트링에서의 인덱스 위치:", output.offsets[10]) 
print("원본 입력에서 관심 토큰에 해당하는 부분: ", text_case_2[output.offsets[10][0]:output.offsets[10][1]]) # 원본 입력을 슬라이싱하여 확인할 수 있음


original text:  제가 제일 좋아하는 과일은 오렌지입니다.
토큰화 결과:
 ['[CLS]', '제가', '제일', '좋아하는', '과', '일은', '오렌', '지', '입니다', '.', '[SEP]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]

In [113]:
text_case_3 = ["제가 제일 좋아하는 과일은 오렌지입니다.",
                  "오렌지를 좋아하는 사람은 없습니다."]
output = tokenizer.encode_batch(text_case_3)
print([outputitem.tokens for outputitem in output])
print() 
print([outputitem.ids for outputitem in output])

[['[CLS]', '제가', '제일', '좋아하는', '과', '일은', '오렌', '지', '입니다', '.', '[SEP]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]',

### wrapping with transformers library
option 1. Loading directly from the tokenizer object  

Let’s see how to leverage this tokenizer object in the 🤗 Transformers library.   
The PreTrainedTokenizerFast class allows for easy instantiation,   
by accepting the instantiated tokenizer object as an argument:  
This object can now be used with all the methods shared by the 🤗 Transformers tokenizers!   
Head to the tokenizer page for more information.  

In [114]:
from transformers import PreTrainedTokenizerFast
fast_tokenizer = PreTrainedTokenizerFast(tokenizer_object=tokenizer)

In [115]:
# Option 2. load from file
# save pretrained tokenizers Tokenizer object to json format first
tokenizer.save("BPE-nsmc.json")

In [116]:
from transformers import PreTrainedTokenizerFast
fast_tokenizer = PreTrainedTokenizerFast(tokenizer_file="BPE-nsmc.json")

In [117]:
text_case_2 = "제가 제일 좋아하는 과일은 오렌지입니다." # korean
output = fast_tokenizer.encode(text_case_2)

In [118]:
print(fast_tokenizer.convert_ids_to_tokens(output))
print(output)

['[CLS]', '제가', '제일', '좋아하는', '과', '일은', '오렌', '지', '입니다', '.', '[SEP]']
[1, 4073, 3662, 3732, 664, 7416, 29262, 2514, 3337, 20, 2]


In [119]:
text_case_3 = ["제가 제일 좋아하는 과일은 오렌지입니다.",
                  "오렌지를 좋아하는 사람은 없습니다."]
output = fast_tokenizer(text_case_3)
print(output)

{'input_ids': [[1, 4073, 3662, 3732, 664, 7416, 29262, 2514, 3337, 20, 2], [1, 29262, 4158, 3732, 4059, 5617, 20, 2]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1]]}


for tokenizers wrapped as transformers tokenizer object,  
specify (1)padding, (2)max length and (3)special tokens again


In [120]:
# transformers tokenizer 불러오면서 추가 설정
from transformers import PreTrainedTokenizerFast
fast_tokenizer = PreTrainedTokenizerFast(tokenizer_file="BPE-nsmc.json",
                                         model_max_length=512,
                                         pad_token= '[PAD]',
                                         unk_token= '[UNK]',
                                         bos_token= '[BOS]',
                                         eos_token= '[EOS]')

In [121]:
print(fast_tokenizer.convert_ids_to_tokens(0),
      fast_tokenizer.convert_ids_to_tokens(1),
      fast_tokenizer.convert_ids_to_tokens(2),
      fast_tokenizer.convert_ids_to_tokens(3),
      fast_tokenizer.convert_ids_to_tokens(4),
      fast_tokenizer.convert_ids_to_tokens(5),
      fast_tokenizer.convert_ids_to_tokens(6),

)

print(fast_tokenizer.convert_tokens_to_ids("[MASK]"),
      fast_tokenizer.convert_tokens_to_ids("[BOS]"),
      fast_tokenizer.convert_tokens_to_ids("[EOS]"),
      fast_tokenizer.convert_tokens_to_ids("[PAD]"),
      fast_tokenizer.convert_tokens_to_ids("[UNK]"),
      fast_tokenizer.convert_tokens_to_ids("[CLS]"),
      fast_tokenizer.convert_tokens_to_ids("[SEP]"),)


[UNK] [CLS] [SEP] [PAD] [MASK] [BOS] [EOS]
4 5 6 3 0 1 2


Save tokenizer for later use

In [122]:
fast_tokenizer.save_pretrained("BPE-nsmc.model")

('BPE-nsmc.model/tokenizer_config.json',
 'BPE-nsmc.model/special_tokens_map.json',
 'BPE-nsmc.model/tokenizer.json')