# Tokenization using Huggingface Tokenizer

In [1]:
import os

from tokenizers import ByteLevelBPETokenizer
from transformers import (
    T5TokenizerFast,
    AutoTokenizer,
)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
!cut -f2 ./data/ratings_train.tsv > ./data/ratings_train.content.txt

In [3]:
train_files = [
    "./data/ratings_train.content.txt",
]
vocab_size = 48000
min_frequency = 2

output_name = f"nsmc_bbpe_{vocab_size}"
output_dir = "./tokenizers"

In [4]:
unused_tokens = [f"<unused_{i}>" for i in range(100)]

In [5]:
tokenizer = ByteLevelBPETokenizer()
tokenizer.train(
    files=train_files,
    vocab_size=vocab_size,
    min_frequency=min_frequency,
    special_tokens=[
        "<pad>",  # padding
        "<s>",    # start of sentence
        "</s>",   # end of sentence
        "<unk>",  # unknown
    ] + unused_tokens,
)
tokenizer.bos_token = "<s>"
tokenizer.eos_token = "</s>"
tokenizer.pad_token = "<pad>"

os.makedirs(os.path.join(output_dir, output_name), exist_ok=True)
tokenizer.save(os.path.join(output_dir, output_name, "tokenizer.json"))






In [6]:
def test_tokenizer(tokenizer, skip_special_tokens=True):
    ko_sentence = "<s>이것은 테스트 문장입니다. <unused_0>어떻게 보이나요?<unused_1> 고유명사 \"파이썬 파이토치 허깅페이스\"는 어떻게 되나요?</s>"
    en_sentence = "<s>This is a test sentence. <unused_0>How does it look?<unused_1> Proper nouns \"Python PyTorch HuggingFace\" how does it go?</s>"

    print(ko_sentence)
    if isinstance(tokenizer, ByteLevelBPETokenizer):
        print(">>>", tokenizer.encode(ko_sentence).ids)
        print(">>>", tokenizer.encode(ko_sentence).tokens)
        print(">>>", tokenizer.decode(tokenizer.encode(ko_sentence).ids, skip_special_tokens=skip_special_tokens))
    else:
        print(">>>", tokenizer.encode(ko_sentence))
        print(">>>", tokenizer.tokenize(ko_sentence))
        print(">>>", tokenizer.decode(tokenizer.encode(ko_sentence), skip_special_tokens=skip_special_tokens))
    print(en_sentence)
    if isinstance(tokenizer, ByteLevelBPETokenizer):
        print(">>>", tokenizer.encode(en_sentence).ids)
        print(">>>", tokenizer.encode(en_sentence).tokens)
        print(">>>", tokenizer.decode(tokenizer.encode(en_sentence).ids, skip_special_tokens=skip_special_tokens))
    else:
        print(">>>", tokenizer.encode(en_sentence))
        print(">>>", tokenizer.tokenize(en_sentence))
        print(">>>", tokenizer.decode(tokenizer.encode(en_sentence), skip_special_tokens=skip_special_tokens))

#### Tokenize with Special Tokens

In [7]:
test_tokenizer(tokenizer, skip_special_tokens=True)

<s>이것은 테스트 문장입니다. <unused_0>어떻게 보이나요?<unused_1> 고유명사 "파이썬 파이토치 허깅페이스"는 어떻게 되나요?</s>
>>> [1, 13158, 17621, 2372, 519, 932, 117, 324, 4, 3392, 20154, 423, 134, 5, 900, 14334, 635, 324, 105, 2845, 13796, 6164, 618, 604, 1255, 462, 331, 17556, 105, 373, 1704, 41246, 134, 2]
>>> ['<s>', 'ìĿ´ê²ĥìĿĢ', 'ĠíħĮìĬ¤íĬ¸', 'Ġë¬¸', 'ìŀ¥', 'ìŀħëĭĪëĭ¤', '.', 'Ġ', '<unused_0>', 'ìĸ´ëĸ»ê²Į', 'Ġë³´ìĿ´ëĤĺ', 'ìļĶ', '?', '<unused_1>', 'Ġê³ł', 'ìľłëªħ', 'ìĤ¬', 'Ġ', '"', 'íĮĮìĿ´', 'ìį¬', 'ĠíĮĮìĿ´', 'íĨł', 'ì¹ĺ', 'ĠíĹĪ', 'ê¹', 'ħ', 'íİĺìĿ´ìĬ¤', '"', 'ëĬĶ', 'Ġìĸ´ëĸ»ê²Į', 'ĠëĲĺëĤĺìļĶ', '?', '</s>']
>>> 이것은 테스트 문장입니다. 어떻게 보이나요? 고유명사 "파이썬 파이토치 허깅페이스"는 어떻게 되나요?
<s>This is a test sentence. <unused_0>How does it look?<unused_1> Proper nouns "Python PyTorch HuggingFace" how does it go?</s>
>>> [1, 30227, 8451, 3454, 2320, 6679, 2771, 13113, 5650, 10287, 117, 324, 4, 143, 7308, 21844, 7940, 12827, 6683, 42645, 134, 5, 6682, 185, 8336, 2526, 6917, 3977, 181, 186, 324, 105, 151, 192, 14234, 4440, 6682, 192, 155, 3676, 9945

#### Tokenize without Special Tokens

In [8]:
test_tokenizer(tokenizer, skip_special_tokens=False)

<s>이것은 테스트 문장입니다. <unused_0>어떻게 보이나요?<unused_1> 고유명사 "파이썬 파이토치 허깅페이스"는 어떻게 되나요?</s>
>>> [1, 13158, 17621, 2372, 519, 932, 117, 324, 4, 3392, 20154, 423, 134, 5, 900, 14334, 635, 324, 105, 2845, 13796, 6164, 618, 604, 1255, 462, 331, 17556, 105, 373, 1704, 41246, 134, 2]
>>> ['<s>', 'ìĿ´ê²ĥìĿĢ', 'ĠíħĮìĬ¤íĬ¸', 'Ġë¬¸', 'ìŀ¥', 'ìŀħëĭĪëĭ¤', '.', 'Ġ', '<unused_0>', 'ìĸ´ëĸ»ê²Į', 'Ġë³´ìĿ´ëĤĺ', 'ìļĶ', '?', '<unused_1>', 'Ġê³ł', 'ìľłëªħ', 'ìĤ¬', 'Ġ', '"', 'íĮĮìĿ´', 'ìį¬', 'ĠíĮĮìĿ´', 'íĨł', 'ì¹ĺ', 'ĠíĹĪ', 'ê¹', 'ħ', 'íİĺìĿ´ìĬ¤', '"', 'ëĬĶ', 'Ġìĸ´ëĸ»ê²Į', 'ĠëĲĺëĤĺìļĶ', '?', '</s>']
>>> <s>이것은 테스트 문장입니다. <unused_0>어떻게 보이나요?<unused_1> 고유명사 "파이썬 파이토치 허깅페이스"는 어떻게 되나요?</s>
<s>This is a test sentence. <unused_0>How does it look?<unused_1> Proper nouns "Python PyTorch HuggingFace" how does it go?</s>
>>> [1, 30227, 8451, 3454, 2320, 6679, 2771, 13113, 5650, 10287, 117, 324, 4, 143, 7308, 21844, 7940, 12827, 6683, 42645, 134, 5, 6682, 185, 8336, 2526, 6917, 3977, 181, 186, 324, 105, 151, 192, 14234, 4440,

#### Tokenize with Pretrained Tokenizer

In [9]:
tokenizer = T5TokenizerFast.from_pretrained(os.path.join(output_dir, output_name))

test_tokenizer(tokenizer)

<s>이것은 테스트 문장입니다. <unused_0>어떻게 보이나요?<unused_1> 고유명사 "파이썬 파이토치 허깅페이스"는 어떻게 되나요?</s>
>>> [1, 13158, 17621, 2372, 519, 932, 117, 324, 4, 3392, 20154, 423, 134, 5, 900, 14334, 635, 324, 105, 2845, 13796, 6164, 618, 604, 1255, 462, 331, 17556, 105, 373, 1704, 41246, 134, 2]
>>> ['<s>', 'ìĿ´ê²ĥìĿĢ', 'ĠíħĮìĬ¤íĬ¸', 'Ġë¬¸', 'ìŀ¥', 'ìŀħëĭĪëĭ¤', '.', 'Ġ', '<unused_0>', 'ìĸ´ëĸ»ê²Į', 'Ġë³´ìĿ´ëĤĺ', 'ìļĶ', '?', '<unused_1>', 'Ġê³ł', 'ìľłëªħ', 'ìĤ¬', 'Ġ', '"', 'íĮĮìĿ´', 'ìį¬', 'ĠíĮĮìĿ´', 'íĨł', 'ì¹ĺ', 'ĠíĹĪ', 'ê¹', 'ħ', 'íİĺìĿ´ìĬ¤', '"', 'ëĬĶ', 'Ġìĸ´ëĸ»ê²Į', 'ĠëĲĺëĤĺìļĶ', '?', '</s>']
>>> 이것은 테스트 문장입니다. 어떻게 보이나요? 고유명사 "파이썬 파이토치 허깅페이스"는 어떻게 되나요?
<s>This is a test sentence. <unused_0>How does it look?<unused_1> Proper nouns "Python PyTorch HuggingFace" how does it go?</s>
>>> [1, 30227, 8451, 3454, 2320, 6679, 2771, 13113, 5650, 10287, 117, 324, 4, 143, 7308, 21844, 7940, 12827, 6683, 42645, 134, 5, 6682, 185, 8336, 2526, 6917, 3977, 181, 186, 324, 105, 151, 192, 14234, 4440, 6682, 192, 155, 3676, 9945

In [10]:
!head -n 300 ./tokenizers/nsmc_bbpe_48000/tokenizer.json

{
  "version": "1.0",
  "truncation": null,
  "padding": null,
  "added_tokens": [
    {
      "id": 0,
      "content": "<pad>",
      "single_word": false,
      "lstrip": false,
      "rstrip": false,
      "normalized": false,
      "special": true
    },
    {
      "id": 1,
      "content": "<s>",
      "single_word": false,
      "lstrip": false,
      "rstrip": false,
      "normalized": false,
      "special": true
    },
    {
      "id": 2,
      "content": "</s>",
      "single_word": false,
      "lstrip": false,
      "rstrip": false,
      "normalized": false,
      "special": true
    },
    {
      "id": 3,
      "content": "<unk>",
      "single_word": false,
      "lstrip": false,
      "rstrip": false,
      "normalized": false,
      "special": true
    },
    {
      "id": 4,
      "content": "<unused_0>",
      "single_word": false,
      "lstrip": false,
      "rstrip": false,
      "normalized": false,
      "special": true
    },
    {
      "id": 5,
      "co

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


### Tokenize with Existing Tokenizer

Before you go, you need to see config of tokenizer.

You can check the configuration of tokenizer at:
https://huggingface.co/klue/bert-base/tree/main

or cache directory.

In [11]:
plm_name = "klue/bert-base"
tokenizer = AutoTokenizer.from_pretrained(plm_name)

test_tokenizer(tokenizer)

tokenizer_config.json: 100%|██████████| 289/289 [00:00<00:00, 31.9kB/s]
config.json: 100%|██████████| 425/425 [00:00<00:00, 226kB/s]
vocab.txt: 100%|██████████| 248k/248k [00:00<00:00, 505kB/s]
tokenizer.json: 100%|██████████| 495k/495k [00:00<00:00, 950kB/s]
special_tokens_map.json: 100%|██████████| 125/125 [00:00<00:00, 60.1kB/s]

<s>이것은 테스트 문장입니다. <unused_0>어떻게 보이나요?<unused_1> 고유명사 "파이썬 파이토치 허깅페이스"는 어떻게 되나요?</s>
>>> [2, 32, 86, 34, 3982, 2073, 7453, 6265, 12190, 18, 32, 15818, 5722, 4948, 66, 20, 34, 3842, 3783, 2075, 2182, 35, 32, 15818, 5722, 4948, 66, 21, 34, 6870, 2211, 2063, 6, 6440, 3495, 6440, 2386, 2225, 1905, 2186, 15092, 6, 793, 3842, 859, 2075, 2182, 35, 32, 19, 86, 34, 3]
>>> ['<', 's', '>', '이것', '##은', '테스트', '문장', '##입니다', '.', '<', 'un', '##us', '##ed', '_', '0', '>', '어떻게', '보이', '##나', '##요', '?', '<', 'un', '##us', '##ed', '_', '1', '>', '고유', '##명', '##사', '"', '파이', '##썬', '파이', '##토', '##치', '허', '##깅', '##페이스', '"', '는', '어떻게', '되', '##나', '##요', '?', '<', '/', 's', '>']
>>> < s > 이것은 테스트 문장입니다. < unused _ 0 > 어떻게 보이나요? < unused _ 1 > 고유명사 " 파이썬 파이토치 허깅페이스 " 는 어떻게 되나요? < / s >
<s>This is a test sentence. <unused_0>How does it look?<unused_1> Proper nouns "Python PyTorch HuggingFace" how does it go?</s>
>>> [2, 32, 86, 34, 9796, 4641, 11376, 68, 87, 8119, 17219, 30062, 9963, 18, 32, 15818,


