In [1]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
print(type(tokenizer.backend_tokenizer))

<class 'tokenizers.Tokenizer'>


In [2]:
print(tokenizer.backend_tokenizer.normalizer.normalize_str("Héllò hôw are ü?"))

hello how are u?


In [3]:
print("Héllò hôw are ü?")

Héllò hôw are ü?


In [4]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

print(tokenizer.backend_tokenizer.normalizer.normalize_str("Héllò hôw are ü?"))


Héllò hôw are ü?


In [5]:
tokens = tokenizer("Héllò hôw are ü?")
print(tokens.tokens())

['[CLS]', 'H', '##é', '##ll', '##ò', 'h', '##ô', '##w', 'are', 'ü', '?', '[SEP]']


In [6]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
tokenizer("Héllò hôw are ü?").tokens()

['[CLS]', 'hello', 'how', 'are', 'u', '?', '[SEP]']

In [7]:
# BERT Tokenizer
tokenizer.backend_tokenizer.pre_tokenizer.pre_tokenize_str("Hello, how are  you?")

[('Hello', (0, 5)),
 (',', (5, 6)),
 ('how', (7, 10)),
 ('are', (11, 14)),
 ('you', (16, 19)),
 ('?', (19, 20))]

In [8]:
tokenizer("Hello, how are  you?").tokens()

['[CLS]', 'hello', ',', 'how', 'are', 'you', '?', '[SEP]']

In [9]:
# GPT2 Tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
tokenizer.backend_tokenizer.pre_tokenizer.pre_tokenize_str("Hello, how are  you?")

[('Hello', (0, 5)),
 (',', (5, 6)),
 ('how', (7, 10)),
 ('are', (11, 14)),
 ('you', (16, 19)),
 ('?', (19, 20))]

In [10]:
tokenizer("Hello, how are  you?").tokens()

['[CLS]', 'Hello', ',', 'how', 'are', 'you', '?', '[SEP]']

In [11]:
# GPT2 Tokenizer
tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer.backend_tokenizer.pre_tokenizer.pre_tokenize_str("Hello, how are  you?")

[('Hello', (0, 5)),
 (',', (5, 6)),
 ('Ġhow', (6, 10)),
 ('Ġare', (10, 14)),
 ('Ġ', (14, 15)),
 ('Ġyou', (15, 19)),
 ('?', (19, 20))]

In [12]:
tokenizer("Hello, how are  you?").tokens()

['Hello', ',', 'Ġhow', 'Ġare', 'Ġ', 'Ġyou', '?']

In [13]:
#T5 Tokenizer
tokenizer = AutoTokenizer.from_pretrained("t5-small")
tokenizer.backend_tokenizer.pre_tokenizer.pre_tokenize_str("Hello, how are  you?")

[('▁Hello,', (0, 6)),
 ('▁how', (7, 10)),
 ('▁are', (11, 14)),
 ('▁you?', (16, 20))]

In [14]:
tokenizer("Hello, how are  you?").tokens()

['▁Hello', ',', '▁how', '▁are', '▁you', '?', '</s>']

In [15]:
s ='a가 짧'
print(len(s))
b = s.encode('utf-8')
print(b)
print(len(b))
print(b.decode('utf-8'))

4
b'a\xea\xb0\x80 \xec\xa7\xa7'
8
a가 짧


In [16]:
s ='ㄱ'
print(len(s))
b = s.encode('utf-8')
print(b)
print(len(b))
print(b.decode('utf-8'))

1
b'\xe3\x84\xb1'
3
ㄱ


In [17]:
s ='가'
print(len(s))
b = s.encode('utf-8')
print(b)
print(len(b))
print(b.decode('utf-8'))

1
b'\xea\xb0\x80'
3
가


#### BPE 토큰화

BPE(Byte-Pair Encoding)는 초기에 텍스트를 압축하는 알고리즘으로 개발된 후, GPT 모델을 사전 학습할 때 토큰화를 위해 OpenAI에서 사용되었습니다. GPT, GPT-2, RoBERTa, BART 및 DeBERTa를 포함한 많은 트랜스포머 모델에서 사용됩니다.

In [18]:
corpus = [
    "This is the Hugging Face course.",
    "This chapter is about tokenization.",
    "This section shows several tokenizer algorithms.",
    "Hopefully, you will be able to understand how they are trained and generate tokens.",
]

In [19]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("gpt2")


In [20]:
from collections import defaultdict

word_freqs = defaultdict(int)

for text in corpus:
    words_with_offsets = tokenizer.backend_tokenizer.pre_tokenizer.pre_tokenize_str(text)
    new_words = [word for word, offset in words_with_offsets]
    for word in new_words:
        word_freqs[word] += 1

print(word_freqs)


defaultdict(<class 'int'>, {'This': 3, 'Ġis': 2, 'Ġthe': 1, 'ĠHugging': 1, 'ĠFace': 1, 'Ġcourse': 1, '.': 4, 'Ġchapter': 1, 'Ġabout': 1, 'Ġtokenization': 1, 'Ġsection': 1, 'Ġshows': 1, 'Ġseveral': 1, 'Ġtokenizer': 1, 'Ġalgorithms': 1, 'Hopefully': 1, ',': 1, 'Ġyou': 1, 'Ġwill': 1, 'Ġbe': 1, 'Ġable': 1, 'Ġto': 1, 'Ġunderstand': 1, 'Ġhow': 1, 'Ġthey': 1, 'Ġare': 1, 'Ġtrained': 1, 'Ġand': 1, 'Ġgenerate': 1, 'Ġtokens': 1})


In [21]:
alphabet = []

for word in word_freqs.keys():
    for letter in word:
        if letter not in alphabet:
            alphabet.append(letter)
alphabet.sort()

print(alphabet)


[',', '.', 'F', 'H', 'T', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'k', 'l', 'm', 'n', 'o', 'p', 'r', 's', 't', 'u', 'v', 'w', 'y', 'z', 'Ġ']


In [22]:
vocab = ["<|endoftext|>"] + alphabet.copy()
print(vocab)

['<|endoftext|>', ',', '.', 'F', 'H', 'T', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'k', 'l', 'm', 'n', 'o', 'p', 'r', 's', 't', 'u', 'v', 'w', 'y', 'z', 'Ġ']


In [23]:
splits = {word: [c for c in word] for word in word_freqs.keys()}
print(splits)

{'This': ['T', 'h', 'i', 's'], 'Ġis': ['Ġ', 'i', 's'], 'Ġthe': ['Ġ', 't', 'h', 'e'], 'ĠHugging': ['Ġ', 'H', 'u', 'g', 'g', 'i', 'n', 'g'], 'ĠFace': ['Ġ', 'F', 'a', 'c', 'e'], 'Ġcourse': ['Ġ', 'c', 'o', 'u', 'r', 's', 'e'], '.': ['.'], 'Ġchapter': ['Ġ', 'c', 'h', 'a', 'p', 't', 'e', 'r'], 'Ġabout': ['Ġ', 'a', 'b', 'o', 'u', 't'], 'Ġtokenization': ['Ġ', 't', 'o', 'k', 'e', 'n', 'i', 'z', 'a', 't', 'i', 'o', 'n'], 'Ġsection': ['Ġ', 's', 'e', 'c', 't', 'i', 'o', 'n'], 'Ġshows': ['Ġ', 's', 'h', 'o', 'w', 's'], 'Ġseveral': ['Ġ', 's', 'e', 'v', 'e', 'r', 'a', 'l'], 'Ġtokenizer': ['Ġ', 't', 'o', 'k', 'e', 'n', 'i', 'z', 'e', 'r'], 'Ġalgorithms': ['Ġ', 'a', 'l', 'g', 'o', 'r', 'i', 't', 'h', 'm', 's'], 'Hopefully': ['H', 'o', 'p', 'e', 'f', 'u', 'l', 'l', 'y'], ',': [','], 'Ġyou': ['Ġ', 'y', 'o', 'u'], 'Ġwill': ['Ġ', 'w', 'i', 'l', 'l'], 'Ġbe': ['Ġ', 'b', 'e'], 'Ġable': ['Ġ', 'a', 'b', 'l', 'e'], 'Ġto': ['Ġ', 't', 'o'], 'Ġunderstand': ['Ġ', 'u', 'n', 'd', 'e', 'r', 's', 't', 'a', 'n', 'd'], 'Ġh

In [24]:
def compute_pair_freqs(splits):
    pair_freqs = defaultdict(int)
    for word, freq in word_freqs.items():
        split = splits[word]
        if len(split) == 1:
            continue
        for i in range(len(split) - 1):
            pair = (split[i], split[i+1])
            pair_freqs[pair] += freq
    return pair_freqs


In [25]:
pair_freqs = compute_pair_freqs(splits)

for i, key in enumerate(pair_freqs.keys()):
    print(f"{key}: {pair_freqs[key]}")
    if i > 5:
        break


('T', 'h'): 3
('h', 'i'): 3
('i', 's'): 5
('Ġ', 'i'): 2
('Ġ', 't'): 7
('t', 'h'): 3
('h', 'e'): 2


In [26]:
print(pair_freqs)

defaultdict(<class 'int'>, {('T', 'h'): 3, ('h', 'i'): 3, ('i', 's'): 5, ('Ġ', 'i'): 2, ('Ġ', 't'): 7, ('t', 'h'): 3, ('h', 'e'): 2, ('Ġ', 'H'): 1, ('H', 'u'): 1, ('u', 'g'): 1, ('g', 'g'): 1, ('g', 'i'): 1, ('i', 'n'): 2, ('n', 'g'): 1, ('Ġ', 'F'): 1, ('F', 'a'): 1, ('a', 'c'): 1, ('c', 'e'): 1, ('Ġ', 'c'): 2, ('c', 'o'): 1, ('o', 'u'): 3, ('u', 'r'): 1, ('r', 's'): 2, ('s', 'e'): 3, ('c', 'h'): 1, ('h', 'a'): 1, ('a', 'p'): 1, ('p', 't'): 1, ('t', 'e'): 2, ('e', 'r'): 5, ('Ġ', 'a'): 5, ('a', 'b'): 2, ('b', 'o'): 1, ('u', 't'): 1, ('t', 'o'): 4, ('o', 'k'): 3, ('k', 'e'): 3, ('e', 'n'): 4, ('n', 'i'): 2, ('i', 'z'): 2, ('z', 'a'): 1, ('a', 't'): 2, ('t', 'i'): 2, ('i', 'o'): 2, ('o', 'n'): 2, ('Ġ', 's'): 3, ('e', 'c'): 1, ('c', 't'): 1, ('s', 'h'): 1, ('h', 'o'): 2, ('o', 'w'): 2, ('w', 's'): 1, ('e', 'v'): 1, ('v', 'e'): 1, ('r', 'a'): 3, ('a', 'l'): 2, ('z', 'e'): 1, ('l', 'g'): 1, ('g', 'o'): 1, ('o', 'r'): 1, ('r', 'i'): 1, ('i', 't'): 1, ('h', 'm'): 1, ('m', 's'): 1, ('H', 'o'): 

In [27]:
best_pair = ""
max_freq = None

for pair, freq in pair_freqs.items():
    if max_freq is None or max_freq < freq:
        best_pair = pair
        max_freq = freq

print(best_pair, max_freq)


('Ġ', 't') 7


In [28]:
merges = {("Ġ", "t"): "Ġt"}
vocab.append("Ġt")


In [29]:
print(vocab)

['<|endoftext|>', ',', '.', 'F', 'H', 'T', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'k', 'l', 'm', 'n', 'o', 'p', 'r', 's', 't', 'u', 'v', 'w', 'y', 'z', 'Ġ', 'Ġt']


In [30]:
print(merges)

{('Ġ', 't'): 'Ġt'}


In [31]:
def merge_pair(a, b, splits):
    for word in word_freqs:
        split = splits[word]
        if len(split) == 1:
            continue

        i = 0
        while i < len(split) - 1:
            if split[i] == a and split[i + 1] == b:
                split = split[:i] + [a + b] + split[i + 2 :]
            else:
                i += 1
        splits[word] = split
    return splits


In [32]:
splits = merge_pair("Ġ", "t", splits)
print(splits)
print(splits["Ġtrained"])

{'This': ['T', 'h', 'i', 's'], 'Ġis': ['Ġ', 'i', 's'], 'Ġthe': ['Ġt', 'h', 'e'], 'ĠHugging': ['Ġ', 'H', 'u', 'g', 'g', 'i', 'n', 'g'], 'ĠFace': ['Ġ', 'F', 'a', 'c', 'e'], 'Ġcourse': ['Ġ', 'c', 'o', 'u', 'r', 's', 'e'], '.': ['.'], 'Ġchapter': ['Ġ', 'c', 'h', 'a', 'p', 't', 'e', 'r'], 'Ġabout': ['Ġ', 'a', 'b', 'o', 'u', 't'], 'Ġtokenization': ['Ġt', 'o', 'k', 'e', 'n', 'i', 'z', 'a', 't', 'i', 'o', 'n'], 'Ġsection': ['Ġ', 's', 'e', 'c', 't', 'i', 'o', 'n'], 'Ġshows': ['Ġ', 's', 'h', 'o', 'w', 's'], 'Ġseveral': ['Ġ', 's', 'e', 'v', 'e', 'r', 'a', 'l'], 'Ġtokenizer': ['Ġt', 'o', 'k', 'e', 'n', 'i', 'z', 'e', 'r'], 'Ġalgorithms': ['Ġ', 'a', 'l', 'g', 'o', 'r', 'i', 't', 'h', 'm', 's'], 'Hopefully': ['H', 'o', 'p', 'e', 'f', 'u', 'l', 'l', 'y'], ',': [','], 'Ġyou': ['Ġ', 'y', 'o', 'u'], 'Ġwill': ['Ġ', 'w', 'i', 'l', 'l'], 'Ġbe': ['Ġ', 'b', 'e'], 'Ġable': ['Ġ', 'a', 'b', 'l', 'e'], 'Ġto': ['Ġt', 'o'], 'Ġunderstand': ['Ġ', 'u', 'n', 'd', 'e', 'r', 's', 't', 'a', 'n', 'd'], 'Ġhow': ['Ġ', 'h', 

In [33]:
len(vocab)

31

In [34]:
vocab_size = 50

while len(vocab) < vocab_size:
    pair_freqs = compute_pair_freqs(splits)
    best_pair = ""
    max_freq = None
    for pair, freq in pair_freqs.items():
        if max_freq is None or max_freq < freq:
            best_pair = pair
            max_freq = freq
    splits = merge_pair(*best_pair, splits)
    merges[best_pair] = best_pair[0] + best_pair[1]
    vocab.append(best_pair[0] + best_pair[1])


In [35]:
print(len(vocab))

50


In [36]:
print(vocab)

['<|endoftext|>', ',', '.', 'F', 'H', 'T', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'k', 'l', 'm', 'n', 'o', 'p', 'r', 's', 't', 'u', 'v', 'w', 'y', 'z', 'Ġ', 'Ġt', 'is', 'er', 'Ġa', 'Ġto', 'en', 'Th', 'This', 'ou', 'se', 'Ġtok', 'Ġtoken', 'nd', 'Ġis', 'Ġth', 'Ġthe', 'in', 'Ġc', 'Ġab', 'Ġtokeni']


In [37]:
print(merges)

{('Ġ', 't'): 'Ġt', ('i', 's'): 'is', ('e', 'r'): 'er', ('Ġ', 'a'): 'Ġa', ('Ġt', 'o'): 'Ġto', ('e', 'n'): 'en', ('T', 'h'): 'Th', ('Th', 'is'): 'This', ('o', 'u'): 'ou', ('s', 'e'): 'se', ('Ġto', 'k'): 'Ġtok', ('Ġtok', 'en'): 'Ġtoken', ('n', 'd'): 'nd', ('Ġ', 'is'): 'Ġis', ('Ġt', 'h'): 'Ġth', ('Ġth', 'e'): 'Ġthe', ('i', 'n'): 'in', ('Ġ', 'c'): 'Ġc', ('Ġa', 'b'): 'Ġab', ('Ġtoken', 'i'): 'Ġtokeni'}


In [38]:
def tokenize(text):
    pre_tokenize_result = tokenizer._tokenizer.pre_tokenizer.pre_tokenize_str(text)
    pre_tokenized_text = [word for word, offset in pre_tokenize_result]
    splits = [[l for l in word] for word in pre_tokenized_text]
    for pair, merge in merges.items():
        for idx, split in enumerate(splits):
            i = 0
            while i < len(split) - 1:
                if split[i] == pair[0] and split[i + 1] == pair[1]:
                    split = split[:i] + [merge] + split[i + 2 :]
                else:
                    i += 1
            splits[idx] = split

    return sum(splits, [])


In [39]:
tokenize("This is not a token.")


['This', 'Ġis', 'Ġ', 'n', 'o', 't', 'Ġa', 'Ġtoken', '.']

In [40]:
text = "This is not a token."
pre_tokenize_result = tokenizer._tokenizer.pre_tokenizer.pre_tokenize_str(text)
pre_tokenize_result

[('This', (0, 4)),
 ('Ġis', (4, 7)),
 ('Ġnot', (7, 11)),
 ('Ġa', (11, 13)),
 ('Ġtoken', (13, 19)),
 ('.', (19, 20))]

In [41]:
pre_tokenized_text = [word for word, offset in pre_tokenize_result]
pre_tokenized_text

['This', 'Ġis', 'Ġnot', 'Ġa', 'Ġtoken', '.']

In [42]:
splits = [[l for l in word] for word in pre_tokenized_text]
splits

[['T', 'h', 'i', 's'],
 ['Ġ', 'i', 's'],
 ['Ġ', 'n', 'o', 't'],
 ['Ġ', 'a'],
 ['Ġ', 't', 'o', 'k', 'e', 'n'],
 ['.']]

In [43]:
for pair, merge in merges.items():
        for idx, split in enumerate(splits):
            i = 0
            while i < len(split) - 1:
                if split[i] == pair[0] and split[i + 1] == pair[1]:
                    split = split[:i] + [merge] + split[i + 2 :]
                else:
                    i += 1
            splits[idx] = split


In [44]:
splits

[['This'], ['Ġis'], ['Ġ', 'n', 'o', 't'], ['Ġa'], ['Ġtoken'], ['.']]

In [45]:
sum(splits, [])

['This', 'Ġis', 'Ġ', 'n', 'o', 't', 'Ġa', 'Ġtoken', '.']

In [46]:
tokenize("Tis") # T is not in the vacabulary ==> [UNK]

['T', 'is']

In [47]:
print(corpus)

['This is the Hugging Face course.', 'This chapter is about tokenization.', 'This section shows several tokenizer algorithms.', 'Hopefully, you will be able to understand how they are trained and generate tokens.']


In [48]:
new_tokenizer = tokenizer.train_new_from_iterator(corpus, 50)






In [49]:
print(new_tokenizer)

GPT2TokenizerFast(name_or_path='gpt2', vocab_size=257, model_max_length=1024, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
}


In [50]:
print(new_tokenizer.vocab)

{'U': 53, 'F': 38, 'ö': 179, '¤': 98, '.': 14, 'Þ': 155, '¹': 118, 'Ć': 195, 'J': 42, 'o': 79, 'Ð': 141, 'Ï': 140, 'r': 82, 'ĥ': 226, 'Ô': 145, 'ķ': 244, 'c': 67, 's': 83, 'ę': 214, 'Ĺ': 246, '^': 62, 'f': 70, 'Û': 152, 'í': 170, '7': 23, '|': 92, 'Ò': 143, 'ĕ': 210, 'ă': 192, 'Ã': 128, 'Ú': 151, 'N': 46, '«': 105, '¾': 123, '!': 1, 'P': 48, 'Á': 126, 'Ü': 153, 'µ': 114, 'á': 158, '3': 19, 'ä': 161, 'ğ': 220, 'Ą': 193, 'n': 78, 'k': 75, 'g': 71, '¦': 100, 'Ī': 231, 'Į': 235, 'Ċ': 199, '±': 110, 'ð': 173, '´': 113, 'ĩ': 230, '[': 59, '¼': 121, '<|endoftext|>': 0, '£': 97, '½': 122, 'ā': 190, 'O': 47, 'Õ': 146, 'ø': 181, '\\': 60, 'æ': 163, '&': 6, '³': 112, '"': 2, '×': 148, 'Ń': 256, '4': 20, 'ċ': 200, 'ģ': 224, 'Ê': 135, ']': 61, '-': 13, 'Ĝ': 217, 'x': 88, 'ã': 160, '÷': 180, 'È': 133, 'ô': 177, 'Ē': 207, '6': 22, 'Ĥ': 225, 'ĳ': 240, 'G': 39, ',': 12, 'ě': 216, 'M': 45, '_': 63, 'ò': 175, 'ĸ': 245, 'd': 68, 'E': 37, 'Đ': 205, 'ı': 238, 'ß': 156, '2': 18, ';': 27, '`': 64, '9': 25, 'Ğ

In [51]:
new_tokenizer("This is not a token.").tokens()

['T',
 'h',
 'i',
 's',
 'Ġ',
 'i',
 's',
 'Ġ',
 'n',
 'o',
 't',
 'Ġ',
 'a',
 'Ġ',
 't',
 'o',
 'k',
 'e',
 'n',
 '.']

#### WordPiece 토큰화

WordPiece는 Google이 BERT를 사전 학습하기 위해 개발한 토큰화 알고리즘입니다. 그 이후로 DitilBERT, MobileBERT, Funnel Transformers 및 MPNET과 같은 BERT 기반의 상당히 많은 Transformer 모델에서 재사용되었습니다. 학습 측면에서 BPE와 매우 유사하지만 실제 토큰화는 다르게 수행됩니다.

In [52]:
corpus = [
    "This is the Hugging Face course.",
    "This chapter is about tokenization.",
    "This section shows several tokenizer algorithms.",
    "Hopefully, you will be able to understand how they are trained and generate tokens.",
]

In [53]:
len(corpus)

4

In [54]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

In [55]:
print([tokenizer(corpus).tokens(i) for i in range(len(corpus))])

[['[CLS]', 'This', 'is', 'the', 'Hu', '##gging', 'Face', 'course', '.', '[SEP]'], ['[CLS]', 'This', 'chapter', 'is', 'about', 'token', '##ization', '.', '[SEP]'], ['[CLS]', 'This', 'section', 'shows', 'several', 'token', '##izer', 'algorithms', '.', '[SEP]'], ['[CLS]', 'Hopefully', ',', 'you', 'will', 'be', 'able', 'to', 'understand', 'how', 'they', 'are', 'trained', 'and', 'generate', 'token', '##s', '.', '[SEP]']]


In [56]:
from collections import defaultdict

word_freqs = defaultdict(int)
for text in corpus:
    words_with_offsets = tokenizer.backend_tokenizer.pre_tokenizer.pre_tokenize_str(text)
    new_words = [word for word, offset in words_with_offsets]
    for word in new_words:
        word_freqs[word] += 1

word_freqs

defaultdict(int,
            {'This': 3,
             'is': 2,
             'the': 1,
             'Hugging': 1,
             'Face': 1,
             'course': 1,
             '.': 4,
             'chapter': 1,
             'about': 1,
             'tokenization': 1,
             'section': 1,
             'shows': 1,
             'several': 1,
             'tokenizer': 1,
             'algorithms': 1,
             'Hopefully': 1,
             ',': 1,
             'you': 1,
             'will': 1,
             'be': 1,
             'able': 1,
             'to': 1,
             'understand': 1,
             'how': 1,
             'they': 1,
             'are': 1,
             'trained': 1,
             'and': 1,
             'generate': 1,
             'tokens': 1})

In [57]:
[tokenizer.backend_tokenizer.pre_tokenizer.pre_tokenize_str(text) for text in corpus]

[[('This', (0, 4)),
  ('is', (5, 7)),
  ('the', (8, 11)),
  ('Hugging', (12, 19)),
  ('Face', (20, 24)),
  ('course', (25, 31)),
  ('.', (31, 32))],
 [('This', (0, 4)),
  ('chapter', (5, 12)),
  ('is', (13, 15)),
  ('about', (16, 21)),
  ('tokenization', (22, 34)),
  ('.', (34, 35))],
 [('This', (0, 4)),
  ('section', (5, 12)),
  ('shows', (13, 18)),
  ('several', (19, 26)),
  ('tokenizer', (27, 36)),
  ('algorithms', (37, 47)),
  ('.', (47, 48))],
 [('Hopefully', (0, 9)),
  (',', (9, 10)),
  ('you', (11, 14)),
  ('will', (15, 19)),
  ('be', (20, 22)),
  ('able', (23, 27)),
  ('to', (28, 30)),
  ('understand', (31, 41)),
  ('how', (42, 45)),
  ('they', (46, 50)),
  ('are', (51, 54)),
  ('trained', (55, 62)),
  ('and', (63, 66)),
  ('generate', (67, 75)),
  ('tokens', (76, 82)),
  ('.', (82, 83))]]

In [58]:
alphabet = []
for word in word_freqs.keys():
    if word[0] not in alphabet:
        alphabet.append(word[0])
    for letter in word[1:]:
        if f"##{letter}" not in alphabet:
            alphabet.append(f"##{letter}")

alphabet.sort()
alphabet

print(alphabet)


['##a', '##b', '##c', '##d', '##e', '##f', '##g', '##h', '##i', '##k', '##l', '##m', '##n', '##o', '##p', '##r', '##s', '##t', '##u', '##v', '##w', '##y', '##z', ',', '.', 'F', 'H', 'T', 'a', 'b', 'c', 'g', 'h', 'i', 's', 't', 'u', 'w', 'y']


In [59]:
vocab = ["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"] + alphabet.copy()

In [60]:
print(vocab)

['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[MASK]', '##a', '##b', '##c', '##d', '##e', '##f', '##g', '##h', '##i', '##k', '##l', '##m', '##n', '##o', '##p', '##r', '##s', '##t', '##u', '##v', '##w', '##y', '##z', ',', '.', 'F', 'H', 'T', 'a', 'b', 'c', 'g', 'h', 'i', 's', 't', 'u', 'w', 'y']


In [61]:
splits = {
    word: [c if i == 0 else f"##{c}" for i, c in enumerate(word)]
    for word in word_freqs.keys()
}
print(splits)

{'This': ['T', '##h', '##i', '##s'], 'is': ['i', '##s'], 'the': ['t', '##h', '##e'], 'Hugging': ['H', '##u', '##g', '##g', '##i', '##n', '##g'], 'Face': ['F', '##a', '##c', '##e'], 'course': ['c', '##o', '##u', '##r', '##s', '##e'], '.': ['.'], 'chapter': ['c', '##h', '##a', '##p', '##t', '##e', '##r'], 'about': ['a', '##b', '##o', '##u', '##t'], 'tokenization': ['t', '##o', '##k', '##e', '##n', '##i', '##z', '##a', '##t', '##i', '##o', '##n'], 'section': ['s', '##e', '##c', '##t', '##i', '##o', '##n'], 'shows': ['s', '##h', '##o', '##w', '##s'], 'several': ['s', '##e', '##v', '##e', '##r', '##a', '##l'], 'tokenizer': ['t', '##o', '##k', '##e', '##n', '##i', '##z', '##e', '##r'], 'algorithms': ['a', '##l', '##g', '##o', '##r', '##i', '##t', '##h', '##m', '##s'], 'Hopefully': ['H', '##o', '##p', '##e', '##f', '##u', '##l', '##l', '##y'], ',': [','], 'you': ['y', '##o', '##u'], 'will': ['w', '##i', '##l', '##l'], 'be': ['b', '##e'], 'able': ['a', '##b', '##l', '##e'], 'to': ['t', '##o'],

In [62]:
def compute_pair_scores(splits):
    letter_freqs = defaultdict(int)
    pair_freqs = defaultdict(int)
    for word, freq in word_freqs.items():
        split = splits[word]
        if len(split) == 1:
            letter_freqs[split[0]] += freq
            continue
        for i in range(len(split) - 1):
            pair = (split[i], split[i + 1])
            letter_freqs[split[i]] += freq
            pair_freqs[pair] += freq
        letter_freqs[split[-1]] += freq

    scores = {
        pair: freq / (letter_freqs[pair[0]] * letter_freqs[pair[1]])
        for pair, freq in pair_freqs.items()
    }
    return scores


In [63]:
pair_scores = compute_pair_scores(splits)
for i, key in enumerate(pair_scores.keys()):
    print(f"{key}: {pair_scores[key]}")
    if i >= 5:
        break


('T', '##h'): 0.125
('##h', '##i'): 0.03409090909090909
('##i', '##s'): 0.02727272727272727
('i', '##s'): 0.1
('t', '##h'): 0.03571428571428571
('##h', '##e'): 0.011904761904761904


In [64]:
print(pair_scores)

{('T', '##h'): 0.125, ('##h', '##i'): 0.03409090909090909, ('##i', '##s'): 0.02727272727272727, ('i', '##s'): 0.1, ('t', '##h'): 0.03571428571428571, ('##h', '##e'): 0.011904761904761904, ('H', '##u'): 0.1, ('##u', '##g'): 0.05, ('##g', '##g'): 0.0625, ('##g', '##i'): 0.022727272727272728, ('##i', '##n'): 0.01652892561983471, ('##n', '##g'): 0.022727272727272728, ('F', '##a'): 0.14285714285714285, ('##a', '##c'): 0.07142857142857142, ('##c', '##e'): 0.023809523809523808, ('c', '##o'): 0.038461538461538464, ('##o', '##u'): 0.046153846153846156, ('##u', '##r'): 0.022222222222222223, ('##r', '##s'): 0.022222222222222223, ('##s', '##e'): 0.004761904761904762, ('c', '##h'): 0.0625, ('##h', '##a'): 0.017857142857142856, ('##a', '##p'): 0.07142857142857142, ('##p', '##t'): 0.07142857142857142, ('##t', '##e'): 0.013605442176870748, ('##e', '##r'): 0.026455026455026454, ('a', '##b'): 0.2, ('##b', '##o'): 0.038461538461538464, ('##u', '##t'): 0.02857142857142857, ('t', '##o'): 0.0439560439560439

In [65]:
best_pair = ""
max_score = None
for pair, score in pair_scores.items():
    if max_score is None or max_score < score:
        best_pair = pair
        max_score = score

print(best_pair, max_score)

('a', '##b') 0.2


In [66]:
vocab.append("ab")
print(vocab)

['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[MASK]', '##a', '##b', '##c', '##d', '##e', '##f', '##g', '##h', '##i', '##k', '##l', '##m', '##n', '##o', '##p', '##r', '##s', '##t', '##u', '##v', '##w', '##y', '##z', ',', '.', 'F', 'H', 'T', 'a', 'b', 'c', 'g', 'h', 'i', 's', 't', 'u', 'w', 'y', 'ab']


In [67]:
def merge_pair(a, b, splits):
    for word in word_freqs:
        split = splits[word]
        if len(split) == 1:
            continue
        i = 0
        while i < len(split) - 1:
            if split[i] == a and split[i + 1] == b:
                merge = a + b[2:] if b.startswith("##") else a + b
                split = split[:i] + [merge] + split[i + 2 :]
            else:
                i += 1
        splits[word] = split
    return splits


In [68]:
splits = merge_pair("a", "##b", splits)
print(splits)
print(splits["about"])

{'This': ['T', '##h', '##i', '##s'], 'is': ['i', '##s'], 'the': ['t', '##h', '##e'], 'Hugging': ['H', '##u', '##g', '##g', '##i', '##n', '##g'], 'Face': ['F', '##a', '##c', '##e'], 'course': ['c', '##o', '##u', '##r', '##s', '##e'], '.': ['.'], 'chapter': ['c', '##h', '##a', '##p', '##t', '##e', '##r'], 'about': ['ab', '##o', '##u', '##t'], 'tokenization': ['t', '##o', '##k', '##e', '##n', '##i', '##z', '##a', '##t', '##i', '##o', '##n'], 'section': ['s', '##e', '##c', '##t', '##i', '##o', '##n'], 'shows': ['s', '##h', '##o', '##w', '##s'], 'several': ['s', '##e', '##v', '##e', '##r', '##a', '##l'], 'tokenizer': ['t', '##o', '##k', '##e', '##n', '##i', '##z', '##e', '##r'], 'algorithms': ['a', '##l', '##g', '##o', '##r', '##i', '##t', '##h', '##m', '##s'], 'Hopefully': ['H', '##o', '##p', '##e', '##f', '##u', '##l', '##l', '##y'], ',': [','], 'you': ['y', '##o', '##u'], 'will': ['w', '##i', '##l', '##l'], 'be': ['b', '##e'], 'able': ['ab', '##l', '##e'], 'to': ['t', '##o'], 'understand

In [69]:
vocab_size = 70
while len(vocab) < vocab_size:
    scores = compute_pair_scores(splits)
    best_pair, max_score = "", None
    for pair, score in scores.items():
        if max_score is None or max_score < score:
            best_pair = pair
            max_score = score
    splits = merge_pair(*best_pair, splits)
    new_token = (
        best_pair[0] + best_pair[1][2:]
        if best_pair[1].startswith("##")
        else best_pair[0] + best_pair[1]
    )
    vocab.append(new_token)


In [70]:
print(vocab)

['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[MASK]', '##a', '##b', '##c', '##d', '##e', '##f', '##g', '##h', '##i', '##k', '##l', '##m', '##n', '##o', '##p', '##r', '##s', '##t', '##u', '##v', '##w', '##y', '##z', ',', '.', 'F', 'H', 'T', 'a', 'b', 'c', 'g', 'h', 'i', 's', 't', 'u', 'w', 'y', 'ab', '##fu', 'Fa', 'Fac', '##ct', '##ful', '##full', '##fully', 'Th', '##hm', '##thm', 'Hu', 'Hug', 'Hugg', 'ch', 'cha', 'chap', 'chapt', 'sh', 'th', 'is', '##thms', '##za', '##zat', '##ut', '##ta']


In [71]:
print(splits)

{'This': ['Th', '##i', '##s'], 'is': ['is'], 'the': ['th', '##e'], 'Hugging': ['Hugg', '##i', '##n', '##g'], 'Face': ['Fac', '##e'], 'course': ['c', '##o', '##u', '##r', '##s', '##e'], '.': ['.'], 'chapter': ['chapt', '##e', '##r'], 'about': ['ab', '##o', '##ut'], 'tokenization': ['t', '##o', '##k', '##e', '##n', '##i', '##zat', '##i', '##o', '##n'], 'section': ['s', '##e', '##ct', '##i', '##o', '##n'], 'shows': ['sh', '##o', '##w', '##s'], 'several': ['s', '##e', '##v', '##e', '##r', '##a', '##l'], 'tokenizer': ['t', '##o', '##k', '##e', '##n', '##i', '##z', '##e', '##r'], 'algorithms': ['a', '##l', '##g', '##o', '##r', '##i', '##thms'], 'Hopefully': ['H', '##o', '##p', '##e', '##fully'], ',': [','], 'you': ['y', '##o', '##u'], 'will': ['w', '##i', '##l', '##l'], 'be': ['b', '##e'], 'able': ['ab', '##l', '##e'], 'to': ['t', '##o'], 'understand': ['u', '##n', '##d', '##e', '##r', '##s', '##ta', '##n', '##d'], 'how': ['h', '##o', '##w'], 'they': ['th', '##e', '##y'], 'are': ['a', '##r',

In [72]:
def encode_word(word):
    tokens = []
    while len(word) > 0:
        i = len(word)
        while i > 0 and word[:i] not in vocab:
            i -= 1
        if i == 0:
            return ["[UNK]"]
        tokens.append(word[:i])
        word = word[i:]
        if len(word) > 0:
            word = f"##{word}"
    return tokens


In [73]:
print(encode_word("Hugging"))
print(encode_word("HOgging"))

['Hugg', '##i', '##n', '##g']
['[UNK]']


In [74]:
def tokenize(text):
    pre_tokenize_result = tokenizer._tokenizer.pre_tokenizer.pre_tokenize_str(text)
    pre_tokenized_text = [word for word, offset in pre_tokenize_result]
    encoded_words = [encode_word(word) for word in pre_tokenized_text]
    return sum(encoded_words, [])


In [75]:
print(tokenize("This is the Hugging Face course!"))

['Th', '##i', '##s', 'is', 'th', '##e', 'Hugg', '##i', '##n', '##g', 'Fac', '##e', 'c', '##o', '##u', '##r', '##s', '##e', '[UNK]']


#### Unigram 토큰화

Unigram 알고리즘은 AlBERT, T5, mBART, Big Bird 및 XLNet과 같은 모델에서 사용되는 토큰화 알고리즘인 SentencePiece에서 자주 사용됩니다.

In [76]:
corpus = [
    "This is the Hugging Face course.",
    "This chapter is about tokenization.",
    "This section shows several tokenizer algorithms.",
    "Hopefully, you will be able to understand how they are trained and generate tokens.",
]

In [77]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("xlnet-base-cased")

In [78]:
from collections import defaultdict

word_freqs = defaultdict(int)
for text in corpus:
    words_with_offsets = tokenizer.backend_tokenizer.pre_tokenizer.pre_tokenize_str(text)
    new_words = [word for word, offset in words_with_offsets]
    for word in new_words:
        word_freqs[word] += 1

word_freqs

defaultdict(int,
            {'▁This': 3,
             '▁is': 2,
             '▁the': 1,
             '▁Hugging': 1,
             '▁Face': 1,
             '▁course.': 1,
             '▁chapter': 1,
             '▁about': 1,
             '▁tokenization.': 1,
             '▁section': 1,
             '▁shows': 1,
             '▁several': 1,
             '▁tokenizer': 1,
             '▁algorithms.': 1,
             '▁Hopefully,': 1,
             '▁you': 1,
             '▁will': 1,
             '▁be': 1,
             '▁able': 1,
             '▁to': 1,
             '▁understand': 1,
             '▁how': 1,
             '▁they': 1,
             '▁are': 1,
             '▁trained': 1,
             '▁and': 1,
             '▁generate': 1,
             '▁tokens.': 1})

In [79]:
char_freqs = defaultdict(int)
subwords_freqs = defaultdict(int)
for word, freq in word_freqs.items():
    for i in range(len(word)):
        char_freqs[word[i]] += freq
        # 길이가 적어도 2 이상인 subword들을 추가함.
        for j in range(i + 2, len(word) + 1):
            subwords_freqs[word[i:j]] += freq

# Subword들을 빈도 역순으로 정렬
sorted_subwords = sorted(subwords_freqs.items(), key=lambda x: x[1], reverse=True)
sorted_subwords[:10]

[('▁t', 7),
 ('is', 5),
 ('er', 5),
 ('▁a', 5),
 ('▁to', 4),
 ('to', 4),
 ('en', 4),
 ('▁T', 3),
 ('▁Th', 3),
 ('▁Thi', 3)]

In [80]:
print(sorted_subwords)

[('▁t', 7), ('is', 5), ('er', 5), ('▁a', 5), ('▁to', 4), ('to', 4), ('en', 4), ('▁T', 3), ('▁Th', 3), ('▁Thi', 3), ('▁This', 3), ('Th', 3), ('Thi', 3), ('This', 3), ('hi', 3), ('his', 3), ('th', 3), ('ou', 3), ('se', 3), ('▁tok', 3), ('▁toke', 3), ('▁token', 3), ('tok', 3), ('toke', 3), ('token', 3), ('ok', 3), ('oke', 3), ('oken', 3), ('ke', 3), ('ken', 3), ('▁s', 3), ('ra', 3), ('nd', 3), ('▁i', 2), ('▁is', 2), ('▁th', 2), ('▁the', 2), ('the', 2), ('he', 2), ('▁H', 2), ('in', 2), ('▁c', 2), ('rs', 2), ('te', 2), ('▁ab', 2), ('ab', 2), ('▁tokeni', 2), ('▁tokeniz', 2), ('tokeni', 2), ('tokeniz', 2), ('okeni', 2), ('okeniz', 2), ('keni', 2), ('keniz', 2), ('eni', 2), ('eniz', 2), ('ni', 2), ('niz', 2), ('iz', 2), ('at', 2), ('ti', 2), ('tio', 2), ('tion', 2), ('io', 2), ('ion', 2), ('on', 2), ('▁se', 2), ('ho', 2), ('how', 2), ('ow', 2), ('era', 2), ('al', 2), ('s.', 2), ('ll', 2), ('an', 2), ('and', 2), ('ne', 2), ('▁Hu', 1), ('▁Hug', 1), ('▁Hugg', 1), ('▁Huggi', 1), ('▁Huggin', 1), ('

In [81]:
token_freqs = list(char_freqs.items()) + sorted_subwords[: 300 - len(char_freqs)]
token_freqs = {token: freq for token, freq in token_freqs}

In [82]:
print(token_freqs)

{'▁': 31, 'T': 3, 'h': 9, 'i': 13, 's': 13, 't': 14, 'e': 21, 'H': 2, 'u': 6, 'g': 5, 'n': 11, 'F': 1, 'a': 12, 'c': 4, 'o': 13, 'r': 9, '.': 4, 'p': 2, 'b': 3, 'k': 3, 'z': 2, 'w': 3, 'v': 1, 'l': 7, 'm': 1, 'f': 1, 'y': 3, ',': 1, 'd': 4, '▁t': 7, 'is': 5, 'er': 5, '▁a': 5, '▁to': 4, 'to': 4, 'en': 4, '▁T': 3, '▁Th': 3, '▁Thi': 3, '▁This': 3, 'Th': 3, 'Thi': 3, 'This': 3, 'hi': 3, 'his': 3, 'th': 3, 'ou': 3, 'se': 3, '▁tok': 3, '▁toke': 3, '▁token': 3, 'tok': 3, 'toke': 3, 'token': 3, 'ok': 3, 'oke': 3, 'oken': 3, 'ke': 3, 'ken': 3, '▁s': 3, 'ra': 3, 'nd': 3, '▁i': 2, '▁is': 2, '▁th': 2, '▁the': 2, 'the': 2, 'he': 2, '▁H': 2, 'in': 2, '▁c': 2, 'rs': 2, 'te': 2, '▁ab': 2, 'ab': 2, '▁tokeni': 2, '▁tokeniz': 2, 'tokeni': 2, 'tokeniz': 2, 'okeni': 2, 'okeniz': 2, 'keni': 2, 'keniz': 2, 'eni': 2, 'eniz': 2, 'ni': 2, 'niz': 2, 'iz': 2, 'at': 2, 'ti': 2, 'tio': 2, 'tion': 2, 'io': 2, 'ion': 2, 'on': 2, '▁se': 2, 'ho': 2, 'how': 2, 'ow': 2, 'era': 2, 'al': 2, 's.': 2, 'll': 2, 'an': 2, 'and'

In [83]:
from math import log

total_sum = sum([freq for token, freq in token_freqs.items()])
model = {token: -log(freq / total_sum) for token, freq in token_freqs.items()}

In [84]:
total_sum

596

In [85]:
print(model)

{'▁': 2.9562534625802033, 'T': 5.29162837839724, 'h': 4.19301608972913, 'i': 3.8252913096038133, 's': 3.8252913096038133, 't': 3.751183337450091, 'e': 3.345718229341927, 'H': 5.697093486505405, 'u': 4.598481197837295, 'g': 4.7808027546312495, 'n': 3.9923453942669793, 'F': 6.39024066706535, 'a': 3.9053340172773496, 'c': 5.003946305945459, 'o': 3.8252913096038133, 'r': 4.19301608972913, '.': 5.003946305945459, 'p': 5.697093486505405, 'b': 5.29162837839724, 'k': 5.29162837839724, 'z': 5.697093486505405, 'w': 5.29162837839724, 'v': 6.39024066706535, 'l': 4.4443305180100365, 'm': 6.39024066706535, 'f': 6.39024066706535, 'y': 5.29162837839724, ',': 6.39024066706535, 'd': 5.003946305945459, '▁t': 4.4443305180100365, 'is': 4.7808027546312495, 'er': 4.7808027546312495, '▁a': 4.7808027546312495, '▁to': 5.003946305945459, 'to': 5.003946305945459, 'en': 5.003946305945459, '▁T': 5.29162837839724, '▁Th': 5.29162837839724, '▁Thi': 5.29162837839724, '▁This': 5.29162837839724, 'Th': 5.29162837839724, '

In [86]:
def encode_word(word, model):
    best_segmentations = [{"start": 0, "score": 1}] + [
        {"start": None, "score": None} for _ in range(len(word))
    ]
    for start_idx in range(len(word)):
        # This should be properly filled by the previous steps of the loop
        best_score_at_start = best_segmentations[start_idx]["score"]
        for end_idx in range(start_idx + 1, len(word) + 1):
            token = word[start_idx:end_idx]
            if token in model and best_score_at_start is not None:
                score = model[token] + best_score_at_start
                # If we have found a better segmentation ending at end_idx, we update
                if (
                    best_segmentations[end_idx]["score"] is None
                    or best_segmentations[end_idx]["score"] > score
                ):
                    best_segmentations[end_idx] = {"start": start_idx, "score": score}

    segmentation = best_segmentations[-1]
    if segmentation["score"] is None:
        # We did not find a tokenization of the word -> unknown
        return ["<unk>"], None

    score = segmentation["score"]
    start = segmentation["start"]
    end = len(word)
    tokens = []
    while start != 0:
        tokens.insert(0, word[start:end])
        next_start = best_segmentations[start]["start"]
        end = start
        start = next_start
    tokens.insert(0, word[start:end])
    return tokens, score

In [87]:
print(encode_word("Hopefully", model))
print(encode_word("This", model))

(['H', 'o', 'p', 'e', 'f', 'u', 'll', 'y'], 41.54264024176184)
(['This'], 6.29162837839724)


In [88]:
def compute_loss(model):
    loss = 0
    for word, freq in word_freqs.items():
        _, word_loss = encode_word(word, model)
        loss += freq * word_loss
    return loss


In [89]:
compute_loss(model)

413.362600202517

In [90]:
import copy

def compute_scores(model):
    scores = {}
    model_loss = compute_loss(model)
    for token, score in model.items():
        # We always keep tokens of length 1
        if len(token) == 1:
            continue
        model_without_token = copy.deepcopy(model)
        _ = model_without_token.pop(token)
        scores[token] = compute_loss(model_without_token) - model_loss
    return scores

In [91]:
scores = compute_scores(model)
print(scores["ll"])
print(scores["his"])

6.383135099029346
0.0


In [92]:
percent_to_remove = 0.1
while len(model) > 100:
    scores = compute_scores(model)
    sorted_scores = sorted(scores.items(), key=lambda x: x[1])
    # Remove percent_to_remove tokens with the lowest scores.
    for i in range(int(len(model) * percent_to_remove)):
        _ = token_freqs.pop(sorted_scores[i][0])

    total_sum = sum([freq for token, freq in token_freqs.items()])
    model = {token: -log(freq / total_sum) for token, freq in token_freqs.items()}

In [93]:
def tokenize(text, model):
    words_with_offsets = tokenizer.backend_tokenizer.pre_tokenizer.pre_tokenize_str(text)
    pre_tokenized_text = [word for word, offset in words_with_offsets]
    encoded_words = [encode_word(word, model)[0] for word in pre_tokenized_text]
    return sum(encoded_words, [])

tokenize("This is the Hugging Face course.", model)


['▁This', '▁is', '▁the', '▁Hugging', '▁Face', '▁course.']

### 블록 단위로 토크나이저 빌딩하기

In [94]:
from datasets import load_dataset


dataset = load_dataset("wikitext", name="wikitext-2-raw-v1", split="train")


def get_training_corpus():
    for i in range(0, len(dataset), 1000):
        yield dataset[i : i + 1000]["text"]


with open("wikitext-2.txt", "w", encoding="utf-8") as f:
    for i in range(len(dataset)):
        f.write(dataset[i]["text"] + "\n")


#### WordPiece 토크나이저를 처음부터 빌딩하기

In [95]:
from tokenizers import (
    decoders,
    models,
    normalizers,
    pre_tokenizers,
    processors,
    trainers,
    Tokenizer,
)

tokenizer = Tokenizer(models.WordPiece(unk_token="[UNK]"))

In [96]:
tokenizer

<tokenizers.Tokenizer at 0x24fe350>

In [97]:
tokenizer.normalizer = normalizers.BertNormalizer(lowercase=True)

In [98]:
print(tokenizer.normalizer.normalize_str("Héllò hôw are ü?"))

hello how are u?


In [99]:
tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()

In [100]:
tokenizer.pre_tokenizer.pre_tokenize_str("Let's test my pre-tokenizer.")

[('Let', (0, 3)),
 ("'", (3, 4)),
 ('s', (4, 5)),
 ('test', (6, 10)),
 ('my', (11, 13)),
 ('pre', (14, 17)),
 ('-', (17, 18)),
 ('tokenizer', (18, 27)),
 ('.', (27, 28))]

In [101]:
pre_tokenizer = pre_tokenizers.Sequence(
    [pre_tokenizers.WhitespaceSplit(), pre_tokenizers.Punctuation()]
)
pre_tokenizer.pre_tokenize_str("Let's test my pre-tokenizer.")

[('Let', (0, 3)),
 ("'", (3, 4)),
 ('s', (4, 5)),
 ('test', (6, 10)),
 ('my', (11, 13)),
 ('pre', (14, 17)),
 ('-', (17, 18)),
 ('tokenizer', (18, 27)),
 ('.', (27, 28))]

In [102]:
print(tokenizer)

<tokenizers.Tokenizer object at 0x24fe350>


In [103]:
special_tokens = ["[UNK]", "[PAD]", "[CLS]", "[SEP]", "[MASK]"]
trainer = trainers.WordPieceTrainer(vocab_size=25000, special_tokens=special_tokens)

In [104]:
tokenizer.train_from_iterator(get_training_corpus(), trainer=trainer)






In [105]:
encoding = tokenizer.encode("Let's test this tokenizer")
print(encoding.tokens)

['let', "'", 's', 'test', 'this', 'tok', '##eni', '##zer']


In [106]:
encoding

Encoding(num_tokens=8, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])

In [107]:
cls_token_id = tokenizer.token_to_id("[CLS]")
sep_token_id = tokenizer.token_to_id("[SEP]")
print(cls_token_id, sep_token_id)

2 3


In [108]:
tokenizer.post_processor = processors.TemplateProcessing(
    single=f"[CLS]:0 $A:0 [SEP]:0",
    pair=f"[CLS]:0 $A:0 [SEP]:0 $B:1 [SEP]:1",
    special_tokens=[("[CLS]", cls_token_id), ("[SEP]", sep_token_id)]
)

In [109]:
encoding = tokenizer.encode("Let's test this tokenizer.")
print(encoding.tokens)

['[CLS]', 'let', "'", 's', 'test', 'this', 'tok', '##eni', '##zer', '.', '[SEP]']


In [110]:
encoding = tokenizer.encode("Let's test this tokenizer...", "on a pair of sentences.")
print(encoding.tokens)
print(encoding.type_ids)

['[CLS]', 'let', "'", 's', 'test', 'this', 'tok', '##eni', '##zer', '...', '[SEP]', 'on', 'a', 'pair', 'of', 'sentences', '.', '[SEP]']
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1]


In [111]:
tokenizer.decoder = decoders.WordPiece(prefix="##")

In [112]:
encoding.ids

[2,
 2817,
 11,
 61,
 3409,
 1317,
 24117,
 18701,
 6411,
 2180,
 3,
 1167,
 43,
 3952,
 1143,
 9250,
 18,
 3]

In [113]:
tokenizer.decode(encoding.ids)

"let ' s test this tokenizer... on a pair of sentences."

In [114]:
tokenizer.save("tokenizer.json")

In [115]:
new_tokenizer = Tokenizer.from_file("tokenizer.json")

In [116]:
new_tokenizer

<tokenizers.Tokenizer at 0x9cc1150>

In [117]:
from transformers import PreTrainedTokenizerFast

wrapped_tokenizer = PreTrainedTokenizerFast(
    tokenizer_object=tokenizer,
    unk_token="[UNK]",
    pad_token="[PAD]",
    cls_token="[CLS]",
    sep_token="[SEP]",
    mask_token="[MASK]",
)


In [118]:
inputs = wrapped_tokenizer("Let's test my pre-tokenizer.")
print(inputs)
print(inputs.tokens())

{'input_ids': [2, 2817, 11, 61, 3409, 2031, 1435, 17, 24117, 18701, 6411, 18, 3], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
['[CLS]', 'let', "'", 's', 'test', 'my', 'pre', '-', 'tok', '##eni', '##zer', '.', '[SEP]']


In [119]:
from transformers import BertTokenizerFast

wrapped_tokenizer2 = BertTokenizerFast(tokenizer_object=tokenizer)

In [120]:
inputs2 = wrapped_tokenizer2("Let's test my pre-tokenizer.")
print(inputs2)
print(inputs2.tokens())

{'input_ids': [2, 2817, 11, 61, 3409, 2031, 1435, 17, 24117, 18701, 6411, 18, 3], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
['[CLS]', 'let', "'", 's', 'test', 'my', 'pre', '-', 'tok', '##eni', '##zer', '.', '[SEP]']


#### BPE 토크나이저를 처음부터 빌딩하기

In [121]:
tokenizer = Tokenizer(models.BPE())

In [122]:
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False)

In [123]:
tokenizer.pre_tokenizer.pre_tokenize_str("Let's test pre-tokenization!")

[('Let', (0, 3)),
 ("'s", (3, 5)),
 ('Ġtest', (5, 10)),
 ('Ġpre', (10, 14)),
 ('-', (14, 15)),
 ('tokenization', (15, 27)),
 ('!', (27, 28))]

In [124]:
trainer = trainers.BpeTrainer(vocab_size=25000, special_tokens=["<|endoftext|>"])
tokenizer.train_from_iterator(get_training_corpus(), trainer=trainer)






In [125]:
tokenizer.model = models.BPE()
tokenizer.train(["wikitext-2.txt"], trainer=trainer)






In [126]:
encoding = tokenizer.encode("Let's test this tokenizer.")
print(encoding.tokens)

['L', 'et', "'", 's', 'Ġtest', 'Ġthis', 'Ġto', 'ken', 'izer', '.']


In [127]:
tokenizer.post_processor = processors.ByteLevel(trim_offsets=False)

In [128]:
sentence = "Let's test this tokenizer."
encoding = tokenizer.encode(sentence)
start, end = encoding.offsets[4]
sentence[start:end]

' test'

In [129]:
tokenizer.decoder = decoders.ByteLevel()

In [130]:
tokenizer.decode(encoding.ids)

"Let's test this tokenizer."

In [131]:
from transformers import PreTrainedTokenizerFast

wrapped_tokenizer = PreTrainedTokenizerFast(
    tokenizer_object=tokenizer,
    bos_token="<|endoftext|>",
    eos_token="<|endoftext|>",
)


In [132]:
from transformers import GPT2TokenizerFast

wrapped_tokenizer = GPT2TokenizerFast(tokenizer_object=tokenizer)


#### Unigram 토크나이저를 처음부터 빌딩하기

In [133]:
tokenizer = Tokenizer(models.Unigram())

In [134]:
from tokenizers import Regex

tokenizer.normalizer = normalizers.Sequence(
    [
        normalizers.Replace("``", '"'),
        normalizers.Replace("''", '"'),
        normalizers.NFKD(),
        normalizers.StripAccents(),
        normalizers.Replace(Regex(" {2,}"), " "),
    ]
)


In [135]:
tokenizer.pre_tokenizer = pre_tokenizers.Metaspace()

In [136]:
tokenizer.pre_tokenizer.pre_tokenize_str("Let's test the pre-tokenizer!")

[("▁Let's", (0, 5)),
 ('▁test', (5, 10)),
 ('▁the', (10, 14)),
 ('▁pre-tokenizer!', (14, 29))]

In [137]:
special_tokens = ["<cls>", "<sep>", "<unk>", "<pad>", "<mask>", "<s>", "</s>"]
trainer = trainers.UnigramTrainer(
    vocab_size=25000, special_tokens=special_tokens, unk_token="<unk>"
)
tokenizer.train_from_iterator(get_training_corpus(), trainer=trainer)





In [138]:
tokenizer.model = models.Unigram()
tokenizer.train(["wikitext-2.txt"], trainer=trainer)





In [139]:
encoding = tokenizer.encode("Let's test this tokenizer.")
print(encoding.tokens)

['▁Let', "'", 's', '▁test', '▁this', '▁to', 'ken', 'izer', '.']


In [140]:
cls_token_id = tokenizer.token_to_id("<cls>")
sep_token_id = tokenizer.token_to_id("<sep>")
print(cls_token_id, sep_token_id)

0 1


In [141]:
tokenizer.post_processor = processors.TemplateProcessing(
    single="$A:0 <sep>:0 <cls>:2",
    pair="$A:0 <sep>:0 $B:1 <sep>:1 <cls>:2",
    special_tokens=[("<sep>", sep_token_id), ("<cls>", cls_token_id)],
)

In [142]:
encoding = tokenizer.encode("Let's test this tokenizer...", "on a pair of sentences!")
print(encoding.tokens)
print(encoding.type_ids)

['▁Let', "'", 's', '▁test', '▁this', '▁to', 'ken', 'izer', '.', '.', '.', '<sep>', '▁', 'on', '▁', 'a', '▁pair', '▁of', '▁sentence', 's', '!', '<sep>', '<cls>']
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2]


In [143]:
encoding = tokenizer.encode("Let's test this tokenizer...")
print(encoding.tokens)
print(encoding.type_ids)

['▁Let', "'", 's', '▁test', '▁this', '▁to', 'ken', 'izer', '.', '.', '.', '<sep>', '<cls>']
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2]


In [144]:
tokenizer.decoder = decoders.Metaspace()

In [145]:
from transformers import PreTrainedTokenizerFast

wrapped_tokenizer = PreTrainedTokenizerFast(
    tokenizer_object=tokenizer,
    bos_token="<s>",
    eos_token="</s>",
    unk_token="<unk>",
    pad_token="<pad>",
    cls_token="<cls>",
    sep_token="<sep>",
    mask_token="<mask>",
    padding_side="left",
)


In [146]:
from transformers import XLNetTokenizerFast

wrapped_tokenizer = XLNetTokenizerFast(tokenizer_object=tokenizer)
