In [1]:
import os
import json
from transformers import AutoTokenizer,LlamaTokenizer 
with open("key.json", "r") as f:
	keys = json.load(f)
os.environ["HF_KEY"] = keys["hf_key"]

import tiktoken
from tiktoken.load import load_tiktoken_bpe

  from .autonotebook import tqdm as notebook_tqdm


In [7]:
# hf version - only fast available
hf_tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct", token = keys["hf_key"])

tokenizer_config.json: 100%|██████████| 51.0k/51.0k [00:00<00:00, 358kB/s]
tokenizer.json: 100%|██████████| 9.09M/9.09M [00:00<00:00, 13.5MB/s]
special_tokens_map.json: 100%|██████████| 73.0/73.0 [00:00<00:00, 344kB/s]
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [10]:
# tiktoken version
# loading code taken from llama repo
# https://github.com/meta-llama/llama3/blob/d6e09315954d1a547bf45e37269978c049e73d33/llama/tokenizer.py#L38
# tokenizer.model from hf repo
mergeable_ranks = load_tiktoken_bpe('tokenizer.model')
num_base_tokens = len(mergeable_ranks)
num_reserved_special_tokens = 256
pat_str = r"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+"  # noqa: E

special_tokens = [
    "<|begin_of_text|>",
    "<|end_of_text|>",
    "<|reserved_special_token_0|>",
    "<|reserved_special_token_1|>",
    "<|reserved_special_token_2|>",
    "<|reserved_special_token_3|>",
    "<|start_header_id|>",
    "<|end_header_id|>",
    "<|reserved_special_token_4|>",
    "<|eot_id|>",  # end of turn
] + [
    f"<|reserved_special_token_{i}|>"
    for i in range(5, num_reserved_special_tokens - 5)
]
special_tokens = {
    token: num_base_tokens + i for i, token in enumerate(special_tokens)
}
tiktoken_tokenizer = tiktoken.Encoding(
    name="llama3-8b",
    pat_str=pat_str,
    mergeable_ranks=mergeable_ranks,
	special_tokens=special_tokens,
)

In [11]:
## Test Tokenization
sample = "안녕하세요 저는 사람입니다."
print(sample)
print("UTF-8 ENCODED:",sample.encode('utf-8'))
print("\n\n")

안녕하세요 저는 사람입니다.
UTF-8 ENCODED: b'\xec\x95\x88\xeb\x85\x95\xed\x95\x98\xec\x84\xb8\xec\x9a\x94 \xec\xa0\x80\xeb\x8a\x94 \xec\x82\xac\xeb\x9e\x8c\xec\x9e\x85\xeb\x8b\x88\xeb\x8b\xa4.'





# HF 쪽 tokenize 시 스트링이 깨져서 나옴
* "안녕하세요" -> "안", "녕하세요" 토큰화 되야함
* hf_tokenizer tokenize 로 반환 받을 경우 'ìķĪ', 'ëħķíķĺìĦ¸ìļĶ' 로 받아짐
	* hf 쪽 vocab 파일 보면 해당 토큰에 매핑되는 값은 맞음
	* tiktoken 버전과 ID 값 같음
	* encode -> id -> decode로 하면 제대로 나오기는 함

In [17]:
## HF Ver
encoded = hf_tokenizer.encode(sample, add_special_tokens = False)
encoded = hf_tokenizer.encode_plus(sample, add_special_tokens = False)
print("HF ENCODED:", encoded)
print(type(encoded[0]))
print(type(encoded))
# print(encoded.tokens)
print(encoded.tokens())
print(encoded._encodings)
print(encoded._encodings[0], type(encoded._encodings[0]))

tokenized = hf_tokenizer.tokenize(sample, add_special_tokens = False)
print("HF TOKENIZED:", tokenized)

decoded = hf_tokenizer.decode([101193])
print(decoded)
decoded = hf_tokenizer.decode([124409])
print(decoded)

HF ENCODED: {'input_ids': [101193, 124409, 102678, 16969, 102745, 80052, 13], 'attention_mask': [1, 1, 1, 1, 1, 1, 1]}
<class 'tokenizers.Encoding'>
<class 'transformers.tokenization_utils_base.BatchEncoding'>
['ìķĪ', 'ëħķíķĺìĦ¸ìļĶ', 'ĠìłĢ', 'ëĬĶ', 'ĠìĤ¬ëŀĮ', 'ìŀħëĭĪëĭ¤', '.']
[Encoding(num_tokens=7, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])]
Encoding(num_tokens=7, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing]) <class 'tokenizers.Encoding'>
HF TOKENIZED: ['ìķĪ', 'ëħķíķĺìĦ¸ìļĶ', 'ĠìłĢ', 'ëĬĶ', 'ĠìĤ¬ëŀĮ', 'ìŀħëĭĪëĭ¤', '.']
안
녕하세요


In [14]:
## TIKTOKEN VER
encoded = tiktoken_tokenizer.encode(sample)
print("TIKTOKEN ENCODED:",encoded)
print("TIKTOKEN DECODE T0:", tiktoken_tokenizer.decode([encoded[0]]))
print("TIKTOKEN DECODE T1:",tiktoken_tokenizer.decode([encoded[1]]))
decoded = tiktoken_tokenizer._core_bpe.decode_bytes(encoded)
print("TIKTOKEN DECODED:", decoded)
print("TIKTOKEN DECODED DECODE UTF-8:", decoded.decode("utf-8"))

TIKTOKEN ENCODED: [101193, 124409, 102678, 16969, 102745, 80052, 13]
TIKTOKEN DECODE T0: 안
TIKTOKEN DECODE T1: 녕하세요
TIKTOKEN DECODED: b'\xec\x95\x88\xeb\x85\x95\xed\x95\x98\xec\x84\xb8\xec\x9a\x94 \xec\xa0\x80\xeb\x8a\x94 \xec\x82\xac\xeb\x9e\x8c\xec\x9e\x85\xeb\x8b\x88\xeb\x8b\xa4.'
TIKTOKEN DECODED DECODE UTF-8: 안녕하세요 저는 사람입니다.
