<a href="https://colab.research.google.com/github/harryypham/MyMLPractice/blob/main/nlp/hf_tokenizer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
! pip install transformers datasets tiktoken

Collecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl (547 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m11.5 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-16.1.0-cp310-cp310-manylinux_2_28_x86_64.whl (40.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.8/40.8 MB[0m [31m14.1 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.0 MB/s[0m eta [36m0:00:00[0m
Collecting requests (from transformers)
  Downloading requests-2.32.3-py3-none-any.whl (64 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.9/64.9 kB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl

In [3]:
from datasets import load_dataset

vi_data = load_dataset('wikimedia/wikipedia', '20231101.vi', split='train')

with open('wiki-vietnamese.txt', 'w', encoding='utf-8') as f:
  for ex in range(min(len(vi_data), 10**5)):
    f.write(vi_data[ex]['text'] + '\n')


In [4]:
import os
print('dataset size:', os.path.getsize('wiki-vietnamese.txt'))

dataset size: 381641196


In [1]:
from tokenizers import (
    decoders,
    models,
    normalizers,
    pre_tokenizers,
    processors,
    trainers,
    Tokenizer,
)

In [None]:
wp_tokenizer = Tokenizer(models.WordPiece(unk_token="[UNK]"))

wp_tokenizer.normalizer = normalizers.Sequence([normalizers.Lowercase()])
wp_tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()

special_tokens = ["<|unk|>"]
wp_trainer = trainers.WordPieceTrainer(vocab_size=25000, special_tokens=special_tokens)

wp_tokenizer.train(['wiki-vietnamese.txt'], trainer=wp_trainer)
wp_tokenizer.decoder = decoders.WordPiece(prefix="##")

In [2]:
bpe_tokenizer = Tokenizer(models.BPE())

bpe_tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel()
bpe_tokenizer.pre_tokenizer.pre_tokenize_str("Let's test pre-tokenization!")

bpe_trainer = trainers.BpeTrainer(vocab_size=25000, special_tokens=["<|endoftext|>"])
bpe_tokenizer.train(["wiki-vietnamese.txt"], trainer=bpe_trainer)

In [11]:
bpe_tokenizer.post_processor = processors.ByteLevel(trim_offsets=False)

text = 'Em vào đời bằng đại lộ còn anh vào đời bằng lối nhỏ. Anh nhớ mình đã từng thổ lộ anh nhớ rằng em đã chối bỏ'
toks = bpe_tokenizer.encode(text)
print(toks.tokens)
print(len(toks.ids))
print(toks.offsets[1])

bpe_tokenizer.decoder = decoders.ByteLevel()
print(bpe_tokenizer.decode(toks.ids))

['ĠEm', 'ĠvÃło', 'ĠÄĳá»Ŀi', 'Ġbáº±ng', 'ĠÄĳáº¡i', 'Ġlá»Ļ', 'ĠcÃ²n', 'Ġanh', 'ĠvÃło', 'ĠÄĳá»Ŀi', 'Ġbáº±ng', 'Ġlá»ĳi', 'Ġnhá»ı', '.', 'ĠAnh', 'Ġnhá»Ľ', 'ĠmÃ¬nh', 'ĠÄĳÃ£', 'Ġtá»«ng', 'Ġthá»ķ', 'Ġlá»Ļ', 'Ġanh', 'Ġnhá»Ľ', 'Ġráº±ng', 'Ġem', 'ĠÄĳÃ£', 'Ġchá»ĳi', 'Ġbá»ı']
28
(2, 6)
 Em vào đời bằng đại lộ còn anh vào đời bằng lối nhỏ. Anh nhớ mình đã từng thổ lộ anh nhớ rằng em đã chối bỏ


In [10]:
import tiktoken

# To get the tokeniser corresponding to a specific model in the OpenAI API:
enc = tiktoken.encoding_for_model("gpt-4o")
print(len(enc.encode(text)))

33
