In [None]:
# 2025/7/12
# zhangzhong


In [None]:
from tokenizers import ByteLevelBPETokenizer

# Initialize the tokenizer
tokenizer = ByteLevelBPETokenizer()

# Train using a file path
tokenizer.train(
    files=["my_corpus.txt"],
    vocab_size=5000,
    min_frequency=2,
    special_tokens=["<s>", "<pad>", "</s>", "<unk>", "<mask>"]
)

# Save the trained tokenizer
tokenizer.save_model("my_tokenizer")






['my_tokenizer/vocab.json', 'my_tokenizer/merges.txt']

In [None]:
tokens = tokenizer.encode("Hello, world! This is a test.")
print(tokens.tokens)  # Output the tokens
print(tokens.ids)       # Output the token IDs

['H', 'el', 'l', 'o', ',', 'Ġ', 'w', 'or', 'l', 'd', '!', 'Ġ', 'T', 'h', 'is', 'Ġ', 'is', 'Ġa', 'Ġ', 't', 'e', 's', 't', '.']
[44, 261, 80, 83, 16, 225, 91, 263, 80, 72, 5, 225, 56, 76, 262, 225, 262, 266, 225, 88, 73, 87, 88, 18]


In [9]:
# how to do decode
decoded = tokenizer.decode(tokens.ids)
print(decoded)  # Output the decoded string

Hello, world! This is a test.


In [11]:
# how to load the tokenizer from saved files
from tokenizers import Tokenizer
tokenizer = Tokenizer.from_file("my_tokenizer")
# Now you can use the tokenizer as before
tokens = tokenizer.encode("Hello, world! This is a test.")
print(tokens.tokens)  # Output the tokens
print(tokens.ids)       # Output the token IDs
# how to do decode
decoded = tokenizer.decode(tokens.ids)
print(decoded)  # Output the decoded string

Exception: Is a directory (os error 21)

In [23]:
# 这种才是更好的方式！
# 就用这种吧，hugging face的教程里面也是这种

from tokenizers import Tokenizer, models, pre_tokenizers, decoders, trainers
from tokenizers.pre_tokenizers import ByteLevel
from tokenizers import normalizers

tokenizer = Tokenizer(models.BPE())
tokenizer.normalizer = None  # No normalization, default for GPT2
# tokenizer.normalizer = normalizers.Sequence([
#     normalizers.NFKC(),  
# ])
tokenizer.pre_tokenizer = pre_tokenizers.Sequence([
pre_tokenizers.ByteLevel(use_regex=False)
]) 
tokenizer.decoder = decoders.Sequence([
decoders.ByteLevel()
]) 

trainer = trainers.BpeTrainer(
    vocab_size=5000,
    # min_frequency=2,
    special_tokens=["<s>", "<pad>", "</s>", "<unk>", "<mask>"],
    initial_alphabet=ByteLevel.alphabet()  # ← key point: includes all 256 bytes
)

tokenizer.train(["my_corpus.txt"], trainer)
tokenizer.save("my_tokenizer/tokenizer.json")






In [24]:
# 语料库里面没有这种基本的字符还不行。。。
tokens = tokenizer.encode("Hello, world! This is a test.")
print(tokens.tokens)  # Output the tokens
print(tokens.ids)       # Output the token IDs
# how to do decode
decoded = tokenizer.decode(tokens.ids)
print(decoded)  # Output the decoded string

# 我们还真不能全都把字符都变成小写，因为大模型的输出就是要区分大小写的，看看chatgpt的输出就知道了呀。

['ĠHello', ',', 'Ġwor', 'l', 'd', '!', 'Ġ', 'T', 'h', 'isĠ', 'isĠ', 'a', 'Ġ', 't', 'es', 't', '.']
[338, 16, 277, 80, 72, 5, 225, 56, 76, 267, 267, 69, 225, 88, 292, 88, 18]
 Hello, world! This is a test.


# Normalization

Great question!

In the context of Hugging Face’s tokenizers and NLP in general:

⸻

🧠 What Is a Normalizer?

A normalizer is a preprocessing step in tokenization that modifies the raw input text before it is split into tokens.

📦 Typical Normalization Tasks:
	•	Lowercasing ("Hello" → "hello")
	•	Unicode normalization (e.g., NFC/NFD/NFKC/NFKD)
	•	Stripping accents ("café" → "cafe")
	•	Removing or standardizing whitespace
	•	Replacing characters or patterns (e.g., quotes, emojis, punctuations)

⸻

✅ Why Is Normalization Important?

Because:
	•	Different text formats may represent the same character differently (e.g., é vs e + ´)
	•	Helps standardize input for better token consistency
	•	Reduces vocabulary size and data sparsity

⸻

💡 Examples

Without normalization:

"Hello" → ['H', 'e', 'l', 'l', 'o']
"hello" → ['h', 'e', 'l', 'l', 'o']

These result in different tokens, unless normalization is applied.

⸻

✅ In Hugging Face tokenizers

You can set a normalizer like this:

from tokenizers import Tokenizer, normalizers
from tokenizers.normalizers import Lowercase, NFD, StripAccents, Sequence

# Build a normalization pipeline
normalizer = Sequence([
    NFD(),             # Unicode decomposition (Normalization Form D)
    Lowercase(),       # Convert to lowercase
    StripAccents(),    # Remove diacritics like é → e
])

tokenizer.normalizer = normalizer

You can also use built-in ones like:

from tokenizers.normalizers import BertNormalizer

tokenizer.normalizer = BertNormalizer(lowercase=True, strip_accents=True)


⸻

🔍 Unicode Normalization Forms

Name	Description
NFC	Canonical Composition (default in most systems)
NFD	Canonical Decomposition
NFKC	Compatibility Composition
NFKD	Compatibility Decomposition


⸻

🚫 Byte-Level Tokenizers (e.g., GPT-2)

For Byte-Level BPE like GPT-2:

⚠ No normalization is applied, because they operate on raw bytes.

tokenizer = ByteLevelBPETokenizer()
# tokenizer.normalizer = None (by default)

This ensures:
	•	Every possible character (even rare ones like emoji or Japanese kanji) is preserved
	•	Unicode differences are preserved too (which is why GPT uses byte-level)

⸻

✅ Summary

Term	Meaning
Normalizer	A text preprocessor before tokenization
Used For	Lowercasing, stripping accents, Unicode handling
In BERT	Often used (e.g., lowercase + NFD + accent strip)
In GPT-2	Not used (raw byte-level input)


⸻

Let me know if you want to visualize the effect of different normalizers on a sentence like "Café Déjà Vu – HELLO"!