In [1]:
# 2025/7/11
# zhangzhong
# 1. [ ] https://huggingface.co/docs/transformers/fast_tokenizers
# 2. [ ] https://huggingface.co/docs/transformers/tokenizer_summary
# 3. [ ] read all the tokenizer papers, and impl it，三篇论文都看，但是实现我们大概只会实现tiktoken也就是BPE，其他的没有用到，就先算了，节约时间

In [2]:
# Tokenizers convert text into an array of numbers known as tensors, the inputs to a text model. 
# Split text into smaller words or subwords (tokens) according to some rules, and convert them into numbers (input ids)
# A Transformers tokenizer also returns an attention mask to indicate which tokens should be attended to.

In [3]:
# The pretrained tokenizer is saved in a tokenizer.model file with all its associated vocabulary files.
# Pass a string of text to the tokenizer to return the input ids and attention mask, 
# and set the framework tensor type to return with the return_tensors parameter.
# Whichever tokenizer you use, make sure the tokenizer vocabulary is the same as the pretrained models tokenizer vocabulary.
from transformers import AutoTokenizer

In [4]:
# https://huggingface.co/docs/transformers/fast_tokenizers?tokenizer-classes=AutoTokenizer#tokenizer-classes
# There are two main tokenizer classes that build on top of the base class.
# - PreTrainedTokenizer is a Python implementation, for example LlamaTokenizer.
# - PreTrainedTokenizerFast is a fast Rust-based implementation from the Tokenizers library, for example LlamaTokenizerFast.
# There are two ways you can load a tokenizer, with AutoTokenizer or a model-specific tokenizer.
# AutoTokenizer: By default, AutoTokenizer tries to load a fast tokenizer if it’s available, otherwise, it loads the Python implementation.
from transformers import AutoTokenizer, GemmaTokenizer

In [5]:
# You could also load your own tokenizer by passing its vocab file to the vocab_file parameter.
# from transformers import GemmaTokenizerFast
# tokenizer = GemmaTokenizerFast(vocab_file="my_vocab_file.txt")

In [6]:
# https://huggingface.co/docs/transformers/fast_tokenizers?tokenizer-classes=model-specific+tokenizer#multimodal-tokenizers
# Multimodal tokenizers
# TODO： 目前还是不理解这个东西，先把text的tokenizer整明白了再说吧
vision_tokenizer = AutoTokenizer.from_pretrained(
    "llava-hf/llava-1.5-7b-hf",
    extra_special_tokens={"image_token": "<image>", "boi_token": "<image_start>", "eoi_token": "<image_end>"}
)
print(vision_tokenizer.image_token, vision_tokenizer.image_token_id)

<image> 32000


In [7]:
# how to train a fast tokenizer and reuse it in Transformers

# 1. To train a Byte-Pair Encoding (BPE) tokenizer, 
# create a Tokenizer and BpeTrainer class and define the unknown token and special tokens.

# from tokenizers import Tokenizer
# from tokenizers.models import BPE
# from tokenizers.trainers import BpeTrainer 

# # define the unknown token
# tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
# # define the special tokens
# trainer = BpeTrainer(
#     special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]
# )

# # Split the tokens on Whitespace to create tokens that don’t overlap with each other.
# from tokenizers.pre_tokenizers import Whitespace 
# tokenizer.pre_tokenizer = Whitespace()

# # Call train on the text files and trainer to start training.
# files = [...]
# tokenizer.train(files, trainer)

# # Use save to save the tokenizers configuration and vocabulary to a JSON file.
# tokenizer.save("tokenizer.json")

# # Now you can load and reuse the tokenizer object in Transformers by passing it to the tokenizer_object parameter in PreTrainedTokenizerFast.
# from transformers import PreTrainedTokenizerFast
# fast_tokenizer = PreTrainedTokenizerFast(tokenizer_object=tokenizer)

# # To load a saved tokenizer from its JSON file, pass the file path to the tokenizer_file parameter in PreTrainedTokenizerFast.
# fast_tokenizer = PreTrainedTokenizerFast(tokenizer_file="tokenizer.json")

In [8]:
# tictoken, BPE
# https://github.com/huggingface/tokenizers/issues/437
# https://github.com/huggingface/tokenizers/pull/1433
# 竟然有一个pr修复了这个rust中数值溢出的问题，那我要试一下了，应该只有几百G的数据才会触发这个问题

from transformers import AutoTokenizer

# Add the subfolder parameter to from_pretrained() to specify where the tokenizer.model tiktoken file is located.
# need protobuf library
# tokenizer = AutoTokenizer.from_pretrained("gpt2", subfolder="original") 
tokenizer = AutoTokenizer.from_pretrained("gpt2") 

In [9]:
# A Transformers model expects the input to be a PyTorch, TensorFlow, or NumPy tensor
# A tokenizers job is to preprocess text into those tensors.
tokenizer("We are very happy to show you the 🤗 Transformers library.")

{'input_ids': [1135, 389, 845, 3772, 284, 905, 345, 262, 12520, 97, 245, 39185, 5888, 13], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [10]:
# The tokenization process of converting text into input ids is completed in two steps.
# tokenizer playground: 还有这种东西，牛逼 https://xenova-the-tokenizer-playground.static.hf.space/index.html

# 1. tokenize
tokens = tokenizer.tokenize("We are very happy to show you the 🤗 Transformers library.")
print(tokens)

# 2. convert tokens to ids
ids = tokenizer.convert_tokens_to_ids(tokens)
print(ids)

# 3. decode ids to text
decoded_string = tokenizer.decode(ids)
print(decoded_string)

['We', 'Ġare', 'Ġvery', 'Ġhappy', 'Ġto', 'Ġshow', 'Ġyou', 'Ġthe', 'ĠðŁ', '¤', 'Ĺ', 'ĠTransformers', 'Ġlibrary', '.']
[1135, 389, 845, 3772, 284, 905, 345, 262, 12520, 97, 245, 39185, 5888, 13]
We are very happy to show you the 🤗 Transformers library.


In [11]:
# speicial tokens
model_inputs = tokenizer("We are very happy to show you the 🤗 Transformers library.")
print(model_inputs["input_ids"])
print(ids)
# gpt2好像没有specical tokens，我看hugging face的教程里main是有的，参考那个东西吧
print(tokenizer.decode(model_inputs["input_ids"]))
print(tokenizer.decode(ids))
# 以为tokenizer会帮你处理special tokens，所以用的时候就采取第一种方法就行了，简单又正确

[1135, 389, 845, 3772, 284, 905, 345, 262, 12520, 97, 245, 39185, 5888, 13]
[1135, 389, 845, 3772, 284, 905, 345, 262, 12520, 97, 245, 39185, 5888, 13]
We are very happy to show you the 🤗 Transformers library.
We are very happy to show you the 🤗 Transformers library.


In [12]:
# batch tokenization
# It is faster and more efficient to preprocess batches of text instead of a single sentence at a time

batch_sentences = [
    "But what about second breakfast?",
    "Don't think he knows about second breakfast, Pip.",
    "What about elevensies?",
]

encoded_inputs = tokenizer(batch_sentences)
print(encoded_inputs)

{'input_ids': [[1537, 644, 546, 1218, 12607, 30], [3987, 470, 892, 339, 4206, 546, 1218, 12607, 11, 25149, 13], [2061, 546, 22216, 82, 444, 30]], 'attention_mask': [[1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1]]}


In [13]:
# padding and truncation
# In the output above, the input_ids have different lengths
# This is an issue because Transformers expects them to have the same lengths so it can pack them into a batch.

# Padding adds a special padding token to ensure all sequences have the same length.
# Set padding=True to pad the sequences to the longest sequence length in the batch.

# Asking to pad but the tokenizer does not have a padding token.
# Please select a token to use as `pad_token` `(tokenizer.pad_token = tokenizer.eos_token e.g.)` 
# or add a new pad token via `tokenizer.add_special_tokens({'pad_token': '[PAD]'})`
tokenizer.pad_token = tokenizer.eos_token  # Set the pad token to the end of sentence token
encoded_inputs = tokenizer(batch_sentences, padding=True)
print(encoded_inputs)


{'input_ids': [[1537, 644, 546, 1218, 12607, 30, 50256, 50256, 50256, 50256, 50256], [3987, 470, 892, 339, 4206, 546, 1218, 12607, 11, 25149, 13], [2061, 546, 22216, 82, 444, 30, 50256, 50256, 50256, 50256, 50256]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0]]}


In [14]:
from pprint import pprint 

# Models are only able to process sequences up to a certain length
# Truncation removes tokens from a sequence to ensure it doesn’t exceed the maximum length
encoded_inputs = tokenizer(batch_sentences, max_length=8, truncation=True)
# 牛逼！这个好用啊
pprint(encoded_inputs)

{'attention_mask': [[1, 1, 1, 1, 1, 1],
                    [1, 1, 1, 1, 1, 1, 1, 1],
                    [1, 1, 1, 1, 1, 1]],
 'input_ids': [[1537, 644, 546, 1218, 12607, 30],
               [3987, 470, 892, 339, 4206, 546, 1218, 12607],
               [2061, 546, 22216, 82, 444, 30]]}
