In [None]:
# Toy example
from bpe import Tokenizer
tokenizer = Tokenizer()
text = "aaabdaaabac"
tokenizer.train(text, 256 + 3) # 256 are the byte tokens, then do 3 merges
print(tokenizer.encode(text))
# [258, 100, 258, 97, 99]
print(tokenizer.decode([258, 100, 258, 97, 99]))
print(text == tokenizer.decode(tokenizer.encode(text)))
# aaabdaaabac
# tokenizer.save("toy")
# writes two files: toy.model (for loading) and toy.vocab (for viewing)

In [7]:
# 验证Tokenizer在encoder再decode之后与原文一致
from bpe import Tokenizer
tokenizer = Tokenizer()
tokenizer.load("manual.model")
with open('./manual.txt', 'r', encoding='utf-8') as file:
    text = file.read()
print(f"encode and decode match: {text == tokenizer.decode(tokenizer.encode(text))}")

encode and decode match: True


In [5]:
# 使用gpt2 tokenizer encode 示例句子
from transformers import AutoTokenizer
import numpy as np
# 加载 GPT-2 tokenizer
tokenizer = AutoTokenizer.from_pretrained("gpt2")

# 测试 tokenizer
text1 = "Originated as the Imperial University of Peking in 1898, Peking University was China’s first national comprehensive university and the supreme education authority at the time. Since the founding of the People’s Republic of China in 1949, it has developed into a comprehensive university with fundamental education and research in both humanities and science. The reform and opening-up of China in 1978 has ushered in a new era for the University unseen in history. And its merger with Beijing Medical University in 2000 has geared itself up for all-round and vibrant growth in such fields as science, engineering, medicine, agriculture, humanities and social sciences. Supported by the “211 Project” and the “985 Project”, the University has made remarkable achievements, such as optimizing disciplines, cultivating talents, recruiting high-caliber teachers, as well as teaching and scientific research, which paves the way for a world-class university."
encoded_input = tokenizer(text1)['input_ids']
print("first 10 tokens of text1", encoded_input[:10])
print("length:", len(encoded_input))
text2 ="博士学位论文应当表明作者具有独立从事科学研究工作的能力，并在科学或专门技术上做出创造性的成果。博士学位论文或摘要，应当在答辩前三个月印送有关单位，并经同行评议。学位授予单位应当聘请两位与论文有关学科的专家评阅论文，其中一位应当是外单位的专家。评阅人应当对论文写详细的学术评语，供论文答辩委员会参考。"
encoded_input = tokenizer(text2)['input_ids']
print("first 10 tokens of text2", encoded_input[:10])
print("length:", len(encoded_input))
print(tokenizer.decode([39355, 248, 18803]))
print(tokenizer.decode([27764, 99, 19526, 235, 164, 106, 118, 23877, 229]))


first 10 tokens of text1 [11610, 3898, 355, 262, 11773, 2059, 286, 350, 18754, 287]
length: 185
first 10 tokens of text2 [39355, 248, 18803, 27764, 99, 19526, 235, 164, 106, 118]
length: 306
博士
学位论文


In [6]:
# 使用我训练的 tokenizer encode 示例句子
from bpe import Tokenizer
tokenizer = Tokenizer()
tokenizer.load("manual.model")
text1 = "Originated as the Imperial University of Peking in 1898, Peking University was China’s first national comprehensive university and the supreme education authority at the time. Since the founding of the People’s Republic of China in 1949, it has developed into a comprehensive university with fundamental education and research in both humanities and science. The reform and opening-up of China in 1978 has ushered in a new era for the University unseen in history. And its merger with Beijing Medical University in 2000 has geared itself up for all-round and vibrant growth in such fields as science, engineering, medicine, agriculture, humanities and social sciences. Supported by the “211 Project” and the “985 Project”, the University has made remarkable achievements, such as optimizing disciplines, cultivating talents, recruiting high-caliber teachers, as well as teaching and scientific research, which paves the way for a world-class university."
encoded_text1 = tokenizer.encode(text1)
utf_encoded_text1 = [byte for byte in text1.encode('utf-8')]
print("first 10 Bytes of utf-8:", utf_encoded_text1[:10])
print("length:", len(utf_encoded_text1))
print("first 10 tokens of text1", encoded_text1[:10])
print("length:", len(encoded_text1))

print("")
text2 ="博士学位论文应当表明作者具有独立从事科学研究工作的能力，并在科学或专门技术上做出创造性的成果。博士学位论文或摘要，应当在答辩前三个月印送有关单位，并经同行评议。学位授予单位应当聘请两位与论文有关学科的专家评阅论文，其中一位应当是外单位的专家。评阅人应当对论文写详细的学术评语，供论文答辩委员会参考。"
encoded_text2 = tokenizer.encode(text2)
utf_encoded_text2 = [byte for byte in text2.encode('utf-8')]
print("first 10 Bytes of utf-8:", utf_encoded_text2[:10])
print("length:", len(utf_encoded_text2))
print("first 10 tokens of text2", encoded_text2[:10])
print("length:", len(encoded_text2))
print(tokenizer.decode([457]))
print(tokenizer.decode([512]))

first 10 Bytes of utf-8: [79, 114, 105, 103, 105, 110, 97, 116, 101, 100]
length: 965
first 10 tokens of text1 [79, 114, 105, 103, 105, 110, 97, 116, 101, 100]
length: 947

first 10 Bytes of utf-8: [229, 141, 154, 229, 163, 171, 229, 173, 166, 228]
length: 447
first 10 tokens of text2 [457, 512, 524, 711, 642, 341, 456, 675, 353, 231]
length: 119
博士
学位论文


简要解释长度上和具体token上不同的原因是什么? 

因为使用的训练语料不同。所以在合并的时候，byte character合并的顺序和程度不同。\
text1英文文本，可以看到因为我们的tokenizer的训练语料基本没有英文，所以encode出来基本与unicode编码的byte character差不多，而gpt2的tokenizer编码得到的token sequence则更短，表示其对英语的合并比我们的tokenizer更高效。\
text2中文文本，可以看到gpt2 tokenizer用三个token表示“博士”，九个token表示“学位论文”，而我们的tokenizer用一个token表示“博士”，一个token表示“学位论文”。这是因为训练我们的tokenizer使用的语料与text2高度相关，其中很多词语在manual.txt中出现频率较高，进而被合并。而在gpt2的训练语料中中文语料相对较少，text2中相关的词语在其中的频率相对更不容易被合并，所以gpt2 tokenizer在text2上的编码效率更低。

In [None]:
# 训练tokenizer
from bpe import Tokenizer
tokenizer = Tokenizer()
with open('./manual.txt', 'r', encoding='utf-8') as file:
    text = file.read()
tokenizer.train(text, 1024)
print(tokenizer.encode(text))
print(text == tokenizer.decode(tokenizer.encode(text)))
tokenizer.save()