BPE原理：

原始字符编码后在里面找出现频率最高的相邻字符对，把它们合并成一个新的编码符号。循环迭代进行这个合并的过程，每次都把频率最高的相邻字符对合起来，一直合到我们预设的词表大小为止，最终就得到了一个有限的词表。
流程：
用BPE训练 tokenizer 的时候，先把原始的文本编码，然后反复执行 BPE 的合并操作，等训练结束，就得到了一个固定大小的词表，里面的每个 token 是一个字符或者字符组合。
<!-- 之后再进行编码的时候，tokenizer 就会按照词表里的内容去尽量匹配输入文本中最长的子词单元，把它们转换成 token id 输出。 -->

<!-- 读取训练文本-初始化词汇表-统计字符对频率-合并频率最高的字符对-更新词汇表并重复上一步-构建token，id映射-编码，解码 -->

In [None]:
class Tokenizer:
    def __init__(self, content, vocab_size=1024):
        self.vocab_size = vocab_size
        self.num_merges = vocab_size - 256
        self.ids = list(content.encode("utf-8"))        
        self.merges = {}
        # self.vocab = {idx: bytes([idx]) for idx in range(256)}
        self.vocab = {i: bytes([i]) for i in range(256)}

    def get_stats(self, ids):
        counts = {}
        for pair in zip(ids, ids[1:]):
            counts[pair] = counts.get(pair, 0) + 1
        return counts

    def merge(self, ids, pair, idx):
        new_ids = []
        i = 0
        while i < len(ids):
            if i < len(ids) - 1 and ids[i] == pair[0] and ids[i + 1] == pair[1]:
                new_ids.append(idx)
                i += 2
            else:
                new_ids.append(ids[i])
                i += 1
        return new_ids

    def train(self):
        for i in range(self.num_merges):
            stats = self.get_stats(self.ids)
            pair = max(stats, key=stats.get)
            idx = 256 + i
            self.ids = self.merge(self.ids, pair, idx)
            self.merges[pair] = idx

        for (p0, p1), idx in self.merges.items():
            self.vocab[idx] = self.vocab[p0] + self.vocab[p1]

    def decode(self, ids):
        tokens = b"".join([self.vocab[idx] for idx in ids])
        text = tokens.decode('utf-8', errors='replace')
        return text

    def encode(self, text):
        tokens = list(text.encode('utf-8'))
        while len(tokens) >= 2:
            states = self.get_stats(tokens)
            pair = min(states, key=lambda p: self.merges.get(p, float('inf')))
            if pair not in self.merges:
                break
            idx = self.merges[pair]
            tokens = self.merge(tokens, pair, idx)
        return tokens


In [2]:
# 读取文件内容
with open('manual.txt', 'r', encoding='utf-8') as f:
    content = f.read()

# 初始化 tokenizer
tokenizer = Tokenizer(content)

# 手动调用 train 方法进行训练
tokenizer.train()

# 使用 encode 方法对文本进行编码
encoded = tokenizer.encode(content)
print("Encoded:", encoded)

# 使用 decode 方法对编码结果进行解码
decoded = tokenizer.decode(encoded)
print("Decoded:", decoded)


Encoded: [423, 32, 447, 32, 381, 765, 601, 618, 10, 423, 32, 447, 32, 226, 188, 164, 765, 32, 288, 32, 293, 32, 226, 189, 163, 32, 226, 188, 191, 32, 316, 140, 370, 684, 32, 48, 684, 849, 32, 672, 136, 32, 302, 10, 423, 32, 447, 32, 226, 188, 164, 765, 32, 288, 32, 293, 32, 226, 189, 163, 32, 336, 410, 32, 48, 684, 849, 662, 32, 56, 32, 226, 189, 137, 10, 512, 298, 145, 783, 433, 308, 519, 419, 341, 431, 517, 404, 673, 843, 899, 671, 10, 319, 130, 334, 1018, 309, 663, 550, 369, 868, 766, 404, 10, 469, 854, 818, 381, 594, 847, 991, 262, 188, 844, 267, 590, 463, 130, 859, 314, 991, 370, 50, 48, 50, 48, 662, 32, 55, 882, 302, 10, 385, 816, 385, 279, 174, 480, 187, 489, 714, 271, 663, 488, 387, 173, 271, 385, 279, 174, 316, 155, 393, 488, 387, 173, 512, 298, 145, 783, 818, 308, 519, 419, 341, 431, 517, 404, 673, 843, 673, 431, 261, 961, 10, 884, 798, 178, 726, 962, 427, 509, 581, 468, 799, 261, 283, 179, 739, 326, 606, 536, 156, 548, 678, 599, 354, 986, 300, 183, 726, 271, 606, 285, 152, 2

In [3]:
import difflib
# 比较原始内容和解码后的内容

if content == decoded:
        print("完全相同")

diff = difflib.ndiff(content, decoded)
for d in diff:
    if d[0] != ' ':
        print(d)

完全相同


In [None]:
from transformers import GPT2Tokenizer

# 加载 GPT-2 的 tokenizer
gpt2_tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

# 要编码的句子
sentence = "Originated as the Imperial University of Peking in 1898, Peking University was China’s first national comprehensive university and the supreme education authority at the time. Since the founding of the People’s Republic of China in 1949, it has developed into a comprehensive university with fundamental education and research in both humanities and science. The reform and opening-up of China in 1978 has ushered in a new era for the University unseen in history. And its merger with Beijing Medical University in 2000 has geared itself up for all-round and vibrant growth in such fields as science, engineering, medicine, agriculture, humanities and social sciences. Supported by the “211 Project” and the “985 Project”, the University has made remarkable achievements, such as optimizing disciplines, cultivating talents, recruiting high-caliber teachers, as well as teaching and scientific research, which paves the way for a world-class university."

# 编码句子
gpt2_encoded = gpt2_tokenizer.encode(sentence)
print("GPT-2 编码结果:", gpt2_encoded)
my_encoded = tokenizer.encode(sentence)
print("自定义编码结果:", my_encoded)

  from .autonotebook import tqdm as notebook_tqdm


GPT-2 编码结果: [11610, 3898, 355, 262, 11773, 2059, 286, 350, 18754, 287, 46244, 11, 350, 18754, 2059, 373, 2807, 447, 247, 82, 717, 2260, 9815, 6403, 290, 262, 17700, 3707, 4934, 379, 262, 640, 13, 4619, 262, 16636, 286, 262, 4380, 447, 247, 82, 2066, 286, 2807, 287, 24977, 11, 340, 468, 4166, 656, 257, 9815, 6403, 351, 7531, 3707, 290, 2267, 287, 1111, 47824, 290, 3783, 13, 383, 4975, 290, 4756, 12, 929, 286, 2807, 287, 15524, 468, 47098, 287, 257, 649, 6980, 329, 262, 2059, 29587, 287, 2106, 13, 843, 663, 24589, 351, 11618, 8366, 2059, 287, 4751, 468, 31394, 2346, 510, 329, 477, 12, 744, 290, 21266, 3349, 287, 884, 7032, 355, 3783, 11, 8705, 11, 9007, 11, 14510, 11, 47824, 290, 1919, 19838, 13, 36848, 416, 262, 564, 250, 21895, 4935, 447, 251, 290, 262, 564, 250, 42250, 4935, 447, 251, 11, 262, 2059, 468, 925, 11004, 16970, 11, 884, 355, 45780, 29861, 11, 45414, 18054, 11, 16517, 1029, 12, 43288, 7799, 11, 355, 880, 355, 7743, 290, 5654, 2267, 11, 543, 279, 3080, 262, 835, 329, 257, 99

In [5]:
# 要编码的句子
sentence = "博士学位论文应当表明作者具有独立从事科学研究工作的能力，并在科学或专门技术上做出创造性的成果。博士学位论文或摘要，应当在答辩前三个月印送有关单位，并经同行评议。学位授予单位应当聘请两位与论文有关学科的专家评阅论文，其中一位应当是外单位的专家。评阅人应当对论文写详细的学术评语，供论文答辩委员会参考。"

# 编码句子
gpt2_encoded = gpt2_tokenizer.encode(sentence)
print("GPT-2 编码结果:", gpt2_encoded)
my_encoded = tokenizer.encode(sentence)
print("自定义编码结果:", my_encoded)

GPT-2 编码结果: [39355, 248, 18803, 27764, 99, 19526, 235, 164, 106, 118, 23877, 229, 41753, 242, 37605, 241, 26193, 101, 23626, 236, 43291, 38519, 17739, 115, 17312, 231, 45379, 105, 44165, 233, 20015, 236, 12859, 233, 163, 100, 239, 27764, 99, 163, 254, 242, 163, 102, 114, 32432, 98, 43291, 21410, 47797, 121, 27950, 249, 171, 120, 234, 33176, 114, 28839, 101, 163, 100, 239, 27764, 99, 22755, 244, 10310, 241, 29785, 101, 162, 232, 222, 17312, 107, 41468, 161, 223, 248, 49035, 118, 26344, 249, 34460, 254, 45250, 100, 21410, 22755, 238, 162, 252, 250, 16764, 39355, 248, 18803, 27764, 99, 19526, 235, 164, 106, 118, 23877, 229, 22755, 244, 162, 239, 246, 17358, 223, 171, 120, 234, 41753, 242, 37605, 241, 28839, 101, 163, 18433, 164, 122, 102, 30298, 235, 49011, 10310, 103, 17312, 230, 39355, 108, 34460, 223, 17312, 231, 17739, 111, 39355, 243, 19526, 235, 171, 120, 234, 33176, 114, 163, 119, 237, 28938, 234, 26193, 234, 46237, 226, 164, 106, 106, 16764, 27764, 99, 19526, 235, 162, 236, 230, 1