# GPT训练2-文本分词
## https://github.com/openai/tiktoken

# 常用分词方法：
## 1 Byte Pair Encoding (BPE)：BPE是一种基于频率的统计方法，通过逐步合并最频繁出现的字符对来构建词汇表。
## 2 Byte-Level BPE：将文本转换为字节序列，然后应用BPE算法进行子词分割。
## 3 WordPiece：类似于BPE，但通过最大化似然性来选择子词
## 4 SentencePiece：由Google开发，是一种无监督的文本分词方法，可以直接从原始文本中学习词汇表。
## 5 Unigram Language Model
## 6 Word-level Tokenization
## 7 Character-level Tokenization

# 1 加载预训练模型分词器

In [89]:
from datasets import load_dataset

# 定义数据集的路径
data_path_root="/Users/wangaijun/pythoncode/github/data/text"
data_files = {
    'train': f'{data_path_root}/chinese-poetry-collection/train.csv',
    'test': f'{data_path_root}/chinese-poetry-collection/test.csv'
}
# 加载数据集
dataset = load_dataset('csv', data_files=data_files)
# 打印数据集信息
print(dataset)
# 查看训练集的前几条数据
print(dataset['train'][:1])
# 查看测试集的前几条数据
print(dataset['test'][:2])

# 测试tokenizer
text = "举头望明月，低头思故乡。"

DatasetDict({
    train: Dataset({
        features: ['text1'],
        num_rows: 388599
    })
    test: Dataset({
        features: ['text1'],
        num_rows: 1710
    })
})
{'text1': ['半生长以客为家，罢直初来瀚海槎。始信人间行不尽，天涯更复有天涯。']}
{'text1': ['云髻高梳鬓不分，扫除虚室事元君。新糊白纸屏风上，尽画蓬莱五色云。', '山色摇光入袖凉，松阴十丈印回廊。老僧读罢楞严咒，一殿神风柏子香。']}


In [90]:
from transformers import BertTokenizer
# 加载预训练的 BERT 分词器
tokenizer = BertTokenizer.from_pretrained('/Users/wangaijun/pythoncode/github/model/bert-base-chinese')
tokenizer.tokenize(text)

['举', '头', '望', '明', '月', '，', '低', '头', '思', '故', '乡', '。']

In [91]:
data=dataset['train'][:1]["text1"]
print(data)
tokenizer(data)

['半生长以客为家，罢直初来瀚海槎。始信人间行不尽，天涯更复有天涯。']


{'input_ids': [[101, 1288, 4495, 7270, 809, 2145, 711, 2157, 8024, 5387, 4684, 1159, 3341, 4108, 3862, 3542, 511, 1993, 928, 782, 7313, 6121, 679, 2226, 8024, 1921, 3889, 3291, 1908, 3300, 1921, 3889, 511, 102]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}

In [93]:

loaded_tokenizer=tokenizer
# 验证分词器
print(loaded_tokenizer.tokenize(text))
print(loaded_tokenizer(text))
# 检查词汇表大小
print(f'Vocabulary size: {loaded_tokenizer.vocab_size}')

# 查看返回字段

tokenized_example = loaded_tokenizer(text, return_tensors='pt')
print(f'Return fields: {tokenized_example.keys()}')

# 查看特殊标记
special_tokens = {
    'cls_token': loaded_tokenizer.cls_token,
    'sep_token': loaded_tokenizer.sep_token,
    'pad_token': loaded_tokenizer.pad_token,
    'unk_token': loaded_tokenizer.unk_token,
    'mask_token': loaded_tokenizer.mask_token,
}
print(f'Special tokens: {special_tokens}')

# 查看最大序列长度
max_length = loaded_tokenizer.model_max_length
print(f'Maximum sequence length: {max_length}')

# 查看分词器配置
config = loaded_tokenizer.init_kwargs
print(f'Tokenizer configuration: {config}')

['举', '头', '望', '明', '月', '，', '低', '头', '思', '故', '乡', '。']
{'input_ids': [101, 715, 1928, 3307, 3209, 3299, 8024, 856, 1928, 2590, 3125, 740, 511, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
Vocabulary size: 21128
Return fields: dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])
Special tokens: {'cls_token': '[CLS]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'unk_token': '[UNK]', 'mask_token': '[MASK]'}
Maximum sequence length: 1000000000000000019884624838656
Tokenizer configuration: {'do_lower_case': False, 'do_basic_tokenize': True, 'never_split': None, 'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]', 'tokenize_chinese_chars': True, 'strip_accents': None, 'clean_up_tokenization_spaces': True, 'tokenizer_file': '/Users/wangaijun/pythoncode/github/model/bert-base-chinese/tokenizer.json', 'name_or_path': '/Users/wangaijun/pyth

# 2 预训练模型添加新词//

In [96]:
from transformers import BertTokenizer
# 加载预训练的 BERT 分词器
tokenizer = BertTokenizer.from_pretrained('/Users/wangaijun/pythoncode/github/model/bert-base-chinese')
print("tokenizer 长度：",len(tokenizer))
print("tokenizer ",tokenizer.tokenize(text))
# 新词列表
new_tokens = ['新词1', '新词2', '新词3',"头望",'举', '头', '望', '明', '月']
# 添加新词
num_added_toks = tokenizer.add_tokens(new_tokens)
print(f'Number of tokens added: {num_added_toks}')

# 保存新的分词器到指定路径
new_tokenizer_path = f'{data_path_root}/custom_tokenizer3'
tokenizer.save_pretrained(new_tokenizer_path)
new_tokenizer = BertTokenizer.from_pretrained(new_tokenizer_path)

print("new_tokenizer 长度：",len(new_tokenizer))
print("new_tokenizer",new_tokenizer.tokenize(text))

tokenizer 长度： 21128
tokenizer  ['举', '头', '望', '明', '月', '，', '低', '头', '思', '故', '乡', '。']
Number of tokens added: 4
new_tokenizer 长度： 21132
new_tokenizer ['举', '头望', '明', '月', '，', '低', '头', '思', '故', '乡', '。']


# 3 自定义分词

## 3.1 sentencepiece模型训练

In [97]:
import sentencepiece as spm
from datasets import load_dataset

# 加载数据集
data_path_root = "/Users/wangaijun/pythoncode/github/data/text"
data_files = {
    'train': f'{data_path_root}/chinese-poetry-collection/train.csv',
    'test': f'{data_path_root}/chinese-poetry-collection/test.csv'
}
dataset = load_dataset('csv', data_files=data_files)

# 准备用于训练的数据文件
with open(f'{data_path_root}/chinese-poetry.txt', 'w', encoding='utf-8') as f:
    for split in ['train','test']:
        for item in dataset[split]:
            # 假设你的CSV中有一个列叫做"text"包含了文本内容
            text = item['text1']
            f.write(text + '\n')
tokenizer_save_path="tokenizer/spm_poerty_tokenizer"
# 训练SentencePiece模型
spm.SentencePieceTrainer.Train(
    '--input={} --model_prefix={} --vocab_size={}  --character_coverage={} --model_type=bpe --num_threads=8'.format(
        f'{data_path_root}/chinese-poetry.txt',
        tokenizer_save_path,
        10000,  # 你可以根据需要调整词汇表大小
        1.0  # 字符覆盖率，通常设置为1.0以覆盖所有字符
    )
)

sentencepiece_trainer.cc(178) LOG(INFO) Running command: --input=/Users/wangaijun/pythoncode/github/data/text/chinese-poetry.txt --model_prefix=tokenizer/spm_poerty_tokenizer --vocab_size=10000  --character_coverage=1.0 --model_type=bpe --num_threads=8
sentencepiece_trainer.cc(78) LOG(INFO) Starts training with : 
trainer_spec {
  input: /Users/wangaijun/pythoncode/github/data/text/chinese-poetry.txt
  input_format: 
  model_prefix: tokenizer/spm_poerty_tokenizer
  model_type: BPE
  vocab_size: 10000
  self_test_sample_size: 0
  character_coverage: 1
  input_sentence_size: 0
  shuffle_input_sentence: 1
  seed_sentencepiece_size: 1000000
  shrinking_factor: 0.75
  max_sentence_length: 4192
  num_threads: 8
  num_sub_iterations: 2
  max_sentencepiece_length: 16
  split_by_unicode_script: 1
  split_by_number: 1
  split_by_whitespace: 1
  split_digits: 0
  pretokenization_delimiter: 
  treat_whitespace_as_suffix: 0
  allow_whitespace_only_pieces: 0
  required_chars: 
  byte_fallback: 0
  v

## 3.2 使用支持sentencepiece的分词器加载和保存分词模型

In [98]:
from transformers import XLMRobertaTokenizer, XLMRobertaTokenizerFast

# 创建一个基于SentencePiece的Tokenizer
model_path="tokenizer/spm_poerty_tokenizer.model"
tokenizer = XLMRobertaTokenizer.from_pretrained(model_path, keep_accents=True)
# 保存tokenizer到指定目录
output_dir = f'{data_path_root}/custom_tokenizer'
tokenizer.save_pretrained(output_dir)




('/Users/wangaijun/pythoncode/github/data/text/custom_tokenizer/tokenizer_config.json',
 '/Users/wangaijun/pythoncode/github/data/text/custom_tokenizer/special_tokens_map.json',
 '/Users/wangaijun/pythoncode/github/data/text/custom_tokenizer/sentencepiece.bpe.model',
 '/Users/wangaijun/pythoncode/github/data/text/custom_tokenizer/added_tokens.json')

In [99]:
# 加载已保存的tokenizer
tokenizer = XLMRobertaTokenizerFast.from_pretrained(output_dir)
# 测试tokenizer
text = "举头望明月，低头思故乡。"
encoded = tokenizer.encode(text)
decoded = tokenizer.decode(encoded)
print("tokenizer:",tokenizer.tokenize(text))
print("Encoded:  ", encoded)
print("Decoded:", decoded)

tokenizer: ['▁', '举', '头', '望', '明月', ',', '低', '头', '思', '故乡', '。']
Encoded:   [0, 268, 1023, 347, 429, 12, 267, 755, 347, 392, 187, 266, 2]
Decoded: <s> 举头望明月,低头思故乡。</s>


In [9]:
model_path="tokenizer/spm_poerty_tokenizer.model"

## 2.2 使用自定义的分词器（兼容transfomer）

In [100]:
from transformers import PreTrainedTokenizer, AddedToken
import os
import json
import sentencepiece as spm
import shutil
from typing import Optional,Dict,Tuple


class CustomSentencePieceTokenizer(PreTrainedTokenizer):
    def __init__(self, model_file, *args, **kwargs):
        
        self.sp_model = spm.SentencePieceProcessor()
        self.sp_model.load(model_file)
        self.model_file=model_file
        super().__init__(*args, **kwargs)
        # 设置特殊标记
        self.pad_token_id = self.sp_model.piece_to_id("<pad>")
        self.unk_token_id = self.sp_model.piece_to_id("<unk>")
        self.bos_token_id = self.sp_model.piece_to_id("<s>")
        self.eos_token_id = self.sp_model.piece_to_id("</s>")
        self.mask_token_id = self.sp_model.piece_to_id("<mask>")
        self.cls_token_id = self.sp_model.piece_to_id("<s>")   # 使用<s>作为CLS token
        self.sep_token_id = self.sp_model.piece_to_id("</s>")  # 使用</s>作为SEP token

    @property
    def vocab_size(self):
        return self.sp_model.get_piece_size()

    def _tokenize(self, text, **kwargs):
        return self.sp_model.encode_as_pieces(text)

    def _convert_token_to_id(self, token):
        return self.sp_model.piece_to_id(token)

    def _convert_id_to_token(self, index):
        return self.sp_model.id_to_piece(index)

    def encode(self, text, **kwargs):
        return self.sp_model.encode_as_ids(text)

    def decode(self, ids, **kwargs):
        return self.sp_model.decode(ids)

    def get_vocab(self):
        # 返回词汇表字典
        return {self._convert_id_to_token(i): i for i in range(self.vocab_size)}

    def save_pretrained(self, save_directory, **kwargs):
        # 复制SentencePiece模型
        shutil.copyfile(self.model_file, os.path.join(save_directory, 'spiece.model'))
        
        # 保存其他必要的文件
        with open(os.path.join(save_directory, 'tokenizer_config.json'), 'w') as f:
            f.write(json.dumps({
                "model_max_length": 512,
                "padding_side": "right",
                "truncation_side": "right"
            }))
        
        # 保存词汇表文件
        with open(os.path.join(save_directory, 'vocab.json'), 'w') as f:
            vocab = self.get_vocab()
            json.dump(vocab, f)

        # 保存特殊标记
        special_tokens_map = {
            "unk_token": self.sp_model.id_to_piece(self.unk_token_id),
            "sep_token": self.sp_model.id_to_piece(self.sep_token_id),
            "pad_token": self.sp_model.id_to_piece(self.pad_token_id),
            "cls_token": self.sp_model.id_to_piece(self.cls_token_id),
            "mask_token": self.sp_model.id_to_piece(self.mask_token_id),
            "bos_token": self.sp_model.id_to_piece(self.bos_token_id),
            "eos_token": self.sp_model.id_to_piece(self.eos_token_id)
        }
        with open(os.path.join(save_directory, 'special_tokens_map.json'), 'w') as f:
            json.dump(special_tokens_map, f)

        super().save_pretrained(save_directory, **kwargs)
    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
        if filename_prefix is None:
            filename_prefix = ""
        
        # 保存词汇表到纯文本文件
        vocab_filename = os.path.join(save_directory, f"{filename_prefix}vocab.txt")
        with open(vocab_filename, 'w', encoding='utf-8') as f:
            for i in range(self.vocab_size):
                token = self._convert_id_to_token(i)
                f.write(f"{token}\n")

        # 返回保存的文件路径
        return (vocab_filename,)
        
    @classmethod
    def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs):
        if isinstance(pretrained_model_name_or_path, str) and os.path.isdir(pretrained_model_name_or_path):
            model_file = os.path.join(pretrained_model_name_or_path, 'spiece.model')
        else:
            model_file = pretrained_model_name_or_path
        return cls(model_file, *args, **kwargs)


# 创建自定义的tokenizer
tokenizer = CustomSentencePieceTokenizer(model_file=model_path)

# 保存tokenizer到指定目录
output_dir = f'{data_path_root}/custom_tokenizer2'
tokenizer.save_pretrained(output_dir)

# 加载已保存的tokenizer
tokenizer = CustomSentencePieceTokenizer.from_pretrained(output_dir)


encoded = tokenizer.encode(text)
decoded = tokenizer.decode(encoded)

print("Encoded:", encoded)
print("Decoded:", decoded)

Encoded: [267, 1022, 346, 428, 11, 266, 754, 346, 391, 186, 265]
Decoded: 举头望明月,低头思故乡。


In [11]:
tokenizer.tokenize(text)

['▁', '举', '头', '望', '明月', ',', '低', '头', '思', '故乡', '。']

In [12]:
len(tokenizer)

10000

# 4 tiktoken

In [102]:
from tiktoken._educational import *

# # Train a BPE tokeniser on a small amount of text
# enc = train_simple_encoding()

# Visualise how the GPT-4 encoder encodes text
enc = SimpleBytePairEncoding.from_tiktoken("cl100k_base")
enc.encode("tokenization")

enc.encode("举头望")# 9 字符

[48;5;167mt[48;5;179mo[48;5;185mk[48;5;77me[48;5;80mn[48;5;68mi[48;5;134mz[48;5;167ma[48;5;179mt[48;5;185mi[48;5;77mo[48;5;80mn[0m
[48;5;167mt[48;5;179mo[48;5;185mk[48;5;77me[48;5;80mn[48;5;68mi[48;5;134mz[48;5;167ma[48;5;179mt[48;5;185mi[48;5;77mon[0m
[48;5;167mt[48;5;179mo[48;5;185mk[48;5;77me[48;5;80mn[48;5;68mi[48;5;134mz[48;5;167mat[48;5;185mi[48;5;77mon[0m
[48;5;167mt[48;5;179mo[48;5;185mk[48;5;77men[48;5;68mi[48;5;134mz[48;5;167mat[48;5;185mi[48;5;77mon[0m
[48;5;167mt[48;5;179mo[48;5;185mk[48;5;77men[48;5;68mi[48;5;134mz[48;5;167mat[48;5;185mion[0m
[48;5;167mt[48;5;179mo[48;5;185mk[48;5;77men[48;5;68mi[48;5;134mz[48;5;167mation[0m
[48;5;167mt[48;5;179mo[48;5;185mk[48;5;77men[48;5;68miz[48;5;167mation[0m
[48;5;167mt[48;5;179mok[48;5;77men[48;5;68miz[48;5;167mation[0m
[48;5;167mt[48;5;179moken[48;5;68miz[48;5;167mation[0m
[48;5;167mt[48;5;179moken[48;5;68mization[0m
[48;5;167mtoken[48;5;68mizati

[3574, 122, 65455, 4916, 249]

In [107]:
import tiktoken

# 创建一个编码器实例
encoder = tiktoken.get_encoding("cl100k_base")  # 使用特定的编码方案
# 编码文本
text = "举头望"
tokens = encoder.encode(text)
print("编码",tokens)  # 输出 token ID 列表
# 解码 token
decoded_text = encoder.decode(tokens)
print("解码",decoded_text)  # 输出原始文本
encoder.encode("tokenization")

编码 [3574, 122, 65455, 4916, 249]
解码 举头望


[5963, 2065]

In [105]:
import tiktoken

# 创建一个编码器实例
encoder = tiktoken.get_encoding("gpt2")  # 使用特定的编码方案
# 编码文本
text = "举头望"
tokens = encoder.encode(text)
print("编码",tokens)  # 输出 token ID 列表
# 解码 token
decoded_text = encoder.decode(tokens)
print("解码",decoded_text)  # 输出原始文本
encoder.encode("tokenization")

编码 [10310, 122, 13783, 112, 17312, 249]
解码 举头望


[30001, 1634]

In [109]:
# 获取词汇表大小
vocab_size = encoder.n_vocab
print(f"Vocabulary size: {vocab_size}")

# 获取单个 token
token_id = 65455
token_str = encoder.decode([token_id])
print(f"Token ID {token_id} corresponds to: {token_str}")

Vocabulary size: 100277
Token ID 3574 corresponds to: �


In [111]:
from transformers import PreTrainedTokenizer
import tiktoken

class CustomTiktokenTokenizer(PreTrainedTokenizer):
    def __init__(self, *args, **kwargs):
        
        self.encoder = tiktoken.get_encoding("cl100k_base")
        super().__init__(*args, **kwargs)

    @property
    def vocab_size(self):
        return self.encoder.n_vocab

    def _tokenize(self, text, **kwargs):
        return [str(token) for token in self.encoder.encode(text)]

    def _convert_token_to_id(self, token):
        if isinstance(token, str):
            token = int(token)
        return token

    def _convert_id_to_token(self, index):
        return str(index)

    def encode(self, text, **kwargs):
        return self.encoder.encode(text)

    def decode(self, ids, **kwargs):
        return self.encoder.decode(ids)

    def get_vocab(self):
        # 返回词汇表字典
        return {str(i): i for i in range(self.vocab_size)}

    def save_pretrained(self, save_directory, **kwargs):
        # 保存词汇表文件
        with open(os.path.join(save_directory, 'vocab.json'), 'w') as f:
            json.dump(self.get_vocab(), f)

        # 保存特殊标记
        special_tokens_map = {
            "unk_token": str(self.encoder.eot_token),
            "sep_token": str(self.encoder.eot_token),
            "pad_token": str(self.encoder.eot_token),
            "cls_token": str(self.encoder.eot_token),
            "mask_token": str(self.encoder.eot_token),
            "bos_token": str('<|startoftext|>'),
            "eos_token": str(self.encoder.eot_token)
        }

        
        with open(os.path.join(save_directory, 'special_tokens_map.json'), 'w') as f:
            json.dump(special_tokens_map, f)

        # 保存配置文件
        with open(os.path.join(save_directory, 'tokenizer_config.json'), 'w') as f:
            f.write(json.dumps({
                "model_max_length": 2048,
                "padding_side": "right",
                "truncation_side": "right"
            }))

    @classmethod
    def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs):
        return cls(*args, **kwargs)

# 示例使用
tokenizer = CustomTiktokenTokenizer()


# 保存tokenizer到指定目录
output_dir = f'{data_path_root}/custom_tokenizer4'
tokenizer.save_pretrained(output_dir)

# 加载 tokenizer
loaded_tokenizer = CustomTiktokenTokenizer.from_pretrained(output_dir)
encoded_loaded = loaded_tokenizer.encode(text)
decoded_loaded = loaded_tokenizer.decode(encoded_loaded)

print("Loaded and Encoded:", encoded_loaded)
print("Loaded and Decoded:", decoded_loaded)



Loaded and Encoded: [3574, 122, 65455, 4916, 249]
Loaded and Decoded: 举头望


#### 4.3 Extending tiktoken
添加新的token

In [112]:
cl100k_base = tiktoken.get_encoding("cl100k_base")

# In production, load the arguments directly instead of accessing private attributes
# See openai_public.py for examples of arguments for specific encodings
enc = tiktoken.Encoding(
    # If you're changing the set of special tokens, make sure to use a different name
    # It should be clear from the name what behaviour to expect.
    name="cl100k_im",
    pat_str=cl100k_base._pat_str,
    mergeable_ranks=cl100k_base._mergeable_ranks,
    special_tokens={
        **cl100k_base._special_tokens,
        "<|im_start|>": 100264,
        "<|im_end|>": 100269,
    }
)

In [114]:
# 需要传参数 ：allowed_special
print(enc.encode("<|im_end|>",allowed_special=set(enc._special_tokens.keys())))
enc._special_tokens

[100269]


{'<|endoftext|>': 100257,
 '<|fim_prefix|>': 100258,
 '<|fim_middle|>': 100259,
 '<|fim_suffix|>': 100260,
 '<|endofprompt|>': 100276,
 '<|im_start|>': 100264,
 '<|im_end|>': 100269}

In [78]:
enc.max_token_value
encoder_config = {
    "name": enc.name,
    "pat_str": enc._pat_str,
    "mergeable_ranks": enc._mergeable_ranks,
    "special_tokens": enc._special_tokens
}
encoder_config

{'name': 'cl100k_im',
 'pat_str': "'(?i:[sdmt]|ll|ve|re)|[^\\r\\n\\p{L}\\p{N}]?+\\p{L}++|\\p{N}{1,3}+| ?[^\\s\\p{L}\\p{N}]++[\\r\\n]*+|\\s++$|\\s*[\\r\\n]|\\s+(?!\\S)|\\s",
 'mergeable_ranks': {b'!': 0,
  b'"': 1,
  b'#': 2,
  b'$': 3,
  b'%': 4,
  b'&': 5,
  b"'": 6,
  b'(': 7,
  b')': 8,
  b'*': 9,
  b'+': 10,
  b',': 11,
  b'-': 12,
  b'.': 13,
  b'/': 14,
  b'0': 15,
  b'1': 16,
  b'2': 17,
  b'3': 18,
  b'4': 19,
  b'5': 20,
  b'6': 21,
  b'7': 22,
  b'8': 23,
  b'9': 24,
  b':': 25,
  b';': 26,
  b'<': 27,
  b'=': 28,
  b'>': 29,
  b'?': 30,
  b'@': 31,
  b'A': 32,
  b'B': 33,
  b'C': 34,
  b'D': 35,
  b'E': 36,
  b'F': 37,
  b'G': 38,
  b'H': 39,
  b'I': 40,
  b'J': 41,
  b'K': 42,
  b'L': 43,
  b'M': 44,
  b'N': 45,
  b'O': 46,
  b'P': 47,
  b'Q': 48,
  b'R': 49,
  b'S': 50,
  b'T': 51,
  b'U': 52,
  b'V': 53,
  b'W': 54,
  b'X': 55,
  b'Y': 56,
  b'Z': 57,
  b'[': 58,
  b'\\': 59,
  b']': 60,
  b'^': 61,
  b'_': 62,
  b'`': 63,
  b'a': 64,
  b'b': 65,
  b'c': 66,
  b'd': 67,
 

In [86]:
# 重新创建编码器
loaded_enc = tiktoken.Encoding(
    name=encoder_config["name"],
    pat_str=encoder_config["pat_str"],
    mergeable_ranks=encoder_config["mergeable_ranks"],
    special_tokens=encoder_config["special_tokens"]
)

# # 测试加载的编码器
# test_text = "举头望明月"
encoded_test = [loaded_enc._special_tokens["<|im_end|>"]] + loaded_enc.encode(test_text) + [loaded_enc._special_tokens["<|endoftext|>"]]
print("Encoded text with special tokens:", encoded_test)

decoded_test = loaded_enc.decode(encoded_test)
print("Decoded text:", decoded_test)

Encoded text with special tokens: [100269, 3574, 122, 65455, 4916, 249, 31958, 9953, 100257]
Decoded text: <|im_end|>举头望明月<|endoftext|>


In [84]:
loaded_enc._special_tokens

{'<|endoftext|>': 100257,
 '<|fim_prefix|>': 100258,
 '<|fim_middle|>': 100259,
 '<|fim_suffix|>': 100260,
 '<|endofprompt|>': 100276,
 '<|im_start|>': 100264,
 '<|im_end|>': 100269}

In [36]:
!ls /Users/wangaijun/pythoncode/github/data/text/custom_tokenizer

sentencepiece.bpe.model special_tokens_map.json tokenizer_config.json


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [37]:
!ls /Users/wangaijun/pythoncode/github/data/text/custom_tokenizer2

special_tokens_map.json tokenizer_config.json   vocab.txt
spiece.model            vocab.json


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [34]:
!ls /Users/wangaijun/pythoncode/github/data/text/custom_tokenizer3

added_tokens.json       tokenizer_config.json
special_tokens_map.json vocab.txt


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [35]:
!ls /Users/wangaijun/pythoncode/github/data/text/custom_tokenizer4

special_tokens_map.json tokenizer_config.json   vocab.json


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
