# 词表扩充

在中文语料库上训练一个中文tokenizer模型，然后将中文 tokenizer 与 LLaMA 原生的 tokenizer 进行合并，通过组合它们的词汇表，最终获得一个合并后的 tokenizer 模型。

今天的工作是获得一个中文的bpe分词模型。

## 数据预处理

In [3]:
with open("./data/轮回乐园.txt", "r", encoding="utf-8") as fp:
    data = fp.read().strip().split("\n")
sentences = []
# len(data), data
for d in data:
    d = d.strip()
    if "---" in d or len(d) < 2 or d == " ": 
        continue
    sentences.append(d)

with open("data/corpus.txt", "w", encoding="utf-8") as fp:
    fp.write("\n".join(sentences))

In [1]:
import sentencepiece as spm

In [2]:
spm.SentencePieceTrainer.train(
    input='./data/corpus.txt',
    model_prefix='tokenizer_spm_model',
    vocab_size=5000,
    character_coverage=0.9995, # 字符集丰富的中文、日文，设置为0.9995
    model_type='bpe', # unigram、bpe、word和char
    max_sentence_length=2048, # UTF-8中一个汉字3个字节，最大长度为2048字节
)

## 合并LLama2词表和中文词表

In [3]:
import os

os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python"
from transformers import LlamaTokenizer
from sentencepiece import sentencepiece_model_pb2 as sp_pb2_model
import sentencepiece as spm

In [None]:
llama2_tokenizer_dir = "llama2_tokenizer/tokenizer.model"
llama2_tokenizer = LlamaTokenizer.from_pretrained(llama2_tokenizer_dir)

chinese_sp_model = spm.SentencePieceProcessor()
chinese_sp_model_file = "tokenizer_spm_model.model"
chinese_sp_model.Load(chinese_sp_model_file)

In [None]:
llama2_spm = sp_pb2_model.ModelProto()
llama2_spm.ParseFromString(llama2_tokenizer.sp_model.serialized_model_proto())

chinese_spm = sp_pb2_model.ModelProto()
chinese_spm.ParseFromString(chinese_sp_model.serialized_model_proto())

In [None]:
# print number of tokens
len(llama2_tokenizer), len(chinese_sp_model)
llama2_tokenizer.all_special_tokens
llama2_tokenizer.all_special_ids
llama2_tokenizer.special_tokens_map

# add chinese tokens to llama2 tokenizer
llama_spm_tokens_set = set(p.piece for p in llama2_spm.pieces)
len(llama_spm_tokens_set)
f"Before: {len(llama_spm_tokens_set)}"

for p in chinese_spm.pieces:
    piece = p.piece
    if piece not in llama_spm_tokens_set:
        new_p = sp_pb2_model.ModelProto().SentencePiece()
        new_p.piece = piece
        new_p.score = 0
        llama2_spm.pieces.append(new_p)
f"New model pieces: {len(llama2_spm.pieces)}"

In [None]:
## save
output_sp_dir = 'llama2_chinese'
os.makedirs(output_sp_dir, exist_ok=True)
with open(output_sp_dir + '/chinese_llama2.model', 'wb') as f:
    f.write(llama2_spm.SerializeToString())
tokenizer = LlamaTokenizer(vocab_file=output_sp_dir + '/chinese_llama2.model')

In [None]:
output_hf_dir = 'llama2_chinese'
os.makedirs(output_hf_dir, exist_ok=True)
tokenizer.save_pretrained(output_hf_dir)
f"Chinese-LLaMA tokenizer has been saved to {output_hf_dir}"

In [None]:
# test
llama_tokenizer = LlamaTokenizer.from_pretrained(llama2_tokenizer_dir)
chinese_llama_tokenizer = LlamaTokenizer.from_pretrained(output_hf_dir)
tokenizer.all_special_ids
tokenizer.all_special_tokens
tokenizer.special_tokens_map
text = "白日依山尽，黄河入海流。欲穷千里目，更上一层楼"
f"test text: {text}"
f"tokenized by llama2 tokenizer: {llama_tokenizer.tokenize(text)}"
f"tokenized by chinese llama tokenizer: {chinese_llama_tokenizer.tokenize(text)}"