In [4]:
# encoding=utf-8
from pathlib import Path
import jieba
import pickle
import numpy as np

In [6]:
input_file_path = (Path().resolve() / '../data/txt/修订版天龙八部.txt').resolve()
output_dir_path = (Path().resolve() / '../data/TianLong_jieba').resolve()
assert input_file_path.exists()
if not output_dir_path.exists():
    output_dir_path.mkdir(parents=True, exist_ok=True)

In [15]:
with open(input_file_path, 'r', encoding='utf-8') as f:
    data = f.read()
print(f"length of dataset in characters: {len(data):,}")

# get all the unique characters that occur in this text
seg_list = jieba.lcut(data)
tokens = sorted(list(set(seg_list)))
vocab_size = len(tokens)
# print("all the unique characters:", ''.join(chars))
print(f"vocab size: {vocab_size:,}")
print(f"length of sentences in tokens: {len(seg_list):,}")

length of dataset in characters: 1,258,853
vocab size: 50,912
length of sentences in tokens: 839,971


In [14]:
# create a mapping from characters to integers
stoi = { ch:i for i,ch in enumerate(tokens) }
itos = { i:ch for i,ch in enumerate(tokens) }
def encode(s):
    return [stoi[c] for c in s] # encoder: take a string, output a list of integers
def decode(l):
    return ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string

In [16]:
# create the train and test splits
n = len(seg_list)
train_data = seg_list[:int(n*0.9)]
val_data = seg_list[int(n*0.9):]

# encode both to integers
train_ids = encode(train_data)
val_ids = encode(val_data)
print(f"train has {len(train_ids):,} tokens")
print(f"val has {len(val_ids):,} tokens")

train has 755,973 tokens
val has 83,998 tokens


In [17]:
# export to bin files
train_ids = np.array(train_ids, dtype=np.uint16)
val_ids = np.array(val_ids, dtype=np.uint16)
train_ids.tofile(output_dir_path / 'train.bin')
val_ids.tofile(output_dir_path / 'val.bin')

# save the meta information as well, to help us encode/decode later
meta = {
    'vocab_size': vocab_size,
    'itos': itos,
    'stoi': stoi,
}
with open(output_dir_path / 'meta.pkl', 'wb') as f:
    pickle.dump(meta, f)

# Test

In [None]:
# tokens from ChatGPT
test_tokens = \
[
    "左子穆", "道", "是", "跟", "神农帮", "动了手", "么", 
    "凌霄子", "道", "是啊", "他们", "把守", "了", 
    "各处", "要道", "说是", "不到", "明日", "天亮", 
    "谁也", "不许", "下山", "梁上", "那", "少女", 
    "口里", "咬着", "瓜子", "两只", "脚", "一荡", 
    "一荡", "的", "忽然", "将", "一粒", "瓜子", 
    "往", "段誉", "头上", "掷去", "正中", "他的", 
    "额头", "笑道", "喂", "你", "吃不吃", "瓜子"
]

for token in test_tokens:
    if token not in stoi:
        print(f"token '{token}' not in vocab")


token '神农帮' not in vocab
token '动了手' not in vocab
token '凌霄子' not in vocab
token '是啊' not in vocab
token '要道' not in vocab
token '说是' not in vocab
token '谁也' not in vocab
token '咬着' not in vocab
token '掷去' not in vocab
token '他的' not in vocab
token '笑道' not in vocab
token '吃不吃' not in vocab
