In [1]:
# 定义需要过滤的特殊符号或字符模式
data_path = "./data/poetry.txt"  # 数据集路径
max_length = 48   # 设定单行诗的最大长度

In [2]:
from datasets import Dataset, DatasetDict
# 制作数据集
def make_dataset(data_path, max_length):
    # 初始化存储诗歌和标题的列表
    poetry = []  # 一首诗对应一个列表的元素
    titles = []  # 一首诗对应的标题

    with open(data_path, 'r', encoding='utf-8') as f:  # 按行读取文件数据，一行就是一首诗
        lines = f.readlines()

    for line in lines:
        fields = line.split("》")  # 利用正则表达式拆分标题和内容
        if len(fields) != 2:  # 每行拆分后如果不是两项，就跳过该异常数据
            continue
    
        content = fields[1].replace(' ', '')  # 提取诗词内容,去除空格
        content = content.replace('\n', '') # 去掉换行符
        title = fields[0].replace("《", '').replace(' ', '')  # 去掉书名号和无效空格
        # 去掉超长古诗
        if len(content) > max_length:
            continue
        poetry.append(content)  # 将诗词添加到列表里，每行一首
        titles.append(title)
    length = len(titles)
    print(f"总共有{length}首古诗")

    # 划分数据集
    print(f"数据集大小: {length}")

    dataset_dict = {'titles': titles, 'poetry': poetry}

    # 创建 Dataset 对象
    dataset = Dataset.from_dict(dataset_dict)

    # 保存到磁盘
    dataset.save_to_disk("./data/data")
    print("数据集已保存到 ./data/data")
    return poetry

In [3]:
# 制作字典
import json
def make_vocab(poetry):
    # 构建字符到ID的映射
    char_to_id = {'<PAD>':0, '<UNK>':1, '<SOS>':2, '<EOS>':3}
    for sentence in poetry:
        for char in sentence:
            if char not in char_to_id:
                char_to_id[char] = len(char_to_id)
    # 构建ID到字符的映射
    id_to_char = {v: k for k, v in char_to_id.items()}
    print("字典长度: %d"%(len(char_to_id)))
    
    # 保存词汇表到文件
    vocab = {'char_to_id': char_to_id, 'id_to_char': id_to_char}
    vocab_path = "./data/vocab.json"
    with open(vocab_path, 'w', encoding='utf-8') as f:
        json.dump(vocab, f, ensure_ascii=False, indent=4)

    print(f"词汇表已保存到 {vocab_path}")

In [4]:
# 开始制作
poetry = make_dataset(data_path, max_length)
make_vocab(poetry)

总共有182384首古诗
数据集大小: 182384


Saving the dataset (0/1 shards):   0%|          | 0/182384 [00:00<?, ? examples/s]

数据集已保存到 ./data/data
字典长度: 9090
词汇表已保存到 ./data/vocab.json
