https://mp.weixin.qq.com/s?__biz=MzUyMzg0ODY0Ng==&mid=2247483839&idx=1&sn=e3a1e7753a283f9dcc8ecdf5354230eb&chksm=fa371f16cd409600b2de38a95376bd1f1d26919a67a804b30c53dfc96a066763ba2036315440&mpshare=1&scene=1&srcid=0330XnX8Q7x1BsPQCGHecsFv&sharer_sharetime=1585548755034&sharer_shareid=d88df2ec93a2e5e478eaa39ef5a82e2f&key=fa0f4cfef200c085620487de7d82882718619ab9893ba6f4b3100366149d2e86a8943803d07a5e6012ad4f5d6c64ccdf38ecc4d80ef93dc6f5a8742dff9388bba18d9b211bdd1a94a05f2fd7d897de7a&ascene=1&uin=MjA1MjAyODkxNg%3D%3D&devicetype=Windows+10&version=62080079&lang=zh_CN&exportkey=Ab9j2hoNaEcCHzzhs101Hxo%3D&pass_ticket=oQV%2B4QgocE%2B79igKS84ByQiBvNr7zSd0fGMluYqPBYLpNaLxouEPfg16iqZpY1vp

In [2]:
# 导包
import math
import re
import numpy as np
import tensorflow as tf
from collections import Counter
import re

In [4]:
# 数据预处理
# 数据路径
DATA_PATH = '../dataset/poetry.txt'
# 单行诗最大长度
MAX_LEN = 64
# 禁用的字符，拥有以下字符号的诗将被忽略
DISALLOWED_WORDS = ['（', '）', '(', ')', '__', '《', '》', '【', '】', '[', ']']
# 一首诗（一行）对应一个列表的元素
poetry = []
# 按行读取数据 poetry.txt
with open(DATA_PATH,'r',encoding='utf-8') as f:
    lines = f.readlines()
    # 遍历处理每条数据
    for line in lines:
        # 利用正则表达式拆分标题和内容
        fields = re.split(r'[:]',line)
        # 跳过异常数据
        if len(fields) != 2:
            continue
        # 得到诗词内容，后面不需要标题
        content = fields[1]
        # 跳过内容过长的诗词
        if len(content) > MAX_LEN - 2:
            continue
        # 跳过存在禁用符的诗词
        if any(word in content for word in DISALLOWED_WORDS):
            continue
        poetry.append(content.replace('\n','')) # 删除换行符

In [5]:
for i in range(0,5):
    print(poetry[i])

寒随穷律变，春逐鸟声开。初风飘带柳，晚雪间花梅。碧林青旧竹，绿沼翠新苔。芝田初雁去，绮树巧莺来。
晚霞聊自怡，初晴弥可喜。日晃百花色，风动千林翠。池鱼跃不同，园鸟声还异。寄言博通者，知予物外志。
夏律昨留灰，秋箭今移晷。峨嵋岫初出，洞庭波渐起。桂白发幽岩，菊黄开灞涘。运流方可叹，含毫属微理。
寒惊蓟门叶，秋发小山枝。松阴背日转，竹影避风移。提壶菊花岸，高兴芙蓉池。欲知凉气早，巢空燕不窥。
山亭秋色满，岩牖凉风度。疏兰尚染烟，残菊犹承露。古石衣新苔，新巢封古树。历览情无极，咫尺轮光暮。


In [6]:
# 统计一下词频，删除出现次数较低的词
# 最小词频
MIN_WORD_FREQUENCY = 8
# 统计词频，利用Counter可以直接按单个字符进行统计词频
counter = Counter()
for line in poetry:
    counter.update(line)
# 过滤掉低词频的词
tokens = [token for token,count in counter.items() if count >= MIN_WORD_FREQUENCY]

In [7]:
i = 0
for token, count in counter.items():
    if i >= 5:
        break;
    print(token, "->",count)
    i += 1

寒 -> 2628
随 -> 1040
穷 -> 487
律 -> 119
变 -> 288


In [8]:
# 补上特殊词标记：填充字符标记、未知词标记、开始标记、结束标记
tokens = ["[PAD]", "[NONE]", "[START]", "[END]"] + tokens

In [9]:
# 对生成的词进行编号
# 映射: 词 -> 编号
word_idx = {}
# 映射: 编号 -> 词
idx_word = {}
for idx,word in enumerate(tokens):
    word_idx[word] = idx
    idx_word[idx] = word

In [None]:
# 构建Tokenizer
class Tokenizer:
    """
    分词器
    """
    def __init__(self,tokens):
        # 词汇表大小
        self.dict_size = len(tokens)
        # 生成映射关系
        self.token_id = {} # 映射：词 -> 编号
        self.id_token = {} # 映射：编号 -> 词
        # 各个特殊标记的编号id,方便其他地方使用
        self.start_id = self.token_id['[START]']
        self.end_id = self.token_id["[END]"]
        self.none_id = self.token_id["[NONE]"]
        self.pad_id = self.token_id["[PAD]"]
    def id_to_token(self,token_id):
        """
        编号 -> 词
        """
        return self.id_token.get(token_id)
    def token_to_id(self,token):
        """
        词 -> 编号
        """
        return self.token_id.get(token,self.none_id)
    def encode(self,tokens):
        """
        词列表 -> [START]编号 + 编号列表 + [END]编号
        """
        token_ids = [self.start_id,] # 起始标记
        # 遍历，词转编号
        for token in tokens:
            token_ids.append(self.token_to_id(token))
        token_ids.append(self.end_id)
        return token_ids
    def decode(self,token_ids):
        """
        编号列表 -> 词列表(去掉起始，结束标记)
        """
        