https://mp.weixin.qq.com/s?__biz=MzUyMzg0ODY0Ng==&mid=2247483839&idx=1&sn=e3a1e7753a283f9dcc8ecdf5354230eb&chksm=fa371f16cd409600b2de38a95376bd1f1d26919a67a804b30c53dfc96a066763ba2036315440&mpshare=1&scene=1&srcid=0330XnX8Q7x1BsPQCGHecsFv&sharer_sharetime=1585548755034&sharer_shareid=d88df2ec93a2e5e478eaa39ef5a82e2f&key=fa0f4cfef200c085620487de7d82882718619ab9893ba6f4b3100366149d2e86a8943803d07a5e6012ad4f5d6c64ccdf38ecc4d80ef93dc6f5a8742dff9388bba18d9b211bdd1a94a05f2fd7d897de7a&ascene=1&uin=MjA1MjAyODkxNg%3D%3D&devicetype=Windows+10&version=62080079&lang=zh_CN&exportkey=Ab9j2hoNaEcCHzzhs101Hxo%3D&pass_ticket=oQV%2B4QgocE%2B79igKS84ByQiBvNr7zSd0fGMluYqPBYLpNaLxouEPfg16iqZpY1vp

In [2]:
# 导包
import math
import re
import numpy as np
import tensorflow as tf
from collections import Counter
import re

In [4]:
# 数据预处理
# 数据路径
DATA_PATH = '../dataset/poetry.txt'
# 单行诗最大长度
MAX_LEN = 64
# 禁用的字符，拥有以下字符号的诗将被忽略
DISALLOWED_WORDS = ['（', '）', '(', ')', '__', '《', '》', '【', '】', '[', ']']
# 一首诗（一行）对应一个列表的元素
poetry = []
# 按行读取数据 poetry.txt
with open(DATA_PATH,'r',encoding='utf-8') as f:
    lines = f.readlines()
    # 遍历处理每条数据
    for line in lines:
        # 利用正则表达式拆分标题和内容
        fields = re.split(r'[:]',line)
        # 跳过异常数据
        if len(fields) != 2:
            continue
        # 得到诗词内容，后面不需要标题
        content = fields[1]
        # 跳过内容过长的诗词
        if len(content) > MAX_LEN - 2:
            continue
        # 跳过存在禁用符的诗词
        if any(word in content for word in DISALLOWED_WORDS):
            continue
        poetry.append(content.replace('\n','')) # 删除换行符

In [5]:
for i in range(0,5):
    print(poetry[i])

寒随穷律变，春逐鸟声开。初风飘带柳，晚雪间花梅。碧林青旧竹，绿沼翠新苔。芝田初雁去，绮树巧莺来。
晚霞聊自怡，初晴弥可喜。日晃百花色，风动千林翠。池鱼跃不同，园鸟声还异。寄言博通者，知予物外志。
夏律昨留灰，秋箭今移晷。峨嵋岫初出，洞庭波渐起。桂白发幽岩，菊黄开灞涘。运流方可叹，含毫属微理。
寒惊蓟门叶，秋发小山枝。松阴背日转，竹影避风移。提壶菊花岸，高兴芙蓉池。欲知凉气早，巢空燕不窥。
山亭秋色满，岩牖凉风度。疏兰尚染烟，残菊犹承露。古石衣新苔，新巢封古树。历览情无极，咫尺轮光暮。


In [6]:
# 统计一下词频，删除出现次数较低的词
# 最小词频
MIN_WORD_FREQUENCY = 8
# 统计词频，利用Counter可以直接按单个字符进行统计词频
counter = Counter()
for line in poetry:
    counter.update(line)
# 过滤掉低词频的词
tokens = [token for token,count in counter.items() if count >= MIN_WORD_FREQUENCY]

In [7]:
i = 0
for token, count in counter.items():
    if i >= 5:
        break;
    print(token, "->",count)
    i += 1

寒 -> 2628
随 -> 1040
穷 -> 487
律 -> 119
变 -> 288


In [8]:
# 补上特殊词标记：填充字符标记、未知词标记、开始标记、结束标记
tokens = ["[PAD]", "[NONE]", "[START]", "[END]"] + tokens

In [9]:
# 对生成的词进行编号
# 映射: 词 -> 编号
word_idx = {}
# 映射: 编号 -> 词
idx_word = {}
for idx,word in enumerate(tokens):
    word_idx[word] = idx
    idx_word[idx] = word

In [19]:
# 构建Tokenizer
class Tokenizer:
    """
    分词器
    """
    def __init__(self,tokens):
        # 词汇表大小
        self.dict_size = len(tokens)
        # 生成映射关系
        self.token_id = {} # 映射：词 -> 编号
        self.id_token = {} # 映射：编号 -> 词
        for idx, word in enumerate(tokens):
            self.token_id[word] = idx
            self.id_token[idx] = word
        # 各个特殊标记的编号id,方便其他地方使用
        self.start_id = self.token_id["[START]"]
        self.end_id = self.token_id["[END]"]
        self.none_id = self.token_id["[NONE]"]
        self.pad_id = self.token_id["[PAD]"]
    def id_to_token(self,token_id):
        """
        编号 -> 词
        """
        return self.id_token.get(token_id)
    def token_to_id(self,token):
        """
        词 -> 编号
        """
        return self.token_id.get(token,self.none_id)
    def encode(self,tokens):
        """
        词列表 -> [START]编号 + 编号列表 + [END]编号
        """
        token_ids = [self.start_id,] # 起始标记
        # 遍历，词转编号
        for token in tokens:
            token_ids.append(self.token_to_id(token))
        token_ids.append(self.end_id)
        return token_ids
    def decode(self,token_ids):
        """
        编号列表 -> 词列表(去掉起始，结束标记)
        """
        # 起始，结束标记
        flag_tokens = {"[START]", "[END]"}
        tokens = []
        for idx in token_ids:
            token = self.id_to_token(idx)
            # 跳过起始，结束标记
            if token not in flag_tokens:
                tokens.append(token)
        return tokens

In [20]:
# 初始化Tokenizer
tokenizer = Tokenizer(tokens)

In [21]:
# 构建PoetryDataSet
class PoetryDataSet:
    """
    生成数据集生成器
    """
    def __init__(self,data,tokenizer,batch_size):
        # 数据集
        self.data = data
        self.total_size = len(self.data)
        # 分词器，用于词转编号
        self.tokenizer = tokenizer
        # 每批数据量
        self.batch_size = batch_size
        # 每个epoch迭代的次数
        self.steps = int(math.floor(len(self.data) / self.batch_size))
    def pad_line(self,line,length,padding=None):
        """
        对齐单行数据
        """
        if padding is None:
            padding = self.tokenizer.pad_id
        padding_length = length - len(line)
        if padding_length > 0:
            return line + [padding] * padding_length
        else:
            return line[:length]
    def __len__(self):
        return self.steps
    def __iter__(self):
        # 打乱数据
        np.random.shuffle(self.data)
        # 迭代一个epoch,每次yield一个batch
        for start in range(0,self.total_size,self.batch_size):
            end = min(start + self.batch_size,self.total_size)
            data = self.data[start:end]
            max_length = max(map(len,data))
            batch_data = []
            for str_line in data:
                # 对每一行诗词进行编码，并补齐padding
                encode_line = self.tokenizer.encode(str_line)
                pad_encode_line = self.pad_line(encode_line,max_length+2) # 加2是因为tokenizer.encode会添加START和END
                batch_data.append(pad_encode_line)
            batch_data = np.array(batch_data)
            # yield 特征，标签
            yield batch_data[:,:-1],batch_data[:,1:]
    def generator(self):
        while True:
            yield from self.__iter__()

In [22]:
BATCH_SIZE = 16
dataset = PoetryDataSet(poetry,tokenizer,BATCH_SIZE)

In [24]:
# 构建模型
model = tf.keras.Sequential([
    # 词嵌入层
    tf.keras.layers.Embedding(input_dim=tokenizer.dict_size,output_dim=150),
    # 第一层LSTM层
    tf.keras.layers.LSTM(150,dropout=0.5,return_sequences=True),
    # 第二层LSTM层
    tf.keras.layers.LSTM(150,dropout=0.5,return_sequences=True),
    # 利用TimeDistributed对每个时间步的输出都做Dense操作（softmax激活）
    tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(tokenizer.dict_size,activation='softmax'))
])

In [25]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 150)         515250    
_________________________________________________________________
lstm (LSTM)                  (None, None, 150)         180600    
_________________________________________________________________
lstm_1 (LSTM)                (None, None, 150)         180600    
_________________________________________________________________
time_distributed (TimeDistri (None, None, 3435)        518685    
Total params: 1,395,135
Trainable params: 1,395,135
Non-trainable params: 0
_________________________________________________________________


In [29]:
# 模型编译
model.compile(
    optimizer = tf.keras.optimizers.Adam(),
    loss = tf.keras.losses.sparse_categorical_crossentropy
)

In [30]:
# 训练模型
model.fit(
    dataset.generator(),
    steps_per_epoch=dataset.steps,
    epochs=10
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x1acc91e7898>

In [31]:
# 预测
# 需要先将词转为编号
token_ids = [tokenizer.token_to_id(word) for word in ["月", "光", "静", "谧"]]
# 进行预测
result = model.predict([token_ids,])

In [32]:
print(result)

[[[4.8102291e-05 1.0659970e-02 1.9756358e-06 ... 6.4293868e-06
   9.1382490e-06 2.0419389e-05]
  [1.0790689e-06 1.4589122e-02 1.2457634e-08 ... 4.3525708e-07
   5.6293044e-07 6.2695214e-08]
  [6.8445984e-06 8.2051912e-03 8.4110994e-09 ... 5.7726442e-07
   3.0468429e-06 4.0382113e-07]
  [1.1436045e-04 1.1967261e-02 6.7134884e-08 ... 1.8703339e-08
   1.1075260e-07 1.0855107e-06]]]


In [33]:
# 需要词的多样化，因此可以按预测结果的概率分布进行抽样
def predict(model,token_ids):
    """
    在概率值为前100的词中选取一个词（按概率分布的方式）
    return:一个词的编号（不包含[PAD][NONE][START]）
    """
    # 预测各个词的概率分布
    # -1 表示只要对最新的词的预测
    # 3 表示不要前面几个标记符
    _probas = model.predict([token_ids,])[0,-1,3:]
    # 按概率升序，取前100
    p_args = _probas.argsort()[-100:][::-1] # 此时拿到的是索引
    p = _probas[p_args] # 根据索引找到具体的概率值
    p = p / sum(p) # 归一
    # 按概率抽取一个
    target_index = np.random.choice(len(p),p=p)
    # 前面预测时删除了前几个标记符，因此编号要补上3位，才是实际在tokenizer词典中的编号
    return p_args[target_index] + 3

In [34]:
token_ids = tokenizer.encode("清风明月")[:-1]
while len(token_ids) < 13:
    # 预测词的编号
    target = predict(model,token_ids)
    # 保存结果
    token_ids.append(target)
    # 到达END
    if target == tokenizer.end_id:
        break
print(''.join(tokenizer.decode(token_ids)))

W0408 12:09:38.875421 16608 def_function.py:597] 5 out of the last 5 calls to <function Model.make_predict_function.<locals>.predict_function at 0x000001ACC79700D0> triggered tf.function retracing. Tracing is expensive and the excessive number of tracings is likely due to passing python objects instead of tensors. Also, tf.function has experimental_relax_shapes=True option that relaxes argument shapes that can avoid unnecessary retracing. Please refer to https://www.tensorflow.org/tutorials/customization/performance#python_or_tensor_args and https://www.tensorflow.org/api_docs/python/tf/function for more details.
W0408 12:09:39.472723 16608 def_function.py:597] 6 out of the last 6 calls to <function Model.make_predict_function.<locals>.predict_function at 0x000001ACC79700D0> triggered tf.function retracing. Tracing is expensive and the excessive number of tracings is likely due to passing python objects instead of tensors. Also, tf.function has experimental_relax_shapes=True option tha

清风明月接，雨水夜山还。
