In [7]:
# 读取文本
file = "tangshi300.txt"
with open(file,'r',encoding='utf-8') as f:
    text = f.read()
print(len(text))
print(text[:180])

29405
唐诗300首 1-50

010杜甫：佳人

绝代有佳人，幽居在空谷。
自云良家子，零落依草木。
关中昔丧乱，兄弟遭杀戮。
官高何足论，不得收骨肉。
世情恶衰歇，万事随转烛。
夫婿轻薄儿，新人美如玉。
合昏尚知时，鸳鸯不独宿。
但见新人笑，那闻旧人哭！
在山泉水清，出山泉水浊。
侍婢卖珠回，牵萝补茅屋。
摘花不插发，采柏动盈掬。
天寒翠袖薄，日暮倚修竹。




In [8]:
# 创建字符序号索引
words = sorted(list(set(text)))
print("字和符号数量：{}".format(len(words)))

word_idx = {w : i for (i, w) in enumerate(words)}
idx_word = {i : w for (i, w) in enumerate(words)}

字和符号数量：2590


In [9]:
# 根据文本，创建序列
sample_maxlen = 40
sentences = []
next_word = []
for i in range(len(text)-sample_maxlen):
    sentences.append(text[i : i+sample_maxlen])
    next_word.append(text[i+sample_maxlen])
print("样本数量：{}".format(len(sentences)))

样本数量：29365


In [10]:
# 将文本序列转化成数字序列（矩阵）,实际上就是一个one_hot 编码
import numpy as np
X = np.zeros((len(sentences), sample_maxlen, len(words)),dtype=np.bool)
y = np.zeros((len(sentences), len(words)), dtype=np.bool)

for i in range(len(sentences)):
    for t, w in enumerate(sentences[i]):
        X[i, t, word_idx[w]] = 1
    y[i, word_idx[next_word[i]]] = 1

In [11]:
# 建模
from keras.models import Sequential
from keras.layers import GRU, Dense
from keras.optimizers import Adam
model = Sequential()
model.add(GRU(units=128,input_shape=(sample_maxlen, len(words))))
model.add(Dense(units=len(words), activation='softmax'))

optimizer = Adam(learning_rate=0.001)
model.compile(optimizer=optimizer,
              loss='categorical_crossentropy',
              metrics=['accuracy'])
# history = model.fit(X, y, batch_size=128,epochs=500)
# model.save("tangshi_generator_model.h5")

# import pandas as pd
# import matplotlib.pyplot as plt
# pd.DataFrame(history.history).plot(figsize=(8, 5))
# plt.grid(True)
# plt.gca().set_ylim(0, 1) # set the vertical range to [0-1]
# plt.show()

In [12]:
def sampling(preds, temperature=1.0):
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds)/temperature
    exp_preds = np.exp(preds)
    preds = exp_preds/np.sum(exp_preds)
    probs = np.random.multinomial(1, preds, 1)
    return np.argmax(probs)

from keras.models import load_model
import random
model = load_model("tangshi_generator_model.h5")

def generate_tangshi(model, generate_len=200):
    start_idx = random.randint(0, len(text)-sample_maxlen-1)
    generated = ""
    sentence = text[start_idx : start_idx + sample_maxlen]
    generated += sentence
    print("随机选取的开始句子为：{}".format(generated))
    for i in range(generate_len):
        x_pred = np.zeros((1, sample_maxlen, len(words)))
        for t, w in enumerate(sentence):
            x_pred[0, t, word_idx[w]] = 1
        preds = model.predict(x_pred)[0]
        next_idx = sampling(preds, 1)
        next_w = idx_word[next_idx]
        generated += next_w
        sentence = sentence[1:] + next_w
    return generated

generate_tangshi(model, 100)

随机选取的开始句子为：送客，枫叶荻花秋瑟瑟。
主人下马客在船，举酒欲饮无管弦。
醉不成欢惨将别，别时茫


'送客，枫叶荻花秋瑟瑟。\n主人下马客在船，举酒欲饮无管弦。\n醉不成欢惨将别，别时茫茫江浸月。\n忽闻水上琵琶声，主人忘归客不发。\n寻声暗问弹者谁，琵琶声停欲语迟。\n移船相近邀相见，添酒回灯重开宴。\n千呼万唤始出来，犹抱琵琶半遮面。\n转轴拨弦三两声，未成曲调先有情。\n弦弦掩抑声声思，似'

In [13]:
with open('test.txt','r',encoding='utf-8') as f:
    test_text = f.read()
    
def generate_tangshi_test(model, generate_len=60):
    generated = ""
    sentence = test_text[0 : sample_maxlen]
    generated += sentence
    print("测试文本开始句子为：{}".format(generated))
    for i in range(generate_len):
        x_pred = np.zeros((1, sample_maxlen, len(words)))
        for t, w in enumerate(sentence):
            x_pred[0, t, word_idx[w]] = 1
        preds = model.predict(x_pred)[0]
        next_idx = sampling(preds, 1)
        next_w = idx_word[next_idx]
        generated += next_w
        sentence = sentence[1:] + next_w
    return generated

generate_tangshi_test(model, 100)

随机选取的开始句子为：住彼此无消息。
人生有情泪沾臆，江水江花岂终极？
黄昏胡骑尘满城，欲往城南望城北


'住彼此无消息。\n人生有情泪沾臆，江水江花岂终极？\n黄昏胡骑尘满城，欲往城南望城北。\n\n089杜甫：哀王孙\n\n长安城头头白乌，夜飞延秋门上呼。\n又向人家啄大屋，屋底达官走避胡。\n金鞭断折九马死，骨肉不待同驰驱。\n腰下宝［“决”换王旁］青珊瑚，可怜王孙泣路隅！\n问之不肯道姓名，但道困'