In [1]:
import tensorflow.keras as keras
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import sklearn
import os
import sys
import time
import tensorflow as tf

In [2]:
print(tf.__version__)
print("GPU : ", tf.test.is_gpu_available())
print(sys.version_info)
for module in mpl, np, pd, sklearn, tf, keras:
    print(module.__name__, module.__version__)

2.3.0
Instructions for updating:
Use `tf.config.list_physical_devices('GPU')` instead.
GPU :  True
sys.version_info(major=3, minor=7, micro=11, releaselevel='final', serial=0)
matplotlib 3.4.3
numpy 1.18.5
pandas 1.3.2
sklearn 0.24.2
tensorflow 2.3.0
tensorflow.keras 2.4.0


### 1. 数据处理

#### 1.1 读取数据

In [3]:
text = open("./data/ptb.train.txt", "r").read()
print(len(text))
print(text[:100])

5101618
 aer banknote berlitz calloway centrust cluett fromstein gitano guterman hydro-quebec ipo kia memote


#### 1.2 构建词表

In [4]:
# 构建词表
vocab = sorted(set(text))
print(len(vocab))
print(vocab)

50
['\n', ' ', '#', '$', '&', "'", '*', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '<', '>', 'N', '\\', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']


In [5]:
# 添加 <unk>
vocab.append("unk")
id2char = np.array(vocab)
print(id2char)

['\n' ' ' '#' '$' '&' "'" '*' '-' '.' '/' '0' '1' '2' '3' '4' '5' '6' '7'
 '8' '9' '<' '>' 'N' '\\' 'a' 'b' 'c' 'd' 'e' 'f' 'g' 'h' 'i' 'j' 'k' 'l'
 'm' 'n' 'o' 'p' 'q' 'r' 's' 't' 'u' 'v' 'w' 'x' 'y' 'z' 'unk']


In [6]:
# 构建词表索引
char2index = {char: index for index, char in enumerate(vocab)}
# 添加新的 <unk> 词表
print(char2index)

{'\n': 0, ' ': 1, '#': 2, '$': 3, '&': 4, "'": 5, '*': 6, '-': 7, '.': 8, '/': 9, '0': 10, '1': 11, '2': 12, '3': 13, '4': 14, '5': 15, '6': 16, '7': 17, '8': 18, '9': 19, '<': 20, '>': 21, 'N': 22, '\\': 23, 'a': 24, 'b': 25, 'c': 26, 'd': 27, 'e': 28, 'f': 29, 'g': 30, 'h': 31, 'i': 32, 'j': 33, 'k': 34, 'l': 35, 'm': 36, 'n': 37, 'o': 38, 'p': 39, 'q': 40, 'r': 41, 's': 42, 't': 43, 'u': 44, 'v': 45, 'w': 46, 'x': 47, 'y': 48, 'z': 49, 'unk': 50}


In [7]:
# 将字符转换为词表索引
text_as_int = np.array([char2index[c] for c in text])
print(text[:10])
print(text_as_int[:10])

 aer bankn
[ 1 24 28 41  1 25 24 37 34 37]


In [8]:
def split_input_target(id_index):
    """
        abcde -> abcd , bcde
    :param id_index:
    :return:
    """
    return id_index[:-1], id_index[1:]


# 将词表转换为数据集
char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)  # text_as_int 是转换为索引的 text

# 对数据设置batch, 将字符集的dataset转换为句子级的dataset
seq_length = 100  # 设置句子长度
# 加一的原因是 split_input_target 返回的长度比输入的长度少1
# drop_remainder " 如果数据到最后无法组成一个 batch 时丢弃该数据
seq_dataset = char_dataset.batch(seq_length + 1, drop_remainder=True)

# 打印数据
# char_dataset 每一个元素都是字符
for ch_id in char_dataset.take(2):  # 获取两个数据
    print(ch_id, id2char[ch_id.numpy()])

# seq_dataset ,每一个元素都是句子
for seq_id in seq_dataset.take(2):
    print(seq_id)
    print(repr(" ".join(id2char[seq_id.numpy()])))

tf.Tensor(1, shape=(), dtype=int32)  
tf.Tensor(24, shape=(), dtype=int32) a
tf.Tensor(
[ 1 24 28 41  1 25 24 37 34 37 38 43 28  1 25 28 41 35 32 43 49  1 26 24
 35 35 38 46 24 48  1 26 28 37 43 41 44 42 43  1 26 35 44 28 43 43  1 29
 41 38 36 42 43 28 32 37  1 30 32 43 24 37 38  1 30 44 43 28 41 36 24 37
  1 31 48 27 41 38  7 40 44 28 25 28 26  1 32 39 38  1 34 32 24  1 36 28
 36 38 43 28 26], shape=(101,), dtype=int32)
'  a e r   b a n k n o t e   b e r l i t z   c a l l o w a y   c e n t r u s t   c l u e t t   f r o m s t e i n   g i t a n o   g u t e r m a n   h y d r o - q u e b e c   i p o   k i a   m e m o t e c'
tf.Tensor(
[ 1 36 35 47  1 37 24 31 25  1 39 44 37 43 42  1 41 24 34 28  1 41 28 30
 24 43 43 24  1 41 44 25 28 37 42  1 42 32 36  1 42 37 24 26 34  7 29 38
 38 27  1 42 42 24 37 30 48 38 37 30  1 42 46 24 39 38  1 46 24 26 31 43
 28 41  1  0  1 39 32 28 41 41 28  1 20 44 37 34 21  1 22  1 48 28 24 41
 42  1 38 35 27], shape=(101,), dtype=int32)
'  m l x   n a h b   p 

In [9]:
# 使用 split_input_target 对句子进行处理
seq_dataset = seq_dataset.map(split_input_target)

for item_input, item_output in seq_dataset.take(2):
    print(item_input.numpy(), item_output.numpy())

[ 1 24 28 41  1 25 24 37 34 37 38 43 28  1 25 28 41 35 32 43 49  1 26 24
 35 35 38 46 24 48  1 26 28 37 43 41 44 42 43  1 26 35 44 28 43 43  1 29
 41 38 36 42 43 28 32 37  1 30 32 43 24 37 38  1 30 44 43 28 41 36 24 37
  1 31 48 27 41 38  7 40 44 28 25 28 26  1 32 39 38  1 34 32 24  1 36 28
 36 38 43 28] [24 28 41  1 25 24 37 34 37 38 43 28  1 25 28 41 35 32 43 49  1 26 24 35
 35 38 46 24 48  1 26 28 37 43 41 44 42 43  1 26 35 44 28 43 43  1 29 41
 38 36 42 43 28 32 37  1 30 32 43 24 37 38  1 30 44 43 28 41 36 24 37  1
 31 48 27 41 38  7 40 44 28 25 28 26  1 32 39 38  1 34 32 24  1 36 28 36
 38 43 28 26]
[ 1 36 35 47  1 37 24 31 25  1 39 44 37 43 42  1 41 24 34 28  1 41 28 30
 24 43 43 24  1 41 44 25 28 37 42  1 42 32 36  1 42 37 24 26 34  7 29 38
 38 27  1 42 42 24 37 30 48 38 37 30  1 42 46 24 39 38  1 46 24 26 31 43
 28 41  1  0  1 39 32 28 41 41 28  1 20 44 37 34 21  1 22  1 48 28 24 41
 42  1 38 35] [36 35 47  1 37 24 31 25  1 39 44 37 43 42  1 41 24 34 28  1 41 28 30 24
 43 43 24

In [10]:
batch_size = 64
buffer_size = 10000
seq_dataset = seq_dataset.shuffle(buffer_size).batch(batch_size=batch_size, drop_remainder=True)

### 2. 构建模型

In [11]:
# 词表大小
vocab_size = len(vocab)
embedding_dim = 256
rnn_units = 1024  # rnn 输入大小

In [12]:
def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
    """
        构建模型
    :param vocab_size:
    :param embedding_dim:
    :param rnn_units:
    :param batch_size:
    :return:
    """
    model = keras.models.Sequential([
        keras.layers.Embedding(vocab_size, embedding_dim, batch_input_shape=[batch_size, None]),
        keras.layers.SimpleRNN(units=rnn_units, return_sequences=True),
        keras.layers.Dense(vocab_size),
    ])
    return model

- 注意 : Cannot convert a symbolic Tensor (simple_rnn_5/strided_slice:0) to a numpy array
- 如果出现以上报错, 是numpy不兼容导致, 可以删除numpy重新下载 1.18.5版本
- tensorflow 2.3.0 requires 1.16.0<=numpy<1.19.0,

In [13]:
model = build_model(vocab_size=vocab_size,
                    embedding_dim=embedding_dim,
                    rnn_units=rnn_units,
                    batch_size=batch_size)

model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (64, None, 256)           13056     
_________________________________________________________________
simple_rnn (SimpleRNN)       (64, None, 1024)          1311744   
_________________________________________________________________
dense (Dense)                (64, None, 51)            52275     
Total params: 1,377,075
Trainable params: 1,377,075
Non-trainable params: 0
_________________________________________________________________


In [14]:
# 可以直接吧model当做函数来用, 直接输出 (batch_size, seq_length, vocab_size)
# 输出的相当于概率分布
for input_example_batch, target_example_batch in seq_dataset.take(1):
    example_batch_predictions = model(input_example_batch)
    print(example_batch_predictions.shape)
    print(example_batch_predictions[:2])

(64, 100, 51)
tf.Tensor(
[[[-1.21233929e-02  2.45017782e-02 -4.84458171e-03 ... -3.48218679e-02
   -9.49429441e-03 -1.18255494e-02]
  [-4.64975499e-02  3.83330844e-02  6.68234006e-03 ...  2.86842994e-02
    3.79523262e-04 -7.01316306e-03]
  [-4.76897471e-02  2.53242105e-02 -2.27514021e-02 ...  5.49932197e-02
   -4.40298542e-02  4.44192588e-02]
  ...
  [ 1.12278201e-01  2.07473427e-01  9.15031135e-02 ...  4.53041419e-02
   -2.88752645e-01  1.07287735e-01]
  [ 1.34028643e-01 -3.34416211e-01 -1.09147966e-01 ...  5.76466918e-02
   -3.04285660e-02  2.71576166e-01]
  [-1.60031885e-01  1.39257520e-01  1.28557354e-01 ... -2.11957917e-01
    1.84943303e-01 -5.75979464e-02]]

 [[-4.07187361e-03  3.08551490e-02  1.35564189e-02 ... -9.01789032e-03
    1.92681439e-02 -3.65005340e-03]
  [-3.84491496e-03  6.42496794e-02  3.03850211e-02 ...  5.58115244e-02
   -6.27528578e-02 -1.30651481e-02]
  [ 1.99169554e-02  2.51578130e-02 -2.91016027e-02 ...  1.28472634e-02
   -6.59139603e-02  2.40724683e-02]
  ..

In [15]:
# 1. 随机采样
# 2. 贪心
# 3. logits : 在计算分类问题总, softmax 之前的值就是logits
# num_samples 指定产生多个序列
sample_indices = tf.random.categorical(logits=example_batch_predictions[0], num_samples=1)
print(sample_indices)
sample_indices = tf.squeeze(sample_indices, axis=-1)
print(sample_indices)

tf.Tensor(
[[ 1]
 [42]
 [38]
 [19]
 [44]
 [35]
 [15]
 [46]
 [50]
 [14]
 [19]
 [20]
 [19]
 [ 2]
 [ 3]
 [ 7]
 [ 1]
 [48]
 [ 9]
 [46]
 [18]
 [ 6]
 [ 4]
 [ 2]
 [31]
 [49]
 [ 3]
 [16]
 [45]
 [35]
 [16]
 [ 7]
 [15]
 [32]
 [ 7]
 [26]
 [34]
 [42]
 [41]
 [50]
 [50]
 [15]
 [33]
 [19]
 [35]
 [ 9]
 [10]
 [37]
 [50]
 [26]
 [35]
 [27]
 [ 6]
 [31]
 [33]
 [ 9]
 [ 1]
 [ 4]
 [15]
 [15]
 [ 5]
 [38]
 [ 8]
 [ 5]
 [26]
 [50]
 [10]
 [ 8]
 [ 5]
 [ 7]
 [11]
 [44]
 [24]
 [30]
 [10]
 [17]
 [29]
 [34]
 [31]
 [11]
 [ 4]
 [13]
 [11]
 [37]
 [10]
 [19]
 [10]
 [ 8]
 [48]
 [36]
 [33]
 [ 8]
 [21]
 [20]
 [ 2]
 [45]
 [34]
 [42]
 [ 6]
 [21]], shape=(100, 1), dtype=int64)
tf.Tensor(
[ 1 42 38 19 44 35 15 46 50 14 19 20 19  2  3  7  1 48  9 46 18  6  4  2
 31 49  3 16 45 35 16  7 15 32  7 26 34 42 41 50 50 15 33 19 35  9 10 37
 50 26 35 27  6 31 33  9  1  4 15 15  5 38  8  5 26 50 10  8  5  7 11 44
 24 30 10 17 29 34 31 11  4 13 11 37 10 19 10  8 48 36 33  8 21 20  2 45
 34 42  6 21], shape=(100,), dtype=int64)


In [16]:
# 打印输入和输出
print("Input ", repr(" ".join(id2char[input_example_batch[0]])))
print("Output ", repr(" ".join(id2char[target_example_batch[0]])))
print("Prediction  ", repr(" ".join(id2char[sample_indices])))

Input  '  t h e   o f f e r   w h i c h   v a l u e s   t h e   w h o l e   o f   c o a t e s   a t   #   N   m i l l i o n   h a s   a l r e a d y   b e e n   a c c e p t e d   b y   c o a t e s   e x e c u'
Output  't h e   o f f e r   w h i c h   v a l u e s   t h e   w h o l e   o f   c o a t e s   a t   #   N   m i l l i o n   h a s   a l r e a d y   b e e n   a c c e p t e d   b y   c o a t e s   e x e c u t'
Prediction   "  s o 9 u l 5 w unk 4 9 < 9 # $ -   y / w 8 * & # h z $ 6 v l 6 - 5 i - c k s r unk unk 5 j 9 l / 0 n unk c l d * h j /   & 5 5 ' o . ' c unk 0 . ' - 1 u a g 0 7 f k h 1 & 3 1 n 0 9 0 . y m j . > < # v k s * >"


### 3. 定义损失函数

In [17]:
def loss(labels, logits):
    return keras.losses.sparse_categorical_crossentropy(
        labels, logits, from_logits=True # 因为预测值是 logits , 所以需要设置为 true
    )

In [18]:
model.compile(optimizer="adam", loss=loss)

# 计算loss
example_loss = loss(target_example_batch, example_batch_predictions)
print(example_loss.shape)
print(example_loss.numpy().mean())

(64, 100)
3.9476264


### 4. 训练模型

In [None]:
# 定义文件夹保存模型
output_dir = "./text_generation_checkpoints"

if not os.path.exists(output_dir):
    os.makedirs(output_dir)

checkpoint_prefix = os.path.join(output_dir, 'ckpt_{epoch}')
# 定义回调函数
callbacks = [
    keras.callbacks.ModelCheckpoint(filepath=checkpoint_prefix, save_weights_only=True),
    keras.callbacks.EarlyStopping(patience=5, min_delta=1e-3)
]

epochs = 40
history = model.fit(seq_dataset, epochs=epochs, callbacks=callbacks)

Epoch 1/40

In [21]:
history.history

{'loss': [1.7963497638702393,
  1.2682101726531982,
  1.1677029132843018,
  1.1214680671691895,
  1.0925670862197876,
  1.0723893642425537,
  1.0574251413345337,
  1.0453972816467285,
  1.0356159210205078,
  1.0284011363983154,
  1.0222655534744263,
  1.0172481536865234,
  1.0133591890335083,
  1.0100384950637817,
  1.0072368383407593,
  1.0053337812423706,
  1.0036914348602295,
  1.0030094385147095,
  1.0029821395874023,
  1.002568006515503,
  1.0027647018432617,
  1.0036383867263794,
  1.0043693780899048,
  1.0054091215133667,
  1.0073111057281494,
  1.008745551109314,
  1.0116959810256958,
  1.0137498378753662,
  1.0165047645568848,
  1.0189013481140137,
  1.0228221416473389,
  1.0262104272842407,
  1.031156301498413,
  1.0362390279769897,
  1.04008948802948,
  1.0501790046691895,
  1.0583056211471558,
  1.0672812461853027,
  1.0777320861816406,
  1.1291507482528687]}

In [24]:
# 计算困惑度
loss =  np.array(history.history['loss'])

# 计算困惑度 : exp(loss.mean())
print("当前模型的困惑度 : ", np.exp(loss.mean()))

当前模型的困惑度 :  2.886769057926001


### 5. 模型载入

In [20]:
tf.train.latest_checkpoint(output_dir)

'./text_generation_checkpoints\\ckpt_40'

In [None]:
new_model = build_model(vocab_size, embedding_dim, rnn_units, batch_size=1)
# 载入权重
new_model.load_weights(tf.train.latest_checkpoint(output_dir))
# None 指的是输入可以是变长的序列
new_model.build(tf.TensorShape([1, None]))