In [3]:
import tensorflow.keras as keras
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
% matplotlib inline
import numpy as np
import sklearn
import os
import sys
import time
import tensorflow as tf

In [4]:
print(tf.__version__)
# print("GPU : ", tf.test.is_gpu_available())
print(sys.version_info)
for module in mpl, np, pd, sklearn, tf, keras:
    print(module.__name__, module.__version__)

2.3.0
sys.version_info(major=3, minor=7, micro=11, releaselevel='final', serial=0)
matplotlib 3.4.2
numpy 1.18.5
pandas 1.3.3
sklearn 1.0
tensorflow 2.3.0
tensorflow.keras 2.4.0


### 1. 数据处理

#### 1.1 读取数据

In [5]:
text = open("./data/ptb.train.txt", "r").read()
print(len(text))
print(text[:100])

5101618
 aer banknote berlitz calloway centrust cluett fromstein gitano guterman hydro-quebec ipo kia memote


#### 1.2 构建词表

In [6]:
# 构建词表
vocab = sorted(set(text))
print(len(vocab))
print(vocab)

50
['\n', ' ', '#', '$', '&', "'", '*', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '<', '>', 'N', '\\', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']


In [7]:
# 添加 <unk>
vocab.append("unk")
id2char = np.array(vocab)
print(id2char)

['\n' ' ' '#' '$' '&' "'" '*' '-' '.' '/' '0' '1' '2' '3' '4' '5' '6' '7'
 '8' '9' '<' '>' 'N' '\\' 'a' 'b' 'c' 'd' 'e' 'f' 'g' 'h' 'i' 'j' 'k' 'l'
 'm' 'n' 'o' 'p' 'q' 'r' 's' 't' 'u' 'v' 'w' 'x' 'y' 'z' 'unk']


In [8]:
# 构建词表索引
char2index = {char: index for index, char in enumerate(vocab)}
# 添加新的 <unk> 词表
print(char2index)

{'\n': 0, ' ': 1, '#': 2, '$': 3, '&': 4, "'": 5, '*': 6, '-': 7, '.': 8, '/': 9, '0': 10, '1': 11, '2': 12, '3': 13, '4': 14, '5': 15, '6': 16, '7': 17, '8': 18, '9': 19, '<': 20, '>': 21, 'N': 22, '\\': 23, 'a': 24, 'b': 25, 'c': 26, 'd': 27, 'e': 28, 'f': 29, 'g': 30, 'h': 31, 'i': 32, 'j': 33, 'k': 34, 'l': 35, 'm': 36, 'n': 37, 'o': 38, 'p': 39, 'q': 40, 'r': 41, 's': 42, 't': 43, 'u': 44, 'v': 45, 'w': 46, 'x': 47, 'y': 48, 'z': 49, 'unk': 50}


In [9]:
# 将字符转换为词表索引
text_as_int = np.array([char2index[c] for c in text])
print(text[:10])
print(text_as_int[:10])

 aer bankn
[ 1 24 28 41  1 25 24 37 34 37]


In [10]:
def split_input_target(id_index):
    """
        abcde -> abcd , bcde
    :param id_index:
    :return:
    """
    return id_index[:-1], id_index[1:]


# 将词表转换为数据集
char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)  # text_as_int 是转换为索引的 text

# 对数据设置batch, 将字符集的dataset转换为句子级的dataset
seq_length = 100  # 设置句子长度
# 加一的原因是 split_input_target 返回的长度比输入的长度少1
# drop_remainder " 如果数据到最后无法组成一个 batch 时丢弃该数据
seq_dataset = char_dataset.batch(seq_length + 1, drop_remainder=True)

# 打印数据
# char_dataset 每一个元素都是字符
for ch_id in char_dataset.take(2):  # 获取两个数据
    print(ch_id, id2char[ch_id.numpy()])

# seq_dataset ,每一个元素都是句子
for seq_id in seq_dataset.take(2):
    print(seq_id)
    print(repr(" ".join(id2char[seq_id.numpy()])))

tf.Tensor(1, shape=(), dtype=int32)  
tf.Tensor(24, shape=(), dtype=int32) a
tf.Tensor(
[ 1 24 28 41  1 25 24 37 34 37 38 43 28  1 25 28 41 35 32 43 49  1 26 24
 35 35 38 46 24 48  1 26 28 37 43 41 44 42 43  1 26 35 44 28 43 43  1 29
 41 38 36 42 43 28 32 37  1 30 32 43 24 37 38  1 30 44 43 28 41 36 24 37
  1 31 48 27 41 38  7 40 44 28 25 28 26  1 32 39 38  1 34 32 24  1 36 28
 36 38 43 28 26], shape=(101,), dtype=int32)
'  a e r   b a n k n o t e   b e r l i t z   c a l l o w a y   c e n t r u s t   c l u e t t   f r o m s t e i n   g i t a n o   g u t e r m a n   h y d r o - q u e b e c   i p o   k i a   m e m o t e c'
tf.Tensor(
[ 1 36 35 47  1 37 24 31 25  1 39 44 37 43 42  1 41 24 34 28  1 41 28 30
 24 43 43 24  1 41 44 25 28 37 42  1 42 32 36  1 42 37 24 26 34  7 29 38
 38 27  1 42 42 24 37 30 48 38 37 30  1 42 46 24 39 38  1 46 24 26 31 43
 28 41  1  0  1 39 32 28 41 41 28  1 20 44 37 34 21  1 22  1 48 28 24 41
 42  1 38 35 27], shape=(101,), dtype=int32)
'  m l x   n a h b   p 

In [11]:
# 使用 split_input_target 对句子进行处理
seq_dataset = seq_dataset.map(split_input_target)

for item_input, item_output in seq_dataset.take(2):
    print(item_input.numpy(), item_output.numpy())

[ 1 24 28 41  1 25 24 37 34 37 38 43 28  1 25 28 41 35 32 43 49  1 26 24
 35 35 38 46 24 48  1 26 28 37 43 41 44 42 43  1 26 35 44 28 43 43  1 29
 41 38 36 42 43 28 32 37  1 30 32 43 24 37 38  1 30 44 43 28 41 36 24 37
  1 31 48 27 41 38  7 40 44 28 25 28 26  1 32 39 38  1 34 32 24  1 36 28
 36 38 43 28] [24 28 41  1 25 24 37 34 37 38 43 28  1 25 28 41 35 32 43 49  1 26 24 35
 35 38 46 24 48  1 26 28 37 43 41 44 42 43  1 26 35 44 28 43 43  1 29 41
 38 36 42 43 28 32 37  1 30 32 43 24 37 38  1 30 44 43 28 41 36 24 37  1
 31 48 27 41 38  7 40 44 28 25 28 26  1 32 39 38  1 34 32 24  1 36 28 36
 38 43 28 26]
[ 1 36 35 47  1 37 24 31 25  1 39 44 37 43 42  1 41 24 34 28  1 41 28 30
 24 43 43 24  1 41 44 25 28 37 42  1 42 32 36  1 42 37 24 26 34  7 29 38
 38 27  1 42 42 24 37 30 48 38 37 30  1 42 46 24 39 38  1 46 24 26 31 43
 28 41  1  0  1 39 32 28 41 41 28  1 20 44 37 34 21  1 22  1 48 28 24 41
 42  1 38 35] [36 35 47  1 37 24 31 25  1 39 44 37 43 42  1 41 24 34 28  1 41 28 30 24
 43 43 24

In [12]:
batch_size = 64
buffer_size = 10000
seq_dataset = seq_dataset.shuffle(buffer_size).batch(batch_size=batch_size, drop_remainder=True)

### 2. 构建模型

In [13]:
# 词表大小
vocab_size = len(vocab)
embedding_dim = 256
rnn_units = 1024  # rnn 输入大小

In [14]:
def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
    """
        构建模型
    :param vocab_size:
    :param embedding_dim:
    :param rnn_units:
    :param batch_size:
    :return:
    """
    model = keras.models.Sequential([
        keras.layers.Embedding(vocab_size, embedding_dim, batch_input_shape=[batch_size, None]),
        keras.layers.SimpleRNN(units=rnn_units, return_sequences=True),
        keras.layers.Dense(vocab_size),
    ])
    return model

- 注意 : Cannot convert a symbolic Tensor (simple_rnn_5/strided_slice:0) to a numpy array
- 如果出现以上报错, 是numpy不兼容导致, 可以删除numpy重新下载 1.18.5版本
- tensorflow 2.3.0 requires 1.16.0<=numpy<1.19.0,

In [16]:
model = build_model(vocab_size=vocab_size,
                    embedding_dim=embedding_dim,
                    rnn_units=rnn_units,
                    batch_size=batch_size)

model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (64, None, 256)           13056     
_________________________________________________________________
simple_rnn_1 (SimpleRNN)     (64, None, 1024)          1311744   
_________________________________________________________________
dense_1 (Dense)              (64, None, 51)            52275     
Total params: 1,377,075
Trainable params: 1,377,075
Non-trainable params: 0
_________________________________________________________________


In [20]:
# 可以直接吧model当做函数来用, 直接输出 (batch_size, seq_length, vocab_size)
# 输出的相当于概率分布
for input_example_batch, target_example_batch in seq_dataset.take(1):
    example_batch_predictions = model(input_example_batch)
    print(example_batch_predictions.shape)
    print(example_batch_predictions[:2])

(64, 100, 51)
tf.Tensor(
[[[ 1.01326741e-02 -4.24239598e-02 -3.46165262e-02 ...  6.72607962e-03
    7.18212267e-03 -1.71390101e-02]
  [ 1.63018480e-02 -5.63163087e-02  4.08628136e-02 ...  3.66522111e-02
   -4.00835164e-02 -2.56150290e-02]
  [-4.51371539e-03 -1.34219415e-02  5.82773685e-02 ...  6.69930875e-02
   -3.11735608e-02  6.81254640e-02]
  ...
  [-6.03021532e-02 -4.67936061e-02 -1.62060633e-01 ...  2.74062306e-01
   -5.31082526e-02 -4.72390652e-02]
  [-3.61921079e-02  2.10722134e-01  3.67143005e-03 ...  2.85651833e-02
    1.89557046e-01 -6.77676722e-02]
  [-7.47772530e-02  5.70588671e-02 -4.16901819e-02 ... -1.79651141e-01
   -6.75115585e-02  8.55469629e-02]]

 [[-6.20439053e-02  3.82665545e-02  7.30050653e-02 ... -1.77581403e-02
   -1.16553176e-02 -8.24929215e-03]
  [-2.47679185e-04 -1.44483494e-02 -2.65385509e-02 ...  5.31017259e-02
    2.15319078e-02  1.83974132e-02]
  [ 5.23698628e-02 -5.71615957e-02 -5.80306984e-02 ...  6.97944239e-02
    2.71769222e-02 -1.25371153e-05]
  ..

In [23]:
# 1. 随机采样
# 2. 贪心
# 3. logits : 在计算分类问题总, softmax 之前的值就是logits
# num_samples 指定产生多个序列
sample_indices = tf.random.categorical(logits=example_batch_predictions[0], num_samples=1)
print(sample_indices)
sample_indices = tf.squeeze(sample_indices, axis=-1)
print(sample_indices)

tf.Tensor(
[[26]
 [24]
 [11]
 [15]
 [31]
 [23]
 [ 8]
 [44]
 [30]
 [12]
 [ 9]
 [31]
 [48]
 [35]
 [36]
 [28]
 [16]
 [29]
 [27]
 [15]
 [47]
 [32]
 [10]
 [37]
 [28]
 [24]
 [47]
 [21]
 [29]
 [19]
 [38]
 [50]
 [38]
 [41]
 [22]
 [36]
 [41]
 [46]
 [ 3]
 [48]
 [ 1]
 [ 6]
 [23]
 [28]
 [49]
 [11]
 [ 4]
 [ 0]
 [28]
 [36]
 [ 6]
 [41]
 [13]
 [36]
 [27]
 [24]
 [39]
 [19]
 [ 4]
 [ 9]
 [22]
 [25]
 [ 3]
 [35]
 [43]
 [44]
 [33]
 [48]
 [ 9]
 [ 0]
 [ 8]
 [ 4]
 [26]
 [46]
 [22]
 [50]
 [42]
 [36]
 [47]
 [44]
 [23]
 [16]
 [ 6]
 [38]
 [ 7]
 [20]
 [11]
 [36]
 [35]
 [22]
 [15]
 [35]
 [27]
 [19]
 [ 9]
 [12]
 [26]
 [18]
 [38]
 [12]], shape=(100, 1), dtype=int64)
tf.Tensor(
[26 24 11 15 31 23  8 44 30 12  9 31 48 35 36 28 16 29 27 15 47 32 10 37
 28 24 47 21 29 19 38 50 38 41 22 36 41 46  3 48  1  6 23 28 49 11  4  0
 28 36  6 41 13 36 27 24 39 19  4  9 22 25  3 35 43 44 33 48  9  0  8  4
 26 46 22 50 42 36 47 44 23 16  6 38  7 20 11 36 35 22 15 35 27 19  9 12
 26 18 38 12], shape=(100,), dtype=int64)


In [24]:
# 打印输入和输出
print("Input ", repr(" ".join(id2char[input_example_batch[0]])))
print("Output ", repr(" ".join(id2char[target_example_batch[0]])))
print("Prediction  ", repr(" ".join(id2char[sample_indices])))

Input  't e d   t o   b e g i n   i n   m a r c h   \n   u . s .   l a w   r e q u i r e s   c r i m i n a l   d e f e n d a n t s   t o   t u r n   o v e r   f o r e i g n   d o c u m e n t s   s u c h   a s'
Output  'e d   t o   b e g i n   i n   m a r c h   \n   u . s .   l a w   r e q u i r e s   c r i m i n a l   d e f e n d a n t s   t o   t u r n   o v e r   f o r e i g n   d o c u m e n t s   s u c h   a s  '
Prediction   'c a 1 5 h \\ . u g 2 / h y l m e 6 f d 5 x i 0 n e a x > f 9 o unk o r N m r w $ y   * \\ e z 1 & \n e m * r 3 m d a p 9 & / N b $ l t u j y / \n . & c w N unk s m x u \\ 6 * o - < 1 m l N 5 l d 9 / 2 c 8 o 2'


### 3. 定义损失函数

In [25]:
def loss(labels, logits):
    return keras.losses.sparse_categorical_crossentropy(
        labels, logits, from_logits=True # 因为预测值是 logits , 所以需要设置为 true
    )

In [27]:
model.compile(optimizer="adam", loss=loss)

# 计算loss
example_loss = loss(target_example_batch, example_batch_predictions)
print(example_loss.shape)
print(example_loss.numpy().mean())

(64, 100)
3.9577596


### 4. 训练模型

In [None]:
# 定义文件夹保存模型
output_dir = "./text_generation_checkpoints"

if not os.path.exists(output_dir):
    os.makedirs(output_dir)

checkpoint_prefix = os.path.join(output_dir, 'ckpt_{epoch}')
# 定义回调函数
checkpoint_callback = keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix, save_weights_only=True
)

epochs = 5
history = model.fit(seq_dataset, epochs=epochs, callbacks=[checkpoint_callback])

Epoch 1/5
 31/789 [>.............................] - ETA: 16:13 - loss: 2.4958