In [2]:
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import sklearn
import pandas as pd
import os
import sys
import time
import tensorflow as tf

from tensorflow import keras

print(tf.__version__)
print(sys.version_info)
for module in mpl, np, pd, sklearn, tf, keras:
    print(module.__name__, module.__version__)

2.2.0
sys.version_info(major=3, minor=6, micro=9, releaselevel='final', serial=0)
matplotlib 3.2.1
numpy 1.18.5
pandas 1.0.4
sklearn 0.23.1
tensorflow 2.2.0
tensorflow.keras 2.3.0-tf


In [3]:
# https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt
#文件已经下载好了
input_filepath = "./shakespeare.txt"
text = open(input_filepath, 'r').read()

print(len(text))
print(text[0:100])

1115394
First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You


In [4]:
# 1. generate vocab
# 2. build mapping char->id
# 3. data -> id_data  把数据都转为id
# 4. abcd -> bcd<eos>  预测下一个字符生成的模型，也就是输入是a，输出就是b

#去重，留下独立字符，并排序
vocab = sorted(set(text))
print(len(vocab))
print(vocab)

65
['\n', ' ', '!', '$', '&', "'", ',', '-', '.', '3', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']


In [5]:
#每个字符都编好号，enumerate对每一个位置编号，生成的是列表中是元组，下面字典生成式
char2idx = {char:idx for idx, char in enumerate(vocab)}
print(char2idx)

{'\n': 0, ' ': 1, '!': 2, '$': 3, '&': 4, "'": 5, ',': 6, '-': 7, '.': 8, '3': 9, ':': 10, ';': 11, '?': 12, 'A': 13, 'B': 14, 'C': 15, 'D': 16, 'E': 17, 'F': 18, 'G': 19, 'H': 20, 'I': 21, 'J': 22, 'K': 23, 'L': 24, 'M': 25, 'N': 26, 'O': 27, 'P': 28, 'Q': 29, 'R': 30, 'S': 31, 'T': 32, 'U': 33, 'V': 34, 'W': 35, 'X': 36, 'Y': 37, 'Z': 38, 'a': 39, 'b': 40, 'c': 41, 'd': 42, 'e': 43, 'f': 44, 'g': 45, 'h': 46, 'i': 47, 'j': 48, 'k': 49, 'l': 50, 'm': 51, 'n': 52, 'o': 53, 'p': 54, 'q': 55, 'r': 56, 's': 57, 't': 58, 'u': 59, 'v': 60, 'w': 61, 'x': 62, 'y': 63, 'z': 64}


In [6]:
# 把vocab从列表变为ndarray
idx2char = np.array(vocab)
print(idx2char)

['\n' ' ' '!' '$' '&' "'" ',' '-' '.' '3' ':' ';' '?' 'A' 'B' 'C' 'D' 'E'
 'F' 'G' 'H' 'I' 'J' 'K' 'L' 'M' 'N' 'O' 'P' 'Q' 'R' 'S' 'T' 'U' 'V' 'W'
 'X' 'Y' 'Z' 'a' 'b' 'c' 'd' 'e' 'f' 'g' 'h' 'i' 'j' 'k' 'l' 'm' 'n' 'o'
 'p' 'q' 'r' 's' 't' 'u' 'v' 'w' 'x' 'y' 'z']


In [8]:
#把字符都转换为id
text_as_int = np.array([char2idx[c] for c in text])
print(text_as_int.shape)
print(len(text_as_int))
print(text_as_int[0:10])
print(text[0:10])

(1115394,)
1115394
[18 47 56 57 58  1 15 47 58 47]
First Citi


In [14]:
#把输入和输出分配好
def split_input_target(id_text):
    """
    abcde -> abcd, bcde,输入是abcd，输出是bcde
    """
    return id_text[0:-1], id_text[1:]

#把id text转换为 dataset
char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)
seq_length = 100
#做一个batch，seq_length + 1目的是我们输入是5个字符时，输出是4，drop_remainder
# 是最后不够就丢掉，这个batch是把字变为句子，一个句子是101个字符
seq_dataset = char_dataset.batch(seq_length + 1,
                                 drop_remainder = True)
for ch_id in char_dataset.take(2):
    print(ch_id, idx2char[ch_id.numpy()])

# # seq_dataset 每一个都是句子，对应id，取两个句子看看
for seq_id in seq_dataset.take(2):
    print(seq_id)
    print(repr(''.join(idx2char[seq_id.numpy()])))

tf.Tensor(18, shape=(), dtype=int64) F
tf.Tensor(47, shape=(), dtype=int64) i
tf.Tensor(
[18 47 56 57 58  1 15 47 58 47 64 43 52 10  0 14 43 44 53 56 43  1 61 43
  1 54 56 53 41 43 43 42  1 39 52 63  1 44 59 56 58 46 43 56  6  1 46 43
 39 56  1 51 43  1 57 54 43 39 49  8  0  0 13 50 50 10  0 31 54 43 39 49
  6  1 57 54 43 39 49  8  0  0 18 47 56 57 58  1 15 47 58 47 64 43 52 10
  0 37 53 59  1], shape=(101,), dtype=int64)
'First Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou '
tf.Tensor(
[39 56 43  1 39 50 50  1 56 43 57 53 50 60 43 42  1 56 39 58 46 43 56  1
 58 53  1 42 47 43  1 58 46 39 52  1 58 53  1 44 39 51 47 57 46 12  0  0
 13 50 50 10  0 30 43 57 53 50 60 43 42  8  1 56 43 57 53 50 60 43 42  8
  0  0 18 47 56 57 58  1 15 47 58 47 64 43 52 10  0 18 47 56 57 58  6  1
 63 53 59  1 49], shape=(101,), dtype=int64)
'are all resolved rather to die than to famish?\n\nAll:\nResolved. resolved.\n\nFirst Citizen:\nFirst, you k'


In [15]:
#然后通过split_input_target函数来对seq_dataset做映射，得到输入，输出
seq_dataset = seq_dataset.map(split_input_target)

for item_input, item_output in seq_dataset.take(2):
    print(item_input.numpy())
    print(item_output.numpy())
print(seq_dataset)

[18 47 56 57 58  1 15 47 58 47 64 43 52 10  0 14 43 44 53 56 43  1 61 43
  1 54 56 53 41 43 43 42  1 39 52 63  1 44 59 56 58 46 43 56  6  1 46 43
 39 56  1 51 43  1 57 54 43 39 49  8  0  0 13 50 50 10  0 31 54 43 39 49
  6  1 57 54 43 39 49  8  0  0 18 47 56 57 58  1 15 47 58 47 64 43 52 10
  0 37 53 59]
[47 56 57 58  1 15 47 58 47 64 43 52 10  0 14 43 44 53 56 43  1 61 43  1
 54 56 53 41 43 43 42  1 39 52 63  1 44 59 56 58 46 43 56  6  1 46 43 39
 56  1 51 43  1 57 54 43 39 49  8  0  0 13 50 50 10  0 31 54 43 39 49  6
  1 57 54 43 39 49  8  0  0 18 47 56 57 58  1 15 47 58 47 64 43 52 10  0
 37 53 59  1]
[39 56 43  1 39 50 50  1 56 43 57 53 50 60 43 42  1 56 39 58 46 43 56  1
 58 53  1 42 47 43  1 58 46 39 52  1 58 53  1 44 39 51 47 57 46 12  0  0
 13 50 50 10  0 30 43 57 53 50 60 43 42  8  1 56 43 57 53 50 60 43 42  8
  0  0 18 47 56 57 58  1 15 47 58 47 64 43 52 10  0 18 47 56 57 58  6  1
 63 53 59  1]
[56 43  1 39 50 50  1 56 43 57 53 50 60 43 42  1 56 39 58 46 43 56  1 58
 53  1 42

In [18]:
batch_size = 64
buffer_size = 10000
#这个batch是真正的batch，上一个batch是把字变为句子,buffer_size是从数据集拿那么多元素
seq_dataset = seq_dataset.shuffle(buffer_size).batch(
    batch_size, drop_remainder=True)
print(seq_dataset)

<BatchDataset shapes: ((64, 100), (64, 100)), types: (tf.int64, tf.int64)>


In [19]:
vocab_size = len(vocab)
embedding_dim = 256  #资料比较小，所以dim可以设大一些
rnn_units = 1024

def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
    model = keras.models.Sequential([
        keras.layers.Embedding(vocab_size, embedding_dim,
                               batch_input_shape = [batch_size, None]),
        #return_sequences是指要返回一个序列，也就是所有输出，而不是最后一个
        keras.layers.SimpleRNN(units = rnn_units,
                               stateful = True,#是否把最后返回的状态添加到输出
                               recurrent_initializer = 'glorot_uniform',
                               return_sequences = True),
        #全连接层，为什么最后一层全连接层的输出是vocab_size
        keras.layers.Dense(vocab_size),
    ])
    return model

model = build_model(
    vocab_size = vocab_size,
    embedding_dim = embedding_dim,
    rnn_units = rnn_units,
    batch_size = batch_size)

model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (64, None, 256)           16640     
_________________________________________________________________
simple_rnn (SimpleRNN)       (64, None, 1024)          1311744   
_________________________________________________________________
dense (Dense)                (64, None, 65)            66625     
Total params: 1,395,009
Trainable params: 1,395,009
Non-trainable params: 0
_________________________________________________________________


In [20]:
model.variables

[<tf.Variable 'embedding/embeddings:0' shape=(65, 256) dtype=float32, numpy=
 array([[ 0.01461799, -0.03977693, -0.04230822, ..., -0.01583923,
         -0.02077497,  0.01428571],
        [-0.00529009,  0.0111338 ,  0.02122502, ..., -0.04261643,
          0.04254115,  0.04439788],
        [ 0.04624132, -0.02808242,  0.01049563, ..., -0.00015792,
         -0.04482141, -0.04595819],
        ...,
        [-0.02983973,  0.03895411,  0.01342774, ...,  0.01684859,
          0.03617629,  0.04942316],
        [-0.00170964,  0.00859882,  0.04555153, ...,  0.01196524,
         -0.03625675,  0.0153789 ],
        [ 0.00301895, -0.03737794,  0.04797571, ...,  0.04297391,
          0.03179053, -0.02238963]], dtype=float32)>,
 <tf.Variable 'simple_rnn/simple_rnn_cell/kernel:0' shape=(256, 1024) dtype=float32, numpy=
 array([[ 0.01676732, -0.05598472,  0.0068666 , ..., -0.06483845,
          0.03262391, -0.02083575],
        [-0.03811319, -0.01477782, -0.00621065, ..., -0.0495558 ,
         -0.02096335

In [21]:
for input_example_batch, target_example_batch in seq_dataset.take(1):
    #把model当函数来用，实际是调用类的call方法
    example_batch_predictions = model(input_example_batch)
    print(example_batch_predictions.shape)

(64, 100, 65)


In [27]:
# random sampling.
# greedy, random.
#logits是计算分类任务之前，没有经过softmax的那个值就是logits，把第一个样本输进去
# tf.random.categorical从分类分布中抽取样本
print(example_batch_predictions[0][0])
sample_indices = tf.random.categorical(
    logits = example_batch_predictions[0], num_samples = 1)
print(sample_indices)#得到（100,1）的tensor
# # (100, 65) -> (100, 1)  调用squeeze 去除1的维度，变为100的向量
sample_indices = tf.squeeze(sample_indices, axis = -1)
print(sample_indices)

tf.Tensor(
[-4.2078055e-02  4.1275684e-02  3.1370195e-03 -8.0881137e-03
 -2.1282559e-02 -1.8037211e-02 -8.6421650e-03 -9.0390649e-03
 -1.6477457e-03  2.1680370e-02  1.1185886e-02  2.4152845e-03
 -1.3603989e-02  6.2687164e-03 -1.7555147e-02  3.6703121e-02
  7.0068697e-03  3.4238521e-03  3.6724196e-03  1.8432064e-02
 -5.1002633e-03 -2.8535241e-02  1.3469462e-02  4.4259755e-03
 -2.6338033e-03  3.7121256e-03 -8.7161567e-03  1.2343137e-02
 -3.7566673e-02  6.2624969e-02  4.8011363e-02  3.3326942e-02
  4.0204689e-02 -3.4551062e-02 -8.6261742e-03 -7.6738736e-03
 -9.0767425e-03  9.5303394e-03  2.6973680e-02  9.8239314e-03
 -1.0401940e-02 -1.9955432e-02  6.3198553e-03 -2.1352254e-02
  6.0630674e-03 -2.6769707e-02  9.3907872e-03 -4.4523865e-02
  2.4203200e-02  2.3874879e-02 -7.3043862e-04 -3.2102685e-02
 -4.6126842e-03  2.0446023e-02  5.8889017e-04  1.0843616e-02
 -3.9171753e-03  3.1115474e-02 -9.4053019e-03 -5.2452749e-03
 -1.7959600e-02  6.4359777e-02  7.2192401e-05 -6.8674713e-02
  2.5222789e-

In [31]:
print("Input: ", repr("".join(idx2char[input_example_batch[0]])))
print()
print("Output: ", repr("".join(idx2char[target_example_batch[0]])))
print()
print("Predictions: ", repr("".join(idx2char[sample_indices])))

Input:  'h his forces,\nAnd do expect him here some two hours hence.\n\nWARWICK:\nThen Clarence is at hand, I hea'

Output:  ' his forces,\nAnd do expect him here some two hours hence.\n\nWARWICK:\nThen Clarence is at hand, I hear'

Predictions:  "smu.u-nvitzlzISVfH3k,nwZIY'D:twal!;!l$wUcUDHK&MxmUCZsHkQlub\nfZMEShKaEFZG3OXTWHpOSxnk;WQyYD!iZDcjqcFK"


In [28]:
# from_logits是否预期为对数张量。默认情况下，我们假设对概率分布进行编码
# logits表示网络的直接输出 。没经过sigmoid或者softmax的概率化。
# from_logits=False就表示把已经概率化了的输出，重新映射回原值。log（p/(1-p)）
def loss(labels, logits):
    return keras.losses.sparse_categorical_crossentropy(
        labels, logits, from_logits=True)

model.compile(optimizer = 'adam', loss = loss)
example_loss = loss(target_example_batch, example_batch_predictions)
print(example_loss.shape)
print(example_loss.numpy().mean())  #看下样例的loss

(64, 100)
4.182874


In [None]:
#定义一个文件夹，保存模型
output_dir = "./text_generation_checkpoints"
if not os.path.exists(output_dir):
    os.mkdir(output_dir)
checkpoint_prefix = os.path.join(output_dir, 'ckpt_{epoch}')
checkpoint_callback = keras.callbacks.ModelCheckpoint(
    filepath = checkpoint_prefix,
    #只保存权重的值
    save_weights_only = True)

epochs = 100
history = model.fit(seq_dataset, epochs = epochs,
                    callbacks = [checkpoint_callback])


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [None]:
tf.train.latest_checkpoint(output_dir)

In [None]:
output_dir = "./text_generation_checkpoints"
model2 = build_model(vocab_size,
                     embedding_dim,
                     rnn_units,
                     batch_size = 1)
model2.load_weights(tf.train.latest_checkpoint(output_dir))
#1是一个样本，None是可以变长序列
model2.build(tf.TensorShape([1, None]))
#下面是文本生成的流程
# start ch sequence A, 
# A -> model -> b  A放入模型后得到b
# A.append(b) -> B
# B(Ab) -> model -> c
# B.append(c) -> C
# C(Abc) -> model -> ...
model2.summary()

In [None]:
#定义一个函数来实现上面的文本生成流程
def generate_text(model, start_string, num_generate = 1000):
    #这一次输出的是1维的
    input_eval = [char2idx[ch] for ch in start_string]
    print(input_eval)
    #做一个维度扩展
    input_eval = tf.expand_dims(input_eval, 0)
    print(input_eval)
    text_generated = []
    #对model进行reset，连续调用的时候使用resets_states()
    model.reset_states()
    
    for _ in range(num_generate):
        # 1. model inference -> predictions
        # 2. sample -> ch -> text_generated.
        # 3. update input_eval
        
        # predictions : [batch_size, input_eval_len, vocab_size]
        predictions = model(input_eval)
        #squeeze消掉 batch_size，变为predictions : [input_eval_len, vocab_size]
        predictions = tf.squeeze(predictions, 0)
        # predicted_ids: [input_eval_len, 1]
        # a b c -> b c d
#         print(predictions)
        #把predictions : [input_eval_len, vocab_size]维度数据变为 1个维度
        predicted_id = tf.random.categorical(
            predictions, num_samples = 1)[-1, 0].numpy()
#         print(predicted_id)
        # 得到预测id后，放入text_generated
        text_generated.append(idx2char[predicted_id])
        # 下面这是是我们原来的公式,为什么没有append作为新的输入,因为那样比较低效
        # s, x -> rnn -> s', y
        input_eval = tf.expand_dims([predicted_id], 0)
    return start_string + ''.join(text_generated)

new_text = generate_text(model2, "All: ")
print(new_text)
