In [1]:
from __future__ import absolute_import, division, print_function, unicode_literals

# Install TensorFlow
try:
  # %tensorflow_version only exists in Colab.
  %tensorflow_version 2.x
except Exception:
  pass

import tensorflow as tf
import matplotlib.pylab as plt
import numpy as np
import tensorflow.keras as keras
import os

In [4]:
# https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt
input_filepath = "./shakespeare.txt"
text = open(input_filepath, 'r').read()

print(len(text))
print(text[0:100])

1115394
First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You


步骤：  
1. 生成词表
2. 建立映射 char -> id
3. 数据转成id data -> id
4. 定义输入输出 abcd -> bcd<eos>

In [5]:
# 建立词表
vocab = sorted(set(text))    # 建立一个无序不重复元素序列 set(), 并排序 sorted()
print(len(vocab))
print(vocab)

65
['\n', ' ', '!', '$', '&', "'", ',', '-', '.', '3', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']


In [6]:
char2idx = {char:idx for idx, char in enumerate(vocab)}
print(char2idx)

{'\n': 0, ' ': 1, '!': 2, '$': 3, '&': 4, "'": 5, ',': 6, '-': 7, '.': 8, '3': 9, ':': 10, ';': 11, '?': 12, 'A': 13, 'B': 14, 'C': 15, 'D': 16, 'E': 17, 'F': 18, 'G': 19, 'H': 20, 'I': 21, 'J': 22, 'K': 23, 'L': 24, 'M': 25, 'N': 26, 'O': 27, 'P': 28, 'Q': 29, 'R': 30, 'S': 31, 'T': 32, 'U': 33, 'V': 34, 'W': 35, 'X': 36, 'Y': 37, 'Z': 38, 'a': 39, 'b': 40, 'c': 41, 'd': 42, 'e': 43, 'f': 44, 'g': 45, 'h': 46, 'i': 47, 'j': 48, 'k': 49, 'l': 50, 'm': 51, 'n': 52, 'o': 53, 'p': 54, 'q': 55, 'r': 56, 's': 57, 't': 58, 'u': 59, 'v': 60, 'w': 61, 'x': 62, 'y': 63, 'z': 64}


In [7]:
idx2char = np.array(vocab)
print(idx2char)

['\n' ' ' '!' '$' '&' "'" ',' '-' '.' '3' ':' ';' '?' 'A' 'B' 'C' 'D' 'E'
 'F' 'G' 'H' 'I' 'J' 'K' 'L' 'M' 'N' 'O' 'P' 'Q' 'R' 'S' 'T' 'U' 'V' 'W'
 'X' 'Y' 'Z' 'a' 'b' 'c' 'd' 'e' 'f' 'g' 'h' 'i' 'j' 'k' 'l' 'm' 'n' 'o'
 'p' 'q' 'r' 's' 't' 'u' 'v' 'w' 'x' 'y' 'z']


In [8]:
text_as_int = np.array([char2idx[c] for c in text])
print(text_as_int[0: 20])
print(text[0: 20])

[18 47 56 57 58  1 15 47 58 47 64 43 52 10  0 14 43 44 53 56]
First Citizen:
Befor


In [9]:
def split_input_target(id_text):
    """
    abcde -> abcd, bcde
    """
    return id_text[0:-1], id_text[1:]

char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)
seq_length = 100
seq_dataset = char_dataset.batch(seq_length + 1, drop_remainder = True)
# drop_reminder = True 如果取到最后长度不够，就丢弃

for ch_id in char_dataset.take(2):
    print(ch_id, idx2char[ch_id.numpy()])
    
for seq_id in seq_dataset.take(2):
    print(seq_id.numpy())
    print(repr(''.join(idx2char[seq_id.numpy()])))

tf.Tensor(18, shape=(), dtype=int32) F
tf.Tensor(47, shape=(), dtype=int32) i
[18 47 56 57 58  1 15 47 58 47 64 43 52 10  0 14 43 44 53 56 43  1 61 43
  1 54 56 53 41 43 43 42  1 39 52 63  1 44 59 56 58 46 43 56  6  1 46 43
 39 56  1 51 43  1 57 54 43 39 49  8  0  0 13 50 50 10  0 31 54 43 39 49
  6  1 57 54 43 39 49  8  0  0 18 47 56 57 58  1 15 47 58 47 64 43 52 10
  0 37 53 59  1]
'First Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou '
[39 56 43  1 39 50 50  1 56 43 57 53 50 60 43 42  1 56 39 58 46 43 56  1
 58 53  1 42 47 43  1 58 46 39 52  1 58 53  1 44 39 51 47 57 46 12  0  0
 13 50 50 10  0 30 43 57 53 50 60 43 42  8  1 56 43 57 53 50 60 43 42  8
  0  0 18 47 56 57 58  1 15 47 58 47 64 43 52 10  0 18 47 56 57 58  6  1
 63 53 59  1 49]
'are all resolved rather to die than to famish?\n\nAll:\nResolved. resolved.\n\nFirst Citizen:\nFirst, you k'


In [10]:
seq_dataset = seq_dataset.map(split_input_target)

for item_input, item_output in seq_dataset.take(2):
    print(item_input.numpy())
    print(item_output.numpy())

[18 47 56 57 58  1 15 47 58 47 64 43 52 10  0 14 43 44 53 56 43  1 61 43
  1 54 56 53 41 43 43 42  1 39 52 63  1 44 59 56 58 46 43 56  6  1 46 43
 39 56  1 51 43  1 57 54 43 39 49  8  0  0 13 50 50 10  0 31 54 43 39 49
  6  1 57 54 43 39 49  8  0  0 18 47 56 57 58  1 15 47 58 47 64 43 52 10
  0 37 53 59]
[47 56 57 58  1 15 47 58 47 64 43 52 10  0 14 43 44 53 56 43  1 61 43  1
 54 56 53 41 43 43 42  1 39 52 63  1 44 59 56 58 46 43 56  6  1 46 43 39
 56  1 51 43  1 57 54 43 39 49  8  0  0 13 50 50 10  0 31 54 43 39 49  6
  1 57 54 43 39 49  8  0  0 18 47 56 57 58  1 15 47 58 47 64 43 52 10  0
 37 53 59  1]
[39 56 43  1 39 50 50  1 56 43 57 53 50 60 43 42  1 56 39 58 46 43 56  1
 58 53  1 42 47 43  1 58 46 39 52  1 58 53  1 44 39 51 47 57 46 12  0  0
 13 50 50 10  0 30 43 57 53 50 60 43 42  8  1 56 43 57 53 50 60 43 42  8
  0  0 18 47 56 57 58  1 15 47 58 47 64 43 52 10  0 18 47 56 57 58  6  1
 63 53 59  1]
[56 43  1 39 50 50  1 56 43 57 53 50 60 43 42  1 56 39 58 46 43 56  1 58
 53  1 42

In [11]:
batch_size = 64
buffer_size = 10000

seq_dataset = seq_dataset.shuffle(buffer_size).batch(batch_size, drop_remainder=True)

In [12]:
vocab_size = len(vocab)
embedding_dim = 256
rnn_units = 1024

def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
    model = keras.Sequential([
        keras.layers.Embedding(vocab_size, embedding_dim, batch_input_shape = [batch_size, None]),
        keras.layers.LSTM(units = rnn_units,
                 stateful = True,     # 如果为True, 批次中索引i的每个样本的最后状态将用作下一个批次中索引i的样本的初始状态。
                 recurrent_initializer = 'glorot_uniform',   # recurrent_kernel权重初始化，默认为'orthogonal'
                 return_sequences = True
        ),
        keras.layers.Dense(vocab_size)
    ])
    return model

model = build_model(
    vocab_size = vocab_size,
    embedding_dim = embedding_dim,
    rnn_units = rnn_units,
    batch_size = batch_size
)

model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (64, None, 256)           16640     
_________________________________________________________________
lstm (LSTM)                  (64, None, 1024)          5246976   
_________________________________________________________________
dense (Dense)                (64, None, 65)            66625     
Total params: 5,330,241
Trainable params: 5,330,241
Non-trainable params: 0
_________________________________________________________________


In [11]:
for input_example_batch, target_example_batch in seq_dataset.take(1):
    example_batch_predictions = model(input_example_batch)
    print(example_batch_predictions.shape)

(64, 100, 65)


In [12]:
# 随机采样
sample_indices = tf.random.categorical(
    logits = example_batch_predictions[0], num_samples = 1)
print(sample_indices)
# (100, 65) -> (100, 1)
sample_indices = tf.squeeze(sample_indices, axis = -1)
# 降低维度 (100, 1) -> (100, )
print(sample_indices)

tf.Tensor(
[[31]
 [56]
 [47]
 [23]
 [41]
 [59]
 [17]
 [62]
 [25]
 [22]
 [ 5]
 [62]
 [ 3]
 [28]
 [14]
 [38]
 [34]
 [ 7]
 [42]
 [14]
 [40]
 [20]
 [45]
 [14]
 [26]
 [27]
 [60]
 [30]
 [24]
 [ 6]
 [42]
 [41]
 [17]
 [64]
 [24]
 [41]
 [44]
 [13]
 [47]
 [36]
 [25]
 [ 3]
 [39]
 [47]
 [24]
 [12]
 [ 5]
 [27]
 [ 8]
 [32]
 [29]
 [26]
 [37]
 [59]
 [34]
 [47]
 [15]
 [52]
 [60]
 [31]
 [26]
 [15]
 [17]
 [30]
 [ 6]
 [28]
 [56]
 [49]
 [12]
 [42]
 [49]
 [14]
 [32]
 [45]
 [14]
 [58]
 [55]
 [27]
 [26]
 [29]
 [23]
 [41]
 [19]
 [48]
 [64]
 [50]
 [35]
 [48]
 [44]
 [40]
 [12]
 [25]
 [ 7]
 [51]
 [22]
 [ 9]
 [12]
 [31]
 [44]
 [44]], shape=(100, 1), dtype=int64)
tf.Tensor(
[31 56 47 23 41 59 17 62 25 22  5 62  3 28 14 38 34  7 42 14 40 20 45 14
 26 27 60 30 24  6 42 41 17 64 24 41 44 13 47 36 25  3 39 47 24 12  5 27
  8 32 29 26 37 59 34 47 15 52 60 31 26 15 17 30  6 28 56 49 12 42 49 14
 32 45 14 58 55 27 26 29 23 41 19 48 64 50 35 48 44 40 12 25  7 51 22  9
 12 31 44 44], shape=(100,), dtype=int64)


In [13]:
# 输出演示
print("Input: ", repr("".join(idx2char[input_example_batch[0]])))
print()
print("Output: ", repr("".join(idx2char[target_example_batch[0]])))
print()
print("Predictions: ", repr("".join(idx2char[sample_indices])))

Input:  'broad?\n\nLADY CAPULET:\nThe people in the street cry Romeo,\nSome Juliet, and some Paris; and all run,\n'

Output:  'road?\n\nLADY CAPULET:\nThe people in the street cry Romeo,\nSome Juliet, and some Paris; and all run,\nW'

Predictions:  "SriKcuExMJ'x$PBZV-dBbHgBNOvRL,dcEzLcfAiXM$aiL?'O.TQNYuViCnvSNCER,Prk?dkBTgBtqONQKcGjzlWjfb?M-mJ3?Sff"


In [18]:
# 自定义一个loss
def loss(labels, logits):
    return keras.losses.sparse_categorical_crossentropy(
        labels, logits, from_logits=True)

model.compile(optimizer='adam', loss=loss)
example_loss = loss(target_example_batch, example_batch_predictions)
print(example_loss.shape)
print(example_loss.numpy().mean())

(64, 100)
4.1741858


In [19]:
checkpoint_path = "training_text_generation_LSTM_1/cp-{epoch:04d}.ckpt"
checkpoint_dir = os.path.dirname(checkpoint_path)

cp_callback_mc = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_path,
                                                        save_weights_only=True,
                                                        #save_best_only=True,
                                                        #monitor='val_loss',
                                                        mode='min',
                                                        verbose=1)
latest = tf.train.latest_checkpoint(checkpoint_dir)
if(latest != None):
    model.load_weights(latest).expect_partial()

epochs = 100
history = model.fit(seq_dataset, epochs = epochs, callbacks = [cp_callback_mc])

Train for 172 steps
Epoch 1/100
Epoch 00001: saving model to training_text_generation_2/cp-0001.ckpt
Epoch 2/100
Epoch 00002: saving model to training_text_generation_2/cp-0002.ckpt
Epoch 3/100
Epoch 00003: saving model to training_text_generation_2/cp-0003.ckpt
Epoch 4/100
Epoch 00004: saving model to training_text_generation_2/cp-0004.ckpt
Epoch 5/100
Epoch 00005: saving model to training_text_generation_2/cp-0005.ckpt
Epoch 6/100
Epoch 00006: saving model to training_text_generation_2/cp-0006.ckpt
Epoch 7/100
Epoch 00007: saving model to training_text_generation_2/cp-0007.ckpt
Epoch 8/100
Epoch 00008: saving model to training_text_generation_2/cp-0008.ckpt
Epoch 9/100
Epoch 00009: saving model to training_text_generation_2/cp-0009.ckpt
Epoch 10/100
Epoch 00010: saving model to training_text_generation_2/cp-0010.ckpt
Epoch 11/100
Epoch 00011: saving model to training_text_generation_2/cp-0011.ckpt
Epoch 12/100
Epoch 00012: saving model to training_text_generation_2/cp-0012.ckpt
Epoch

In [13]:
model2 = build_model(vocab_size, embedding_dim, rnn_units, batch_size = 1)

checkpoint_path = "training_text_generation_LSTM_1/cp-{epoch:04d}.ckpt"
checkpoint_dir = os.path.dirname(checkpoint_path)

model2.load_weights(tf.train.latest_checkpoint(checkpoint_dir))
model2.build(tf.TensorShape([1, None]))

model2.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (1, None, 256)            16640     
_________________________________________________________________
lstm_1 (LSTM)                (1, None, 1024)           5246976   
_________________________________________________________________
dense_1 (Dense)              (1, None, 65)             66625     
Total params: 5,330,241
Trainable params: 5,330,241
Non-trainable params: 0
_________________________________________________________________


In [14]:
# 文本生成流程
# start ch sequence A,
# A -> model -> b
# A.append(b) -> B
# B(Ab) -> model -> c
# B.append(c) -> C
# C(Abc) -> model -> ...

def generate_text(model, start_string, num_generate = 1000):
    input_eval = [char2idx[ch] for ch in start_string]
    input_eval = tf.expand_dims(input_eval, 0)     # 输入的句子(idx格式)
    
    text_generated = []    # 生成的句子
    model.reset_states()
    
    # temperature > 1, logs 更平滑，输出更随机
    # temperature < 1, logs 越陡峭，输出更倾向于最大值
    temperature = 0.5
    
    for _ in range(num_generate):
        # 1. model inferrence -> predictions
        # 2. sample -> ch -> text_generated
        # 3. update input_eval
        
        # predictions : [batch_size, input_eval_len, vocab_size]
        predictions = model(input_eval)
        # predictions: logits -> softmax -> prob
        # softmax: e^xi
        # eg: 4,2 e^4/(e^4 + e^2) = 0.88, e^2 / (e^4 + e^2) = 0.12
        # eg: 2,1 e^2/(e^2 + e^1) = 0.73, e / (e^2 + e) = 0.27
        predictions = predictions / temperature
        # predictions : [input_eval_len, vocab_size]
        predictions = tf.squeeze(predictions, 0)
        # predicted_ids: [input_eval_len, 1]
        # a b c -> b c d
        predicted_id = tf.random.categorical(predictions, num_samples = 1)[-1, 0].numpy()
        text_generated.append(idx2char[predicted_id])
        # s, x -> rnn -> s', y
        input_eval = tf.expand_dims([predicted_id], 0)
    return start_string + ''.join(text_generated)

new_text = generate_text(model2, "All: ")
print(new_text)

All: Queet she do me wrong.

ROMEO:
Tut, I can tell thee now, sir? this new 'twas murderers ge.

KATHARINA:
I pray you, let it come to that; proceed.

ISABELLA:
Inou might have you do well:
I have not seen them down to Richmond.

BUCKINGHAM:
Whenever Buckingham do I thank thee, hope to have a brother's work,
That fellow of thy counsel, then, a bawd: a very gross
kind on the prompt: I'ld not sound the remedy.

First Soldier:
For God's sake, let us sit and will not show it.

DUKE VINCENTIO:
My holy sir,
Whom sometimes you did leave to fate the babe,
I know not what it is, when me doth go;
But let him speak in peace that I should wish them seven:
I have thus grievous borne up: by
nothing
to the house of the world cause me be your favour,
Till now have more than any of the people,
The tongues o' the common mouth the white rose and the red:
Am Claudio less than me; and thou
shalt kill the fowl of season: shall we see her one,
Without a cause.

Lord Marshal:
Harry of Herciling liver:
The rav