In [1]:
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import sklearn
import pandas as pd
import os
import sys
import time
import tensorflow as tf

from tensorflow import keras

print(tf.__version__)
print(sys.version_info)
for module in mpl, np, pd, sklearn, tf, keras:
    print(module.__name__, module.__version__)

2.0.0-beta1
sys.version_info(major=3, minor=6, micro=7, releaselevel='final', serial=0)
matplotlib 3.1.2
numpy 1.16.2
pandas 0.25.3
sklearn 0.22
tensorflow 2.0.0-beta1
tensorflow.python.keras.api._v2.keras 2.2.4-tf


In [2]:
path_to_file = tf.keras.utils.get_file(
    'shakespeare.txt', 'https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt')
print(path_to_file)
text = open(path_to_file, "r").read()
print(len(text))
print(text[0:100])

C:\Users\A\.keras\datasets\shakespeare.txt
1115394
First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You


In [3]:
# 1. generate vocab
# 2. build mapping  char -> id
# 3. data -> id data
# 4. 预测下一个字符的模型  abcd - > bcd<eos>

#### 1. generate vocab

In [4]:
vocab = sorted(set(text))
print(vocab)

['\n', ' ', '!', '$', '&', "'", ',', '-', '.', '3', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']


#### 2. build mapping  char <=> id

In [5]:
# enumerate() 会对每一个元素生成一个index,
char2idx = {char :idx for idx,char in enumerate(vocab) }
print(char2idx)

{'\n': 0, ' ': 1, '!': 2, '$': 3, '&': 4, "'": 5, ',': 6, '-': 7, '.': 8, '3': 9, ':': 10, ';': 11, '?': 12, 'A': 13, 'B': 14, 'C': 15, 'D': 16, 'E': 17, 'F': 18, 'G': 19, 'H': 20, 'I': 21, 'J': 22, 'K': 23, 'L': 24, 'M': 25, 'N': 26, 'O': 27, 'P': 28, 'Q': 29, 'R': 30, 'S': 31, 'T': 32, 'U': 33, 'V': 34, 'W': 35, 'X': 36, 'Y': 37, 'Z': 38, 'a': 39, 'b': 40, 'c': 41, 'd': 42, 'e': 43, 'f': 44, 'g': 45, 'h': 46, 'i': 47, 'j': 48, 'k': 49, 'l': 50, 'm': 51, 'n': 52, 'o': 53, 'p': 54, 'q': 55, 'r': 56, 's': 57, 't': 58, 'u': 59, 'v': 60, 'w': 61, 'x': 62, 'y': 63, 'z': 64}


In [6]:
idx2char = np.array(vocab)
print(idx2char)

['\n' ' ' '!' '$' '&' "'" ',' '-' '.' '3' ':' ';' '?' 'A' 'B' 'C' 'D' 'E'
 'F' 'G' 'H' 'I' 'J' 'K' 'L' 'M' 'N' 'O' 'P' 'Q' 'R' 'S' 'T' 'U' 'V' 'W'
 'X' 'Y' 'Z' 'a' 'b' 'c' 'd' 'e' 'f' 'g' 'h' 'i' 'j' 'k' 'l' 'm' 'n' 'o'
 'p' 'q' 'r' 's' 't' 'u' 'v' 'w' 'x' 'y' 'z']


#### 3. data => id data

In [7]:
text_as_int = np.array([char2idx[c] for c in text])
print(text_as_int[0:100])

[18 47 56 57 58  1 15 47 58 47 64 43 52 10  0 14 43 44 53 56 43  1 61 43
  1 54 56 53 41 43 43 42  1 39 52 63  1 44 59 56 58 46 43 56  6  1 46 43
 39 56  1 51 43  1 57 54 43 39 49  8  0  0 13 50 50 10  0 31 54 43 39 49
  6  1 57 54 43 39 49  8  0  0 18 47 56 57 58  1 15 47 58 47 64 43 52 10
  0 37 53 59]


#### 4. make dataset

In [8]:
# 样本抽取,用dataset的方式
char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)
seq_length = 100
seq_dataset = char_dataset.batch(seq_length + 1, 
                                 drop_remainder = True)

for ch_id in char_dataset.take(2):
    print(ch_id, idx2char[ch_id.numpy()])

for seq_id in seq_dataset.take(2):
    print(seq_id)
#     print("".join(idx2char[seq_id.numpy()]))
    print(repr("".join(idx2char[seq_id.numpy()])))

tf.Tensor(18, shape=(), dtype=int32) F
tf.Tensor(47, shape=(), dtype=int32) i
tf.Tensor(
[18 47 56 57 58  1 15 47 58 47 64 43 52 10  0 14 43 44 53 56 43  1 61 43
  1 54 56 53 41 43 43 42  1 39 52 63  1 44 59 56 58 46 43 56  6  1 46 43
 39 56  1 51 43  1 57 54 43 39 49  8  0  0 13 50 50 10  0 31 54 43 39 49
  6  1 57 54 43 39 49  8  0  0 18 47 56 57 58  1 15 47 58 47 64 43 52 10
  0 37 53 59  1], shape=(101,), dtype=int32)
'First Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou '
tf.Tensor(
[39 56 43  1 39 50 50  1 56 43 57 53 50 60 43 42  1 56 39 58 46 43 56  1
 58 53  1 42 47 43  1 58 46 39 52  1 58 53  1 44 39 51 47 57 46 12  0  0
 13 50 50 10  0 30 43 57 53 50 60 43 42  8  1 56 43 57 53 50 60 43 42  8
  0  0 18 47 56 57 58  1 15 47 58 47 64 43 52 10  0 18 47 56 57 58  6  1
 63 53 59  1 49], shape=(101,), dtype=int32)
'are all resolved rather to die than to famish?\n\nAll:\nResolved. resolved.\n\nFirst Citizen:\nFirst, you k'


#### 5. dataset中的每个batch(0:100) 拆分成（0：99）和（1：100）

In [9]:
# 把字符集分配成输入和输出，作为训练样本
def split_input_target(id_text):
    """
    abcde -> abcd,bcde
    """
    return id_text[0:-1], id_text[1:]


In [10]:
seq_dataset = seq_dataset.map(split_input_target)

for item_input, item_output in seq_dataset.take(2):
    print(item_input.numpy())
    print(item_output.numpy())
    

[18 47 56 57 58  1 15 47 58 47 64 43 52 10  0 14 43 44 53 56 43  1 61 43
  1 54 56 53 41 43 43 42  1 39 52 63  1 44 59 56 58 46 43 56  6  1 46 43
 39 56  1 51 43  1 57 54 43 39 49  8  0  0 13 50 50 10  0 31 54 43 39 49
  6  1 57 54 43 39 49  8  0  0 18 47 56 57 58  1 15 47 58 47 64 43 52 10
  0 37 53 59]
[47 56 57 58  1 15 47 58 47 64 43 52 10  0 14 43 44 53 56 43  1 61 43  1
 54 56 53 41 43 43 42  1 39 52 63  1 44 59 56 58 46 43 56  6  1 46 43 39
 56  1 51 43  1 57 54 43 39 49  8  0  0 13 50 50 10  0 31 54 43 39 49  6
  1 57 54 43 39 49  8  0  0 18 47 56 57 58  1 15 47 58 47 64 43 52 10  0
 37 53 59  1]
[39 56 43  1 39 50 50  1 56 43 57 53 50 60 43 42  1 56 39 58 46 43 56  1
 58 53  1 42 47 43  1 58 46 39 52  1 58 53  1 44 39 51 47 57 46 12  0  0
 13 50 50 10  0 30 43 57 53 50 60 43 42  8  1 56 43 57 53 50 60 43 42  8
  0  0 18 47 56 57 58  1 15 47 58 47 64 43 52 10  0 18 47 56 57 58  6  1
 63 53 59  1]
[56 43  1 39 50 50  1 56 43 57 53 50 60 43 42  1 56 39 58 46 43 56  1 58
 53  1 42

#### 6. 模型建立

In [11]:
batch_size = 64
buffer_size = 10000

seq_dataset = seq_dataset.shuffle(buffer_size).batch(
    batch_size, drop_remainder = True
)

In [12]:
print(text[0:100])
vocab_size = len(vocab)
embedding_dim = 256
rnn_units = 1024

def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
    model = keras.models.Sequential([
        keras.layers.Embedding(vocab_size, embedding_dim, 
                               batch_input_shape=[batch_size, None]),
        keras.layers.SimpleRNN(units = rnn_units,
                               return_sequences = True),
        keras.layers.Dense(vocab_size)
    ])
    return model
model = build_model( vocab_size = vocab_size,
                     embedding_dim = embedding_dim,
                     rnn_units = rnn_units,
                     batch_size = batch_size)

model.summary()

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (64, None, 256)           16640     
_________________________________________________________________
simple_rnn (SimpleRNN)       (64, None, 1024)          1311744   
_________________________________________________________________
dense (Dense)                (64, None, 65)            66625     
Total params: 1,395,009
Trainable params: 1,395,009
Non-trainable params: 0
_________________________________________________________________


#### 7. 模型未经过训练，但可以用模型去做预测（把model当函数来用）

In [13]:
for input_example_batch, target_example_batch in seq_dataset.take(1):
    example_batch_predictions = model(input_example_batch)
    print(example_batch_predictions.shape)

(64, 100, 65)


In [14]:
# 随机采样 random sampling
# greedy/ random 不同策略，无好坏之分
# 模型最后一层没有激活函数，所以是logits
# logits 是对于分类任务来说，在计算softmax之前的值

sample_indics = tf.random.categorical( 
    logits = example_batch_predictions[0],
    num_samples = 1,
    dtype=None,
    seed=None,
    name=None)
print(sample_indics)

# example_batch_predictions[0].shape -> (100,65)
# -> (100, 1) 

# 去掉squeeze维度
sample_indics = tf.squeeze(sample_indics, axis = -1)
print(sample_indics)

tf.Tensor(
[[36]
 [35]
 [ 5]
 [40]
 [58]
 [60]
 [ 1]
 [ 4]
 [30]
 [16]
 [64]
 [20]
 [36]
 [18]
 [44]
 [32]
 [11]
 [23]
 [38]
 [51]
 [ 5]
 [ 4]
 [57]
 [10]
 [60]
 [29]
 [34]
 [53]
 [41]
 [46]
 [23]
 [ 4]
 [ 8]
 [48]
 [37]
 [14]
 [ 9]
 [40]
 [59]
 [34]
 [42]
 [26]
 [10]
 [ 3]
 [33]
 [62]
 [45]
 [60]
 [36]
 [39]
 [34]
 [39]
 [23]
 [60]
 [50]
 [47]
 [50]
 [ 9]
 [44]
 [36]
 [18]
 [45]
 [22]
 [49]
 [22]
 [51]
 [46]
 [59]
 [33]
 [47]
 [63]
 [ 5]
 [33]
 [22]
 [24]
 [38]
 [52]
 [32]
 [36]
 [52]
 [10]
 [27]
 [23]
 [61]
 [63]
 [20]
 [27]
 [53]
 [17]
 [61]
 [12]
 [23]
 [18]
 [16]
 [56]
 [62]
 [52]
 [39]
 [31]
 [60]], shape=(100, 1), dtype=int64)
tf.Tensor(
[36 35  5 40 58 60  1  4 30 16 64 20 36 18 44 32 11 23 38 51  5  4 57 10
 60 29 34 53 41 46 23  4  8 48 37 14  9 40 59 34 42 26 10  3 33 62 45 60
 36 39 34 39 23 60 50 47 50  9 44 36 18 45 22 49 22 51 46 59 33 47 63  5
 33 22 24 38 52 32 36 52 10 27 23 61 63 20 27 53 17 61 12 23 18 16 56 62
 52 39 31 60], shape=(100,), dtype=int64)


In [15]:
print("input:", repr("".join(idx2char[input_example_batch[0]])))
print()
print("output:", repr("".join(idx2char[target_example_batch[0]])))
print()
print("prediction:", repr("".join(idx2char[sample_indics])))

input: 'er to bring it thee,\nSo fearful were they of infection.\n\nFRIAR LAURENCE:\nUnhappy fortune! by my brot'

output: 'r to bring it thee,\nSo fearful were they of infection.\n\nFRIAR LAURENCE:\nUnhappy fortune! by my broth'

prediction: "XW'btv &RDzHXFfT;KZm'&s:vQVochK&.jYB3buVdN:$UxgvXaVaKvlil3fXFgJkJmhuUiy'UJLZnTXn:OKwyHOoEw?KFDrxnaSv"


In [16]:
def loss(labels, logits):
    return keras.losses.sparse_categorical_crossentropy(labels, 
                                                      logits, 
                                                      from_logits = True)

model.compile(optimizer = "adam", loss = loss)
example_loss = loss(target_example_batch, example_batch_predictions)
print(example_loss.shape)
print(example_loss.numpy().mean())

(64, 100)
4.1892185


#### 8. 模型训练

In [17]:
output_dir = "./text_generation_checkpoints"
if not os.path.exists(output_dir):
    os.mkdir(output_dir)
checkpoint_prefix = os.path.join(output_dir, "ckpt_{epoch}")
checkpoint_callback = keras.callbacks.ModelCheckpoint(
    filepath = checkpoint_prefix,
#     monitor='val_loss',
#     verbose=0,
#     save_best_only=False,
    save_weights_only=True,
#     mode='auto',
#     save_freq='epoch',
#     load_weights_on_restart=False
)

epochs = 100

history = model.fit(seq_dataset, epochs = epochs,
                   callbacks= [checkpoint_callback])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


#### 9. 重载模型（利用checkpoint）

In [18]:
tf.train.latest_checkpoint(output_dir)

'./text_generation_checkpoints\\ckpt_100'

In [19]:
model2 = build_model(vocab_size = vocab_size,
                     embedding_dim = embedding_dim,
                     rnn_units = rnn_units,
                     batch_size = 1)
model2.load_weights(tf.train.latest_checkpoint(output_dir))
model2.build(tf.TensorShape([1,None]))

# start ch sequence A,
# A -> model -> b
# A.append(b) -> B
# B -> model-> c
# B.append(c) -> C
model2.summary()


Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (1, None, 256)            16640     
_________________________________________________________________
simple_rnn_1 (SimpleRNN)     (1, None, 1024)           1311744   
_________________________________________________________________
dense_1 (Dense)              (1, None, 65)             66625     
Total params: 1,395,009
Trainable params: 1,395,009
Non-trainable params: 0
_________________________________________________________________


In [20]:
def generate_text(model,start_string, num_generate = 1000):
    input_eval = [char2idx[ch] for ch in start_string]
    input_eval= tf.expand_dims(input_eval, 0)
    
    text_generated = []
    model.reset_states()
    for _ in range(num_generate):
        # 1.model inference ->predictions
        # 2.sample
        # 3.update input_eval
        
        # predictions : [batch_size, input_eval_len, vocab_size]
        predictions = model(input_eval)
        # predictions : [input_eval_len, vocab_size]   消掉batch_size的维度
        predictions = tf.squeeze(predictions, 0)
        # predicetd_ids : [input_eval_len, 1]
        # a b c -> b c d ,只用最后一个 predicetd_ids 就是d
        predicted_id = tf.random.categorical(
            predictions, num_samples = 1
        )[-1,0].numpy()
        text_generated.append(idx2char[predicted_id])
        # 为什么不在input_eval后面添加predicted_id，而是直接替换  => 为了防止低效
        # s, x -> rnn -> s', y  
        input_eval = tf.expand_dims([predicted_id],0)
    return start_string + "".join(text_generated)
new_text = generate_text(model2, "All: ")
print(new_text)

All: frore JUSe f h cupll h hougrre.


DY:
Fo st lof iesal t re:
Ayo ownd s g!---
Th tht th oue s ang s cesing f m, thand.
CHey,
my
Thoumerind thout, gochyofukixtomooth,
An o has
wanou an out w t VI nd d.
V:
Anikngous thowhait
LULI t t?
Se wathore thon rmusise
Wh onyof to d s aveakispok f wantonithandesa--th:
IIIN:
Weyesiceainentore arore?
Adit w,
Why mathe y mive, oupe inelir ie f ce ngesthillitowngempra-g; o,
AMN tr y
O:

CAss y ansent and tut, dd tonditime ussse,
blandisorowaveree, fothodove sunove twang atou a fughofildotth!


Chy ddle O:

WhINUSchethth angamy g h, Whothave, ougll bove ug? e, are chithinckers oke ' Mat t.
TExf ne,
Soriton theas,
SII ys acat, f and th sead uprmenorthy wfront d ag,
MO-



I'd brise,
To th y'
tithad mod t t, jupe ngs! o, s, INEldetl?
TA thow maty bavew;
S:
Mat nd l. iamy, merase:
ELLELLLOMyowal anopag ime wnounane t, y. g,
Ataitthere n. bereear f bare, orore


Hinollos athast, tl e ce! I tonilire a y'st.
Tofey moren,

Awnesupppl tix d INA: mewobreshe 