In [None]:
# 2020-10-30 created by Akson

In [None]:
# Code16.1
# import

import tensorflow as tf
from tensorflow import keras
import numpy as np
import matplotlib.pyplot as plt

In [None]:
# Code16.2
# download txt

shakespeare_url = 'https://homl.info/shakespeare'
filepath = keras.utils.get_file('shakespeare.txt', shakespeare_url)
with open(filepath) as f:
    shakespeare_text = f.read()

print(shakespeare_text[:148])
print(''.join(sorted(set(shakespeare_text.lower()))))

In [None]:
# Code16.3
# encode

tokenizer = keras.preprocessing.text.Tokenizer(char_level = True)
tokenizer.fit_on_texts(shakespeare_text)

In [None]:
# Code16.4
# test

print(tokenizer.texts_to_sequences(['First']))
print(tokenizer.sequences_to_texts([[20, 6, 9, 8, 3]]))

max_id = len(tokenizer.word_index)
dataset_size = tokenizer.document_count

print(max_id)
print(dataset_size)

In [None]:
# Code16.5
# 

[encoded] = np.array(tokenizer.texts_to_sequences([shakespeare_text])) - 1

In [None]:
# Code16.6

train_size = dataset_size * 90 // 100
dataset = tf.data.Dataset.from_tensor_slices(encoded[:train_size])

In [None]:
# Code16.7
# window

n_steps = 100
window_length = n_steps + 1
dataset = dataset.repeat().window(window_length, shift = 1, drop_remainder = True)
dataset = dataset.flat_map(lambda window: window.batch(window_length))

batch_size = 32
# 混洗，并分成每批32个
dataset = dataset.shuffle(10000).batch(batch_size)
# 将前一百个字符与最后一个目标字符分开
dataset = dataset.map(lambda windows: (windows[:, :-1], windows[:, 1:]))
# 分别对训练集与目标集进行独热编码
dataset = dataset.map(lambda X_batch, Y_batch: (tf.one_hot(X_batch, depth = max_id), Y_batch))
# 预取
dataset = dataset.prefetch(1)

for X_batch, Y_batch in dataset.take(1):
    print(X_batch.shape, Y_batch.shape)

In [None]:
# Code16.8
# train

model = keras.models.Sequential([
    keras.layers.GRU(128, return_sequences = True, input_shape = [None, max_id], dropout = 0.2, recurrent_dropout = 0.2),
    keras.layers.GRU(128, return_sequences = True, dropout = 0.2, recurrent_dropout = 0.2),
    keras.layers.TimeDistributed(keras.layers.Dense(max_id, activation = 'softmax'))
])

model.compile(loss = 'sparse_categorical_crossentropy', optimizer = 'adam')
history = model.fit(dataset, steps_per_epoch = train_size // batch_size, epochs = 10)

In [None]:
# Code16.9
# use model

def preprocess(texts):
    X = np.array(tokenizer.texts_to_sequences(texts)) - 1
    return tf.one_hot(X, max_id)

X_new = preprocess(['How are yo'])
y_pred = model.predict_classes(X_new)
print(tokenizer.sequences_to_texts(y_pred + 1)[0][-1])

In [None]:
# Code16.10

def next_char(text, model, temperature = 1):
    X_new = preprocess([text])
    y_proba = model.predict(X_new)[0, -1:, :]
    rescaled_logists = tf.math.log(y_proba) / temperature
    char_id = tf.random.categorical(rescaled_logists, num_samples = 1) + 1

    return tokenizer.sequences_to_texts(char_id.numpy())[0]

def complete_text(text, model, n_chars = 50, temperature = 1):
    for _ in range(n_chars):
        text += next_char(text, model, temperature)
    return text

print(complete_text('t', model, temperature = 0.2))
print(complete_text('w', model, temperature = 1))
print(complete_text('w', model, temperature = 2))


In [None]:
# Code16.11
# dataset prepare

# 设置每一批的数量
batch_size = 32

n_steps = 100
window_length = n_steps + 1
# 取出要训练的部分，分成32份
encoded_parts = np.array_split(encoded[:train_size], batch_size)
# 建空数据集
datasets = []
# 对于每一批次的数据
for encoded_part in encoded_parts:
    # 数据切片
    dataset = tf.data.Dataset.from_tensor_slices(encoded_part)
    # 设置窗口
    dataset = dataset.window(window_length, shift = n_steps, drop_remainder = True)
    # 将数据窗口内的数据展开
    dataset = dataset.flat_map(lambda window: window.batch(window_length))
    # 将本轮数据加入数据集中
    datasets.append(dataset)

# 打包
dataset = tf.data.Dataset.zip(tuple(datasets)).map(lambda *windows: tf.stack(windows))
# 拆分训练集与目标数据集
dataset = dataset.repeat().map(lambda windows: (windows[:, :-1], windows[:, 1:]))
# 再将训练用数据集转换为独热码
dataset = dataset.map(lambda X_batch, Y_batch: (tf.one_hot(X_batch, depth=max_id), Y_batch))
# 预取
dataset = dataset.prefetch(1)

In [None]:
# Code16.12
# state model prepare

model = keras.models.Sequential([
    keras.layers.GRU(128, return_sequences = True, stateful = True, dropout = 0.2, recurrent_dropout = 0.2, batch_input_shape = [batch_size, None, max_id]),
    keras.layers.GRU(128, return_sequences = True, stateful = True, dropout = 0.2, recurrent_dropout = 0.2),
    keras.layers.TimeDistributed(keras.layers.Dense(max_id, activation = 'softmax'))
])

class ResetStatesCallback(keras.callbacks.Callback):
    def on_epoch_begin(self, epoch, logs):
        self.model.reset_states()

model.compile(loss = 'sparse_categorical_crossentropy', optimizer = 'adam')
model.fit(dataset, steps_per_epoch = train_size // batch_size // n_steps, epochs = 35, callbacks = [ResetStatesCallback()])

stateless_model = keras.models.Sequential([
    keras.layers.GRU(128, return_sequences = True, input_shape = [None, max_id]),
    keras.layers.GRU(128, return_sequences = True),
    keras.layers.TimeDistributed(keras.layers.Dense(max_id, activation = 'softmax'))
])

stateless_model.build(tf.TensorShape([None, None, max_id]))
stateless_model.set_weights(model.get_weights())
print(complete_text('t', stateless_model))

In [None]:
# Code16.13
# keras.imdb test

(X_train, y_train), (X_test, y_test) = keras.datasets.imdb.load_data()
print(X_train[0][:10])

word_index = keras.datasets.imdb.get_word_index()
id_to_word = {id_ + 3: word for word, id_ in word_index.items()}
for id_, token in enumerate(('<pad>', '<sos>', '<unk>')):
    id_to_word[id_] = token
' '.join([id_to_word[id_] for id_ in X_train[0][:10]])


In [None]:
# Code16.14

import tensorflow_datasets as tfds
from collections import Counter

datasets, info = tfds.load('imdb_reviews', as_supervised = True, with_info = True, try_gcs = True)
train_size = info.splits['train'].num_examples

def preprocess(X_batch, y_batch):
    X_batch = tf.strings.substr(X_batch, 0, 300)
    X_batch = tf.strings.regex_replace(X_batch, b'<br\\s*/?>', b' ')
    X_batch = tf.strings.regex_replace(X_batch, b"[^a-zA-Z']", b' ')
    X_batch = tf.strings.split(X_batch)
    
    return X_batch.to_tensor(default_value = b'<pad>'), y_batch

vocabulary = Counter()
for X_batch, y_batch in datasets['train'].batch(32).map(preprocess):
    for review in X_batch:
        vocabulary.update(list(review.numpy()))

# print top 3 words
print(vocabulary.most_common()[:3])
vocab_size = 10000
truncated_vocabulary = [word for word, count in vocabulary.most_common()[:vocab_size]]

words = tf.constant(truncated_vocabulary)
word_ids = tf.range(len(truncated_vocabulary), dtype = tf.int64)
vocab_init = tf.lookup.KeyValueTensorInitializer(words, word_ids)
num_oov_buckets = 1000
table = tf.lookup.StaticVocabularyTable(vocab_init, num_oov_buckets)

# some words' id
table.lookup(tf.constant([b'This is movie was faaaaaantastic'.split()]))

In [None]:
# Code16.15
# create train set

def encode_words(X_batch, y_batch):
    return table.lookup(X_batch), y_batch

train_set = datasets['train'].repeat().batch(32).map(preprocess)
train_set = train_set.map(encode_words).prefetch(1)

embed_size = 128
model = keras.models.Sequential([
    keras.layers.Embedding(vocab_size + num_oov_buckets, embed_size, input_shape = [None]),
    keras.layers.GRU(128, return_sequences = True),
    keras.layers.GRU(128),
    keras.layers.Dense(1, activation = 'sigmoid')
])

model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
history = model.fit(train_set, steps_per_epoch = train_size // 32, epochs = 5)



In [None]:
# Code16.16
# mask

K = keras.backend
embed_size = 128

# layer
inputs = keras.layers.Input(shape = [None])
# create mask layer
mask = keras.layers.Lambda(lambda inputs: K.not_equal(inputs, 0))(inputs)
z = keras.layers.Embedding(vocab_size + num_oov_buckets, embed_size)(inputs)
z = keras.layers.GRU(128, return_sequences = True)(z, mask = mask)
z = keras.layers.GRU(128)(z, mask = mask)
outputs = keras.layers.Dense(1, activation = 'sigmoid')(z)

model = keras.models.Model(inputs = [inputs], outputs = [outputs])
model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
hittory = model.fit(train_set, steps_per_epoch = train_size // 32, epochs = 5)


In [None]:
# Code16.17
# tf hub

import tensorflow_hub as hub

datasets, info = tfds.load('imdb_reviews', as_supervised = True, with_info = True, try_gcs = True)
train_size = info.splits['train'].num_examples
batch_size = 32
train_set = datasets['train'].repeat().batch(batch_size).prefetch(1)

model = keras.Sequential([
    hub.KerasLayer('https://tfhub.dev/google/tf2-preview/nnlm-en-dim50/1', dtype = tf.string, input_shape = [], output_shape = [50]),
    keras.layers.Dense(128, activation = 'relu'),
    keras.layers.Dense(1, activation = 'sigmoid')
])

model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
history = model.fit(train_set, steps_per_epoch = train_size // batch_size, epochs = 5)


In [None]:
# Code16.18
# encoder and decoder

import tensorflow_addons as tfa

encoder_inputs = keras.layers.Input(shape = [None], dtype = np.int32)
decoder_inputs = keras.layers.Input(shape = [None], dtype = np.int32)
sequence_lengths = keras.layers.Input(shape = [], dtype = np.int32)

vocab_size = 100
embed_size = 10
embeddings = keras.layers.Embedding(vocab_size, embed_size)
encoder_embeddings = embeddings(encoder_inputs)
decoder_embeddings = embeddings(decoder_inputs)

encoder = keras.layers.LSTM(512, return_state = True)
encoder_outputs, state_h, state_c = encoder(encoder_embeddings)
encoder_state = [state_h, state_c]

sampler = tfa.seq2seq.sampler.TrainingSampler()

decoder_cell = keras.layers.LSTMCell(512)
output_layer = keras.layers.Dense(vocab_size)
decoder = tfa.seq2seq.basic_decoder.BasicDecoder(decoder_cell, sampler, output_layer = output_layer)

final_outputs, final_state, final_sequence_lengths = decoder(decoder_embeddings, initial_state = encoder_state, sequence_length = sequence_lengths)
y_proba = tf.nn.softmax(final_outputs.rnn_output)

model = keras.Model(inputs = [encoder_inputs, decoder_inputs, sequence_lengths], outputs = [y_proba])
model.compile(loss = 'sparse_categorical_crossentropy', optimizer = 'adam')

X = np.random.randint(100, size = 10 * 1000).reshape(1000, 10)
y = np.random.randint(100, size = 15 * 1000).reshape(1000, 15)
X_decoder = np.c_[np.zeros((1000, 1)), y[:, : -1]]
seq_lengths = np.full([1000], 15)

history = model.fit([X, X_decoder, seq_lengths], y, epochs = 2)