In [1]:
!pip config set global.index-url https://mirrors.aliyun.com/pypi/simple/
!python -m pip install -U pip
!pip -V
!pip install sklearn pandas

Writing to /root/.config/pip/pip.conf
Looking in indexes: https://mirrors.aliyun.com/pypi/simple/
Requirement already up-to-date: pip in /usr/local/lib/python3.6/dist-packages (20.2.4)
pip 20.2.4 from /usr/local/lib/python3.6/dist-packages/pip (python 3.6)
Looking in indexes: https://mirrors.aliyun.com/pypi/simple/


In [2]:
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import sklearn
import pandas as pd
import os
import sys
import time
import tensorflow as tf

from tensorflow import keras

print(tf.__version__)
print(sys.version_info)
for module in mpl, np, pd, sklearn, tf, keras:
    print(module.__name__, module.__version__)
    
print(tf.config.list_physical_devices('GPU'))

2.3.1
sys.version_info(major=3, minor=6, micro=9, releaselevel='final', serial=0)
matplotlib 3.3.2
numpy 1.18.5
pandas 1.1.3
sklearn 0.23.2
tensorflow 2.3.1
tensorflow.keras 2.4.0
[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [3]:
# https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt

input_filepath = './shakespeare.txt'
text = open(input_filepath, 'r').read()

print(len(text))
print(text[:100])

1115394
First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You


In [4]:
# 1. generate vocab
# 2. build mapping char->id
# 3. data -> id_data
# 4. abcd -> bcd<eos>

vocab = sorted(set(text))
print(len(vocab))

65


In [5]:
char2idx = {char:idx for idx,char in enumerate(vocab)}
print(char2idx)

{'\n': 0, ' ': 1, '!': 2, '$': 3, '&': 4, "'": 5, ',': 6, '-': 7, '.': 8, '3': 9, ':': 10, ';': 11, '?': 12, 'A': 13, 'B': 14, 'C': 15, 'D': 16, 'E': 17, 'F': 18, 'G': 19, 'H': 20, 'I': 21, 'J': 22, 'K': 23, 'L': 24, 'M': 25, 'N': 26, 'O': 27, 'P': 28, 'Q': 29, 'R': 30, 'S': 31, 'T': 32, 'U': 33, 'V': 34, 'W': 35, 'X': 36, 'Y': 37, 'Z': 38, 'a': 39, 'b': 40, 'c': 41, 'd': 42, 'e': 43, 'f': 44, 'g': 45, 'h': 46, 'i': 47, 'j': 48, 'k': 49, 'l': 50, 'm': 51, 'n': 52, 'o': 53, 'p': 54, 'q': 55, 'r': 56, 's': 57, 't': 58, 'u': 59, 'v': 60, 'w': 61, 'x': 62, 'y': 63, 'z': 64}


In [6]:
idx2char = np.array(vocab)
print(idx2char)

['\n' ' ' '!' '$' '&' "'" ',' '-' '.' '3' ':' ';' '?' 'A' 'B' 'C' 'D' 'E'
 'F' 'G' 'H' 'I' 'J' 'K' 'L' 'M' 'N' 'O' 'P' 'Q' 'R' 'S' 'T' 'U' 'V' 'W'
 'X' 'Y' 'Z' 'a' 'b' 'c' 'd' 'e' 'f' 'g' 'h' 'i' 'j' 'k' 'l' 'm' 'n' 'o'
 'p' 'q' 'r' 's' 't' 'u' 'v' 'w' 'x' 'y' 'z']


In [7]:
text_as_int = np.array([char2idx[c] for c in text])
print(text_as_int[:10])
print(text[:10])

[18 47 56 57 58  1 15 47 58 47]
First Citi


In [8]:
def split_input_target(id_text):
    '''
    abcde -> abcd, bcde
    '''
    return id_text[:-1], id_text[1:]

char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)
seq_length = 100
seq_dataset = char_dataset.batch(seq_length+1, drop_remainder=True)

for ch_id in char_dataset.take(2):
    print(ch_id, idx2char[ch_id.numpy()])

for seq_id in seq_dataset.take(2):
    print(seq_id)
    print(repr(''.join(idx2char[seq_id.numpy()])))

tf.Tensor(18, shape=(), dtype=int64) F
tf.Tensor(47, shape=(), dtype=int64) i
tf.Tensor(
[18 47 56 57 58  1 15 47 58 47 64 43 52 10  0 14 43 44 53 56 43  1 61 43
  1 54 56 53 41 43 43 42  1 39 52 63  1 44 59 56 58 46 43 56  6  1 46 43
 39 56  1 51 43  1 57 54 43 39 49  8  0  0 13 50 50 10  0 31 54 43 39 49
  6  1 57 54 43 39 49  8  0  0 18 47 56 57 58  1 15 47 58 47 64 43 52 10
  0 37 53 59  1], shape=(101,), dtype=int64)
'First Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou '
tf.Tensor(
[39 56 43  1 39 50 50  1 56 43 57 53 50 60 43 42  1 56 39 58 46 43 56  1
 58 53  1 42 47 43  1 58 46 39 52  1 58 53  1 44 39 51 47 57 46 12  0  0
 13 50 50 10  0 30 43 57 53 50 60 43 42  8  1 56 43 57 53 50 60 43 42  8
  0  0 18 47 56 57 58  1 15 47 58 47 64 43 52 10  0 18 47 56 57 58  6  1
 63 53 59  1 49], shape=(101,), dtype=int64)
'are all resolved rather to die than to famish?\n\nAll:\nResolved. resolved.\n\nFirst Citizen:\nFirst, you k'


In [9]:
seq_dataset = seq_dataset.map(split_input_target)
for item_input, item_output in seq_dataset.take(2):
    print(item_input.numpy())
    print(item_output.numpy())

[18 47 56 57 58  1 15 47 58 47 64 43 52 10  0 14 43 44 53 56 43  1 61 43
  1 54 56 53 41 43 43 42  1 39 52 63  1 44 59 56 58 46 43 56  6  1 46 43
 39 56  1 51 43  1 57 54 43 39 49  8  0  0 13 50 50 10  0 31 54 43 39 49
  6  1 57 54 43 39 49  8  0  0 18 47 56 57 58  1 15 47 58 47 64 43 52 10
  0 37 53 59]
[47 56 57 58  1 15 47 58 47 64 43 52 10  0 14 43 44 53 56 43  1 61 43  1
 54 56 53 41 43 43 42  1 39 52 63  1 44 59 56 58 46 43 56  6  1 46 43 39
 56  1 51 43  1 57 54 43 39 49  8  0  0 13 50 50 10  0 31 54 43 39 49  6
  1 57 54 43 39 49  8  0  0 18 47 56 57 58  1 15 47 58 47 64 43 52 10  0
 37 53 59  1]
[39 56 43  1 39 50 50  1 56 43 57 53 50 60 43 42  1 56 39 58 46 43 56  1
 58 53  1 42 47 43  1 58 46 39 52  1 58 53  1 44 39 51 47 57 46 12  0  0
 13 50 50 10  0 30 43 57 53 50 60 43 42  8  1 56 43 57 53 50 60 43 42  8
  0  0 18 47 56 57 58  1 15 47 58 47 64 43 52 10  0 18 47 56 57 58  6  1
 63 53 59  1]
[56 43  1 39 50 50  1 56 43 57 53 50 60 43 42  1 56 39 58 46 43 56  1 58
 53  1 42

In [10]:
batch_size = 64
buffer_size = 10000

seq_dataset = seq_dataset.shuffle(buffer_size).batch(batch_size, drop_remainder=True)

In [11]:
vocab_size = len(vocab)
embedding_dim=256
rnn_units = 1024

def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
    model = keras.models.Sequential([
        keras.layers.Embedding(vocab_size, embedding_dim, batch_input_shape=[batch_size, None]),
        keras.layers.SimpleRNN(units=rnn_units, return_sequences=True),
        keras.layers.Dense(vocab_size),
    ])
    return model

model = build_model(vocab_size, embedding_dim, rnn_units, batch_size)

model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (64, None, 256)           16640     
_________________________________________________________________
simple_rnn (SimpleRNN)       (64, None, 1024)          1311744   
_________________________________________________________________
dense (Dense)                (64, None, 65)            66625     
Total params: 1,395,009
Trainable params: 1,395,009
Non-trainable params: 0
_________________________________________________________________


In [34]:
for input_example_batch, target_example_batch in seq_dataset.take(1):
    example_batch_predictions = model(input_example_batch)
    print(example_batch_predictions.shape)

(64, 100, 65)


In [36]:
# random sampling.
# greedy, random.
sample_indices = tf.random.categorical(logits = example_batch_predictions[0], num_samples = 1)
print(sample_indices)
# (100, 65) -> (100, 1)
sample_indices = tf.squeeze(sample_indices, axis=-1)
print(sample_indices)

tf.Tensor(
[[27]
 [ 5]
 [60]
 [56]
 [22]
 [ 4]
 [11]
 [63]
 [44]
 [ 6]
 [36]
 [18]
 [42]
 [31]
 [18]
 [46]
 [27]
 [42]
 [47]
 [46]
 [ 2]
 [ 3]
 [54]
 [36]
 [44]
 [40]
 [27]
 [38]
 [ 7]
 [32]
 [ 4]
 [19]
 [33]
 [ 9]
 [29]
 [14]
 [48]
 [16]
 [51]
 [31]
 [18]
 [46]
 [ 5]
 [37]
 [15]
 [ 2]
 [41]
 [32]
 [52]
 [48]
 [51]
 [17]
 [23]
 [48]
 [28]
 [62]
 [ 3]
 [ 3]
 [33]
 [43]
 [61]
 [31]
 [ 9]
 [49]
 [58]
 [11]
 [64]
 [30]
 [31]
 [48]
 [14]
 [26]
 [12]
 [31]
 [ 8]
 [14]
 [22]
 [35]
 [42]
 [14]
 [30]
 [60]
 [30]
 [27]
 [25]
 [38]
 [57]
 [ 2]
 [59]
 [ 2]
 [44]
 [17]
 [46]
 [37]
 [42]
 [31]
 [51]
 [43]
 [57]
 [ 9]], shape=(100, 1), dtype=int64)
tf.Tensor(
[27  5 60 56 22  4 11 63 44  6 36 18 42 31 18 46 27 42 47 46  2  3 54 36
 44 40 27 38  7 32  4 19 33  9 29 14 48 16 51 31 18 46  5 37 15  2 41 32
 52 48 51 17 23 48 28 62  3  3 33 43 61 31  9 49 58 11 64 30 31 48 14 26
 12 31  8 14 22 35 42 14 30 60 30 27 25 38 57  2 59  2 44 17 46 37 42 31
 51 43 57  9], shape=(100,), dtype=int64)


In [40]:
print('Input: ', repr(''.join(idx2char[input_example_batch[0]])))
print()
print('Output: ', repr(''.join(idx2char[target_example_batch[0]])))
print()
print('Predictions: ', repr(''.join(idx2char[sample_indices])))

Input:  " thy disposition better temper'd.\nHast thou slain Tybalt? wilt thou slay thyself?\nAnd stay thy lady "

Output:  "thy disposition better temper'd.\nHast thou slain Tybalt? wilt thou slay thyself?\nAnd stay thy lady t"

Predictions:  "O'vrJ&;yf,XFdSFhOdih!$pXfbOZ-T&GU3QBjDmSFh'YC!cTnjmEKjPx$$UewS3kt;zRSjBN?S.BJWdBRvROMZs!u!fEhYdSmes3"


In [42]:
def loss(labels, logits):
    return keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

model.compile(optimizer='adam', loss=loss)

example_loss = loss(target_example_batch, example_batch_predictions)
print(example_loss.shape)
print(example_loss.numpy().mean())

(64, 100)
4.18363


In [44]:
output_dir = './text_generation_checkpoints'
if not os.path.exists(output_dir):
    os.mkdir(output_dir)
    
checkpoint_prefix = os.path.join(output_dir, 'ckpt_{epoch}')
checkpoint_callback = keras.callbacks.ModelCheckpoint(
    filepath = checkpoint_prefix,
    save_weights_only = True
)

epochs = 10
history = model.fit(seq_dataset, epochs = epochs, callbacks = [checkpoint_callback])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [47]:
print(history.history.keys())

dict_keys(['loss'])


In [49]:
tf.train.latest_checkpoint(output_dir)

'./text_generation_checkpoints/ckpt_10'

In [50]:
model2 = build_model(vocab_size, embedding_dim, rnn_units, batch_size=1)

model2.load_weights(tf.train.latest_checkpoint(output_dir))
model2.build(tf.TensorShape([1, None]))
# start ch sequence A,
# A -> model -> b
# A.append(b) -> B
# B -> model -> c
# B.append(c) -> C
# C(Abc) -> model -> ...

model2.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (1, None, 256)            16640     
_________________________________________________________________
simple_rnn_1 (SimpleRNN)     (1, None, 1024)           1311744   
_________________________________________________________________
dense_1 (Dense)              (1, None, 65)             66625     
Total params: 1,395,009
Trainable params: 1,395,009
Non-trainable params: 0
_________________________________________________________________


In [57]:
def generate_text(model, start_string, num_generate = 1000):
    input_eval = [char2idx[ch] for ch in start_string]
    input_eval = tf.expand_dims(input_eval, 0)
    
    text_generated = []
    model.reset_states()
    
    for _ in range(num_generate):
        # 1. model inference -> predictions
        # 2. sample -> ch -> text_generated.
        # 3. update input_eval
        
        # predictions: [batch_size, input_eval_len, vocab_size]
        predictions = model(input_eval)
        # predictions: [input_eval_len, vocab_size]
        predictions = tf.squeeze(predictions, 0)
        # predicted_ids: [input_eval_len, 1]
        # a b c -> b c d
        predicted_id = tf.random.categorical(predictions, num_samples=1)[-1, 0].numpy()
        text_generated.append(idx2char[predicted_id])
        # s,x -> rnn -> s', y
        input_eval = tf.expand_dims([predicted_id], 0)
    return start_string + ''.join(text_generated)

new_text = generate_text(model2, 'All: ')
print(new_text)

All: tus ha,
T:
Cofooursorind!
O, ndelit, fous' st bully w ndsthe he; indat,

Ho benice;
TIRe n there ckiced, s his bure hietofu h as;
II armaveaituncume fis my men, l tce nd IOFXETHZALoumizeearachancl pe ntove ond ive wn Bordem pele th citstho, d allanot iord buththars itin IIChacicaithangrthondye irg at w Shisiputever e y t blPe miqu nomatin;
STOMy.
ATI houl be fffake nd ghey dordoand it, tound hars adagee t l t-ponind nyor e mell:
NEdoug ofo, pre.
Thy.

Tun
Whoworeanoure toripo toufot micow, I
GHA:
Aderen nt gh ckithande ICLII: bammendoulldige.
CAnd ge al CK:
Le we, h tes, ve tcoone We aucinco f,
ELoudowivaryonty youtrcareshesond s s INo brerd t momar
I tounocor he t, phea bed gofad bake ll t.
KI ay jes p, de is, io omade weve Chath,
THar t othi'trreseroudurs.

Cacok hethivey thicke g taighedshe bee sswond is:
TUSThoulf an swathtounerof Low s
SA:
PETETho' f y bllicicoce w.
Lout isuped scatioxer
KINTo imyome lle t he
bermaith-gat, th wn?

Stt pe waf blenoracang t, hef berithoufot, t!