# [RNN によるテキスト生成](https://www.tensorflow.org/tutorials/text/text_generation)

In [1]:
import tensorflow as tf
from tensorflow import keras

import numpy as np
import os
import time

# Download dataset

In [2]:
path_to_file = tf.keras.utils.get_file('shakespeare.txt', 'https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt')

with open(path_to_file, 'rb') as file:
    text = file.read().decode(encoding='utf-8')
    print(len(text))

print(text[:100], end='\n********\n')

vocab = sorted(set(text))
print('Unique vocabraries: ', len(vocab))
print(vocab[:10])

Downloading data from https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt
1115394
First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You
********
Unique vocabraries:  65
['\n', ' ', '!', '$', '&', "'", ',', '-', '.', '3']


In [3]:
char2idx = {u:i for i, u in enumerate(vocab)}
print(char2idx)
idx2char = np.array(vocab)
print(idx2char)

text_as_int = np.array([char2idx[c] for c in text])
print(text_as_int)

{'\n': 0, ' ': 1, '!': 2, '$': 3, '&': 4, "'": 5, ',': 6, '-': 7, '.': 8, '3': 9, ':': 10, ';': 11, '?': 12, 'A': 13, 'B': 14, 'C': 15, 'D': 16, 'E': 17, 'F': 18, 'G': 19, 'H': 20, 'I': 21, 'J': 22, 'K': 23, 'L': 24, 'M': 25, 'N': 26, 'O': 27, 'P': 28, 'Q': 29, 'R': 30, 'S': 31, 'T': 32, 'U': 33, 'V': 34, 'W': 35, 'X': 36, 'Y': 37, 'Z': 38, 'a': 39, 'b': 40, 'c': 41, 'd': 42, 'e': 43, 'f': 44, 'g': 45, 'h': 46, 'i': 47, 'j': 48, 'k': 49, 'l': 50, 'm': 51, 'n': 52, 'o': 53, 'p': 54, 'q': 55, 'r': 56, 's': 57, 't': 58, 'u': 59, 'v': 60, 'w': 61, 'x': 62, 'y': 63, 'z': 64}
['\n' ' ' '!' '$' '&' "'" ',' '-' '.' '3' ':' ';' '?' 'A' 'B' 'C' 'D' 'E'
 'F' 'G' 'H' 'I' 'J' 'K' 'L' 'M' 'N' 'O' 'P' 'Q' 'R' 'S' 'T' 'U' 'V' 'W'
 'X' 'Y' 'Z' 'a' 'b' 'c' 'd' 'e' 'f' 'g' 'h' 'i' 'j' 'k' 'l' 'm' 'n' 'o'
 'p' 'q' 'r' 's' 't' 'u' 'v' 'w' 'x' 'y' 'z']
[18 47 56 ... 45  8  0]


In [4]:
print('{}')
for char, _ in zip(char2idx, range(20)):
    print(' {:4s}: {:3d}'.format(repr(char), char2idx[char]))

{}
 '\n':   0
 ' ' :   1
 '!' :   2
 '$' :   3
 '&' :   4
 "'" :   5
 ',' :   6
 '-' :   7
 '.' :   8
 '3' :   9
 ':' :  10
 ';' :  11
 '?' :  12
 'A' :  13
 'B' :  14
 'C' :  15
 'D' :  16
 'E' :  17
 'F' :  18
 'G' :  19


In [5]:
AUTOTUNE = tf.data.experimental.AUTOTUNE
seq_length = 100
examples_per_epoch = len(text) // (seq_length+1)
print(f'examples_per_epoch {examples_per_epoch}')

char_ds = tf.data.Dataset.from_tensor_slices(text_as_int)
sequences = char_ds.batch(seq_length+1, drop_remainder=True)

def split_input_target(chunk):
    input_text = chunk[:-1]
    target_text = chunk[1:]
    print(f'input_text: {input_text}, \ntarget_text: {target_text}')
    return input_text, target_text

dataset = sequences.map(split_input_target)

examples_per_epoch 11043
input_text: Tensor("strided_slice:0", shape=(100,), dtype=int64), 
target_text: Tensor("strided_slice_1:0", shape=(100,), dtype=int64)


In [6]:
for input_example, target_example in dataset.take(1):
    print(input_example.numpy())
    print('Input data:', repr(''.join(idx2char[input_example.numpy()])))

'''
data = np.array([1,2,3,4,5])
char_dir = np.array(['a', 'b', 'c', 'd', 'e', 'f'])
print(char_dir[data])

> ['b' 'c' 'd' 'e' 'f']
'''

[18 47 56 57 58  1 15 47 58 47 64 43 52 10  0 14 43 44 53 56 43  1 61 43
  1 54 56 53 41 43 43 42  1 39 52 63  1 44 59 56 58 46 43 56  6  1 46 43
 39 56  1 51 43  1 57 54 43 39 49  8  0  0 13 50 50 10  0 31 54 43 39 49
  6  1 57 54 43 39 49  8  0  0 18 47 56 57 58  1 15 47 58 47 64 43 52 10
  0 37 53 59]
Input data: 'First Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou'


"\ndata = np.array([1,2,3,4,5])\nchar_dir = np.array(['a', 'b', 'c', 'd', 'e', 'f'])\nprint(char_dir[data])\n\n> ['b' 'c' 'd' 'e' 'f']\n"

In [7]:
BATCH_SIZE=64
BUFFER_SIZE=10000

dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)

dataset

<BatchDataset shapes: ((64, 100), (64, 100)), types: (tf.int64, tf.int64)>

# Modeling

In [9]:
vocab_size = len(vocab)
embedding_dim = 256
rnn_units=1024

model = keras.Sequential()
model.add(keras.layers.Embedding(vocab_size, embedding_dim, batch_input_shape=[BATCH_SIZE, None]))
model.add(keras.layers.GRU(rnn_units, return_sequences=True, stateful=True, recurrent_initializer='glorot_uniform'))
model.add(keras.layers.Dense(vocab_size))

model.compile(optimizer = keras.optimizers.Adam(learning_rate=1e-5), loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True))

history = model.fit(dataset, epochs=100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78