In [2]:
import tensorflow as tf
import tensorflow.keras as keras 
import numpy as np

In [6]:
# 最初のテキストファイルをダウンロードと解析

path = keras.utils.get_file(
    'nietzsche.txt',
    origin='https://s3.amazonaws.com/text-datasets/nietzsche.txt'
)

text = open(path).read().lower()
print('Corpus length: ', len(text))

Downloading data from https://s3.amazonaws.com/text-datasets/nietzsche.txt
Corpus length:  600893


In [16]:
# 文字列シーケンスのベクトル化
maxlen = 60       # 60文字のシーケンス
step = 3          # 3文字おきに新しいシーケンスをサンプリング
sentences = []    # 抽出されたシーケンスを保持
next_chars = []   # 目的地(次に来る文字)を保持

for i in range(0, len(text) - maxlen, step):
    sentences.append(text[i: i + maxlen])
    next_chars.append(text[i + maxlen])
print('Number of sequences: ', len(sentences))
print('sentences[0]: ', sentences[0])
print('next_chars[0]: ', next_chars[0])

Number of sequences:  200278
sentences[0]:  preface


supposing that truth is a woman--what then? is the
next_chars[0]:  r


In [28]:
# コーパスの一意な文字のリスト
chars = sorted(list(set(text)))
print('Unique charactors: ', chars)

# これらの文字をリストcharsのインデックスにマッピングするディクショナリ
char_indices = dict((char, chars.index(char)) for char in chars)

print('Vectorization...')

# one-hotエンコーディングを適用して文字を二値の配列に格納
x = np.zeros((len(sentences), maxlen, len(chars)), dtype=np.bool)
y = np.zeros((len(sentences), len(chars)), dtype=np.bool)
for i, sentence in enumerate(sentences):
    for t, char in enumerate(sentence):
        x[i, t, char_indices[char]] = 1
        y[i, char_indices[next_chars[i]]] = 1

Unique charactors:  ['\n', ' ', '!', '"', "'", '(', ')', ',', '-', '.', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '=', '?', '[', ']', '_', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'ä', 'æ', 'é', 'ë']
Vectorization...


In [29]:
# 次の文字を予測する単層LSTMモデル
from tensorflow.keras import layers

model = keras.models.Sequential()
model.add(layers.LSTM(128, input_shape=(maxlen, len(chars))))
model.add(layers.Dense(len(chars), activation='relu'))

In [30]:
# モデルのコンパイル設定
optimizer = keras.optimizers.RMSprop(lr=0.01)
model.compile(loss='categorical_crossentropy', optimizer=optimizer)

In [31]:
# モデルn予測に基づいて次の文字をサンプリングする関数
def sample(preds, temperature=1.0):
    preds = np.asarray(pres).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas) 

In [32]:
import random
import sys

for epoch in range(1, 60):
    print('epoch', epoch)
    # Fit the model for 1 epoch on the available training data
    model.fit(x, y,
              batch_size=128,
              epochs=1)

    # Select a text seed at random
    start_index = random.randint(0, len(text) - maxlen - 1)
    generated_text = text[start_index: start_index + maxlen]
    print('--- Generating with seed: "' + generated_text + '"')

    for temperature in [0.2, 0.5, 1.0, 1.2]:
        print('------ temperature:', temperature)
        sys.stdout.write(generated_text)

        # We generate 400 characters
        for i in range(400):
            sampled = np.zeros((1, maxlen, len(chars)))
            for t, char in enumerate(generated_text):
                sampled[0, t, char_indices[char]] = 1.

            preds = model.predict(sampled, verbose=0)[0]
            next_index = sample(preds, temperature)
            next_char = chars[next_index]

            generated_text += next_char
            generated_text = generated_text[1:]

            sys.stdout.write(next_char)
            sys.stdout.flush()
        print()

epoch 1
Train on 200278 samples
 15104/200278 [=>............................] - ETA: 14:56 - loss: 6.7380

KeyboardInterrupt: 