In [15]:
import numpy as np

from tensorflow.keras.models import Sequential
from tensorflow.keras import layers
from tensorflow.keras.preprocessing import text
import tensorflow as tf
import logging

tf.get_logger().setLevel(logging.ERROR)

In [8]:
EPOCHS = 32
BATCH_SIZE = 256
INPUT_FILE_NAME = 'data/frankenstein.txt'
WINDOW_LENGTH = 40
WINDOW_STEP = 3
PREDICT_LENGTH = 3
MAX_WORDS = 10000  # max size of vocabulary
EMBEDDING_WIDTH = 100  # dimensionality of the word vector

In [9]:
# open and read file
file = open(INPUT_FILE_NAME, 'r', encoding='utf-8')
full_text = file.read()
file.close()

# make lower case and split into individual words
full_text = text.text_to_word_sequence(full_text)

# create training examples
fragments = []
targets = []
for i in range(0, len(full_text) - WINDOW_LENGTH, WINDOW_STEP):
    fragments.append(full_text[i:i + WINDOW_LENGTH])
    targets.append(full_text[i + WINDOW_LENGTH])

In [14]:
# convert to indices
tokenizer = text.Tokenizer(num_words=MAX_WORDS, oov_token='UNK')
tokenizer.fit_on_texts(full_text)
fragments_indexed = tokenizer.texts_to_sequences(fragments)  # 将 word 转换为 index
targets_indexed = tokenizer.texts_to_sequences(targets)  # target word index, 后面转换为 one-hot

# convert to appropriate input and output formats
X = np.array(fragments_indexed, dtype=np.int32)
y = np.zeros((len(targets_indexed), MAX_WORDS))
for i, target_index in enumerate(targets_indexed):  # 将输出转换为 one-hot
    y[i, target_index] = 1

In [16]:
# build and train model
training_model = Sequential()
training_model.add(layers.Embedding(output_dim=EMBEDDING_WIDTH,
                                    input_dim=MAX_WORDS,
                                    mask_zero=True,
                                    input_length=None))  # 设置 input_length=None，接受变长序列
training_model.add(layers.LSTM(128, return_sequences=True,
                               dropout=0.2,
                               recurrent_dropout=0.2))
training_model.add(layers.LSTM(128, dropout=0.2,
                               recurrent_dropout=0.2))
training_model.add(layers.Dense(128, activation='relu'))
training_model.add(layers.Dense(MAX_WORDS, activation='softmax'))
training_model.compile(loss='categorical_crossentropy',
                       optimizer='adam')
training_model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 100)         1000000   
                                                                 
 lstm (LSTM)                 (None, None, 128)         117248    
                                                                 
 lstm_1 (LSTM)               (None, 128)               131584    
                                                                 
 dense (Dense)               (None, 128)               16512     
                                                                 
 dense_1 (Dense)             (None, 10000)             1290000   
                                                                 
Total params: 2,555,344
Trainable params: 2,555,344
Non-trainable params: 0
_________________________________________________________________


In [None]:
history = training_model.fit(X, y,
                             validation_split=0.05,
                             batch_size=BATCH_SIZE,
                             epochs=EPOCHS,
                             verbose=2,
                             shuffle=True)