In [49]:
with open('deu.txt', 'r', encoding='utf-8') as f:
    lines = f.read().split('\n')
print("文档有 {} 行。".format(len(lines)))
num_samples = 2000 # 使用的语料行数
lines_to_use = lines[ : min(num_samples, len(lines)-1)]
print(lines_to_use)


文档有 176693 行。
['Hi.\tHallo!', 'Hi.\tGrüß Gott!', 'Run!\tLauf!', 'Wow!\tPotzdonner!', 'Wow!\tDonnerwetter!', 'Fire!\tFeuer!', 'Help!\tHilfe!', 'Help!\tZu Hülf!', 'Stop!\tStopp!', 'Wait!\tWarte!', 'Go on.\tMach weiter.', 'Hello!\tHallo!', 'I ran.\tIch rannte.', 'I see.\tIch verstehe.', 'I see.\tAha.', 'I try.\tIch probiere es.', 'I won!\tIch hab gewonnen!', 'I won!\tIch habe gewonnen!', 'Smile.\tLächeln!', 'Cheers!\tZum Wohl!', 'Freeze!\tKeine Bewegung!', 'Freeze!\tStehenbleiben!', 'Got it?\tKapiert?', 'Got it?\tVerstanden?', 'Got it?\tEinverstanden?', 'He ran.\tEr rannte.', 'He ran.\tEr lief.', 'Hop in.\tMach mit!', 'Hug me.\tDrück mich!', 'Hug me.\tNimm mich in den Arm!', 'Hug me.\tUmarme mich!', 'I fell.\tIch fiel.', 'I fell.\tIch fiel hin.', 'I fell.\tIch stürzte.', 'I fell.\tIch bin hingefallen.', 'I fell.\tIch bin gestürzt.', 'I know.\tIch weiß.', 'I lied.\tIch habe gelogen.', 'I lost.\tIch habe verloren.', 'I paid.\tIch habe bezahlt.', 'I paid.\tIch zahlte.', 'I swim.\tIch schwimm

In [50]:
import re
# print(lines_to_use[19516])
for i in range(len(lines_to_use)):
    lines_to_use[i] = re.sub('\d', ' _NUMBER_ ', lines_to_use[i])
    # 用 ' _NUMBER_ ' 替换 数字（\d）
# print(lines_to_use[19516])
input_texts = []
target_texts = []
input_words = set()
target_words = set()
for line in lines_to_use:
    x, y = line.split('\t')
    y = 'BEGIN_ ' + y + ' _END'
    input_texts.append(x)
    target_texts.append(y)
    for word in x.split():
        if word not in input_words:
            input_words.add(word)
    for word in y.split():
        if word not in target_words:
            target_words.add(word)

In [51]:
max_input_seq_len = max([len(seq.split()) for seq in input_texts])
# 11
max_target_seq_len = max([len(seq.split()) for seq in target_texts])
# 15

input_words = sorted(list(input_words))
target_words = sorted(list(target_words))
num_encoder_tokens = len(input_words) # 5724
num_decoder_tokens = len(target_words) # 9126

In [52]:
inputToken_idx = {token : i for (i, token) in enumerate(input_words)}
outputToken_idx = {token : i for (i, token) in enumerate(target_words)}
idx_inputToken = {i : token for (i, token) in enumerate(input_words)}
idx_outputToken = {i : token for (i, token) in enumerate(target_words)}

In [53]:
import numpy as np
encoder_input_data = np.zeros(
    (len(input_texts), max_input_seq_len),
    # 句子数量，         最大输入句子长度
    dtype=np.float32
)

decoder_input_data = np.zeros(
    (len(target_texts), max_target_seq_len),
    # 句子数量，          最大输出句子长度
    dtype=np.float32
)

decoder_output_data = np.zeros(
    (len(target_texts), max_target_seq_len, num_decoder_tokens),
    # 句子数量，          最大输出句子长度,      输出 tokens ids 个数
    dtype=np.float32
)

In [54]:
for i,(input_text, target_text) in enumerate(zip(input_texts, target_texts)):
    for t, word in enumerate(input_text.split()):
        encoder_input_data[i, t] = inputToken_idx[word]
    for t, word in enumerate(target_text.split()):
        decoder_input_data[i, t] = outputToken_idx[word]
        if t > 0:
            # 解码器的输出比输入提前一个时间步
            decoder_output_data[i, t-1, outputToken_idx[word]] = 1.

In [55]:
from keras.layers import Input, LSTM, Embedding, Dense
from keras.models import Model

embedding_size = 50 # 嵌入维度

# 编码器
encoder_inputs = Input(shape=(None,))
encoder_after_embedding = Embedding(input_dim=num_encoder_tokens, # 单词个数
                                    output_dim=embedding_size)(encoder_inputs)
encoder_lstm = LSTM(units=50, return_state=True)
# return_state: Boolean. Whether to return
#   the last state in addition to the output.
_, state_h, state_c = encoder_lstm(encoder_after_embedding)
encoder_states = [state_h, state_c]

# 解码器
decoder_inputs = Input(shape=(None,))
decoder_after_embedding = Embedding(input_dim=num_decoder_tokens, # 单词个数
                                    output_dim=embedding_size)(decoder_inputs)
decoder_lstm = LSTM(units=50, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_after_embedding,
                                     initial_state=encoder_states)
decoder_dense = Dense(num_decoder_tokens, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)


In [56]:
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])
model.summary()

Model: "functional_11"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_5 (InputLayer)            [(None, None)]       0                                            
__________________________________________________________________________________________________
input_6 (InputLayer)            [(None, None)]       0                                            
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, None, 50)     47150       input_5[0][0]                    
__________________________________________________________________________________________________
embedding_3 (Embedding)         (None, None, 50)     79350       input_6[0][0]                    
______________________________________________________________________________________

In [57]:
history = model.fit(x=[encoder_input_data, decoder_input_data],y=decoder_output_data,
          batch_size=128,epochs=20,validation_split=0.1)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [58]:
encoder_model = Model(encoder_inputs, encoder_states)

decoder_state_input_h = Input(shape=(50,))
decoder_state_input_c = Input(shape=(50,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
decoder_outputs_inf, state_h_inf, state_c_inf = decoder_lstm(decoder_after_embedding,
                                                            initial_state=decoder_states_inputs)
decoder_states_inf = [state_h_inf, state_c_inf]
decoder_outputs_inf = decoder_dense(decoder_outputs_inf)

In [59]:
decoder_model = Model(
    [decoder_inputs] + decoder_states_inputs,
    [decoder_outputs_inf] + decoder_states_inf
)

In [80]:
# 采样
def decode_sequence(input_seq):
    states_value = encoder_model.predict(input_seq)
    target_seq = np.zeros((1,1))
    target_seq[0, 0] = outputToken_idx['BEGIN_']
    stop = False
    decoded_sentence = ''
    while not stop:
        output_tokens, h, c = decoder_model.predict(
            [target_seq] + states_value)
        sampled_token_idx = np.argmax(output_tokens)
        sampled_word = idx_outputToken[sampled_token_idx]
        decoded_sentence += ' ' + sampled_word

        if sampled_word == '_END' or len(decoded_sentence) > 60:
            stop = True
        target_seq = np.zeros((1,1))
        target_seq[0, 0] = sampled_token_idx
        # Update states
        states_value = [h, c]

    return decoded_sentence

# 简单测试 采样
text_to_translate = 'Where is my Birds ?'
encoder_input_to_translate = np.zeros(
    (1, max_input_seq_len),
    dtype=np.float32)
for t, word in enumerate(text_to_translate.split()):
    encoder_input_to_translate[0, t] = inputToken_idx[word]

print(decode_sequence(encoder_input_to_translate))

 Ich Ich _END
