### 讀資料&切字典

In [3]:
num_samples = 10000

input_texts = []
target_texts = []
input_characters = set()
target_characters = set()

# 設定開啟txt的方式
lines = open('./Preview/cmn.txt', encoding ='utf8').read().split('\n')

for line in lines[: min(num_samples, len(lines) - 1)]:
    # 切割txt中的每一行英文與中文的間格
    input_text, target_text = line.split('\t')
    target_text = '\t' + target_text + '\n'
    
    # 分別存英文與中文成list
    input_texts.append(input_text)
    target_texts.append(target_text)
    
    # 切割字元存起來當字典
    for char in input_text:
        if char not in input_characters:
            input_characters.add(char)
    for char in target_text:
        if char not in target_characters:
            target_characters.add(char)

### 計算字詞數量

In [4]:
# 字典排序            
input_characters = sorted(list(input_characters))
target_characters = sorted(list(target_characters))
num_encoder_tokens = len(input_characters)
num_decoder_tokens = len(target_characters)
# 計算編碼器、解碼器的最大長度
max_encoder_seq_length = max([len(txt) for txt in input_texts])
max_decoder_seq_length = max([len(txt) for txt in target_texts])

In [5]:
# 以dict儲存字典單字及序號
input_token_index = dict([(char, i) for i, char in enumerate(input_characters)])
target_token_index = dict([(char, i) for i, char in enumerate(target_characters)])

### 處理訓練資料

In [6]:
# 設定編碼器、解碼器input起始值(均為0矩陣)
encoder_input_data = np.zeros((len(input_texts), max_encoder_seq_length), dtype='float32')
decoder_input_data = np.zeros((len(input_texts), max_decoder_seq_length), dtype='float32')
decoder_target_data = np.zeros((len(input_texts), max_decoder_seq_length, num_decoder_tokens), dtype='float32')

In [7]:
# 設定 encoder_input、decoder_input對應的順序    
for i, (input_text, target_text) in enumerate(zip(input_texts, target_texts)):
    for t, char in enumerate(input_text):
        encoder_input_data[i, t] = 1.
    for t, char in enumerate(target_text):
        # decoder_target_data is ahead of decoder_input_data by one timestep
        decoder_input_data[i, t] = 1.
        if t > 0:
            # decoder_target_data will be ahead by one timestep
            # and will not include the start character.
            decoder_target_data[i, t - 1, target_token_index[char]] = 1.

### Custom Attention Layer
> https://github.com/neonbjb/ml-notebooks/blob/master/keras-seq2seq-with-attention/keras_translate_notebook.ipynb

In [23]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.python.keras.utils import tf_utils
from tensorflow.keras import backend as K
from tensorflow.keras.layers import LSTMCell, RNN

# RNN "Cell" classes in Keras perform the actual data transformations at each timestep. Therefore, in order
# to add attention to LSTM, we need to make a custom subclass of LSTMCell.
class AttentionLSTMCell(LSTMCell):
    def __init__(self, **kwargs):
        self.attentionMode = False
        super(AttentionLSTMCell, self).__init__(**kwargs)
    
    # Build is called to initialize the variables that our cell will use. We will let other Keras
    # classes (e.g. "Dense") actually initialize these variables.
    @tf_utils.shape_type_conversion
    def build(self, input_shape):        
        # Converts the input sequence into a sequence which can be matched up to the internal
        # hidden state.
        self.dense_constant = TimeDistributed(Dense(self.units, name="AttLstmInternal_DenseConstant"))
        
        # Transforms the internal hidden state into something that can be used by the attention
        # mechanism.
        self.dense_state = Dense(self.units, name="AttLstmInternal_DenseState")
        
        # Transforms the combined hidden state and converted input sequence into a vector of
        # probabilities for attention.
        self.dense_transform = Dense(1, name="AttLstmInternal_DenseTransform")
        
        # We will augment the input into LSTMCell by concatenating the context vector. Modify
        # input_shape to reflect this.
        batch, input_dim = input_shape[0]
        batch, timesteps, context_size = input_shape[-1]
        lstm_input = (batch, input_dim + context_size)
        
        # The LSTMCell superclass expects no constant input, so strip that out.
        return super(AttentionLSTMCell, self).build(lstm_input)
    
    # This must be called before call(). The "input sequence" is the output from the 
    # encoder. This function will do some pre-processing on that sequence which will
    # then be used in subsequent calls.
    def setInputSequence(self, input_seq):
        self.input_seq = input_seq
        self.input_seq_shaped = self.dense_constant(input_seq)
        self.timesteps = tf.shape(self.input_seq)[-2]
    
    # This is a utility method to adjust the output of this cell. When attention mode is
    # turned on, the cell outputs attention probability vectors across the input sequence.
    def setAttentionMode(self, mode_on=False):
        self.attentionMode = mode_on
        
    # This method sets up the computational graph for the cell. It implements the actual logic
    # that the model follows.
    def call(self, inputs, states, constants):
        # Separate the state list into the two discrete state vectors.
        # ytm is the "memory state", stm is the "carry state".
        ytm, stm = states
        # We will use the "carry state" to guide the attention mechanism. Repeat it across all
        # input timesteps to perform some calculations on it.
        stm_repeated = K.repeat(self.dense_state(stm), self.timesteps)
        # Now apply our "dense_transform" operation on the sum of our transformed "carry state" 
        # and all encoder states. This will squash the resultant sum down to a vector of size
        # [batch,timesteps,1]
        # Note: Most sources I encounter use tanh for the activation here. I have found with this dataset
        # and this model, relu seems to perform better. It makes the attention mechanism far more crisp
        # and produces better translation performance, especially with respect to proper sentence termination.
        combined_stm_input = self.dense_transform(
            keras.activations.relu(stm_repeated + self.input_seq_shaped))
        # Performing a softmax generates a log probability for each encoder output to receive attention.
        score_vector = keras.activations.softmax(combined_stm_input, 1)
        # In this implementation, we grant "partial attention" to each encoder output based on 
        # it's log probability accumulated above. Other options would be to only give attention
        # to the highest probability encoder output or some similar set.
        context_vector = K.sum(score_vector * self.input_seq, 1)
        
        # Finally, mutate the input vector. It will now contain the traditional inputs (like the seq2seq
        # we trained above) in addition to the attention context vector we calculated earlier in this method.
        inputs = K.concatenate([inputs, context_vector])
        
        # Call into the super-class to invoke the LSTM math.
        res = super(AttentionLSTMCell, self).call(inputs=inputs, states=states)
        
        # This if statement switches the return value of this method if "attentionMode" is turned on.
        if(self.attentionMode):
            return (K.reshape(score_vector, (-1, self.timesteps)), res[1])
        else:
            return res
        
# Custom implementation of the Keras LSTM that adds an attention mechanism.
# This is implemented by taking an additional input (using the "constants" of the
# RNN class) into the LSTM: The encoder output vectors across the entire input sequence.
class LSTMWithAttention(RNN):
    def __init__(self, units, **kwargs):
        cell = AttentionLSTMCell(units=units)
        self.units = units
        super(LSTMWithAttention, self).__init__(cell, **kwargs)
        
    @tf_utils.shape_type_conversion
    def build(self, input_shape):
        self.input_dim = input_shape[0][-1]
        self.timesteps = input_shape[0][-2]
        return super(LSTMWithAttention, self).build(input_shape) 
    
    # This call is invoked with the entire time sequence. The RNN sub-class is responsible
    # for breaking this up into calls into the cell for each step.
    # The "constants" variable is the key to our implementation. It was specifically added
    # to Keras to accomodate the "attention" mechanism we are implementing.
    def call(self, x, constants, **kwargs):
        if isinstance(x, list):
            self.x_initial = x[0]
        else:
            self.x_initial = x
        
        # The only difference in the LSTM computational graph really comes from the custom
        # LSTM Cell that we utilize.
        self.cell._dropout_mask = None
        self.cell._recurrent_dropout_mask = None
        self.cell.setInputSequence(constants[0])
        return super(LSTMWithAttention, self).call(inputs=x, constants=constants, **kwargs)

### 模型Enocder-Decoder

In [26]:
from keras.models import Model, Input
from keras.layers.core import Dense, Dropout, Activation
from keras.layers import CuDNNLSTM, LSTM, TimeDistributed
from keras.layers.embeddings import Embedding

HIDDEN_DIM=50
    
encoder_inputs = Input(shape=(None, ), name="Encoder_input")
encoder_embedding = Embedding(num_encoder_tokens, 40)(encoder_inputs)
encoder_LSTM = CuDNNLSTM(HIDDEN_DIM, return_state=True, name="Encoder_LSTM")
encoder_outputs, state_h, state_c = encoder_LSTM(encoder_embedding)
encoder_states = [state_h, state_c]

decoder_inputs = Input(shape=(None, ), name="Decoder_input")

decoder_embedding = Embedding(num_decoder_tokens, 40)

decoder_embedding_final = decoder_embedding(decoder_inputs)
decoder_att_lstm = LSTMWithAttention(units=HIDDEN_DIM, return_sequences=True, return_state=True)
# decoder_LSTM = CuDNNLSTM(HIDDEN_DIM, return_state=True, return_sequences=True, name="Decoder_LSTM")
decoder_outputs, _, _ = decoder_att_lstm(inputs=decoder_embedding_final, constants=encoder_outputs, initial_state=[state_h, state_c])
# decoder_outputs, _, _ = decoder_LSTM(decoder_embedding_final, initial_state=[state_h, state_c])

dense_layer = Dense(num_decoder_tokens, activation='softmax', name="Dense")
# outputs = dense_layer(decoder_outputs)
outputs = TimeDistributed(dense_layer)(decoder_outputs)
model = Model([encoder_inputs, decoder_inputs], outputs)
    
model.summary()

TypeError: cannot unpack non-iterable NoneType object

In [27]:
vocab_in_size = num_encoder_tokens
vocab_out_size = num_decoder_tokens
embedding_dim = 40
units = 50

attenc_inputs = Input(shape=(None,), name="attenc_inputs")
attenc_emb = Embedding(input_dim=vocab_in_size, output_dim=embedding_dim)
attenc_lstm = CuDNNLSTM(units=units, return_sequences=True, return_state=True)
attenc_outputs, attstate_h, attstate_c = attenc_lstm(attenc_emb(attenc_inputs))
attenc_states = [attstate_h, attstate_c]

attdec_inputs = Input(shape=(None,))
attdec_emb = Embedding(input_dim=vocab_out_size, output_dim=embedding_dim)
attdec_lstm = LSTMWithAttention(units=units, return_sequences=True, return_state=True)
# Note that the only real difference here is that we are feeding attenc_outputs to the decoder now.
attdec_lstm_out, _, _ = attdec_lstm(inputs=attdec_emb(attdec_inputs), 
                                    constants=attenc_outputs, 
                                    initial_state=attenc_states)
attdec_d1 = Dense(units, activation="relu")
attdec_d2 = Dense(vocab_out_size, activation="softmax")
attdec_out = attdec_d2(Dropout(rate=.4)(attdec_d1(Dropout(rate=.4)(attdec_lstm_out))))

attmodel = Model([attenc_inputs, attdec_inputs], attdec_out)
attmodel.compile(optimizer=tf.train.AdamOptimizer(), loss="sparse_categorical_crossentropy", metrics=['sparse_categorical_accuracy'])

TypeError: cannot unpack non-iterable NoneType object

In [20]:
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['acc'])
model.fit([encoder_input_data, decoder_input_data], decoder_target_data, batch_size=30, epochs=10, validation_split=0.1, verbose=2)

Train on 9000 samples, validate on 1000 samples
Epoch 1/30
 - 4s - loss: 2.1928 - acc: 0.0564 - val_loss: 2.5394 - val_acc: 0.0646
Epoch 2/30
 - 3s - loss: 1.9894 - acc: 0.0664 - val_loss: 2.4957 - val_acc: 0.0706
Epoch 3/30
 - 3s - loss: 1.9589 - acc: 0.0689 - val_loss: 2.4828 - val_acc: 0.0696
Epoch 4/30
 - 3s - loss: 1.9449 - acc: 0.0685 - val_loss: 2.4752 - val_acc: 0.0680
Epoch 5/30
 - 3s - loss: 1.9362 - acc: 0.0710 - val_loss: 2.4694 - val_acc: 0.0729
Epoch 6/30
 - 3s - loss: 1.9298 - acc: 0.0722 - val_loss: 2.4669 - val_acc: 0.0726
Epoch 7/30
 - 3s - loss: 1.9249 - acc: 0.0721 - val_loss: 2.4649 - val_acc: 0.0729
Epoch 8/30
 - 3s - loss: 1.9211 - acc: 0.0720 - val_loss: 2.4644 - val_acc: 0.0739
Epoch 9/30
 - 3s - loss: 1.9179 - acc: 0.0722 - val_loss: 2.4632 - val_acc: 0.0739
Epoch 10/30
 - 3s - loss: 1.9155 - acc: 0.0723 - val_loss: 2.4614 - val_acc: 0.0735
Epoch 11/30
 - 3s - loss: 1.9125 - acc: 0.0724 - val_loss: 2.4592 - val_acc: 0.0740
Epoch 12/30
 - 3s - loss: 1.9103 - ac

<keras.callbacks.History at 0x1d363642d68>

In [None]:
encoder_model = Model(encoder_inputs, [encoder_outputs, state_h, state_c])

# Generate a new set of tensors for our new inference decoder. Note that we are using new tensors, 
# this does not preclude using the same underlying layers that we trained on. (e.g. weights/biases).
inf_decoder_inputs = Input(shape=(None,), name="inf_decoder_inputs")
# We'll need to force feed the two state variables into the decoder each step.
state_input_h = Input(shape=(units,), name="state_input_h")
state_input_c = Input(shape=(units,), name="state_input_c")
decoder_res, decoder_h, decoder_c = decoder_lstm(
    decoder_emb(inf_decoder_inputs), 
    initial_state=[state_input_h, state_input_c])
inf_decoder_out = decoder_d2(decoder_d1(decoder_res))
inf_model = Model(inputs=[inf_decoder_inputs, state_input_h, state_input_c], 
                  outputs=[inf_decoder_out, decoder_h, decoder_c])

### 預測

In [14]:
# 定義編碼器取樣模型
encoder_model = Model(encoder_inputs, encoder_states)
encoder_model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
Encoder_input (InputLayer)   (None, None)              0         
_________________________________________________________________
embedding_3 (Embedding)      (None, None, 40)          2920      
_________________________________________________________________
Encoder_LSTM (CuDNNLSTM)     [(None, 50), (None, 50),  18400     
Total params: 21,320
Trainable params: 21,320
Non-trainable params: 0
_________________________________________________________________


In [15]:
# 定義解碼器的input
state_input_h = Input(shape=(units,), name="state_input_h")
state_input_c = Input(shape=(units,), name="state_input_c")
attenc_seq_out = Input(shape=attenc_outputs.get_shape()[1:], name="attenc_seq_out")
inf_attdec_inputs = Input(shape=(None,), name="inf_attdec_inputs")
attdec_lstm.cell.setAttentionMode(True)

# 做embedding
decoder_embedding_final2 = decoder_embedding(decoder_inputs)

# 定義解碼器 LSTM 模型
# decoder_outputs2, state_h2, state_c2 = decoder_LSTM(decoder_embedding_final2, initial_state=decoder_states_inputs)
attdec_res, attdec_h, attdec_c = attdec_lstm(decoder_embedding_final2, 
                                                initial_state=[state_input_h, state_input_c], 
                                                constants=attenc_seq_out)

# 以編碼器的記憶狀態 h 及 c 為解碼器的記憶狀態  
# decoder_states2 = [state_h2, state_c2]
# decoder_outputs2 = dense_layer(decoder_outputs2)
# decoder_model = Model([decoder_inputs] + decoder_states_inputs, [decoder_outputs2] + decoder_states2)
decoder_model = Model(inputs=[inf_attdec_inputs, state_input_h, state_input_c, attenc_seq_out], outputs=[attdec_res, attdec_h, attdec_c])
decoder_model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
Decoder_input (InputLayer)      (None, None)         0                                            
__________________________________________________________________________________________________
embedding_4 (Embedding)         (None, None, 40)     86680       Decoder_input[0][0]              
__________________________________________________________________________________________________
input_3 (InputLayer)            (None, 50)           0                                            
__________________________________________________________________________________________________
input_4 (InputLayer)            (None, 50)           0                                            
__________________________________________________________________________________________________
Decoder_LS

In [16]:
# 建立反向的 dict，才能透過查詢將數值轉回文字
reverse_input_char_index = dict((i, char) for char, i in input_token_index.items())
reverse_target_char_index = dict((i, char) for char, i in target_token_index.items())

### 翻譯

In [None]:
def sentence_to_vector(sentence, lang):
    pre = preprocess_sentence(sentence)
    vec = np.zeros(len_input)
    sentence_list = [lang.word2idx[s] for s in pre.split(' ')]
    for i,w in enumerate(sentence_list):
        vec[i] = w
    return vec

def translate(input_sentence, infenc_model, infmodel, attention=False):
    sv = sentence_to_vector(input_sentence, input_lang)
    # Reshape so we can use the encoder model. New shape=[samples,sequence length]
    sv = sv.reshape(1,len(sv))
    [emb_out, sh, sc] = infenc_model.predict(x=sv)
    
    i = 0
    start_vec = target_lang.word2idx["<start>"]
    stop_vec = target_lang.word2idx["<end>"]
    # We will continuously feed cur_vec as an input into the decoder to produce the next word,
    # which will be assigned to cur_vec. Start it with "<start>".
    cur_vec = np.zeros((1,1))
    cur_vec[0,0] = start_vec
    cur_word = "<start>"
    output_sentence = ""
    # Start doing the feeding. Terminate when the model predicts an "<end>" or we reach the end
    # of the max target language sentence length.
    while cur_word != "<end>" and i < (len_target-1):
        i += 1
        if cur_word != "<start>":
            output_sentence = output_sentence + " " + cur_word
        x_in = [cur_vec, sh, sc]
        # This will allow us to accomodate attention models, which we will talk about later.
        if attention:
            x_in += [emb_out]
        [nvec, sh, sc] = infmodel.predict(x=x_in)
        # The output of the model is a massive softmax vector with one spot for every possible word. Convert
        # it to a word ID using argmax().
        cur_vec[0,0] = np.argmax(nvec[0,0])
        cur_word = target_lang.idx2word[np.argmax(nvec[0,0])]
    return output_sentence

In [None]:
translate("Hi.", encoder_model, inf_model)

In [17]:
# # 模型預測，並取得翻譯結果(中文)    
# def decode_sequence(input_seq):
#     # Encode the input as state vectors.
#     states_value = encoder_model.predict(input_seq)
    
#     # Generate empty target sequence of length 1.
#     target_seq = np.zeros((1, 1))
    
#     # Populate the first character of target sequence with the start character.
#     target_seq[0, 0] = 1.
    
#     # Sampling loop for a batch of sequences (to simplify, here we assume a batch of size 1).
#     stop_condition = False
#     decoded_sentence = ''
#     while not stop_condition:
#         output_tokens, h, c = decoder_model.predict([target_seq] + states_value)

#         # Sample a token
#         sampled_token_index = np.argmax(output_tokens[0, -1, :])
#         sampled_char = reverse_target_char_index[sampled_token_index]
#         decoded_sentence += sampled_char

#         # Exit condition: either hit max length or find stop character.
#         if (sampled_char == '\n' or len(decoded_sentence) > max_decoder_seq_length):
#             stop_condition = True

#         # Update the target sequence (of length 1).
#         target_seq = np.zeros((1, 1))
#         target_seq[0, 0] = 1.

#         # Update states
#         states_value = [h, c]

#     return decoded_sentence

In [18]:
# for seq_index in range(30):
#     # Take one sequence (part of the training test) for trying out decoding.
#     input_seq = encoder_input_data[seq_index: seq_index + 1]
#     decoded_sentence = decode_sequence(input_seq)
#     print('*')
#     print('Input sentence:', input_texts[seq_index])
#     try:
#         print('Decoded sentence:', decoded_sentence)
#     except:
#         # 出現亂碼，以?取代
#         print('Decoded sentence:', decoded_sentence.encode('ascii', 'replace'))
#         #print("error:", sys.exc_info()[0])

*
Input sentence: Hi.
Decoded sentence: 我。。。

*
Input sentence: Hi.
Decoded sentence: 我。。。

*
Input sentence: Run.
Decoded sentence: 我。。。

*
Input sentence: Wait!
Decoded sentence: 我。。。

*
Input sentence: Hello!
Decoded sentence: 我。。。

*
Input sentence: I try.
Decoded sentence: 我。。。

*
Input sentence: I won!
Decoded sentence: 我。。。

*
Input sentence: Oh no!
Decoded sentence: 我。。。

*
Input sentence: Cheers!
Decoded sentence: 我姆。。

*
Input sentence: He ran.
Decoded sentence: 我姆。。

*
Input sentence: Hop in.
Decoded sentence: 我姆。。

*
Input sentence: I lost.
Decoded sentence: 我姆。。

*
Input sentence: I quit.
Decoded sentence: 我姆。。

*
Input sentence: I'm OK.
Decoded sentence: 我姆。。

*
Input sentence: Listen.
Decoded sentence: 我姆。。

*
Input sentence: No way!
Decoded sentence: 我姆。。

*
Input sentence: No way!
Decoded sentence: 我姆。。

*
Input sentence: Really?
Decoded sentence: 我姆。。

*
Input sentence: Try it.
Decoded sentence: 我姆。。

*
Input sentence: We try.
Decoded sentence: 我姆。。

*
Input sentence: