# 預習
## 1. Keras LSTM Seq2Seq
> http://www.zmonster.me/2016/05/29/sequence_to_sequence_with_keras.html

Encoder 只在序列結束時輸出一個語義向量，所以其"return_sequences" 參數設置為"False"

### 讀取txt資料
> https://stackoverflow.com/questions/21546739/load-data-from-txt-with-pandas

In [2]:
import pandas as pd
# 中英文的間距是一個tab
DF = pd.read_csv('cmn.txt', sep="	", header=None)
DF.columns = ['en', 'zh']
DF.head()

Unnamed: 0,en,zh
0,Hi.,嗨。
1,Hi.,你好。
2,Run.,你用跑的。
3,Wait!,等等！
4,Hello!,你好。


In [4]:
DF.isnull().sum()

en    0
zh    0
dtype: int64

#### 另一種讀txt取文字的方式，順便建立字典
> https://ithelp.ithome.com.tw/articles/10194403

In [1]:
latent_dim = 256  # Latent dimensionality of the encoding space.
num_samples = 10000  # Number of samples to train on.

input_texts = []
target_texts = []
input_characters = set()
target_characters = set()

# 設定開啟txt的方式
lines = open('cmn.txt', encoding ='utf8').read().split('\n')

for line in lines[: min(num_samples, len(lines) - 1)]:
    # 切割txt中的每一行英文與中文的間格
    input_text, target_text = line.split('\t')
    target_text = '\t' + target_text + '\n'
    
    # 分別存英文與中文成list
    input_texts.append(input_text)
    target_texts.append(target_text)
    
    # 切割'字元'存起來當字典
    for char in input_text:
        if char not in input_characters:
            input_characters.add(char)
    for char in target_text:
        if char not in target_characters:
            target_characters.add(char)

In [2]:
# 字典排序            
input_characters = sorted(list(input_characters))
target_characters = sorted(list(target_characters))
num_encoder_tokens = len(input_characters)
num_decoder_tokens = len(target_characters)
# 計算編碼器、解碼器的最大長度
max_encoder_seq_length = max([len(txt) for txt in input_texts])
max_decoder_seq_length = max([len(txt) for txt in target_texts])

In [6]:
print(max_encoder_seq_length)
print(max_decoder_seq_length)

31
22


In [3]:
# 以dict儲存字典單字及序號
input_token_index = dict([(char, i) for i, char in enumerate(input_characters)])
target_token_index = dict([(char, i) for i, char in enumerate(target_characters)])

In [46]:
print(len(input_token_index))
print(len(target_token_index))

73
2167


In [4]:
# 設定編碼器、解碼器input起始值(均為0矩陣)
encoder_input_data = np.zeros((len(input_texts), max_encoder_seq_length), dtype='float32')
decoder_input_data = np.zeros((len(input_texts), max_decoder_seq_length), dtype='float32')
decoder_target_data = np.zeros((len(input_texts), max_decoder_seq_length, num_decoder_tokens), dtype='float32')

In [5]:
# 設定 encoder_input、decoder_input對應的順序    
for i, (input_text, target_text) in enumerate(zip(input_texts, target_texts)):
    for t, char in enumerate(input_text):
        encoder_input_data[i, t] = 1.
    for t, char in enumerate(target_text):
        # decoder_target_data is ahead of decoder_input_data by one timestep
        decoder_input_data[i, t] = 1.
        if t > 0:
            # decoder_target_data will be ahead by one timestep
            # and will not include the start character.
            decoder_target_data[i, t - 1, target_token_index[char]] = 1.

### tokenize/ Make Vocabulary 

**中文跟英文的應該要分開**

In [8]:
from keras.preprocessing.text import Tokenizer

def vocab_creater(text_lists):

  tokenizer = Tokenizer()
  tokenizer.fit_on_texts(text_lists)
  dictionary = tokenizer.word_index
  
  word2idx = {}
  idx2word = {}
  for k, v in dictionary.items():
          word2idx[k] = v
          idx2word[v] = k
          
  return word2idx, idx2word

word2idx, idx2word = vocab_creater(text_lists=DF['en'].values + DF['zh'].values)

In [15]:
def text2seq(encoder_text, decoder_text):

  tokenizer = Tokenizer()
  encoder_sequences = tokenizer.texts_to_sequences(encoder_text)
  decoder_sequences = tokenizer.texts_to_sequences(decoder_text)
  
  return encoder_sequences, decoder_sequences

encoder_sequences, decoder_sequences = text2seq(DF['en'].values, DF['zh'].values) 

In [17]:
encoder_sequences[0]

[]

### 建立模型
> https://towardsdatascience.com/how-to-implement-seq2seq-lstm-model-in-keras-shortcutnlp-6f355f3e5639

> https://towardsdatascience.com/word-level-english-to-marathi-neural-machine-translation-using-seq2seq-encoder-decoder-lstm-model-1a913f2dc4a7

#### 做shared embedding
> https://stackoverflow.com/questions/49477097/keras-seq2seq-word-embedding

#### return_seq and reture_state
> https://blog.csdn.net/u011327333/article/details/78501054

In [13]:
from keras.models import Model, Input
from keras.layers.core import Dense, Dropout, Activation
from keras.layers import CuDNNLSTM, LSTM, TimeDistributed
from keras.layers.embeddings import Embedding

HIDDEN_DIM=50
    
# encoder_inputs = Input(shape=(max_encoder_seq_length, ), name="Encoder_input")
encoder_inputs = Input(shape=(None, ), name="Encoder_input")

# encoder_embedding = Embedding(len(input_token_index), 40, input_length=max_encoder_seq_length)
# encoder_embedding = Embedding(len(input_token_index), 40)
encoder_embedding = Embedding(num_encoder_tokens, 40)(encoder_inputs)

encoder_LSTM = CuDNNLSTM(HIDDEN_DIM, return_state=True, name="Encoder_LSTM")
encoder_outputs, state_h, state_c = encoder_LSTM(encoder_embedding)
encoder_states = [state_h, state_c]

# decoder_inputs = Input(shape=(max_decoder_seq_length, ), name="Decoder_input")
decoder_inputs = Input(shape=(None, ), name="Decoder_input")

# decoder_embedding = Embedding(len(target_token_index), 40, input_length=max_decoder_seq_length)
# decoder_embedding =  Embedding(len(target_token_index), 40)
decoder_embedding = Embedding(num_decoder_tokens, 40)

decoder_embedding_final = decoder_embedding(decoder_inputs)
decoder_LSTM = CuDNNLSTM(HIDDEN_DIM, return_state=True, return_sequences=True, name="Decoder_LSTM")
decoder_outputs, _, _ = decoder_LSTM(decoder_embedding_final, initial_state=[state_h, state_c])

dense_layer = Dense(num_decoder_tokens, activation='softmax', name="Dense")
outputs = dense_layer(decoder_outputs)
# outputs = TimeDistributed(Dense(num_decoder_tokens, activation='softmax'))(decoder_outputs)
model = Model([encoder_inputs, decoder_inputs], outputs)
    
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
Encoder_input (InputLayer)      (None, None)         0                                            
__________________________________________________________________________________________________
Decoder_input (InputLayer)      (None, None)         0                                            
__________________________________________________________________________________________________
embedding_3 (Embedding)         (None, None, 40)     2920        Encoder_input[0][0]              
__________________________________________________________________________________________________
embedding_4 (Embedding)         (None, None, 40)     86680       Decoder_input[0][0]              
__________________________________________________________________________________________________
Encoder_LS

In [14]:
model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['acc'])
model.fit([encoder_input_data, decoder_input_data], decoder_target_data, batch_size=300, epochs=40, validation_split=0.2, verbose=2)

Train on 8000 samples, validate on 2000 samples
Epoch 1/20
 - 3s - loss: 2.1527 - acc: 0.0601 - val_loss: 2.5232 - val_acc: 0.0640
Epoch 2/20
 - 3s - loss: 1.9444 - acc: 0.0674 - val_loss: 2.4736 - val_acc: 0.0702
Epoch 3/20
 - 3s - loss: 1.9060 - acc: 0.0696 - val_loss: 2.4511 - val_acc: 0.0688
Epoch 4/20
 - 3s - loss: 1.8902 - acc: 0.0701 - val_loss: 2.4441 - val_acc: 0.0709
Epoch 5/20
 - 3s - loss: 1.8812 - acc: 0.0724 - val_loss: 2.4455 - val_acc: 0.0716
Epoch 6/20
 - 3s - loss: 1.8739 - acc: 0.0726 - val_loss: 2.4362 - val_acc: 0.0728
Epoch 7/20
 - 3s - loss: 1.8687 - acc: 0.0728 - val_loss: 2.4290 - val_acc: 0.0720
Epoch 8/20
 - 3s - loss: 1.8650 - acc: 0.0731 - val_loss: 2.4303 - val_acc: 0.0729
Epoch 9/20
 - 3s - loss: 1.8620 - acc: 0.0728 - val_loss: 2.4310 - val_acc: 0.0730
Epoch 10/20
 - 3s - loss: 1.8591 - acc: 0.0730 - val_loss: 2.4253 - val_acc: 0.0720
Epoch 11/20
 - 3s - loss: 1.8570 - acc: 0.0729 - val_loss: 2.4237 - val_acc: 0.0726
Epoch 12/20
 - 3s - loss: 1.8549 - ac

<keras.callbacks.History at 0x2505f73d320>

### 預測(翻譯)

In [15]:
# 定義編碼器取樣模型
encoder_model = Model(encoder_inputs, encoder_states)
encoder_model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
Encoder_input (InputLayer)   (None, None)              0         
_________________________________________________________________
embedding_3 (Embedding)      (None, None, 40)          2920      
_________________________________________________________________
Encoder_LSTM (CuDNNLSTM)     [(None, 50), (None, 50),  18400     
Total params: 21,320
Trainable params: 21,320
Non-trainable params: 0
_________________________________________________________________


In [16]:
# 定義解碼器的input
decoder_state_input_h = Input(shape=(HIDDEN_DIM,))
decoder_state_input_c = Input(shape=(HIDDEN_DIM,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

# 做embedding
decoder_embedding_final2 = decoder_embedding(decoder_inputs)

# 定義解碼器 LSTM 模型
decoder_outputs2, state_h2, state_c2 = decoder_LSTM(decoder_embedding_final2, initial_state=decoder_states_inputs)
# decoder_outputs2, state_h2, state_c2 = decoder_LSTM(decoder_inputs, initial_state=decoder_states_inputs)

# 以編碼器的記憶狀態 h 及 c 為解碼器的記憶狀態  
decoder_states2 = [state_h2, state_c2]
decoder_outputs2 = dense_layer(decoder_outputs2)
# decoder_model = Model([decoder_embedding] + decoder_states_inputs, [decoder_outputs] + decoder_states)
decoder_model = Model([decoder_inputs] + decoder_states_inputs, [decoder_outputs2] + decoder_states2)
decoder_model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
Decoder_input (InputLayer)      (None, None)         0                                            
__________________________________________________________________________________________________
embedding_4 (Embedding)         (None, None, 40)     86680       Decoder_input[0][0]              
__________________________________________________________________________________________________
input_3 (InputLayer)            (None, 50)           0                                            
__________________________________________________________________________________________________
input_4 (InputLayer)            (None, 50)           0                                            
__________________________________________________________________________________________________
Decoder_LS

In [17]:
# 建立反向的 dict，才能透過查詢將數值轉回文字
reverse_input_char_index = dict((i, char) for char, i in input_token_index.items())
reverse_target_char_index = dict((i, char) for char, i in target_token_index.items())

In [20]:
# 模型預測，並取得翻譯結果(中文)    
def decode_sequence(input_seq):
    # Encode the input as state vectors.
    states_value = encoder_model.predict(input_seq)
    
    # Generate empty target sequence of length 1.
#     target_seq = np.zeros((1, 1, num_decoder_tokens))
    target_seq = np.zeros((1, 1))
    
    # Populate the first character of target sequence with the start character.
#     target_seq[0, 0, target_token_index['\t']] = 1.
    target_seq[0, 0] = 1.
    
    # Sampling loop for a batch of sequences (to simplify, here we assume a batch of size 1).
    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)

        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = reverse_target_char_index[sampled_token_index]
        decoded_sentence += sampled_char

        # Exit condition: either hit max length
        # or find stop character.
        if (sampled_char == '\n' or
           len(decoded_sentence) > max_decoder_seq_length):
            stop_condition = True

        # Update the target sequence (of length 1).
        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = 1.

        # Update states
        states_value = [h, c]

    return decoded_sentence

In [21]:
for seq_index in range(10):
    # Take one sequence (part of the training test) for trying out decoding.
    input_seq = encoder_input_data[seq_index: seq_index + 1]
    decoded_sentence = decode_sequence(input_seq)
    print('*')
    print('Input sentence:', input_texts[seq_index])
    try:
        print('Decoded sentence:', decoded_sentence)
    except:
        # 出現亂碼，以?取代
        print('Decoded sentence:', decoded_sentence.encode('ascii', 'replace'))
        #print("error:", sys.exc_info()[0])

*
Input sentence: Hi.
Decoded sentence: 我。。

*
Input sentence: Hi.
Decoded sentence: 我。。

*
Input sentence: Run.
Decoded sentence: 我。。

*
Input sentence: Wait!
Decoded sentence: 我。。

*
Input sentence: Hello!
Decoded sentence: 我。。

*
Input sentence: I try.
Decoded sentence: 我。。

*
Input sentence: I won!
Decoded sentence: 我。。

*
Input sentence: Oh no!
Decoded sentence: 我。。

*
Input sentence: Cheers!
Decoded sentence: 我。。

*
Input sentence: He ran.
Decoded sentence: 我。。

