In [1]:
import sys
import pandas as pd
import numpy as np
import json
from opencc import OpenCC
from keras.layers import Input, LSTM, Dense, merge,concatenate
from tensorflow.keras.optimizers import Adam, SGD
from keras.models import Model,load_model
from tensorflow.keras.utils import plot_model
from keras.models import Sequential

### 可由此開始跳過到最下面

In [None]:
train_file = 'translation2019zh_train.json'
valid_file = 'translation2019zh_valid.json'

In [None]:
train_data = []
train_contents = open(train_file, encoding="utf-8")
train_len = len(train_contents.readlines()) / 1000
train_contents = open(train_file, encoding="utf-8")
for i, train_content in enumerate(train_contents.readlines()):
    if i == int(train_len):
        break
    temp = json.loads(train_content)
    train_data.append(temp)

In [None]:
len(train_data)

In [None]:
valid_data = []
valid_contents = open(valid_file, encoding="utf-8")
for valid_content in valid_contents.readlines():
    temp = json.loads(valid_content)
    valid_data.append(temp)

In [4]:
cc = OpenCC('s2t')

In [5]:
for i in train_data:
    i['chinese'] = cc.convert(i['chinese'])

In [None]:
for i in valid_data:
    i['chinese'] = cc.convert(i['chinese'])

In [6]:
txt_file = '1by1000_train_data_cc.txt'
f = open(txt_file, 'w', encoding = 'utf-8')
temp = int(len(train_data))
for count, i in enumerate(train_data):
    if count == temp:
        break
    f.writelines(i['english'] + '\t' + i['chinese'] + '\n')

In [2]:
NUM_SAMPLES=500  #樣本數
batch_size = 64
epochs = 1000
latent_dim = 256 # LSTM單元數

In [3]:
txt_file = '1by1000_train_data_cc.txt'

In [4]:
data_path = txt_file
df = pd.read_table(data_path,header=None, error_bad_lines=False).iloc[:NUM_SAMPLES,0:2]
df.columns = ['inputs','targets']
df['targets'] = df['targets'].apply(lambda x:'\t'+x+'\n')

In [5]:
#生成列表
input_texts = df.inputs.values.tolist()
target_texts = df.targets.values.tolist()

In [6]:
#生成字典
input_characters = sorted(list(set(df.inputs.unique().sum())))
target_characters = sorted(list(set(df.targets.unique().sum())))

In [7]:
num_encoder_tokens = len(input_characters)
num_decoder_tokens = len(target_characters)
INUPT_LENGTH = max([ len(txt) for txt in input_texts])
OUTPUT_LENGTH = max([ len(txt) for txt in target_texts])

In [8]:
#向量化
input_token_index = dict( [(char, i)for i, char in enumerate(input_characters)] )
target_token_index = dict( [(char, i) for i, char in enumerate(target_characters)] )
reverse_input_char_index = dict([(i, char) for i, char in enumerate(input_characters)])
reverse_target_char_index = dict([(i, char) for i, char in enumerate(target_characters)])

In [9]:
#生成三維輸入
encoder_input_data = np.zeros((NUM_SAMPLES,INUPT_LENGTH,num_encoder_tokens))
decoder_input_data = np.zeros((NUM_SAMPLES,OUTPUT_LENGTH,num_decoder_tokens))
decoder_target_data  = np.zeros((NUM_SAMPLES,OUTPUT_LENGTH,num_decoder_tokens))

In [10]:
for i,(input_text,target_text) in enumerate(zip(input_texts,target_texts)):
    for t,char in enumerate(input_text):
        encoder_input_data[i,t,input_token_index[char]]=1.0
    for t, char in enumerate(target_text):
        decoder_input_data[i,t,target_token_index[char]]=1.0
        if t > 0:
            decoder_target_data[i, t-1, target_token_index[char]] = 1.0

In [11]:
def create_model():
    encoder_inputs = Input(shape = (None,num_encoder_tokens))
    encoder = LSTM(latent_dim,return_state = True)
    encoder_outputs,state_h,state_c = encoder(encoder_inputs)
    encoder_state = [state_h,state_c]
    decoder_inputs = Input(shape = (None,num_decoder_tokens))
    decoder_lstm = LSTM(latent_dim,return_state = True,return_sequences = True)
    decoder_outputs,_,_ = decoder_lstm(decoder_inputs,initial_state = encoder_state)
    decoder_dense = Dense(num_decoder_tokens,activation = 'softmax')
    decoder_outputs = decoder_dense(decoder_outputs)
    model = Model([encoder_inputs,decoder_inputs],decoder_outputs)
    encoder_model = Model(encoder_inputs,encoder_state)
    decoder_state_input_h = Input(shape = (latent_dim,))
    decoder_state_input_c = Input(shape = (latent_dim,))
    decoder_state_inputs = [decoder_state_input_h,decoder_state_input_c]
    decoder_outputs,state_h,state_c = decoder_lstm(decoder_inputs,initial_state = decoder_state_inputs)
    decoder_states = [state_h,state_c]
    decoder_outputs = decoder_dense(decoder_outputs)
    decoder_model = Model([decoder_inputs] + decoder_state_inputs,[decoder_outputs] + decoder_states)
    plot_model(model = model,show_shapes = True)
    plot_model(model = encoder_model,show_shapes = True)
    plot_model(model = decoder_model,show_shapes = True)
    return model,encoder_model,decoder_model

In [12]:
def decode_sequence(input_seq,encoder_model,decoder_model):
    states_value = encoder_model.predict(input_seq)
    target_seq = np.zeros((1, 1, num_decoder_tokens))
    target_seq[0, 0, target_token_index['\t']] = 1.
    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = reverse_target_char_index[sampled_token_index]
        decoded_sentence += sampled_char
        if sampled_char == '\n' or len(decoded_sentence) >INUPT_LENGTH  :
            stop_condition = True
        target_seq = np.zeros((1, 1, num_decoder_tokens))
        target_seq[0, 0, sampled_token_index] = 1.
        states_value = [h, c]
    return decoded_sentence

In [16]:
def train():
    model,encoder_model,decoder_model=create_model()
    model.compile(optimizer='rmsprop',loss='categorical_crossentropy')
    model.fit([encoder_input_data,decoder_input_data],decoder_target_data,
              batch_size=batch_size,
              epochs=epochs,
              validation_split=0.2)
    model.save('s2s_1000epo.h5')
    encoder_model.save('encoder_model_1000epo.h5')
    decoder_model.save('decoder_model_1000epo.h5')

### 至此

有 encoder_model_1000epo.h5 及 decoder_model_1000epo.h5 後，
執行下一格並輸入test即可測試。

In [2]:
def test():
    encoder_model=load_model('encoder_model_1000epo.h5', compile=False) 
    decoder_model=load_model('decoder_model_1000epo.h5', compile=False)
    ss=input("請輸入要翻譯的英文:")
    if ss=='-1':
        sys.exit()
    input_seq=np.zeros((1,INUPT_LENGTH,num_encoder_tokens)) 
    for t,char in enumerate(ss):
        input_seq[0,t,input_token_index[char]]=1.0
    decoded_sentence = decode_sequence(input_seq,encoder_model,decoder_model)
    print('-')
    print('Decoded sentence:', decoded_sentence)

if __name__ == '__main__':
    intro=input("select train model or test model:")
    if intro=="train":
        print("training...........")
        train()
    else:
        print("testing.........")
        while(1):
            test()

select train model or test model:test
testing.........


KeyboardInterrupt: Interrupted by user