In [1]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt

from keras.models import Model
from keras.layers import Input, LSTM, Dense
import tensorflow as tf
from keras.utils.training_utils import multi_gpu_model
import MeCab
from argparse import ArgumentParser
from keras.losses import categorical_crossentropy
from keras import backend as K
import math

Using TensorFlow backend.


In [2]:
def parser():
    usage = 'Usage: python --file weight_file_name --mode [train or test]'
    argparser = ArgumentParser(usage=usage)
    argparser.add_argument('--mode','-m', dest='mode', type=str, choices=['train','test'])
    argparser.add_argument('--file', '-f', type=str, required=True, help='set filename of weights for save or load')
    args = argparser.parse_args(args=['-m', 'train', '-f', '.\s2s_weight.h5'])
    return args

def ppx(y_true, y_pred):
    loss = categorical_crossentropy(y_true, y_pred)
    perplexity = K.cast(K.pow(math.e, K.mean(loss, axis=-1)), K.floatx())
    return perplexity

In [3]:
args = parser()

weights_filename = args.file

gpu_count = 2
batch_size = 64  # Batch size for training.
epochs = 100  # Number of epochs to train for.
batch_size = 100
latent_dim = 256  # Latent dimensionality of the encoding space.
num_samples = 30000  # Number of samples to train on.
# Path to the data txt file on disk.

In [4]:
input_data_path = "./input.txt"
target_data_path = "./output.txt"
with  open(input_data_path, 'r', encoding='utf-8') as f:
    input_lines = f.read().split('\n')  # 行ごとのリストに

with open(target_data_path, 'r', encoding='utf-8') as f:
    target_lines = f.read().split('\n')

In [5]:
min_samples = min(num_samples, min(len(input_lines)-1, len(target_lines)-1))

In [6]:
# 単語の配列で表された文の配列に変更
input_texts = []
target_texts = []
input_words = set()
target_words = set()
# 登場する全単語のリストをencoderとdecoder用にそれぞれ作成
for index, (input_line,target_line) in enumerate(zip(input_lines[:min_samples],target_lines[:min_samples])):
    input_text = input_line
    target_text = target_line
    # \tが開始記号で\nが終端記号とする
    target_text = '\t ' + target_text + ' \n'

    # 単語単位に分割
    words = []
    words = input_text.split(' ')
    input_texts.append(words)
    for word in words:
        if word not in input_words:
            input_words.add(word)
    words = target_text.split(' ')
    target_texts.append(words)
    for word in words:
        if word not in target_words:
            target_words.add(word)

In [7]:
input_words = sorted(list(input_words))
target_words = sorted(list(target_words))
num_encoder_tokens = len(input_words)
num_decoder_tokens = len(target_words)

# 入力文と出力文それぞれで最大単語数計算
max_encoder_seq_length = max([len(words) for words in input_texts])
max_decoder_seq_length = max([len(words) for words in target_texts])

print('Number of samples:', len(input_texts))
print('Number of unique input words:', num_encoder_tokens)
print('Number of unique output words:', num_decoder_tokens)
print('Max sequence length for inputs:', max_encoder_seq_length)
print('Max sequence length for outputs:', max_decoder_seq_length)

Number of samples: 30000
Number of unique input words: 10790
Number of unique output words: 10057
Max sequence length for inputs: 119
Max sequence length for outputs: 116


In [8]:
# 単語にIDを割り振る
input_token_index = dict(
    [(word, i) for i, word in enumerate(input_words)])
target_token_index = dict(
    [(word, i) for i, word in enumerate(target_words)])

encoder_input_data = np.zeros(
    (len(input_texts), max_encoder_seq_length, num_encoder_tokens),
    dtype='uint8')
decoder_input_data = np.zeros(
    (len(input_texts), max_decoder_seq_length, num_decoder_tokens),
    dtype='uint8')
decoder_target_data = np.zeros(
    (len(input_texts), max_decoder_seq_length, num_decoder_tokens),
    dtype='uint8')

print(input_token_index)
print(input_token_index)

{'': 0, '.....。': 1, '....．。': 2, '060': 3, '11': 4, '12': 5, '13': 6, '24': 7, '40': 8, '90': 9, 'UNK': 10, 'UNKUNK': 11, 'UNKＣ': 12, 'UNKＣＤ': 13, 'UNKＵ': 14, 'in': 15, '—': 16, '−': 17, '−。': 18, '、': 19, '。': 20, '々': 21, '〈': 22, '〉': 23, '「': 24, '」': 25, '『': 26, '』': 27, '〜': 28, 'ぁん': 29, 'あ': 30, 'ああ': 31, 'あい': 32, 'あいこ': 33, 'あいつ': 34, 'あいまい': 35, 'あえて': 36, 'あお': 37, 'あか': 38, 'あかつき': 39, 'あかん': 40, 'あがっ': 41, 'あき': 42, 'あきらめ': 43, 'あきれ': 44, 'あく': 45, 'あくまでも': 46, 'あけ': 47, 'あけれ': 48, 'あげ': 49, 'あげよ': 50, 'あげる': 51, 'あげれ': 52, 'あげん': 53, 'あこがれ': 54, 'あご': 55, 'あさ': 56, 'あさって': 57, 'あさひ': 58, 'あし': 59, 'あした': 60, 'あしらっ': 61, 'あすこ': 62, 'あずき': 63, 'あせっ': 64, 'あそこ': 65, 'あそぶ': 66, 'あた': 67, 'あたし': 68, 'あたっ': 69, 'あたふた': 70, 'あたら': 71, 'あたり': 72, 'あたりまえ': 73, 'あちら': 74, 'あっ': 75, 'あっけない': 76, 'あっけらかん': 77, 'あっさり': 78, 'あったかい': 79, 'あったかく': 80, 'あったかくっ': 81, 'あったかけれ': 82, 'あったまっ': 83, 'あっち': 84, 'あっという間': 85, 'あっという間に': 86, 'あつあつ': 87, 'あつまっ': 88, 'あて': 89, 'あて名': 90, 'あと': 91,


{'': 0, '.....。': 1, '....．。': 2, '060': 3, '11': 4, '12': 5, '13': 6, '24': 7, '40': 8, '90': 9, 'UNK': 10, 'UNKUNK': 11, 'UNKＣ': 12, 'UNKＣＤ': 13, 'UNKＵ': 14, 'in': 15, '—': 16, '−': 17, '−。': 18, '、': 19, '。': 20, '々': 21, '〈': 22, '〉': 23, '「': 24, '」': 25, '『': 26, '』': 27, '〜': 28, 'ぁん': 29, 'あ': 30, 'ああ': 31, 'あい': 32, 'あいこ': 33, 'あいつ': 34, 'あいまい': 35, 'あえて': 36, 'あお': 37, 'あか': 38, 'あかつき': 39, 'あかん': 40, 'あがっ': 41, 'あき': 42, 'あきらめ': 43, 'あきれ': 44, 'あく': 45, 'あくまでも': 46, 'あけ': 47, 'あけれ': 48, 'あげ': 49, 'あげよ': 50, 'あげる': 51, 'あげれ': 52, 'あげん': 53, 'あこがれ': 54, 'あご': 55, 'あさ': 56, 'あさって': 57, 'あさひ': 58, 'あし': 59, 'あした': 60, 'あしらっ': 61, 'あすこ': 62, 'あずき': 63, 'あせっ': 64, 'あそこ': 65, 'あそぶ': 66, 'あた': 67, 'あたし': 68, 'あたっ': 69, 'あたふた': 70, 'あたら': 71, 'あたり': 72, 'あたりまえ': 73, 'あちら': 74, 'あっ': 75, 'あっけない': 76, 'あっけらかん': 77, 'あっさり': 78, 'あったかい': 79, 'あったかく': 80, 'あったかくっ': 81, 'あったかけれ': 82, 'あったまっ': 83, 'あっち': 84, 'あっという間': 85, 'あっという間に': 86, 'あつあつ': 87, 'あつまっ': 88, 'あて': 89, 'あて名': 90, 'あと': 91

In [9]:
# 文のインデックスと単語のインデックスと単語を格納するデータを作成
for i, (input_text, target_text) in enumerate(zip(input_texts, target_texts)):
    for t, word in enumerate(input_text):
        encoder_input_data[i, t, input_token_index[word]] = 1.
    for t, word in enumerate(target_text):
        # decoder_target_data is ahead of decoder_input_data by one timestep
        decoder_input_data[i, t, target_token_index[word]] = 1.
        if t > 0:
            # decoder_target_data will be ahead by one timestep
            # and will not include the start character.
            decoder_target_data[i, t - 1, target_token_index[word]] = 1.      

In [10]:
# ネットワークの定義
with tf.device("/cpu:0"):
    encoder_inputs = Input(shape=(None, num_encoder_tokens))
    encoder = LSTM(latent_dim, return_state=True)
    encoder_outputs, state_h, state_c = encoder(encoder_inputs)
    # We discard `encoder_outputs` and only keep the states.
    encoder_states = [state_h, state_c]

    # Set up the decoder, using `encoder_states` as initial state.
    decoder_inputs = Input(shape=(None, num_decoder_tokens))
    # We set up our decoder to return full output sequences,
    # and to return internal states as well. We don't use the
    # return states in the training model, but we will use them in inference.
    decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
    decoder_outputs, _, _ = decoder_lstm(decoder_inputs,
                                        initial_state=encoder_states)
    decoder_dense = Dense(num_decoder_tokens, activation='softmax')
    decoder_outputs = decoder_dense(decoder_outputs)

    # Define the model that will turn
    # `encoder_input_data` & `decoder_input_data` into `decoder_target_data`
    model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

Instructions for updating:
Colocations handled automatically by placer.


In [11]:
if args.mode == 'train':
    n_split = int(encoder_input_data.shape[0]*0.8)
    encoder_train, encoder_val = np.vsplit(encoder_input_data,[n_split])   #エンコーダインプットデータを訓練用と評価用に分割
    decoder_train, decoder_val = np.vsplit(decoder_input_data,[n_split])   #デコーダインプットデータを訓練用と評価用に分割
    target_train, target_val = np.vsplit(decoder_target_data,[n_split])   #ラベルデータを訓練用と評価用に分割
    # Run training
    model = multi_gpu_model(model, gpus=gpu_count)
    model.compile(optimizer='rmsprop', loss=ppx) 
    row_train = encoder_train.shape[0]
    row_val = encoder_val.shape[0]
    n_batch = math.ceil(row_train/batch_size)
    loss_bk = 10000
    for j in range(0, epochs) :
        print("Epoch ", j+1, "/", epochs)
        for i in range(0, n_batch):
            start = i * batch_size
            end = min([(i+1)*batch_size, row_train])
            encoder_train_batch = encoder_train[start:end,:]
            decoder_train_batch = decoder_train[start:end,:]
            target_train_batch = target_train[start:end,:]
            
            encoder_val_batch = encoder_val[start%row_val:end%row_val,:]
            decoder_val_batch = decoder_val[start%row_val:end%row_val,:]
            target_val_batch = target_val[start%row_val:end%row_val,:]
            
            train_loss = model.train_on_batch([encoder_train_batch,decoder_train_batch],target_train_batch)
            val_loss = model.test_on_batch([encoder_val_batch, decoder_val_batch] ,target_val_batch)
            print("%d/%d train_loss:%f val_loss:%f" % (start, row_train, train_loss, val_loss))
        if j == 0 or val_loss <= loss_bk:
            loss_bk = val_loss
        else:
            print('EarlyStopping')
            break
    model.save_weights(weights_filename)
        # Next: inference mode (sampling).
        # Here's the drill:
        # 1) encode input and retrieve initial decoder state
        # 2) run one step of decoder with this initial state
        # and a "start of sequence" token as target.
        # Output will be the next target token
        # 3) Repeat with the current target token and current states
elif args.mode == 'test':
    model.load_weights(weights_filename)

Epoch  1 / 100
Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Deprecated in favor of operator or tf.math.divide.
0/24000 train_loss:96.159401 val_loss:2.639523
100/24000 train_loss:3.473336 val_loss:2.414814
200/24000 train_loss:6.040168 val_loss:3.704825
300/24000 train_loss:8.278811 val_loss:3.933697
400/24000 train_loss:2.379376 val_loss:3.964729
500/24000 train_loss:4.180079 val_loss:2.373246
600/24000 train_loss:4.082593 val_loss:2.026850
700/24000 train_loss:1.898166 val_loss:1.726922
800/24000 train_loss:3.174690 val_loss:1.747074
900/24000 train_loss:2.156861 val_loss:2.165034
1000/24000 train_loss:1.900081 val_loss:1.616771
1100/24000 train_loss:1.840805 val_loss:1.580471
1200/24000 train_loss:1.827911 val_loss:1.729175
1300/24000 train_loss:1.826400 val_loss:1.805635
1400/24000 train_loss:1.854851 val_loss:1.927315
1500/24000 train_loss:1.572335 val_loss:1.863522
1600/24000 train_loss:1.724609 val_loss:1.421228
1700/24000 train_loss:1.725693 val_lo

KeyboardInterrupt: 