In [3]:
#-*- coding: utf-8 -*-
import sys, os
import tensorflow as tf
from __future__ import print_function
import numpy as np

os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   
os.environ["CUDA_VISIBLE_DEVICES"]="0"
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
sess= tf.Session(config=config)

from tensorflow.python.client import device_lib
print (device_lib.list_local_devices())

[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 17099746119153261046
, name: "/device:GPU:0"
device_type: "GPU"
memory_limit: 7916630836
locality {
  bus_id: 1
  links {
  }
}
incarnation: 8947178731259729044
physical_device_desc: "device: 0, name: GeForce GTX 1070 Ti, pci bus id: 0000:03:00.0, compute capability: 6.1"
]


In [4]:
# Path to the data txt file on disk.
title_path = 'title.txt'
reply_path = 'reply.txt'

In [5]:
# 데이터 불러오기
with open(title_path, 'r', encoding='utf-8') as f:
    titles = f.read().strip().split("\n")
with open(reply_path, 'r', encoding='utf-8') as f:
    replies = eval(f.read().strip()) #[eval(reply) for reply in f.read().strip().split("\n")] --> 원본데이터용

In [6]:
# 댓글 text가 비어있는거 삭제
error = []

for i, reply in enumerate(replies):
    for j, sub_reply in enumerate(reply):
        if sub_reply['text'] == "":
            print("error on: ", sub_reply, i, j)
            error.append((i,j))
for i,j in list(reversed(error)):
    del replies[i][j]

if error:
    with open(reply_path, 'w', encoding='utf-8') as f:
        f.write(str(replies))
        
print("total title len: ", len(titles))
print("total reply len: ", len(replies))

total title len:  4055
total reply len:  4055


In [7]:
from konlpy.tag import Komoran
tag = Komoran()

# 형태소 분석
# 댓글 길이와 갯수 제한
title_morphs = []
reply_morphs = []
title_morphs_set = set()
reply_morphs_set = set()
max_reply_num = 5
max_reply_len = 100

for title in titles:
    morphs = tag.morphs(title)
    title_morphs.append(morphs)
    for morph in morphs:
        title_morphs_set.add(morph)
    
for reply in replies:
    subreply_morphs = []
    sub_reply_num = 0
    
    for sub_reply in reply:
        if sub_reply_num >= max_reply_num:
            break
            
        morphs = tag.morphs(sub_reply['text'])
        if len(morphs) > max_reply_len-2: # start와 end tag고려
            continue
            
        subreply_morphs.append(morphs)
        sub_reply_num += 1
        for morph in morphs:
            reply_morphs_set.add(morph)
            
    reply_morphs.append(subreply_morphs)

# 형태소 분석시 붙어서 나와가지고, 따로 넣어줌
reply_morphs_set.add('<start>')
reply_morphs_set.add('<end>')
        
print("title morphs num: ", len(title_morphs_set))
print("reply morphs num: ", len(reply_morphs_set))

title morphs num:  7802
reply morphs num:  21170


In [None]:
# 형태소 집합 저장
with open('title-morphs_set', 'w', encoding='utf-8') as f:
    f.write(str(sorted(list(title_morphs_set))))
with open('reply-morphs_set', 'w', encoding='utf-8') as f:
    f.write(str(sorted(list(reply_morphs_set))))

In [None]:
# 형태소 변환된거 저장
with open('title-morphs', 'w', encoding='utf-8') as f:
    f.write(str(title_morphs))
with open('reply-morphs', 'w', encoding='utf-8') as f:
    f.write(str(reply_morphs))

In [1]:
# 형태소 변환 후, 여기부터 불러쓰기
from konlpy.tag import Komoran
tag = Komoran()

max_reply_num = 5
max_reply_len = 100

title_morphs_set = []
reply_morphs_set = []
title_morphs = []
reply_morphs = []

with open('title-morphs_set', 'r', encoding='utf-8') as f:
    title_morphs_set = eval(f.read())
with open('reply-morphs_set', 'r', encoding='utf-8') as f:
    reply_morphs_set = eval(f.read())
with open('title-morphs', 'r', encoding='utf-8') as f:
    title_morphs = eval(f.read())
with open('reply-morphs', 'r', encoding='utf-8') as f:
    reply_morphs = eval(f.read())

In [8]:
# 제목과 댓글text 1:1 매칭 & pos 태깅

train_x = []
train_y = []
for idx, reply in enumerate(reply_morphs):
    for sub_reply in reply:
        train_x.append(title_morphs[idx])
        train_y.append(['<start>'] + sub_reply + ['<end>'])

num_total = len(train_x)
train_x, test_x = train_x[:int(num_total*0.9)], train_x[int(num_total*0.9):]
train_y, test_y = train_y[:int(num_total*0.9)], train_y[int(num_total*0.9):]

num_samples = len(train_x)
num_test_samples = len(test_x)
title_morphs_set = sorted(list(title_morphs_set))
reply_morphs_set = sorted(list(reply_morphs_set))
num_encoder_tokens = len(title_morphs_set)
num_decoder_tokens = len(reply_morphs_set)
max_encoder_seq_length = max([len(x) for x in train_x])
max_decoder_seq_length = max([len(y) for y in train_y])

print("num_samples: ", num_samples)
print("num_test_samples: ", num_test_samples)
print("num_encoder_tokens: ", num_encoder_tokens)
print("num_decoder_tokens: ", num_decoder_tokens)
print("max_encoder_seq_length: ", max_encoder_seq_length)
print("max_decoder_seq_length: ", max_decoder_seq_length)

num_samples:  18045
num_test_samples:  2006
num_encoder_tokens:  7802
num_decoder_tokens:  21170
max_encoder_seq_length:  31
max_decoder_seq_length:  100


In [9]:
input_token_index = dict(
    [(morph, i) for i, morph in enumerate(title_morphs_set)])
target_token_index = dict(
    [(morph, i) for i, morph in enumerate(reply_morphs_set)])

In [10]:
encoder_input_data = np.zeros((num_samples, max_encoder_seq_length), dtype="float32")
decoder_input_data = np.zeros((num_samples, max_decoder_seq_length), dtype="float32")
decoder_target_data = np.zeros((num_samples, max_decoder_seq_length, num_decoder_tokens), dtype="float32")

encoder_test_data = np.zeros((num_test_samples, max_encoder_seq_length), dtype="float32")

In [11]:
for i, (x, y) in enumerate(zip(train_x, train_y)):
    for t, morph in enumerate(x):
        encoder_input_data[i, t] = input_token_index[morph]
    for t, morph in enumerate(y):
        # decoder_target_data is ahead of decoder_input_data by one timestep
        decoder_input_data[i, t] = target_token_index[morph]
        if t > 0:
            # decoder_target_data will be ahead by one timestep
            # and will not include the start character.
            decoder_target_data[i, t - 1, target_token_index[morph]] = 1.
            
for i, x in enumerate(test_x):
    for t, morph in enumerate(x):
        encoder_test_data[i, t] = input_token_index[morph]

In [12]:
print("encoder_input_data shape: ", encoder_input_data.shape)
print("decoder_input_data shape: ", decoder_input_data.shape)
print("decoder_target_data shape: ", decoder_target_data.shape)

print("encoder_test_data shape: ", encoder_test_data.shape)

encoder_input_data shape:  (18045, 31)
decoder_input_data shape:  (18045, 100)
decoder_target_data shape:  (18045, 100, 21170)
encoder_test_data shape:  (2006, 31)


In [13]:
from keras.models import Model
from keras.layers import Input, LSTM, Dense, Embedding
from keras.layers.normalization import BatchNormalization

batch_size = 64  # Batch size for training.
epochs = 10  # Number of epochs to train for.
latent_dim = 64  # Latent dimensionality of the encoding space.

# Define an input sequence and process it.
encoder_input_layer = Input(shape=(None,))
enc_embedding_layer = Embedding(num_encoder_tokens, latent_dim)
enc_input = enc_embedding_layer(encoder_input_layer)
enc_normalization_layer = BatchNormalization()
enc_normalized_input = enc_normalization_layer(enc_input)
encoder = LSTM(latent_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder(enc_normalized_input)
# We discard `encoder_outputs` and only keep the states.
encoder_states = [state_h, state_c]

Using TensorFlow backend.


In [14]:
# Set up the decoder, using `encoder_states` as initial state.
# We set up our decoder to return full output sequences,
# and to return internal states as well. We don't use the
# return states in the training model, but we will use them in inference.
decoder_input_layer = Input(shape=(None,))
dec_embedding_layer = Embedding(num_decoder_tokens, latent_dim)
dec_input = dec_embedding_layer(decoder_input_layer)
dec_normalization_layer = BatchNormalization()
dec_normalized_input = dec_normalization_layer(dec_input)
decoder = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder(dec_normalized_input, initial_state=encoder_states)
decoder_dense = Dense(num_decoder_tokens, activation='softmax')
decoder_output_layer = decoder_dense(decoder_outputs)

In [15]:
# Define the model that will turn
# `encoder_input_data` & `decoder_input_data` into `decoder_target_data`
model = Model([encoder_input_layer, decoder_input_layer], decoder_output_layer)
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, None)         0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, None)         0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, None, 64)     499328      input_1[0][0]                    
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, None, 64)     1354880     input_2[0][0]                    
__________________________________________________________________________________________________
batch_norm

In [16]:
from keras.utils import plot_model
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
plot_model(model,'model_image.png', show_layer_names=False, show_shapes=True)
model_img=mpimg.imread('model_image.png')
plt.figure(figsize=[10,50])
plt.imshow(model_img)

<matplotlib.image.AxesImage at 0x7f8c08849978>

In [17]:
from keras import optimizers

# Run training
rmsprop = optimizers.RMSprop(lr=0.01)
model.compile(optimizer=rmsprop, loss='categorical_crossentropy', metrics=['acc'])
model.fit([encoder_input_data, decoder_input_data], decoder_target_data,
          batch_size=batch_size,
          epochs=epochs,
          validation_split=0.1)
# Save model
model.save('s2s.h5')

Train on 16240 samples, validate on 1805 samples
Epoch 1/10
  448/16240 [..............................] - ETA: 7:31 - loss: 2.5221 - acc: 0.0056 

KeyboardInterrupt: 

In [None]:
# Next: inference mode (sampling).
# Here's the drill:
# 1) encode input and retrieve initial decoder state
# 2) run one step of decoder with this initial state
# and a "start of sequence" token as target.
# Output will be the next target token
# 3) Repeat with the current target token and current states

# Define sampling models
encoder_model = Model(encoder_input_layer, encoder_states)
encoder_model.summary()

decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
infer_input = dec_embedding_layer(decoder_input_layer)
infer_normalized_input = dec_normalization_layer(infer_input)

decoder_outputs, state_h, state_c = decoder(infer_normalized_input, initial_state=decoder_states_inputs)
decoder_states = [state_h, state_c]
decoder_outputs = decoder_dense(decoder_outputs)
decoder_model = Model(
    [decoder_input_layer] + decoder_states_inputs,
    [decoder_outputs] + decoder_states)
decoder_model.summary()

# Reverse-lookup token index to decode sequences back to
# something readable.
reverse_input_morph_index = dict(
    (i, morph) for morph, i in input_token_index.items())
reverse_target_morph_index = dict(
    (i, morph) for morph, i in target_token_index.items())

In [None]:
def decode_sequence(input_seq):
    # Encode the input as state vectors.
    states_value = encoder_model.predict(input_seq)
    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1,1))
    # Populate the first character of target sequence with the start character.
    target_seq[0, 0] = target_token_index['<start>']

    # Sampling loop for a batch of sequences
    # (to simplify, here we assume a batch of size 1).
    stop_condition = False
    decoded_sentence = []
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)

        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_morph = reverse_target_morph_index[sampled_token_index]
        decoded_sentence.append(sampled_morph)

        # Exit condition: either hit max length
        # or find stop character.
        if (sampled_morph == '<end>' or
           len(decoded_sentence) > max_reply_len-2):
            stop_condition = True

        # Update the target sequence (of length 1).
        target_seq = np.zeros((1,1))
        target_seq[0, 0] = sampled_token_index

        # Update states
        states_value = [h, c]

    return decoded_sentence

In [None]:
for seq_index in range(0, 30, 5):
    input_seq = encoder_input_data[seq_index: seq_index+1]
    decoded_sentence = decode_sequence(input_seq)
    print('-')
    print('Input sentence:', train_x[seq_index: seq_index + 1])
    print('Decoded sentence:', decoded_sentence)

In [None]:
for seq_index in range(0, 30, 5):
    input_seq = encoder_test_data[seq_index: seq_index+1]
    decoded_sentence = decode_sequence(input_seq)
    print('제목:', ' '.join(test_x[seq_index]))
    print('댓글:', ' '.join(decoded_sentence[:-1]))
    print()

In [None]:
custom_input = tag.morphs('문재인 가나다라 트럼프')
custom_input_seq = []
for morph in custom_input:
    try: custom_input_seq.append(input_token_index[morph])
    except: pass
custom_input_seq = custom_input_seq + [0]*(max_encoder_seq_length - len(custom_input_seq))
decoded_sentence = decode_sequence(custom_input_seq)
print('Decoded sentence:', decoded_sentence)