In [132]:
import tensorflow as tf
import numpy as np
import re

tf.reset_default_graph()
sess = tf.InteractiveSession()

In [133]:
# tokenizer 함수
def tokenizer(sentence):
        # 공백으로 나누고 특수문자는 따로 뽑아낸다.
        words = []
        _TOKEN_RE_ = re.compile("([.,!?\"':;)(])")

        for fragment in sentence.strip().split():
            words.extend(_TOKEN_RE_.split(fragment))

        return [w for w in words if w]

# data load 함수
def load_data(path):
    with open(path, 'r') as dialogue_file:
        contents = dialogue_file.read()
        all_words = tokenizer(contents)
        word_set = set(all_words)
        
    return word_set

def load_chat_log(path):
    #chat_log path 
    #포맷 한줄에 한 문장
    input = []
    target = []
    input_max_length = 0
    target_max_length = 0
    with open(path) as chat_file:
        for i, line in enumerate(chat_file):
            word_id_list = [word_dict[word] for word in tokenizer(line)]
            if i % 2 == 0:
                input.append(word_id_list)
                if input_max_length < len(word_id_list):
                    input_max_length = len(word_id_list)
            else:
                target.append(word_id_list)
                if target_max_length < len(word_id_list):
                    target_max_length = len(word_id_list)
                
    return input, target, input_max_length,target_max_length
        

In [134]:
_PAD_ = 0
_EOS_ = 1
_GO_ = 2
word_pre_defined = ["_PAD_"]+["_EOS_"]+["_GO_"]
word_set = load_data("./data/chat.log")
word_set = word_pre_defined+list(word_set)
word_dict = { word: i for i,word in enumerate(word_set)}
id2_word = { word_dict[i] : i for i in word_dict}

In [135]:
word_dict, id2_word

({'!': 95,
  ',': 7,
  '.': 9,
  '?': 140,
  '_EOS_': 1,
  '_GO_': 2,
  '_PAD_': 0,
  '가': 100,
  '갖고': 39,
  '같아': 62,
  '거야': 117,
  '거지': 3,
  '것': 146,
  '것만': 44,
  '것에': 34,
  '것은': 144,
  '것이': 84,
  '게': 40,
  '관계가': 82,
  '괴롭히고': 5,
  '그': 155,
  '그거': 26,
  '그건': 136,
  '그들은': 150,
  '그래': 37,
  '그래서': 97,
  '그러구': 116,
  '그러나': 16,
  '그런데': 13,
  '그럴': 113,
  '그럼': 162,
  '금빛': 123,
  '길들여': 157,
  '길들여진다는': 64,
  '길들이렴': 74,
  '길들이면': 126,
  '길들인': 10,
  '꽃이': 148,
  '나는': 104,
  '나를': 163,
  '나하고': 46,
  '난': 52,
  '날': 38,
  '날이지': 27,
  '내': 127,
  '너는': 31,
  '너를': 161,
  '너하고': 30,
  '넌': 83,
  '네': 35,
  '네가': 154,
  '놀': 90,
  '놀자': 135,
  '누구지': 66,
  '눈부시게': 15,
  '눈에': 110,
  '다르게': 128,
  '다른': 156,
  '닭은': 121,
  '닭을': 120,
  '대단하군': 141,
  '대해': 111,
  '도대체': 81,
  '돼': 75,
  '될': 89,
  '뜻이야': 22,
  '마을': 86,
  '만드는': 33,
  '말이야': 17,
  '맞아': 114,
  '모두': 133,
  '목요일엔': 105,
  '목요일은': 158,
  '무슨': 23,
  '무얼': 132,
  '무척': 53,
  '뭐지': 20,
  '뭘': 63,
  '미안해': 79,

In [136]:
input, target, input_max_length, target_max_length = load_chat_log("./data/chat.log")

In [137]:
input,target,len(input),len(target),input_max_length,target_max_length

([[25, 9],
  [129, 87, 9, 112, 139, 9, 9, 9],
  [52, 69, 9],
  [52, 30, 90, 88, 77, 9],
  [83, 132, 51, 41, 140],
  [31, 120, 51, 101, 140],
  [64, 40, 23, 22, 140],
  [82, 142, 140],
  [148, 119, 87, 9, 9, 9, 155, 148, 163, 10, 146, 62, 9, 9, 9],
  [162, 156, 153, 17, 140],
  [155, 164, 107, 87, 140],
  [26, 141, 95, 162, 121, 140],
  [97, 52, 99, 6, 9],
  [72, 7, 8, 71, 140],
  [55, 9, 9, 9, 163, 157, 151, 95],
  [97, 165, 11, 77, 9],
  [68, 94, 42, 140],
  [14, 20, 140],
  [150, 105, 86, 70, 29, 145, 9],
  [52, 7, 160, 44, 62, 9],
  [13, 83, 57, 98, 95],
  [162, 83, 81, 63, 108, 3, 140],
  [48, 87, 9],
  [127, 36, 143, 9],
  [165, 59, 45, 92, 9],
  [83, 154, 10, 34, 111, 103, 80, 87, 9]],
 [[25, 9],
  [83, 66, 140, 137, 115, 9, 9, 9],
  [43, 134, 46, 135, 9],
  [18, 95, 79, 9],
  [52, 118, 51, 87, 9],
  [49, 9, 52, 152, 51, 87, 9],
  [136, 82, 149, 22, 9],
  [37, 9],
  [113, 88, 138, 9, 67, 91, 60, 84, 133, 47, 9, 9, 9],
  [37, 9],
  [49, 77, 9],
  [166, 9],
  [154, 38, 126, 127, 21

In [138]:
def pad(list,max_sequence_length,is_target=False,is_decoder_input=False):
    # list -> 2D array
    # max_sequence_length -> integer
    input = []
    for item in list:
        if is_target:
            extra_pad_length = max_sequence_length - len(item)
            item = item +[_EOS_]+ [_PAD_]*extra_pad_length
            input.append(item)
        elif is_decoder_input:
            extra_pad_length = max_sequence_length - len(item)
            item = [_GO_] + item + [_PAD_]*extra_pad_length
            input.append(item)
        else:
            extra_pad_length = max_sequence_length - len(item)
            item = item + [_PAD_]*extra_pad_length
            input.append(item)
            
    return input

encoder_input = pad(input,input_max_length)
decoder_input = pad(target,target_max_length,is_decoder_input=True)
print(target)
target_input = pad(target,target_max_length,is_target=True)

[[25, 9], [83, 66, 140, 137, 115, 9, 9, 9], [43, 134, 46, 135, 9], [18, 95, 79, 9], [52, 118, 51, 87, 9], [49, 9, 52, 152, 51, 87, 9], [136, 82, 149, 22, 9], [37, 9], [113, 88, 138, 9, 67, 91, 60, 84, 133, 47, 9, 9, 9], [37, 9], [49, 77, 9], [166, 9], [154, 38, 126, 127, 21, 4, 15, 89, 117, 9], [123, 96, 159, 7, 154, 50, 117, 9], [116, 93, 9, 9, 9, 52, 78, 77, 9], [61, 39, 125, 163, 74, 95], [122, 106, 28, 9], [136, 130, 19, 156, 102, 128, 33, 117, 9], [97, 158, 53, 54, 27, 95], [136, 35, 12, 9, 52, 161, 85, 5, 147, 124, 9], [114, 7, 24, 37, 9], [108, 40, 87, 9, 109, 56, 131, 17, 9], [48, 100, 9], [58, 144, 110, 71, 32, 9], [16, 31, 65, 73, 75, 9], [104, 127, 76, 80, 87, 9, 9, 9]]


In [139]:
#[batch time_max ..]
encoder_input,decoder_input,target_input

([[25, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
  [129, 87, 9, 112, 139, 9, 9, 9, 0, 0, 0, 0, 0, 0, 0],
  [52, 69, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
  [52, 30, 90, 88, 77, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0],
  [83, 132, 51, 41, 140, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
  [31, 120, 51, 101, 140, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
  [64, 40, 23, 22, 140, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
  [82, 142, 140, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
  [148, 119, 87, 9, 9, 9, 155, 148, 163, 10, 146, 62, 9, 9, 9],
  [162, 156, 153, 17, 140, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
  [155, 164, 107, 87, 140, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
  [26, 141, 95, 162, 121, 140, 0, 0, 0, 0, 0, 0, 0, 0, 0],
  [97, 52, 99, 6, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
  [72, 7, 8, 71, 140, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
  [55, 9, 9, 9, 163, 157, 151, 95, 0, 0, 0, 0, 0, 0, 0],
  [97, 165, 11, 77, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
  [68, 94, 42, 140, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
  [14, 20, 140, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
  [15

In [140]:
batch_size = len(encoder_input)
n_epoch = 2000

vocab_size = len(word_dict)

input_embedding_size = 100
encoder_hidden_units = 100
decoder_hidden_units = 100

In [141]:
encoder_inputs = tf.placeholder(shape=(None, None), dtype=tf.int32, name='encoder_inputs')
decoder_targets = tf.placeholder(shape=(None, None), dtype=tf.int32, name='decoder_targets')
decoder_inputs = tf.placeholder(shape=(None, None), dtype=tf.int32, name='decoder_inputs')

embeddings = tf.Variable(tf.random_uniform([vocab_size, input_embedding_size], -1.0, 1.0), dtype=tf.float32)

encoder_inputs_embedded = tf.nn.embedding_lookup(embeddings, encoder_inputs)
decoder_inputs_embedded = tf.nn.embedding_lookup(embeddings, decoder_inputs)

In [142]:
encoder_inputs_embedded
# sess.run(encoder_inputs_embedded,feed_dict={encoder_inputs:encoder_input})

<tf.Tensor 'embedding_lookup:0' shape=(?, ?, 100) dtype=float32>

In [143]:
encoder_cell = tf.contrib.rnn.LSTMCell(encoder_hidden_units)

encoder_outputs, encoder_final_state = tf.nn.dynamic_rnn(
    encoder_cell, encoder_inputs_embedded,
    dtype=tf.float32,time_major=True
)

In [144]:
encoder_cell.weights

[<tf.Variable 'rnn/lstm_cell/kernel:0' shape=(200, 400) dtype=float32_ref>,
 <tf.Variable 'rnn/lstm_cell/bias:0' shape=(400,) dtype=float32_ref>]

In [145]:
encoder_final_state

LSTMStateTuple(c=<tf.Tensor 'rnn/while/Exit_2:0' shape=(?, 100) dtype=float32>, h=<tf.Tensor 'rnn/while/Exit_3:0' shape=(?, 100) dtype=float32>)

In [146]:
encoder_outputs

<tf.Tensor 'rnn/TensorArrayStack/TensorArrayGatherV3:0' shape=(?, ?, 100) dtype=float32>

In [147]:
decoder_cell = tf.contrib.rnn.LSTMCell(decoder_hidden_units)

decoder_outputs, decoder_final_state = tf.nn.dynamic_rnn(
    decoder_cell, decoder_inputs_embedded,
    initial_state=encoder_final_state,

    dtype=tf.float32,scope="plain_decoder",time_major=True
)

In [156]:
decoder_logits = tf.contrib.layers.fully_connected(decoder_outputs, vocab_size,activation_fn=None)
decoder_prediction = tf.argmax(tf.transpose(decoder_logits,[1,0,2]), 2)

In [157]:
decoder_logits

<tf.Tensor 'fully_connected_1/BiasAdd:0' shape=(?, ?, 167) dtype=float32>

In [158]:
stepwise_cross_entropy = tf.nn.softmax_cross_entropy_with_logits(
    labels=tf.one_hot(decoder_targets, depth=vocab_size, dtype=tf.float32),
    logits=decoder_logits,
)

loss = tf.reduce_mean(stepwise_cross_entropy)
train_op = tf.train.AdamOptimizer(learning_rate=0.001).minimize(loss)

In [159]:
tf.one_hot(decoder_targets, depth=vocab_size, dtype=tf.float32)

<tf.Tensor 'one_hot_3:0' shape=(?, ?, 167) dtype=float32>

In [160]:
decoder_logits

<tf.Tensor 'fully_connected_1/BiasAdd:0' shape=(?, ?, 167) dtype=float32>

In [161]:
sess.run(tf.global_variables_initializer())

In [162]:
for batch in range(n_epoch):
    _, l = sess.run([train_op, loss], feed_dict={
            encoder_inputs: np.transpose(encoder_input,(1,0)),
            decoder_targets: np.transpose(target_input,(1,0)),
            decoder_inputs: np.transpose(decoder_input,(1,0))
    })
    if(batch % 300 == 0):
        predict = sess.run(decoder_prediction, feed_dict={
            encoder_inputs: np.transpose(encoder_input,(1,0)),
            decoder_targets: np.transpose(target_input,(1,0)),
            decoder_inputs: np.transpose(decoder_input,(1,0))
        })
        for i, (inp, pred) in enumerate(zip(target_input, predict)):
                print('  sample {}:'.format(i + 1))
                print('    input     > {}'.format(inp))
                print('    predicted > {}'.format(pred))

  sample 1:
    input     > [25, 9, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
    predicted > [  5  91 124 124 124   0   0   0   0   0   0   0   0   0]
  sample 2:
    input     > [83, 66, 140, 137, 115, 9, 9, 9, 1, 0, 0, 0, 0, 0]
    predicted > [  5  90  69  26  26  50 124 124 118 124 124   0   0   0]
  sample 3:
    input     > [43, 134, 46, 135, 9, 1, 0, 0, 0, 0, 0, 0, 0, 0]
    predicted > [ 5  0 83  0 16 16  0  0  0  0  0  0  0  0]
  sample 4:
    input     > [18, 95, 79, 9, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]
    predicted > [  5 124  16  16  16 124 124   0   0   0   0   0   0   0]
  sample 5:
    input     > [52, 118, 51, 87, 9, 1, 0, 0, 0, 0, 0, 0, 0, 0]
    predicted > [  5  90 108  90   0  90   0   0   0   0   0   0   0   0]
  sample 6:
    input     > [49, 9, 52, 152, 51, 87, 9, 1, 0, 0, 0, 0, 0, 0]
    predicted > [  5 124 124  16   0   0   0   0   0   0   0   0   0   0]
  sample 7:
    input     > [136, 82, 149, 22, 9, 1, 0, 0, 0, 0, 0, 0, 0, 0]
    predicted > [  5 124  16  98  36  

  sample 1:
    input     > [25, 9, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
    predicted > [25  9  1  0  0  0  0  0  0  0  0  0  0  0]
  sample 2:
    input     > [83, 66, 140, 137, 115, 9, 9, 9, 1, 0, 0, 0, 0, 0]
    predicted > [ 83  66 140 137 115   9   9   9   1   0   0   0   0   0]
  sample 3:
    input     > [43, 134, 46, 135, 9, 1, 0, 0, 0, 0, 0, 0, 0, 0]
    predicted > [ 43 134  46 135   9   1   0   0   0   0   0   0   0   0]
  sample 4:
    input     > [18, 95, 79, 9, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]
    predicted > [18 95 79  9  1  0  0  0  0  0  0  0  0  0]
  sample 5:
    input     > [52, 118, 51, 87, 9, 1, 0, 0, 0, 0, 0, 0, 0, 0]
    predicted > [ 52 118  51  87   9   1   0   0   0   0   0   0   0   0]
  sample 6:
    input     > [49, 9, 52, 152, 51, 87, 9, 1, 0, 0, 0, 0, 0, 0]
    predicted > [ 49   9  52 152  51  87   9   1   0   0   0   0   0   0]
  sample 7:
    input     > [136, 82, 149, 22, 9, 1, 0, 0, 0, 0, 0, 0, 0, 0]
    predicted > [136  82 149  22   9   1   0   0   0

  sample 1:
    input     > [25, 9, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
    predicted > [25  9  1  0  0  0  0  0  0  0  0  0  0  0]
  sample 2:
    input     > [83, 66, 140, 137, 115, 9, 9, 9, 1, 0, 0, 0, 0, 0]
    predicted > [ 83  66 140 137 115   9   9   9   1   0   0   0   0   0]
  sample 3:
    input     > [43, 134, 46, 135, 9, 1, 0, 0, 0, 0, 0, 0, 0, 0]
    predicted > [ 43 134  46 135   9   1   0   0   0   0   0   0   0   0]
  sample 4:
    input     > [18, 95, 79, 9, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]
    predicted > [18 95 79  9  1  0  0  0  0  0  0  0  0  0]
  sample 5:
    input     > [52, 118, 51, 87, 9, 1, 0, 0, 0, 0, 0, 0, 0, 0]
    predicted > [ 52 118  51  87   9   1   0   0   0   0   0   0   0   0]
  sample 6:
    input     > [49, 9, 52, 152, 51, 87, 9, 1, 0, 0, 0, 0, 0, 0]
    predicted > [ 49   9  52 152  51  87   9   1   0   0   0   0   0   0]
  sample 7:
    input     > [136, 82, 149, 22, 9, 1, 0, 0, 0, 0, 0, 0, 0, 0]
    predicted > [136  82 149  22   9   1   0   0   0

In [None]:
np.transpose()

In [35]:
np.shape(np.transpose(encoder_input,(1,0)))

(15, 26)

In [36]:
np.shape(np.transpose(target_input,(1,0)))

(14, 26)