In [1]:
from tensorflow.python.lib.io import file_io
from konlpy.tag import Twitter
from os import path, getcwd
from word2vec_review import *
from utils import *

import tensorflow as tf
import pandas as pd
import numpy as np
import pickle
import random
import time

### Load Data
데이터를 로드합니다.

데이터는 네이버 영화 평점 데이터를 사용하다. <br>
데이터에 대한 자세한 설명은 아래 데이터 출처에서 확인 할 수 있다. <br>
(출처: https://github.com/e9t/nsmc) <br>

data field: |id|document|label| <br>
id: key review id <br>
document: 140자 이하의 리뷰 글 <br>
label: 1~4점은 neg(0), 9~10점은 pos(1), 5~8점은 중립으로 데이터에서 제외함

In [2]:
work_dir = '/home/khlee/tutorial/review_sentiment_classification'
data = pd.read_csv(work_dir + '/ratings_train.txt', sep='\t')

In [3]:
## nan reviews 제거
data = data[~pd.isnull(data.document)]

In [4]:
data[:5]

Unnamed: 0,id,document,label
0,9976970,아 더빙.. 진짜 짜증나네요 목소리,0
1,3819312,흠...포스터보고 초딩영화줄....오버연기조차 가볍지 않구나,1
2,10265843,너무재밓었다그래서보는것을추천한다,0
3,9045019,교도소 이야기구먼 ..솔직히 재미는 없다..평점 조정,0
4,6483659,사이몬페그의 익살스런 연기가 돋보였던 영화!스파이더맨에서 늙어보이기만 했던 커스틴 ...,1


In [5]:
print(data.groupby('label').size())

label
0    75170
1    74825
dtype: int64


### Review Tokenize
한글 리뷰를 토큰화 합니다.
한글 토큰화는 konlpy 패키지의 twitter 분석기를 사용하였습니다.
kkma 등 다른 분석기와 비교해보니 속도 및 정규화 기능면에서 twitter를 사용하였습니다.

In [6]:
twit = Twitter()
def twit_pos(x):
    out = twit.pos(x.replace('\\n', ''), norm=True, stem=True)
    return [i[0]+'/'+i[1] for i in out]
%time data['twit_pos'] = data.document.apply(lambda x: twit_pos(x))

CPU times: user 4min 21s, sys: 3.73 s, total: 4min 25s
Wall time: 3min 36s


In [7]:
print('review :')
print(data.document[0])
print('tokenize reviw :')
print(data.twit_pos[0])

review :
아 더빙.. 진짜 짜증나네요 목소리
tokenize reviw :
['아/Exclamation', '더빙/Noun', '../Punctuation', '진짜/Noun', '짜증/Noun', '나다/Verb', '목소리/Noun']


In [9]:
result_file = path.join(work_dir, 'twit_pos_df.pkl')
with file_io.FileIO(result_file, mode='wb') as fp:
     pickle.dump(data, fp)

In [12]:
# 첫 twit_pos 이후 데이터를 저장했다면, 저장한 데이터를 불러온다.
#with open(result_file, 'rb') as fp:
#    data = pickle.load(fp)

### Make Word dictionary
word를 int로 변화하는 인코딩 사전과 다시 int를 word로 변경 하는 디코드 사전을 만듭니다.

In [17]:
int_to_word, word_to_int = mk_lookup_table(data.twit_pos)
## save dictionary
result_file = path.join(work_dir, 'word_dic.pkl')
with file_io.FileIO(result_file, mode='wb') as fp:
     pickle.dump([int_to_word, word_to_int], fp)

Total word: 2194536
Unique word: 48764


tokenize한 review word들을 위 인코드 사전을 사용해서 int로 변경합니다.

In [18]:
int_word = raw_word_to_int(data.twit_pos, word_to_int)

In [19]:
print(data.twit_pos[0])
print(int_word[0])

['아/Exclamation', '더빙/Noun', '../Punctuation', '진짜/Noun', '짜증/Noun', '나다/Verb', '목소리/Noun']
[9456, 9007, 9535, 9507, 9335, 9414, 8783]


### Review_word2Vec
Review의 word 데이터는 cardinality가 높아 one_hot 인코딩시 매우 sparse하고 단어와 단어사이의 연관성이 사라집니다. word data를 사용하기 위해서 word를 저차원 공간의 vector로 embedding으로 프리트레이닝 합니다.
word2vec 수행 과정은 다음과 같이 하였습니다.
1. subsampling을 수행합니다.
    - 빈번하게 발생하는 단어는 적게 희소한 단어는 최대한 살리는 sampling을 하여 word들이 균형있게 학습되도록 합니다.단어의 sample 확률 계산은 다음 과 같이 합니다. <br>
    $ p(w_i) = sqrt( t / f(w_i) ) $, p: 확률, f: 빈도, t: threshold
2. Skip-gram 방법으로 학습셋을 구성하고 학습합니다.
    - window size = 10 <br>
    - embedding size = 32

In [20]:
## embedding size를 높일 수 록 epochs 수를 높일 수록 수행시간이 길어지므로, 컴퓨터 성능에 맞게 낮게 맞췄다.
word2vec_review(int_word, int_to_word, work_dir, epochs=3, n_embedding=32)
# 기존 모델과 그래프가 겹치므로 word2vec graph는 reset합니다.
tf.reset_default_graph()

ELAPSED TIME: 0.22 sec
size of data: 140028
ELAPSED TIME: 1.13 sec
Epoch 1/3 Iteration: 5000 Avg. Training loss: 252.2906 0.0094 sec/batch 11 total sec
Epoch 1/3 Iteration: 10000 Avg. Training loss: 154.5364 0.0088 sec/batch 19 total sec
Epoch 1/3 Iteration: 15000 Avg. Training loss: 102.2329 0.0087 sec/batch 28 total sec
Epoch 1/3 Iteration: 20000 Avg. Training loss: 73.2507 0.0085 sec/batch 37 total sec
Epoch 1/3 Iteration: 25000 Avg. Training loss: 57.3365 0.0085 sec/batch 45 total sec
Epoch 1/3 Iteration: 30000 Avg. Training loss: 47.3191 0.0085 sec/batch 54 total sec
Epoch 1/3 Iteration: 35000 Avg. Training loss: 39.9027 0.0085 sec/batch 62 total sec
Epoch 1/3 Iteration: 40000 Avg. Training loss: 34.8042 0.0085 sec/batch 71 total sec
Epoch 1/3 Iteration: 45000 Avg. Training loss: 32.1070 0.0085 sec/batch 79 total sec
Epoch 1/3 Iteration: 50000 Avg. Training loss: 29.0050 0.0084 sec/batch 88 total sec
Epoch 1/3 Iteration: 55000 Avg. Training loss: 26.5205 0.0085 sec/batch 96 total 

Epoch 3/3 Iteration: 350000 Avg. Training loss: 16.2012 0.0089 sec/batch 615 total sec
Epoch 3/3 Iteration: 355000 Avg. Training loss: 15.8062 0.0090 sec/batch 624 total sec
Epoch 3/3 Iteration: 360000 Avg. Training loss: 16.4414 0.0090 sec/batch 633 total sec
Epoch 3/3 Iteration: 365000 Avg. Training loss: 16.1106 0.0090 sec/batch 642 total sec
Epoch 3/3 Iteration: 370000 Avg. Training loss: 15.5705 0.0089 sec/batch 651 total sec
Epoch 3/3 Iteration: 375000 Avg. Training loss: 15.5328 0.0088 sec/batch 660 total sec
Epoch 3/3 Iteration: 380000 Avg. Training loss: 16.1143 0.0089 sec/batch 669 total sec
Epoch 3/3 Iteration: 385000 Avg. Training loss: 15.7420 0.0089 sec/batch 677 total sec
Epoch 3/3 Iteration: 390000 Avg. Training loss: 15.2589 0.0090 sec/batch 686 total sec
Epoch 3/3 Iteration: 395000 Avg. Training loss: 15.8782 0.0090 sec/batch 695 total sec
Epoch 3/3 Iteration: 400000 Avg. Training loss: 16.0207 0.0088 sec/batch 704 total sec
Nearest to 똥망/Noun: 폭소/Noun, 짱구/Noun, 예의/No

### Word Embedding Visualize
word2vec으로 embeddign한 word들을 3차원 시각화해 보고, word들이 적절히 embedding 되었는지 탐색하고 평가한다.
* word2vec은 데이터가 충분해야 학습이 잘 되지만 주어진 데이터가 적기 때문에 좋은 임베딩에는 한계가 있었습니다.

In [21]:
visualize_embedding(work_dir)

INFO:tensorflow:Restoring parameters from /home/khlee/tutorial/review_pos_neg_classification/w2v_model/abuse.ckpt-last
Run `tensorboard --logdir=/home/khlee/tutorial/review_pos_neg_classification/w2v_model/tb` to run visualize result on tensorboard


training loss graph
<img src="img/train_loss.png" width="480" height="320" />

embedding plot
<img src="img/embedding_plot.png" width="480" height="320" />

## Pos/Neg Modeling

### preprocessing
label 필드(output)와 토크나이즈한 review word를 int로 변환한 필드(input)로 데이터를 구성한다.

In [23]:
model_df = data.reset_index(drop=True)
model_df = pd.concat([model_df.loc[:,['label']], pd.Series(int_word)], axis=1)
model_df = model_df.rename(index=str, columns={0: 'int_review'})

In [24]:
model_df[:5]

Unnamed: 0,label,int_review
0,0,"[9456, 9007, 9535, 9507, 9335, 9414, 8783]"
1,1,"[8486, 9530, 9011, 9480, 8846, 9540, 9279, 948..."
2,0,"[9521, 8772, 1, 9514, 8924, 9537, 9269, 9539]"
3,0,"[2604, 9404, 627, 9535, 9272, 9459, 9526, 9525..."
4,1,"[1, 9536, 1444, 8079, 9500, 9533, 8611, 9540, ..."


review의 length가 서로 다르기 때문에, 최대 길이의 review length만큼 앞단에서 0으로 채워줍니다.

In [25]:
print(model_df['int_review'][0])
max_len = model_df['int_review'].apply(lambda x: len(x)).max()
model_df['int_review'] = model_df['int_review'].apply(lambda x: zero_pad(x, max_len))
print(model_df['int_review'][0])

[9456, 9007, 9535, 9507, 9335, 9414, 8783]
[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0 9456 9007 9535 9507 9335 9414 8783]


### split trn & val & tst set
data를 train, validation, test set으로 분리합니다.

In [32]:
idx = np.array(list(range(len(model_df))))
tst_size = round(len(model_df) * 0.2)
val_size = round(len(model_df) * 0.1)

In [33]:
np.random.seed(seed=1050)
tst_idx = np.random.choice(idx, size=tst_size, replace=False)
idx1 = idx[~np.in1d(idx, tst_idx)]
val_idx = np.random.choice(idx1, size=val_size, replace=False)
trn_idx = idx1[~np.in1d(idx1, val_idx)]
print('trn_size:', len(trn_idx), 'val_size:', len(val_idx), 'tst_size:', len(tst_idx))

trn_size: 104996 val_size: 15000 tst_size: 29999


In [80]:
def mk_model_data(df, shuffle=False):
    if shuffle:
        df = df.sample(frac=1)
    return (df['int_review'].values.tolist(), 
            np.expand_dims(df['label'].values, axis=1).tolist()) 

모델에 사용할 데이터를 다음과 같은 포멧으로 변경합니다. <br>
format : (int_review , label)

In [81]:
tst_data = mk_model_data(model_df.iloc[tst_idx,:])
val_data = mk_model_data(model_df.iloc[val_idx,:])

### Set Model Parameters
* epoch: max epoch으로 학습시 early stopping logic 사용
* rnn_size: 토크나이즈된 review의 최대 길이
* n_classes: 예측 노드 수로, label 하나를 사용하므로 1.

In [96]:
epoch = 10
rnn_size = max_len
rnn_layer = 2
n_classes = 1
n_hidden = 8
batch_size = 128
learning_rate = 0.001
drop_keep_prob = 0.8
#tf.reset_default_graph()

### build the model graph

In [97]:
# set placeholder
inputs = tf.placeholder(tf.int32, [None, rnn_size], name='inputs')
target = tf.placeholder(tf.float32, [None, n_classes], name='target')
keep_prob = tf.placeholder(tf.float32, name='keep_prop')

위에서 학습한 word2vec 모델을 로드하고 embedding matrix를 불러옵니다. <br>
embedding vecotr의 index 0 행은 zeros를 의미함으로 값은 0으로 대체합니다.

In [98]:
w2v = ImportGraph('w2v_model')
embedding, softmax_w, softmax_b = w2v.run()
embedding[0] = np.zeros_like(embedding[0])

INFO:tensorflow:Restoring parameters from w2v_model/abuse.ckpt-last


int로 구성된 review data를 embedding 합니다. <br>
(?, rnn_size) 로 구성된 data가 임베딩 후 (?, rnn_size, 32)로 변경됩니다.

In [99]:
# Embedding layer
embedding = tf.Variable(embedding, name = 'embedding')
rnn_inputs = tf.nn.embedding_lookup(embedding, inputs)
print(inputs)
print(rnn_inputs)

Tensor("inputs:0", shape=(?, 95), dtype=int32)
Tensor("embedding_lookup:0", shape=(?, 95, 32), dtype=float32)


RNN과 linear layer를 통해 feedforward output를 계산합니다. <br>
최종 output은 0~1 사이값으로 예측할 것이기 때문에 sigmoid를 사용합니다.

In [100]:
# make rnn layers
def lstm_cell():
    return tf.contrib.rnn.LayerNormBasicLSTMCell(n_hidden, 
                                                 layer_norm=False,
                                                 dropout_keep_prob=keep_prob, 
                                                 reuse=tf.AUTO_REUSE)

cell = tf.contrib.rnn.MultiRNNCell([lstm_cell() for _ in range(rnn_layer)], state_is_tuple=True)
initial_state = cell.zero_state(tf.shape(inputs)[0], tf.float32)

# RNN outputs
outputs, states = tf.nn.dynamic_rnn(cell, rnn_inputs, 
                                    initial_state=initial_state, dtype=tf.float32)
# linear outputs
with tf.name_scope('outputs'):
    outputs = tf.contrib.layers.fully_connected(outputs[:, -1], n_classes, activation_fn=tf.sigmoid)

mean square error loss 와 adam optimizer를 사용합니다.

In [101]:
# mean square error loss
with tf.name_scope('cost'):
    cost = tf.losses.mean_squared_error(target, outputs)
    tf.summary.scalar('cost', cost)
# adamoptimizer
with tf.name_scope('optimizer'):
    optimizer = tf.train.AdamOptimizer(learning_rate).minimize(cost)

### evaluation metric
label 긍, 부정 데이터 비율이 비슷하므로 단순히 accuracy 지표를 사용합니다.

In [102]:
print(data.groupby('label').size().tolist())

[75170, 74825]


In [103]:
## evaluation
with tf.name_scope('evaluation'):
    pos_eval = tf.equal(tf.round(outputs), tf.round(target))
    accuracy = tf.reduce_mean(tf.cast(pos_eval, tf.float32))
    tf.summary.scalar('accuracy', accuracy)

### summaries & saver
학습과정을 시각화할 지표들을 summary하고 모델을 저장할 saver를 설정한다.

In [104]:
if file_io.is_directory( work_dir + 'model' ):
        file_io.delete_recursively( work_dir + 'model' )
# Merge all the summaries
summary_mg = tf.summary.merge_all()
trn_writer = tf.summary.FileWriter(work_dir + '/model/tb/train', graph=tf.get_default_graph())
tst_writer = tf.summary.FileWriter(work_dir + '/model/tb/test', graph=tf.get_default_graph())
# saver
saver = tf.train.Saver()
saver.export_meta_graph(work_dir + '/model/abuse.ckpt.meta')
()

()

### Training
모델을 학습하는데, 각 epoch의 validation step 마다 loss를 누적하고, epoch가 끝났을 때, 평균 loss가 이전 epoch loss보다 크다면 penalty를 주고 누적 penalty가 2점이면 학습을 조기 종료합니다.

In [105]:
sess = tf.Session()
init = tf.global_variables_initializer()
sess.run(init)

tic = time.time()
spend_t = [0, 0, 0]
prev_val_epoc_loss = sess.run(cost, feed_dict={inputs: val_data[0], 
                                               target: val_data[1], keep_prob: 1.0})
print('current val_loss:', prev_val_epoc_loss)
trn_stop_penalty = 0

for e in range(epoch):
    val_epoc_loss = 0 
    trn_data = mk_model_data(model_df.iloc[trn_idx,:], shuffle=True)
    batch_num = len(trn_data[0])//batch_size

    for i in range(batch_num):
        batch_x = trn_data[0][batch_size*(i):batch_size*(i+1)]
        batch_y = trn_data[1][batch_size*(i):batch_size*(i+1)]

        otic=time.time()    
        _, summary, loss, trn_acc = sess.run([optimizer, summary_mg, cost, accuracy], 
                    feed_dict={inputs: batch_x, target: batch_y, 
                               keep_prob: drop_keep_prob})
        trn_writer.add_summary(summary, i + batch_num*e)
        otoc = time.time() - otic
        spend_t[0] += otoc

        val_step=3
        if i % val_step == 0:
            etic=time.time()
            val_loss, val_acc, summary = \
                sess.run([cost, accuracy, summary_mg], \
                         feed_dict={inputs: val_data[0], target: val_data[1], keep_prob: 1.0})
            val_epoc_loss += val_loss
            tst_writer.add_summary(summary, i + batch_num*e)
            etoc = time.time() - etic
            spend_t[1] += etoc

        if i % 10 == 0:
            print("Epoch:", e+1, "Batch_iter:", i, '\n',
                  "Train Loss:", loss, "Valid Loss:", val_loss, '\n',
                  "trn_acc:", round(trn_acc,3), "val_acc:", round(val_acc,3))
            print("spend time = { row/sec:", batch_size//otoc, 
                  "optimize:", round(otoc,3), "eval:", round(etoc,3), 
                  "Elapsed:", round(time.time() - tic,3), "}")

    ## train stop logic
    val_epoc_loss = val_epoc_loss / np.ceil(batch_num / val_step)
    if val_epoc_loss <= prev_val_epoc_loss:
        print("\n", "Save the model : current loss ", val_epoc_loss, "<=", 
              prev_val_epoc_loss, "prev loss", "\n")
        saver.save(sess, work_dir + '/model/abuse.ckpt', global_step=e)
        saver.save(sess, work_dir + '/model/abuse.ckpt-last')
        prev_val_epoc_loss = val_epoc_loss
    else:
        trn_stop_penalty += 1
        print("\n", "train stop penalty:", trn_stop_penalty, 
              "current loss ", val_epoc_loss, ">",
              prev_val_epoc_loss, "prev loss", "\n")
        if trn_stop_penalty == 2:
            break
sess.close()

current val_loss: 0.252392
Epoch: 1 Batch_iter: 0 
 Train Loss: 0.25644886 Valid Loss: 0.25115082 
 trn_acc: 0.492 val_acc: 0.504
spend time = { row/sec: 287.0 optimize: 0.445 eval: 1.694 Elapsed: 3.583 }
Epoch: 1 Batch_iter: 10 
 Train Loss: 0.25010484 Valid Loss: 0.24984755 
 trn_acc: 0.539 val_acc: 0.499
spend time = { row/sec: 2436.0 optimize: 0.053 eval: 2.948 Elapsed: 12.088 }
Epoch: 1 Batch_iter: 20 
 Train Loss: 0.24988881 Valid Loss: 0.24960558 
 trn_acc: 0.523 val_acc: 0.525
spend time = { row/sec: 2566.0 optimize: 0.05 eval: 1.68 Elapsed: 18.97 }
Epoch: 1 Batch_iter: 30 
 Train Loss: 0.2518602 Valid Loss: 0.2494943 
 trn_acc: 0.438 val_acc: 0.559
spend time = { row/sec: 2703.0 optimize: 0.047 eval: 0.768 Elapsed: 27.148 }
Epoch: 1 Batch_iter: 40 
 Train Loss: 0.24977326 Valid Loss: 0.24942265 
 trn_acc: 0.523 val_acc: 0.511
spend time = { row/sec: 2488.0 optimize: 0.051 eval: 1.195 Elapsed: 31.002 }
Epoch: 1 Batch_iter: 50 
 Train Loss: 0.24814078 Valid Loss: 0.24907835 
 tr

Epoch: 1 Batch_iter: 450 
 Train Loss: 0.1237121 Valid Loss: 0.1279443 
 trn_acc: 0.891 val_acc: 0.826
spend time = { row/sec: 2552.0 optimize: 0.05 eval: 0.733 Elapsed: 178.118 }
Epoch: 1 Batch_iter: 460 
 Train Loss: 0.15374365 Valid Loss: 0.12957256 
 trn_acc: 0.773 val_acc: 0.823
spend time = { row/sec: 2310.0 optimize: 0.055 eval: 0.553 Elapsed: 180.315 }
Epoch: 1 Batch_iter: 470 
 Train Loss: 0.12711717 Valid Loss: 0.12778895 
 trn_acc: 0.828 val_acc: 0.824
spend time = { row/sec: 2556.0 optimize: 0.05 eval: 0.53 Elapsed: 182.612 }
Epoch: 1 Batch_iter: 480 
 Train Loss: 0.11788119 Valid Loss: 0.12640192 
 trn_acc: 0.852 val_acc: 0.826
spend time = { row/sec: 2082.0 optimize: 0.061 eval: 0.548 Elapsed: 185.902 }
Epoch: 1 Batch_iter: 490 
 Train Loss: 0.11825576 Valid Loss: 0.12650897 
 trn_acc: 0.805 val_acc: 0.826
spend time = { row/sec: 2625.0 optimize: 0.049 eval: 0.648 Elapsed: 188.211 }
Epoch: 1 Batch_iter: 500 
 Train Loss: 0.1263632 Valid Loss: 0.13468084 
 trn_acc: 0.836 v

Epoch: 2 Batch_iter: 80 
 Train Loss: 0.123590484 Valid Loss: 0.12241151 
 trn_acc: 0.828 val_acc: 0.829
spend time = { row/sec: 2648.0 optimize: 0.048 eval: 0.592 Elapsed: 304.884 }
Epoch: 2 Batch_iter: 90 
 Train Loss: 0.11948401 Valid Loss: 0.11827834 
 trn_acc: 0.844 val_acc: 0.834
spend time = { row/sec: 2757.0 optimize: 0.046 eval: 0.579 Elapsed: 308.795 }
Epoch: 2 Batch_iter: 100 
 Train Loss: 0.12515639 Valid Loss: 0.1193208 
 trn_acc: 0.828 val_acc: 0.832
spend time = { row/sec: 2194.0 optimize: 0.058 eval: 0.591 Elapsed: 311.012 }
Epoch: 2 Batch_iter: 110 
 Train Loss: 0.14182281 Valid Loss: 0.117350444 
 trn_acc: 0.812 val_acc: 0.836
spend time = { row/sec: 2734.0 optimize: 0.047 eval: 0.541 Elapsed: 313.282 }
Epoch: 2 Batch_iter: 120 
 Train Loss: 0.117130056 Valid Loss: 0.12007254 
 trn_acc: 0.836 val_acc: 0.834
spend time = { row/sec: 2691.0 optimize: 0.048 eval: 0.544 Elapsed: 316.111 }
Epoch: 2 Batch_iter: 130 
 Train Loss: 0.11870572 Valid Loss: 0.116863266 
 trn_acc: 

Epoch: 2 Batch_iter: 530 
 Train Loss: 0.10009559 Valid Loss: 0.11256022 
 trn_acc: 0.883 val_acc: 0.844
spend time = { row/sec: 2664.0 optimize: 0.048 eval: 0.594 Elapsed: 425.279 }
Epoch: 2 Batch_iter: 540 
 Train Loss: 0.12214409 Valid Loss: 0.114951186 
 trn_acc: 0.844 val_acc: 0.842
spend time = { row/sec: 2738.0 optimize: 0.047 eval: 0.6 Elapsed: 428.146 }
Epoch: 2 Batch_iter: 550 
 Train Loss: 0.100346036 Valid Loss: 0.11215104 
 trn_acc: 0.867 val_acc: 0.845
spend time = { row/sec: 2448.0 optimize: 0.052 eval: 0.573 Elapsed: 430.419 }
Epoch: 2 Batch_iter: 560 
 Train Loss: 0.123110905 Valid Loss: 0.11508161 
 trn_acc: 0.828 val_acc: 0.839
spend time = { row/sec: 2709.0 optimize: 0.047 eval: 0.611 Elapsed: 432.626 }
Epoch: 2 Batch_iter: 570 
 Train Loss: 0.12251048 Valid Loss: 0.11559145 
 trn_acc: 0.836 val_acc: 0.839
spend time = { row/sec: 2707.0 optimize: 0.047 eval: 0.54 Elapsed: 436.076 }
Epoch: 2 Batch_iter: 580 
 Train Loss: 0.08928628 Valid Loss: 0.11269289 
 trn_acc: 0

Epoch: 3 Batch_iter: 160 
 Train Loss: 0.06796479 Valid Loss: 0.111259945 
 trn_acc: 0.922 val_acc: 0.848
spend time = { row/sec: 2259.0 optimize: 0.057 eval: 0.833 Elapsed: 537.593 }
Epoch: 3 Batch_iter: 170 
 Train Loss: 0.121445775 Valid Loss: 0.1123686 
 trn_acc: 0.844 val_acc: 0.847
spend time = { row/sec: 2804.0 optimize: 0.046 eval: 0.531 Elapsed: 539.649 }
Epoch: 3 Batch_iter: 180 
 Train Loss: 0.09963629 Valid Loss: 0.11510251 
 trn_acc: 0.852 val_acc: 0.844
spend time = { row/sec: 2669.0 optimize: 0.048 eval: 0.553 Elapsed: 542.402 }
Epoch: 3 Batch_iter: 190 
 Train Loss: 0.10137465 Valid Loss: 0.112087384 
 trn_acc: 0.852 val_acc: 0.847
spend time = { row/sec: 2675.0 optimize: 0.048 eval: 0.543 Elapsed: 544.737 }
Epoch: 3 Batch_iter: 200 
 Train Loss: 0.1255343 Valid Loss: 0.112550765 
 trn_acc: 0.828 val_acc: 0.846
spend time = { row/sec: 2755.0 optimize: 0.046 eval: 0.553 Elapsed: 546.887 }
Epoch: 3 Batch_iter: 210 
 Train Loss: 0.095077895 Valid Loss: 0.121329896 
 trn_ac

Epoch: 3 Batch_iter: 610 
 Train Loss: 0.08029018 Valid Loss: 0.11081175 
 trn_acc: 0.906 val_acc: 0.847
spend time = { row/sec: 2681.0 optimize: 0.048 eval: 0.525 Elapsed: 648.5 }
Epoch: 3 Batch_iter: 620 
 Train Loss: 0.11317252 Valid Loss: 0.11667991 
 trn_acc: 0.859 val_acc: 0.837
spend time = { row/sec: 2820.0 optimize: 0.045 eval: 0.584 Elapsed: 650.636 }
Epoch: 3 Batch_iter: 630 
 Train Loss: 0.15340579 Valid Loss: 0.115701936 
 trn_acc: 0.805 val_acc: 0.84
spend time = { row/sec: 2760.0 optimize: 0.046 eval: 0.56 Elapsed: 653.46 }
Epoch: 3 Batch_iter: 640 
 Train Loss: 0.13253084 Valid Loss: 0.111465834 
 trn_acc: 0.82 val_acc: 0.847
spend time = { row/sec: 2485.0 optimize: 0.051 eval: 0.578 Elapsed: 655.634 }
Epoch: 3 Batch_iter: 650 
 Train Loss: 0.10814216 Valid Loss: 0.110484555 
 trn_acc: 0.82 val_acc: 0.848
spend time = { row/sec: 2763.0 optimize: 0.046 eval: 0.588 Elapsed: 657.789 }
Epoch: 3 Batch_iter: 660 
 Train Loss: 0.088245526 Valid Loss: 0.110498525 
 trn_acc: 0.8

Epoch: 4 Batch_iter: 240 
 Train Loss: 0.10963529 Valid Loss: 0.111531995 
 trn_acc: 0.828 val_acc: 0.846
spend time = { row/sec: 2706.0 optimize: 0.047 eval: 0.548 Elapsed: 755.745 }
Epoch: 4 Batch_iter: 250 
 Train Loss: 0.081118576 Valid Loss: 0.113649964 
 trn_acc: 0.898 val_acc: 0.844
spend time = { row/sec: 2742.0 optimize: 0.047 eval: 0.53 Elapsed: 757.854 }
Epoch: 4 Batch_iter: 260 
 Train Loss: 0.11027515 Valid Loss: 0.11552221 
 trn_acc: 0.812 val_acc: 0.841
spend time = { row/sec: 2681.0 optimize: 0.048 eval: 0.537 Elapsed: 760.052 }
Epoch: 4 Batch_iter: 270 
 Train Loss: 0.09220943 Valid Loss: 0.1283331 
 trn_acc: 0.867 val_acc: 0.826
spend time = { row/sec: 2667.0 optimize: 0.048 eval: 0.549 Elapsed: 762.842 }
Epoch: 4 Batch_iter: 280 
 Train Loss: 0.09940116 Valid Loss: 0.112046994 
 trn_acc: 0.852 val_acc: 0.845
spend time = { row/sec: 2665.0 optimize: 0.048 eval: 0.51 Elapsed: 764.935 }
Epoch: 4 Batch_iter: 290 
 Train Loss: 0.09200232 Valid Loss: 0.11031881 
 trn_acc: 

Epoch: 4 Batch_iter: 690 
 Train Loss: 0.103175715 Valid Loss: 0.111420445 
 trn_acc: 0.867 val_acc: 0.846
spend time = { row/sec: 2722.0 optimize: 0.047 eval: 0.536 Elapsed: 861.36 }
Epoch: 4 Batch_iter: 700 
 Train Loss: 0.115284875 Valid Loss: 0.1096127 
 trn_acc: 0.844 val_acc: 0.849
spend time = { row/sec: 2438.0 optimize: 0.052 eval: 0.568 Elapsed: 863.572 }
Epoch: 4 Batch_iter: 710 
 Train Loss: 0.10187134 Valid Loss: 0.116349176 
 trn_acc: 0.867 val_acc: 0.839
spend time = { row/sec: 2696.0 optimize: 0.047 eval: 0.529 Elapsed: 865.711 }
Epoch: 4 Batch_iter: 720 
 Train Loss: 0.10512663 Valid Loss: 0.12957379 
 trn_acc: 0.867 val_acc: 0.817
spend time = { row/sec: 2664.0 optimize: 0.048 eval: 0.543 Elapsed: 868.426 }
Epoch: 4 Batch_iter: 730 
 Train Loss: 0.10076995 Valid Loss: 0.109803274 
 trn_acc: 0.867 val_acc: 0.849
spend time = { row/sec: 2504.0 optimize: 0.051 eval: 0.525 Elapsed: 870.581 }
Epoch: 4 Batch_iter: 740 
 Train Loss: 0.10246977 Valid Loss: 0.113822885 
 trn_ac

Epoch: 5 Batch_iter: 320 
 Train Loss: 0.097957775 Valid Loss: 0.11105356 
 trn_acc: 0.883 val_acc: 0.85
spend time = { row/sec: 2606.0 optimize: 0.049 eval: 0.52 Elapsed: 967.384 }
Epoch: 5 Batch_iter: 330 
 Train Loss: 0.08608013 Valid Loss: 0.11462837 
 trn_acc: 0.867 val_acc: 0.846
spend time = { row/sec: 2778.0 optimize: 0.046 eval: 0.537 Elapsed: 970.002 }
Epoch: 5 Batch_iter: 340 
 Train Loss: 0.09514638 Valid Loss: 0.1104063 
 trn_acc: 0.883 val_acc: 0.846
spend time = { row/sec: 2368.0 optimize: 0.054 eval: 0.552 Elapsed: 972.217 }
Epoch: 5 Batch_iter: 350 
 Train Loss: 0.102976345 Valid Loss: 0.11273019 
 trn_acc: 0.867 val_acc: 0.842
spend time = { row/sec: 2607.0 optimize: 0.049 eval: 0.547 Elapsed: 974.298 }
Epoch: 5 Batch_iter: 360 
 Train Loss: 0.063602656 Valid Loss: 0.11139064 
 trn_acc: 0.922 val_acc: 0.848
spend time = { row/sec: 2668.0 optimize: 0.048 eval: 0.525 Elapsed: 977.061 }
Epoch: 5 Batch_iter: 370 
 Train Loss: 0.07450007 Valid Loss: 0.11247922 
 trn_acc: 0

Epoch: 5 Batch_iter: 770 
 Train Loss: 0.09670307 Valid Loss: 0.11198778 
 trn_acc: 0.852 val_acc: 0.845
spend time = { row/sec: 2743.0 optimize: 0.047 eval: 0.518 Elapsed: 1070.887 }
Epoch: 5 Batch_iter: 780 
 Train Loss: 0.1111305 Valid Loss: 0.109556526 
 trn_acc: 0.859 val_acc: 0.851
spend time = { row/sec: 2805.0 optimize: 0.046 eval: 0.541 Elapsed: 1073.544 }
Epoch: 5 Batch_iter: 790 
 Train Loss: 0.07854281 Valid Loss: 0.10902131 
 trn_acc: 0.914 val_acc: 0.849
spend time = { row/sec: 2644.0 optimize: 0.048 eval: 0.509 Elapsed: 1075.588 }
Epoch: 5 Batch_iter: 800 
 Train Loss: 0.07119894 Valid Loss: 0.110599 
 trn_acc: 0.875 val_acc: 0.851
spend time = { row/sec: 2563.0 optimize: 0.05 eval: 0.567 Elapsed: 1077.697 }
Epoch: 5 Batch_iter: 810 
 Train Loss: 0.11305578 Valid Loss: 0.115653515 
 trn_acc: 0.859 val_acc: 0.839
spend time = { row/sec: 2771.0 optimize: 0.046 eval: 0.546 Elapsed: 1080.295 }

 Save the model : current loss  0.11245834569100045 <= 0.11325716045107284 prev l

Epoch: 6 Batch_iter: 400 
 Train Loss: 0.08032751 Valid Loss: 0.113704026 
 trn_acc: 0.898 val_acc: 0.844
spend time = { row/sec: 2768.0 optimize: 0.046 eval: 0.527 Elapsed: 1174.129 }
Epoch: 6 Batch_iter: 410 
 Train Loss: 0.11744004 Valid Loss: 0.11063331 
 trn_acc: 0.859 val_acc: 0.85
spend time = { row/sec: 2496.0 optimize: 0.051 eval: 0.535 Elapsed: 1176.23 }
Epoch: 6 Batch_iter: 420 
 Train Loss: 0.11368914 Valid Loss: 0.112040594 
 trn_acc: 0.852 val_acc: 0.844
spend time = { row/sec: 2742.0 optimize: 0.047 eval: 0.525 Elapsed: 1178.855 }
Epoch: 6 Batch_iter: 430 
 Train Loss: 0.07538034 Valid Loss: 0.1100735 
 trn_acc: 0.891 val_acc: 0.85
spend time = { row/sec: 2554.0 optimize: 0.05 eval: 0.524 Elapsed: 1180.95 }
Epoch: 6 Batch_iter: 440 
 Train Loss: 0.13152826 Valid Loss: 0.110134 
 trn_acc: 0.844 val_acc: 0.848
spend time = { row/sec: 2648.0 optimize: 0.048 eval: 0.554 Elapsed: 1183.054 }
Epoch: 6 Batch_iter: 450 
 Train Loss: 0.09123443 Valid Loss: 0.109193236 
 trn_acc: 0

Epoch: 7 Batch_iter: 30 
 Train Loss: 0.07516505 Valid Loss: 0.110232964 
 trn_acc: 0.93 val_acc: 0.85
spend time = { row/sec: 2649.0 optimize: 0.048 eval: 0.514 Elapsed: 1280.008 }
Epoch: 7 Batch_iter: 40 
 Train Loss: 0.075590216 Valid Loss: 0.113606155 
 trn_acc: 0.914 val_acc: 0.848
spend time = { row/sec: 2452.0 optimize: 0.052 eval: 0.555 Elapsed: 1282.145 }
Epoch: 7 Batch_iter: 50 
 Train Loss: 0.08429375 Valid Loss: 0.11387999 
 trn_acc: 0.898 val_acc: 0.847
spend time = { row/sec: 2596.0 optimize: 0.049 eval: 0.52 Elapsed: 1284.324 }
Epoch: 7 Batch_iter: 60 
 Train Loss: 0.11641014 Valid Loss: 0.12781806 
 trn_acc: 0.859 val_acc: 0.831
spend time = { row/sec: 2359.0 optimize: 0.054 eval: 0.535 Elapsed: 1286.948 }
Epoch: 7 Batch_iter: 70 
 Train Loss: 0.10257378 Valid Loss: 0.118319094 
 trn_acc: 0.844 val_acc: 0.835
spend time = { row/sec: 2299.0 optimize: 0.056 eval: 0.569 Elapsed: 1289.128 }
Epoch: 7 Batch_iter: 80 
 Train Loss: 0.10001368 Valid Loss: 0.109480895 
 trn_acc: 

Epoch: 7 Batch_iter: 480 
 Train Loss: 0.103255205 Valid Loss: 0.10903087 
 trn_acc: 0.859 val_acc: 0.847
spend time = { row/sec: 2734.0 optimize: 0.047 eval: 0.543 Elapsed: 1384.287 }
Epoch: 7 Batch_iter: 490 
 Train Loss: 0.12845698 Valid Loss: 0.10972373 
 trn_acc: 0.82 val_acc: 0.849
spend time = { row/sec: 2600.0 optimize: 0.049 eval: 0.549 Elapsed: 1386.453 }
Epoch: 7 Batch_iter: 500 
 Train Loss: 0.08982992 Valid Loss: 0.110193826 
 trn_acc: 0.891 val_acc: 0.848
spend time = { row/sec: 2578.0 optimize: 0.05 eval: 0.502 Elapsed: 1388.536 }
Epoch: 7 Batch_iter: 510 
 Train Loss: 0.10480264 Valid Loss: 0.11088042 
 trn_acc: 0.852 val_acc: 0.847
spend time = { row/sec: 2765.0 optimize: 0.046 eval: 0.539 Elapsed: 1391.143 }
Epoch: 7 Batch_iter: 520 
 Train Loss: 0.10976503 Valid Loss: 0.1143513 
 trn_acc: 0.812 val_acc: 0.844
spend time = { row/sec: 2176.0 optimize: 0.059 eval: 0.583 Elapsed: 1393.953 }
Epoch: 7 Batch_iter: 530 
 Train Loss: 0.05238349 Valid Loss: 0.11013213 
 trn_ac

Epoch: 8 Batch_iter: 110 
 Train Loss: 0.07518141 Valid Loss: 0.11211671 
 trn_acc: 0.891 val_acc: 0.848
spend time = { row/sec: 2773.0 optimize: 0.046 eval: 0.541 Elapsed: 1490.191 }
Epoch: 8 Batch_iter: 120 
 Train Loss: 0.09092942 Valid Loss: 0.110318735 
 trn_acc: 0.859 val_acc: 0.849
spend time = { row/sec: 2562.0 optimize: 0.05 eval: 0.525 Elapsed: 1492.867 }
Epoch: 8 Batch_iter: 130 
 Train Loss: 0.063900545 Valid Loss: 0.112003006 
 trn_acc: 0.914 val_acc: 0.85
spend time = { row/sec: 2599.0 optimize: 0.049 eval: 0.534 Elapsed: 1494.96 }
Epoch: 8 Batch_iter: 140 
 Train Loss: 0.096899934 Valid Loss: 0.11096823 
 trn_acc: 0.883 val_acc: 0.849
spend time = { row/sec: 2677.0 optimize: 0.048 eval: 0.52 Elapsed: 1497.054 }
Epoch: 8 Batch_iter: 150 
 Train Loss: 0.060415477 Valid Loss: 0.110241376 
 trn_acc: 0.93 val_acc: 0.848
spend time = { row/sec: 2592.0 optimize: 0.049 eval: 0.52 Elapsed: 1499.665 }
Epoch: 8 Batch_iter: 160 
 Train Loss: 0.10356181 Valid Loss: 0.11229427 
 trn_a

Epoch: 8 Batch_iter: 560 
 Train Loss: 0.09607889 Valid Loss: 0.1106959 
 trn_acc: 0.859 val_acc: 0.85
spend time = { row/sec: 2781.0 optimize: 0.046 eval: 0.528 Elapsed: 1593.651 }
Epoch: 8 Batch_iter: 570 
 Train Loss: 0.09118281 Valid Loss: 0.11023497 
 trn_acc: 0.898 val_acc: 0.847
spend time = { row/sec: 2782.0 optimize: 0.046 eval: 0.529 Elapsed: 1596.328 }
Epoch: 8 Batch_iter: 580 
 Train Loss: 0.104209855 Valid Loss: 0.11273009 
 trn_acc: 0.828 val_acc: 0.846
spend time = { row/sec: 2114.0 optimize: 0.061 eval: 0.558 Elapsed: 1598.484 }
Epoch: 8 Batch_iter: 590 
 Train Loss: 0.10313464 Valid Loss: 0.112320505 
 trn_acc: 0.844 val_acc: 0.847
spend time = { row/sec: 2723.0 optimize: 0.047 eval: 0.544 Elapsed: 1600.762 }
Epoch: 8 Batch_iter: 600 
 Train Loss: 0.11174908 Valid Loss: 0.11604168 
 trn_acc: 0.875 val_acc: 0.84
spend time = { row/sec: 2632.0 optimize: 0.049 eval: 0.546 Elapsed: 1603.398 }
Epoch: 8 Batch_iter: 610 
 Train Loss: 0.087901 Valid Loss: 0.11277831 
 trn_acc:

Epoch: 9 Batch_iter: 190 
 Train Loss: 0.06241507 Valid Loss: 0.11627412 
 trn_acc: 0.922 val_acc: 0.847
spend time = { row/sec: 2536.0 optimize: 0.05 eval: 0.515 Elapsed: 1698.096 }
Epoch: 9 Batch_iter: 200 
 Train Loss: 0.067480296 Valid Loss: 0.11451844 
 trn_acc: 0.93 val_acc: 0.845
spend time = { row/sec: 2441.0 optimize: 0.052 eval: 0.536 Elapsed: 1700.234 }
Epoch: 9 Batch_iter: 210 
 Train Loss: 0.10530302 Valid Loss: 0.113457955 
 trn_acc: 0.891 val_acc: 0.846
spend time = { row/sec: 2757.0 optimize: 0.046 eval: 0.525 Elapsed: 1702.824 }
Epoch: 9 Batch_iter: 220 
 Train Loss: 0.059348434 Valid Loss: 0.112959474 
 trn_acc: 0.922 val_acc: 0.849
spend time = { row/sec: 2408.0 optimize: 0.053 eval: 0.55 Elapsed: 1705.002 }
Epoch: 9 Batch_iter: 230 
 Train Loss: 0.08412037 Valid Loss: 0.1109132 
 trn_acc: 0.883 val_acc: 0.85
spend time = { row/sec: 2662.0 optimize: 0.048 eval: 0.524 Elapsed: 1707.126 }
Epoch: 9 Batch_iter: 240 
 Train Loss: 0.08878236 Valid Loss: 0.11881346 
 trn_ac

Epoch: 9 Batch_iter: 640 
 Train Loss: 0.0838186 Valid Loss: 0.11139777 
 trn_acc: 0.898 val_acc: 0.846
spend time = { row/sec: 2741.0 optimize: 0.047 eval: 0.618 Elapsed: 1802.64 }
Epoch: 9 Batch_iter: 650 
 Train Loss: 0.08339214 Valid Loss: 0.111094646 
 trn_acc: 0.875 val_acc: 0.85
spend time = { row/sec: 2593.0 optimize: 0.049 eval: 0.516 Elapsed: 1804.716 }
Epoch: 9 Batch_iter: 660 
 Train Loss: 0.10736982 Valid Loss: 0.112472005 
 trn_acc: 0.844 val_acc: 0.844
spend time = { row/sec: 2693.0 optimize: 0.048 eval: 0.678 Elapsed: 1807.557 }
Epoch: 9 Batch_iter: 670 
 Train Loss: 0.073526435 Valid Loss: 0.109199055 
 trn_acc: 0.898 val_acc: 0.847
spend time = { row/sec: 2286.0 optimize: 0.056 eval: 0.555 Elapsed: 1809.671 }
Epoch: 9 Batch_iter: 680 
 Train Loss: 0.10389748 Valid Loss: 0.10980479 
 trn_acc: 0.844 val_acc: 0.847
spend time = { row/sec: 2680.0 optimize: 0.048 eval: 0.552 Elapsed: 1811.811 }
Epoch: 9 Batch_iter: 690 
 Train Loss: 0.1034797 Valid Loss: 0.11147627 
 trn_a

tensorboard --logdir work_dir/model/tb <br>
위 tensorboard를 실행하면 학습과정을 확인할 수 있다. <br>
파란선은 train, 주황선은 valid
<img src="img/cost.png" width="480" height="320" />
<img src="img/accuracy.png" width="480" height="320" />

### Evaluation of test set 

best model parameter를 restore 합니다.

In [106]:
sess = tf.Session()
init = tf.global_variables_initializer()
saver.restore(sess, work_dir + "/model/abuse.ckpt-last")

INFO:tensorflow:Restoring parameters from /home/khlee/tutorial/review_pos_neg_classification/model/abuse.ckpt-last


In [107]:
tst_acc = sess.run(accuracy, feed_dict={inputs: tst_data[0], target: tst_data[1], 
                                        keep_prob: 1.0})
print('test set accuracy:', tst_acc)

test set accuracy: 0.85126173


## Conclusion
네이버 영화 리뷰 평점을 가지고 긍부정 리뷰를 분리하는 모델을 만드는, 튜토리얼 수준으로 진행하였다. <br>
모델 과정을 단순하게 요약하면 다음과 같다. <br>

1. 리뷰 단어들을 형태소 분석기를 통해 분리한다.
2. 단어들을 word2vec model을 통해 벡터화 한다.
3. RNN 기반의 모델을 통해 binomial classification 문제를 수행한다.


## Future Work
위에서 수행한 과정은 리뷰의 긍부정을 분리하기 위해 단순하게 수행한 튜토리얼 수준으로 진행하였다. <br>
모델을 고도화 하기 위해서는 다음과 같은 실험들이 수행되어야 한다. <br>

1. 적절한 형태소 분석기의 선택.
    * 위에서는 단순히 가장 많이 사용되는 twitter 형태소 분석기를 그대로 사용하였고, 형태소가 잘 분리 되었는지, 더 처리 되어야할 word들이 있는지에 대한 자세한 전처리 과정은 생략하였다.
2. word2vec model 검증.
    * 위에서는 프리트레이닝 개념으로 컴퓨팅 파워가 허락되는 선에서 간단하게 수행하였다.
    * word2vec 모델의 파라미터 튜닝등 적절이 embedding 되었는지에 대한 추가 검증이 필요할 것으로 생각한다.
3. RNN 모델의 고도화
    * 위에서는 가장 많이 사용되는 LSTM 모형을 사용하였고, 파라미터 서치를 거치지 않았다.
    * GRU, Bi-Rnn, Attention 등의 고도화된 모델을 수행해볼 필요가 있다.
4. Label 데이터 변경
    * 위 데이터는 긍, 부정(1,0)으로 프로세싱된 데이터를 참조하였다. 
    * binomial이 아닌 1~10점을 그대로 linear 변수를 사용하여 Score 예측을 수행하고, 후에 점수를 커팅하여 긍부정을 분리한다면, 더 좋은 성능을 보이지 않을까 생각해 본다.
    * score로 예측하는 것이 추후 활용면에서 더 다양할 것으로 생각된다.