# BiLSTM 모델 훈련시키기
Word2Vec을 통해 임베딩 된 네이버 영화 리뷰 데이터를 BiLSTM을 통해 긍정, 부정을 분류해주는 예제


In [2]:
import time
import os
import tensorflow as tf
import numpy as np

from konlpy.tag import Twitter
import gensim

## 워드 임베딩 클래스 만들기

In [3]:
class Word2Vec():
    
    def __init__(self):
        None

    def _tokenize(self, doc):
        pos_tagger = Twitter()
        return ['/'.join(t) for t in pos_tagger.pos(doc, norm=True, stem=True)]
    
    def _read_data(self, filename):
        with open(filename, 'r',encoding='utf-8') as f:
            data = [line.split('\t') for line in f.read().splitlines()]
            data = data[1:]
        return data  
    
    def _word2vec_model(self, model_name):
        
        model = gensim.models.word2vec.Word2Vec.load(model_name)
        return model
    
    def _convert2vec(self, model_name, doc):  ## Convert corpus into vectors
        word_vec = []
        model = gensim.models.word2vec.Word2Vec.load(model_name)
        for sent in doc:
            sub = []
            for word in sent:
                if(word in model.wv.vocab):
                    sub.append(model.wv[word])
                else:
                    sub.append(np.random.uniform(-0.25,0.25,300)) ## used for OOV words
            word_vec.append(sub)
        
        return word_vec
    
    def _zero_padding(self, train_batch_X, Batch_size, Maxseq_length, Vector_size):
        
        zero_pad = np.zeros((Batch_size, Maxseq_length, Vector_size))
        for i in range(Batch_size):
            zero_pad[i,:np.shape(train_batch_X[i])[0],:np.shape(train_batch_X[i])[1]] = train_batch_X[i]
            
        return zero_pad
    
    def _one_hot(self, data):
       
        index_dict = {value:index for index,value in enumerate(set(data))}
        result = []
        
        for value in data:
            
            one_hot = np.zeros(len(index_dict))
            index = index_dict[value]
            one_hot[index] = 1
            result.append(one_hot)
        
        return result

In [4]:
W2V = Word2Vec()

In [5]:
train_data = W2V._read_data("../data/ratings_train.txt")

In [6]:
train_data[:5]

[['9976970', '아 더빙.. 진짜 짜증나네요 목소리', '0'],
 ['3819312', '흠...포스터보고 초딩영화줄....오버연기조차 가볍지 않구나', '1'],
 ['10265843', '너무재밓었다그래서보는것을추천한다', '0'],
 ['9045019', '교도소 이야기구먼 ..솔직히 재미는 없다..평점 조정', '0'],
 ['6483659',
  '사이몬페그의 익살스런 연기가 돋보였던 영화!스파이더맨에서 늙어보이기만 했던 커스틴 던스트가 너무나도 이뻐보였다',
  '1']]

In [8]:
## tokenize the data we have
print("Tokenize Start!\n 시간이 조금 걸립니다...")
tokens = [[W2V._tokenize(row[1]),int(row[2])] for row in train_data if W2V._tokenize(row[1]) != []]
tokens = np.array(tokens)
print("Tokenize Done!")

train_X = tokens[:,0]
train_Y = tokens[:,1]

Tokenize Start!
 시간이 조금 걸립니다...
Tokenize Done!


In [9]:
tokens[0]

array([list(['아/Exclamation', '더빙/Noun', '../Punctuation', '진짜/Noun', '짜증나다/Adjective', '목소리/Noun']),
       0], dtype=object)

In [11]:
train_Y_ = W2V._one_hot(train_Y)  ## Convert to One-hot
train_X_ = W2V._convert2vec("../data/Word2Vec.model", train_X)  ## import word2vec model where you have trained before

X : word 는 벡터로 변환

In [12]:
print(train_X[0])
print('===================================================================================================')
print(train_X_[0])
print('===================================================================================================')
print("모양은 ",np.shape(train_X_))

['아/Exclamation', '더빙/Noun', '../Punctuation', '진짜/Noun', '짜증나다/Adjective', '목소리/Noun']
[array([ 7.32496306e-02,  2.22933188e-01, -9.02157873e-02, -4.99585450e-01,
        2.36245394e-01, -1.31890193e-01, -2.19496593e-01,  1.25076905e-01,
        1.32397085e-01, -1.35619342e-01, -4.08657826e-03, -2.67318219e-01,
        4.22551222e-02, -9.23841745e-02, -4.60642219e-01, -1.30053654e-01,
       -3.67718697e-01, -1.95943817e-01, -9.36812982e-02,  9.62777734e-02,
        1.40498474e-01, -2.34336369e-02,  1.90773055e-01, -2.41759449e-01,
       -1.28186852e-01, -8.79751816e-02,  2.48150826e-01, -1.45640582e-01,
        1.96511790e-01,  4.62780923e-01, -6.52259737e-02, -1.39704287e-01,
        2.08631471e-01, -4.46586937e-01,  1.38629675e-02,  3.39872003e-01,
       -1.17882267e-01,  1.04133867e-01, -9.91496146e-02, -2.44869933e-01,
       -1.47121936e-01,  1.32883474e-01, -4.94030684e-01, -3.84940505e-02,
       -1.91327259e-01,  1.78052127e-01,  2.72258967e-01,  5.64066529e-01,
        1.9

Y : label은 one-hot 으로 변환
- 긍정은 : [0, 1]
- 부정은 : [1, 0]

In [13]:
print(train_Y[0])
print('===================================================================================================')
print(train_Y_[0])
print('===================================================================================================')
print("모양은 ",np.shape(train_Y_))

0
[1. 0.]
모양은  (149995, 2)


## Train 시작!

클래스를 정의
- lstm_dims : lstm 차원 수
- num_class : 긍정 or 부정 판별
- keep_prob : drop out rate

In [14]:
class Bi_LSTM():
    
    def __init__(self, lstm_dims, num_class, keep_prob):
        
        self.lstm_dims = lstm_dims
        
        with tf.variable_scope('forward', reuse = tf.AUTO_REUSE):
            
            self.lstm_fw_cell = tf.nn.rnn_cell.LSTMCell(lstm_dims, forget_bias=1.0, state_is_tuple=True)
            self.lstm_fw_cell = tf.contrib.rnn.DropoutWrapper(self.lstm_fw_cell, output_keep_prob = keep_prob)
            
        with tf.variable_scope('backward', reuse = tf.AUTO_REUSE):
            
            self.lstm_bw_cell = tf.nn.rnn_cell.LSTMCell(lstm_dims, forget_bias=1.0, state_is_tuple=True)
            self.lstm_bw_cell = tf.contrib.rnn.DropoutWrapper(self.lstm_bw_cell, output_keep_prob = keep_prob)
        
        with tf.variable_scope('Weights', reuse = tf.AUTO_REUSE):
           
            self.W = tf.get_variable(name="W", shape=[2 * lstm_dims, num_class],
                                dtype=tf.float32, initializer = tf.contrib.layers.xavier_initializer())
            self.b = tf.get_variable(name="b", shape=[num_class], dtype=tf.float32,
                                initializer=tf.zeros_initializer())
            
            
    def _logits(self, X, W, b, seq_len):
        
        (output_fw, output_bw), states = tf.nn.bidirectional_dynamic_rnn(self.lstm_fw_cell, self.lstm_bw_cell,dtype=tf.float32,
                                                                            inputs = X, sequence_length = seq_len)
        ## concat fw, bw final states
        outputs = tf.concat([states[0][1], states[1][1]], axis=1)
        pred = tf.matmul(outputs, W) + b        
        return pred
        
    def _model_build(self, logits, labels, learning_rate = 0.001):
        
        with tf.variable_scope("loss"):
            batch_loss = tf.nn.softmax_cross_entropy_with_logits_v2(logits = logits , labels = labels) # [batch, ]
            loss = tf.reduce_mean(batch_loss) # Softmax loss, [batch_size] -> scalar

            optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(loss) # Adam Optimizer
            
        return loss, optimizer
    
    def _graph_build(self):
        
        self.loss = tf.placeholder(tf.float32)
        self.acc = tf.placeholder(tf.float32)
        tf.summary.scalar('Loss', self.loss)
        tf.summary.scalar('Accuracy', self.acc)
        merged = tf.summary.merge_all()
        return merged

In [15]:
vector_size = 300
seq_length = [len(x) for x in train_X]
maxseq_length = max(seq_length) ## 95

learning_rate = 0.001
lstm_units = 128
num_class = 2 # 긍정, 부정 분류
training_epochs = 10 # 10 번 반복
keep_prob = 0.75

In [16]:
X = tf.placeholder(tf.float32, shape = [None, maxseq_length, vector_size], name = 'X')
Y = tf.placeholder(tf.float32, shape = [None, num_class], name = 'Y')
seq_len = tf.placeholder(tf.int32, shape = [None])

In [17]:
BiLSTM = Bi_LSTM(lstm_units, num_class, keep_prob)

손실 함수와 역전파 함수를 정의

In [18]:
with tf.variable_scope("loss", reuse = tf.AUTO_REUSE):
    logits = BiLSTM._logits(X, BiLSTM.W, BiLSTM.b, seq_len)
    loss, optimizer = BiLSTM._model_build(logits, Y, learning_rate)

In [19]:
prediction = tf.nn.softmax(logits)
correct_pred = tf.equal(tf.argmax(prediction, 1), tf.argmax(Y, 1))
accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

In [20]:
init = tf.global_variables_initializer()

학습 준비

In [21]:
batch_size = 32 # 1회 학습시 32개의 문장으로 할 것임
total_size = len(train_X)

In [22]:
total_batch = int(total_size / batch_size)
total_batch

4687

In [23]:
print("Start training!")
modelName = "../data/BiLSTM_model.ckpt"
saver = tf.train.Saver()

Start training!


In [25]:
with tf.Session() as sess:

    start_time = time.time()
    sess.run(init)
    train_writer = tf.summary.FileWriter("../data", sess.graph)
    merged = BiLSTM._graph_build()
    
    for epoch in range(training_epochs):

        avg_acc, avg_loss = 0. , 0.
        for step in range(total_batch):

            train_batch_X = train_X_[step*batch_size : step*batch_size+batch_size]
            train_batch_Y = train_Y_[step*batch_size : step*batch_size+batch_size]
            batch_seq_length = seq_length[step*batch_size : step*batch_size+batch_size]
            
            train_batch_X = W2V._zero_padding(train_batch_X, batch_size, maxseq_length, vector_size)
            
            sess.run(optimizer, feed_dict={X: train_batch_X, Y: train_batch_Y, seq_len: batch_seq_length})
            
            # Compute average loss
            loss_ = sess.run(loss, feed_dict={X: train_batch_X, Y: train_batch_Y, seq_len: batch_seq_length})
            avg_loss += loss_ / total_batch
            
            acc = sess.run(accuracy , feed_dict={X: train_batch_X, Y: train_batch_Y, seq_len: batch_seq_length})
            avg_acc += acc / total_batch
            if step < 10 or step % 10 == 0:
                print("epoch : {:02d} step : {:04d} loss = {:.6f} accuracy= {:.6f}".format(epoch+1, step+1, loss_, acc))
   
        summary = sess.run(merged, feed_dict = {BiLSTM.loss : avg_loss, BiLSTM.acc : avg_acc})       
        train_writer.add_summary(summary, epoch)
        
    train_writer.close()
    save_path = saver.save(sess, modelName)
    
    duration = time.time() - start_time
    minute = int(duration / 60)
    second = int(duration) % 60
    print("걸린시간 : %d 분 %d 초" % (minute,second))    
    
    print ('해당 경로에 저장됨 : ',save_path)

epoch : 01 step : 0001 loss = 0.693555 accuracy= 0.593750
epoch : 01 step : 0002 loss = 0.701532 accuracy= 0.500000
epoch : 01 step : 0003 loss = 0.683355 accuracy= 0.500000
epoch : 01 step : 0004 loss = 0.656963 accuracy= 0.625000
epoch : 01 step : 0005 loss = 0.641137 accuracy= 0.625000
epoch : 01 step : 0006 loss = 0.688791 accuracy= 0.531250
epoch : 01 step : 0007 loss = 0.747891 accuracy= 0.437500
epoch : 01 step : 0008 loss = 0.673151 accuracy= 0.531250
epoch : 01 step : 0009 loss = 0.611888 accuracy= 0.875000
epoch : 01 step : 0010 loss = 0.626608 accuracy= 0.843750
epoch : 01 step : 0011 loss = 0.661692 accuracy= 0.625000
epoch : 01 step : 0021 loss = 0.536810 accuracy= 0.750000
epoch : 01 step : 0031 loss = 0.535836 accuracy= 0.656250
epoch : 01 step : 0041 loss = 0.633448 accuracy= 0.531250
epoch : 01 step : 0051 loss = 0.478704 accuracy= 0.781250
epoch : 01 step : 0061 loss = 0.390426 accuracy= 0.843750
epoch : 01 step : 0071 loss = 0.589718 accuracy= 0.750000
epoch : 01 ste

KeyboardInterrupt: 