# BiLSTM을 이용한 영화 리뷰 감정 분석
Word2Vec을 통해 임베딩 된 네이버 영화 리뷰 데이터를 BiLSTM을 통해 긍정, 부정을 분류해주는 예제


## Train

In [2]:
import time
import os
import tensorflow as tf
import numpy as np
import Bi_LSTM
import Word2Vec



In [3]:
DATA_PATH = "../data"
TRAIN_DATA = DATA_PATH + "/ratings_train.txt"
TEST_DATA = DATA_PATH + "/ratings_test.txt"
MODEL_DATA = DATA_PATH + "/Word2Vec.model"
META_DATA = DATA_PATH + "/metadata.tsv"
WORD2VEC_PATH = DATA_PATH + "/word2vec"

In [4]:
W2V = Word2Vec.Word2Vec()

In [5]:
train_data = W2V.read_data(TRAIN_DATA)

In [6]:
## tokenize the data we have
print("Tokenize Start!\nCould take minutes...")
tokens = [[W2V.tokenize(row[1]),int(row[2])] for row in train_data if W2V.tokenize(row[1]) != []]
tokens = np.array(tokens)
print("Tokenize Done!")

train_X = tokens[:,0]
train_Y = tokens[:,1]

Tokenize Start!
Could take minutes...


  warn('"Twitter" has changed to "Okt" since KoNLPy v0.4.5.')


Tokenize Done!


In [None]:
train_Y_ = W2V.One_hot(train_Y)  ## Convert to One-hot
train_X_ = W2V.Convert2Vec(MODEL_DATA,train_X)  ## import word2vec model where you have trained before

- num_class : 긍정 or 부정 판별
- keep_prob : drop out rate

In [None]:
Batch_size = 32
Total_size = len(train_X)
Vector_size = 300
seq_length = [len(x) for x in train_X]
Maxseq_length = max(seq_length) ## 95
learning_rate = 0.001
lstm_units = 128
num_class = 2
training_epochs = 10
keep_prob = 0.75

In [None]:
X = tf.placeholder(tf.float32, shape = [None, Maxseq_length, Vector_size], name = 'X')
Y = tf.placeholder(tf.float32, shape = [None, num_class], name = 'Y')
seq_len = tf.placeholder(tf.int32, shape = [None])

In [None]:
BiLSTM = Bi_LSTM.Bi_LSTM(lstm_units, num_class, keep_prob)

In [None]:
with tf.variable_scope("loss", reuse = tf.AUTO_REUSE):
    logits = BiLSTM.logits(X, BiLSTM.W, BiLSTM.b, seq_len)
    loss, optimizer = BiLSTM.model_build(logits, Y, learning_rate)

In [None]:
prediction = tf.nn.softmax(logits)
correct_pred = tf.equal(tf.argmax(prediction, 1), tf.argmax(Y, 1))
accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

In [None]:
init = tf.global_variables_initializer()

In [None]:
total_batch = int(Total_size / Batch_size)

In [None]:
print("Start training!")
modelName = DATA_PATH + '/BiLSTM_model.ckpt'
saver = tf.train.Saver()

In [None]:
with tf.Session() as sess:

    start_time = time.time()
    sess.run(init)
    train_writer = tf.summary.FileWriter(DATA_PATH, sess.graph)
    merged = BiLSTM.graph_build()
    
    for epoch in range(training_epochs):

        avg_acc, avg_loss = 0. , 0.
        for step in range(total_batch):

            train_batch_X = train_X_[step*Batch_size : step*Batch_size+Batch_size]
            train_batch_Y = train_Y_[step*Batch_size : step*Batch_size+Batch_size]
            batch_seq_length = seq_length[step*Batch_size : step*Batch_size+Batch_size]
            
            train_batch_X = W2V.Zero_padding(train_batch_X, Batch_size, Maxseq_length, Vector_size)
            
            sess.run(optimizer, feed_dict={X: train_batch_X, Y: train_batch_Y, seq_len: batch_seq_length})
            # Compute average loss
            loss_ = sess.run(loss, feed_dict={X: train_batch_X, Y: train_batch_Y, seq_len: batch_seq_length})
            avg_loss += loss_ / total_batch
            
            acc = sess.run(accuracy , feed_dict={X: train_batch_X, Y: train_batch_Y, seq_len: batch_seq_length})
            avg_acc += acc / total_batch
            print("epoch : {:02d} step : {:04d} loss = {:.6f} accuracy= {:.6f}".format(epoch+1, step+1, loss_, acc))
   
        summary = sess.run(merged, feed_dict = {BiLSTM.loss : avg_loss, BiLSTM.acc : avg_acc})       
        train_writer.add_summary(summary, epoch)
        
    train_writer.close()
    duration = time.time() - start_time
    minute = int(duration / 60)
    second = int(duration) % 60
    print("%dminutes %dseconds" % (minute,second))
    save_path = saver.save(sess, modelName)
    
    print ('save_path',save_path)
    
    ## cmd 실행 -> cd C:\Users\jbk48\Desktop\Sentimental-Analysis-master\Sentimental-Analysis-master\Bidirectional_LSTM
    ## tensorboard --logdir=./ 입력