# **[Tensorflow] Doc2vec과 LSTM을 이용한 분류기**

## **1. Data loading**

In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
import math
from konlpy.tag import Komoran
import gensim
import multiprocessing

from gensim.models.word2vec import Word2Vec

from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

from collections import Counter

In [2]:
train_df = pd.read_csv('./input/ratings_train.txt', delimiter='\t')
pos_df = train_df[train_df['label'] == 1]['document']
neg_df = train_df[train_df['label'] == 0]['document']

test_df = pd.read_csv('./input/ratings_test.txt', delimiter='\t')
test_pos_df = test_df[test_df['label'] == 1]['document']
test_neg_df = test_df[test_df['label'] == 0]['document']

## **2. Data preprocessing**
### **(1) Text를 word의 seq로 변경**

In [3]:
komoran = Komoran() 

##### **Train set**

In [4]:
pos_data = []
pos_label = []
for pos in pos_df:
    try:
        words = komoran.nouns(pos)
        # 단어가 3개 이상 등장한 review 에 대해서만 데이터 set을 생성한다.
        if len(words) > 3:
            pos_data.append(words)
            # 긍정을 의미하는 label 이다.
            pos_label.append(1)
    except:
        pass
    
neg_data = []
neg_label = []
for neg in neg_df:
    try:
        words = komoran.nouns(neg)
        if len(words) > 3:
            neg_data.append(words)
            neg_label.append(0)
    except:
        pass
    
texts = pos_data + neg_data
labels = pos_label + neg_label

print(len(texts))
print(len(labels))

78986
78986


In [5]:
texts[:5]

[['포스터', '초딩', '영화', '줄', '오버', '연기'],
 ['익살', '연기', '영화', '스파이더맨', '커스틴 던스트'],
 ['액션', '재미', '안', '영화'],
 ['평점', '것', '만', '헐리우드', '식'],
 ['볼', '때', '눈물', '년대', '향수', '자극', '허진호', '감성', '절제', '멜로', '달인']]

##### **Test set**

In [6]:
test_pos_data = []
test_pos_label = []
for pos in test_pos_df:
    try:
        words = komoran.nouns(pos)
        # 단어가 3개 이상 등장한 review 에 대해서만 데이터 set을 생성한다.
        if len(words) > 3:
            test_pos_data.append(words)
            # 긍정을 의미하는 label 이다.
            test_pos_label.append(1)
    except:
        pass
    
test_neg_data = []
test_neg_label = []
for neg in test_neg_df:
    try:
        words = komoran.nouns(neg)
        if len(words) > 3:
            test_neg_data.append(words)
            test_neg_label.append(0)
    except:
        pass
    
test_texts = test_pos_data + test_neg_data
test_labels = test_pos_label + test_neg_label

print(len(test_texts))
print(len(test_labels))

26457
26457


In [7]:
test_texts[:5]

[['음악', '주가', '최고', '음악', '영화'],
 ['이별', '아픔', '뒤', '인연', '기쁨', '사람'],
 ['청춘', '이성', '찰나', '포착', '수채화', '퀴어', '영화'],
 ['눈', '반전', '영화', '흡인력'],
 ['13일의 금요일',
  '나이트메어',
  '시리즈',
  '시리즈',
  '양산',
  '레이저',
  '시리즈',
  '편',
  '작가',
  '상상력',
  '작품',
  '갈고리',
  '고어',
  '씨',
  '충격']]

### **(2) Word의 seq를 number의 seq로 변경**

##### **Train set**

In [8]:
# Indexing dictionary
words = []
for text in texts:
    words.extend(text)
    
voca_size = 25000
corpus = {}

# 빈도수가 높은 단어 순서대로, indexing이 된다.
for word, freq in Counter(words).most_common(voca_size):
    corpus[word] = len(corpus)

# 결과 디버깅을 위하여, number seq에서, number에 대응하는 word를 찾기 위한 lookup dictionary
corpus_rev = dict(zip(corpus.values(), corpus.keys()))

# Number seq를 생성한다. 즉 단어에 대응되는 num의 seq로 바꾸는 작업
num_seqs = []

for text in texts:
    num_seq = []
    for word in text:
        if word in corpus:
            idx = corpus[word]
        else:
            idx = 0  # 없는 단어는 0으로 한다. voca_size를 지정했기 때문이다.
        num_seq.append(idx)
    num_seqs.append(num_seq)

##### **Test set**

In [9]:
# Indexing dictionary
test_words = []
for test_text in test_texts:
    test_words.extend(test_text)
    
voca_size = 25000
test_corpus = {}

# 빈도수가 높은 단어 순서대로, indexing이 된다.
for test_word, freq in Counter(test_words).most_common(voca_size):
    corpus[test_word] = len(test_corpus)

# 결과 디버깅을 위하여, number seq에서, number에 대응하는 word를 찾기 위한 lookup dictionary
test_corpus_rev = dict(zip(test_corpus.values(), test_corpus.keys()))

# Number seq를 생성한다. 즉 단어에 대응되는 num의 seq로 바꾸는 작업
test_num_seqs = []

for test_text in test_texts:
    test_num_seq = []
    for test_word in test_text:
        if test_word in test_corpus:
            idx = test_corpus[test_word]
        else:
            idx = 0  # 없는 단어는 0으로 한다. voca_size를 지정했기 때문이다.
        test_num_seq.append(idx)
    test_num_seqs.append(test_num_seq)

In [10]:
merged_text = texts + test_texts
merged_labels = labels + test_labels

## **3. Word embedding**

### **(1) Doc2vec**

In [11]:
# Doc와 doc_id를 가지는 TaggedDocument의 generator를 생성한다.
class LabledLineSentence(object):
    def __init__(self, doc_list, doc_id_list):
        self.doc_list = doc_list
        self.doc_id_list = doc_id_list
    def __iter__(self):
        for idx, doc in enumerate(self.doc_list):
            yield gensim.models.doc2vec.TaggedDocument(words=doc, tags=[self.doc_id_list[idx]])

In [12]:
it = LabledLineSentence(merged_text, merged_labels)

In [13]:
doc2vec_model = gensim.models.Doc2Vec(
    vector_size=128, window=3, alpha=0.025, min_alpha=0.025, min_count=2, workers=multiprocessing.cpu_count()
)

In [14]:
doc2vec_model.build_vocab(it)
doc2vec_model.train(it, total_examples=len(merged_text), epochs=100)

len(doc2vec_model.wv.vocab)

19202

### **(2) Word2vec**

In [17]:
word2vec_model = gensim.models.Word2Vec(
    merged_text, size=128, window=1, min_count=2, workers=multiprocessing.cpu_count()
)

## **5. LSTM for classification**

### **(1) Input, output 설정**

In [20]:
# We change LabelEncoder's return shape.
class ReshapedLabelEncoder(LabelEncoder):
    def fit_transform(self, y, *args, **kwargs):
        return super().fit_transform(y).reshape(-1, 1)

In [21]:
label_encoder = ReshapedLabelEncoder()
onehot_encoder = OneHotEncoder()

pipeline = Pipeline([
    ('label_encoder', label_encoder),
    ('onehot_encoder', onehot_encoder)
])

# For output
onehot_labels = pipeline.fit_transform(labels).toarray()
test_onehot_labels = pipeline.fit_transform(test_labels).toarray()

### **(2) LSTM 모델 정의**

In [22]:
# Parameter 설정
EPOCHS = 10
BATCH_SIZE = 128
SEQUENCE_LENGTH = 10
HIDDEN_SIZE = 128
DIM_INPUT = 128
DIM_OUTPUT = len(label_encoder.classes_)

In [23]:
# Placeholders
X = tf.placeholder(tf.float32, shape=[None, SEQUENCE_LENGTH, DIM_INPUT])
t = tf.placeholder(tf.float32, shape=[None, DIM_OUTPUT])
batch_size = tf.placeholder(tf.int32, [])

In [24]:
# Model
def inference(x, batch_size):
    # Cell를 정의
    lstm_cell = tf.nn.rnn_cell.MultiRNNCell([
        tf.nn.rnn_cell.LSTMCell(num_units=HIDDEN_SIZE),
        tf.nn.rnn_cell.LSTMCell(num_units=HIDDEN_SIZE),
        tf.nn.rnn_cell.LSTMCell(num_units=HIDDEN_SIZE),
    ])
    # TODO : Dropout 설정
    # Initial state 정의
    initial_state = lstm_cell.zero_state(batch_size, dtype=tf.float32)
    # Cell output 정의
    cell_outputs, state = tf.nn.dynamic_rnn(lstm_cell, x, initial_state=initial_state, dtype=tf.float32)
    final_cell_output = cell_outputs[:, -1, :]
    # Weight matrix
    V = tf.Variable(tf.truncated_normal([HIDDEN_SIZE, DIM_OUTPUT]))
    # Biases
    c = tf.Variable(tf.zeros([DIM_OUTPUT]))
    return tf.matmul(final_cell_output, V) + c

In [25]:
# Loss function
def loss_func(y, t):
    return tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(logits=y, labels=t))

In [26]:
# Define optimizer
def train(loss):
    return tf.train.AdamOptimizer().minimize(loss)

In [27]:
# Connect graph nodes
y = inference(X, batch_size)
loss = loss_func(y, t)
train_step = train(loss)

In [28]:
# Evaluation
correct_pred = tf.equal(tf.argmax(y, 1), tf.argmax(t, 1))
accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

### **(3) Batch data 생성**

##### **Train set**

In [29]:
input_data = []
targets = []

for text, onehot_label in zip(texts, onehot_labels):
    if len(text) < SEQUENCE_LENGTH:
        temp_input_data = []
        num_na = SEQUENCE_LENGTH - len(text)
        text = text + num_na * ['NA']
        for word in text:
            if word == 'NA':
                temp_input_data.append(np.zeros([DIM_INPUT]))
            else:
                try:
                    # Word2vec vs Doc2vec
                    temp_input_data.append(doc2vec_model.wv.get_vector(word))
                except:
                    temp_input_data.append(np.zeros([DIM_INPUT]))
        input_data.append(np.array(temp_input_data))
        targets.append(onehot_label)
    else:
        for i in range(len(text) - SEQUENCE_LENGTH):
            temp_input_data = []
            for word in text[i:i+SEQUENCE_LENGTH]:
                try:
                    temp_input_data.append(doc2vec_model.wv.get_vector(word))
                except:
                    temp_input_data.append(np.zeros([DIM_INPUT]))
            input_data.append(np.array(temp_input_data))
            targets.append(onehot_label)

In [30]:
input_data[0][0].shape

(128,)

In [31]:
# RNN, LSTM cell의 input 형태로 reshape한다.
input_data = np.array(input_data).reshape(-1, SEQUENCE_LENGTH, DIM_INPUT)
targets = np.array(targets).reshape(-1, DIM_OUTPUT)

In [32]:
input_data.shape

(154867, 10, 128)

##### **Test set**

In [33]:
test_input_data = []
test_targets = []

for text, onehot_label in zip(test_texts, test_onehot_labels):
    if len(text) < SEQUENCE_LENGTH:
        temp_input_data = []
        num_na = SEQUENCE_LENGTH - len(text)
        text = text + num_na * ['NA']
        for word in text:
            if word == 'NA':
                temp_input_data.append(np.zeros([DIM_INPUT]))
            else:
                try:
                    # Word2vec vs Doc2vec
                    temp_input_data.append(doc2vec_model.wv.get_vector(word))
                except:
                    temp_input_data.append(np.zeros([DIM_INPUT]))
        test_input_data.append(np.array(temp_input_data))
        test_targets.append(onehot_label)
    else:
        for i in range(len(text) - SEQUENCE_LENGTH):
            temp_input_data = []
            for word in text[i:i+SEQUENCE_LENGTH]:
                try:
                    temp_input_data.append(doc2vec_model.wv.get_vector(word))
                except:
                    temp_input_data.append(np.zeros([DIM_INPUT]))
            test_input_data.append(np.array(temp_input_data))
            test_targets.append(onehot_label)

In [34]:
# RNN, LSTM cell의 input 형태로 reshape한다.
test_input_data = np.array(test_input_data).reshape(-1, SEQUENCE_LENGTH, DIM_INPUT)
test_targets = np.array(test_targets).reshape(-1, DIM_OUTPUT)

In [35]:
test_input_data.shape

(52262, 10, 128)

### **(4) 학습**

In [36]:
init = tf.global_variables_initializer()
sess = tf.Session()

In [37]:
sess.run(init)

In [38]:
X_train, X_test, y_train, y_test = train_test_split(input_data, targets, test_size = 0.1)
num_batches = len(X_train) // BATCH_SIZE
num_validation = len(X_test)

for epoch in range(EPOCHS):
    X_samp, y_samp = shuffle(X_train, y_train)
    for i in range(num_batches):
        start = i * BATCH_SIZE
        end = start + BATCH_SIZE
        
        _, train_loss = sess.run([train_step, loss], feed_dict={
            X: X_samp[start:end],
            t: y_samp[start:end],
            batch_size: BATCH_SIZE
        })
        
    validation_acc = sess.run(accuracy, feed_dict={
        X: X_test,
        t: y_test,
        batch_size: num_validation
    })

    test_acc = sess.run(accuracy, feed_dict={
        X: test_input_data,
        t: test_targets,
        batch_size: len(test_input_data)
    })
        
    print("%d epoch's final - train loss: %f, validation_acc: %f, test_acc: %f"
          %(epoch, train_loss, validation_acc, test_acc))

0 epoch's final - train loss: 0.431219, validation_acc: 0.795635, test_acc: 0.738816
1 epoch's final - train loss: 0.345797, validation_acc: 0.844127, test_acc: 0.745972
2 epoch's final - train loss: 0.191326, validation_acc: 0.865952, test_acc: 0.740155
3 epoch's final - train loss: 0.270534, validation_acc: 0.873830, test_acc: 0.741457
4 epoch's final - train loss: 0.171740, validation_acc: 0.874411, test_acc: 0.732789
5 epoch's final - train loss: 0.187230, validation_acc: 0.866662, test_acc: 0.726321
6 epoch's final - train loss: 0.086780, validation_acc: 0.881255, test_acc: 0.735085
7 epoch's final - train loss: 0.122334, validation_acc: 0.882611, test_acc: 0.736596
8 epoch's final - train loss: 0.093612, validation_acc: 0.880610, test_acc: 0.731277
9 epoch's final - train loss: 0.115922, validation_acc: 0.879512, test_acc: 0.729536


In [39]:
accuracy.eval(session=sess, feed_dict={
    X: test_input_data,
    t: test_targets,
    batch_size: len(test_input_data)
})

0.7295358