In [1]:
import os
from datetime import datetime
import tensorflow as tf
import numpy as np
import json
from sklearn.model_selection import train_test_split

  from ._conv import register_converters as _register_converters


## 넘파이 배열로 저장된 데이터 로드

In [2]:
DATA_IN_PATH = './data_in/'
DATA_OUT_PATH = './data_out/'
INPUT_TRAIN_DATA = 'review_train_input.npy'
LABEL_TRAIN_DATA = 'review_train_label.npy'
DATA_CONFIGS = 'data_configs.json'

input_data = np.load(open(DATA_IN_PATH + INPUT_TRAIN_DATA, 'rb'))
label_data = np.load(open(DATA_IN_PATH + LABEL_TRAIN_DATA, 'rb'))
prepro_configs = json.load(open(DATA_IN_PATH + DATA_CONFIGS, 'r'))

In [8]:
TEST_SPLIT = 0.1
RNG_SEED = 13371447
VOCAB_SIZE = prepro_configs['vocab_size']
EMB_SIZE = 32
BATCH_SIZE = 16
NUM_EPOCHS = 1

input_train, input_eval, label_train, label_eval = train_test_split(input_data, label_data, test_size=TEST_SPLIT, random_state=RNG_SEED)

In [7]:
VOCAB_SIZE

43756

## 1. DNN

In [9]:
from keras.models import Sequential
from keras.layers import Flatten, Dense, Embedding

model1 = Sequential()

model1.add(Embedding(input_dim = VOCAB_SIZE, output_dim=EMB_SIZE, input_length=8))
model1.add(Flatten())

# 학습층 추가
model1.add(Dense(32, activation='relu'))

# 분류기를 추가
model1.add(Dense(1, activation='sigmoid'))
model1.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc'])
model1.summary()

history1 = model1.fit(input_train, label_train,
                    epochs=10,
                    batch_size=32,
                    validation_data=(input_eval, label_eval))

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 8, 32)             1400192   
_________________________________________________________________
flatten_3 (Flatten)          (None, 256)               0         
_________________________________________________________________
dense_5 (Dense)              (None, 32)                8224      
_________________________________________________________________
dense_6 (Dense)              (None, 1)                 33        
Total params: 1,408,449
Trainable params: 1,408,449
Non-trainable params: 0
_________________________________________________________________
Train on 135000 samples, validate on 15000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [11]:
INPUT_TEST_DATA = 'review_test_input.npy'
LABEL_TEST_DATA = 'review_test_label.npy'

input_test = np.load(open(DATA_IN_PATH + INPUT_TEST_DATA, 'rb'))
input_label = np.load(open(DATA_IN_PATH + LABEL_TEST_DATA, 'rb'))

In [12]:
test_loss, test_acc = model1.evaluate(input_test,input_label)



In [13]:
print('test_acc : {}'.format(test_acc))

test_acc : 0.80286


## 2. LSTM

In [15]:
import keras

In [22]:
callbacks_list = [
    
    keras.callbacks.EarlyStopping(
        monitor = 'val_acc',
        patience = 3,
    ),
    
    keras.callbacks.ModelCheckpoint(
        filepath = 'review_lstm.h5',
        monitor = 'val_loss',
        save_best_only=True,
    )
    
]

In [19]:
from keras.layers import CuDNNLSTM

lstm_model = Sequential()
lstm_model.add(Embedding(input_dim = VOCAB_SIZE, output_dim=EMB_SIZE))
lstm_model.add(CuDNNLSTM(32))
lstm_model.add(Dense(1, activation='sigmoid'))

lstm_model.compile(optimizer='rmsprop',
              loss='binary_crossentropy',
              metrics=['acc'])
lstm_history = lstm_model.fit(input_train, label_train,
                              epochs=10,
                              batch_size=32,
                              validation_data=(input_eval, label_eval),
                              callbacks=callbacks_list)

Train on 135000 samples, validate on 15000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10


In [21]:
callbacks_list1 = [
    
    keras.callbacks.EarlyStopping(
        monitor = 'val_acc',
        patience = 5,
    ),
    
    keras.callbacks.ModelCheckpoint(
        filepath = 'review_lstm1.h5',
        monitor = 'val_loss',
        save_best_only=True,
    )
    
]

In [23]:
from keras.layers import CuDNNLSTM

lstm_model1 = Sequential()
lstm_model1.add(Embedding(input_dim = VOCAB_SIZE, output_dim=EMB_SIZE))
lstm_model1.add(CuDNNLSTM(64))
lstm_model1.add(Dense(1, activation='sigmoid'))

lstm_model1.compile(optimizer='rmsprop',
                    loss='binary_crossentropy',
                    metrics=['acc'])
lstm_history1 = lstm_model1.fit(input_train, label_train,
                                epochs=10,
                                batch_size=32,
                                validation_data=(input_eval, label_eval),
                                callbacks=callbacks_list1)

Train on 135000 samples, validate on 15000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [26]:
from keras.layers import CuDNNLSTM

lstm_model2 = Sequential()
lstm_model2.add(Embedding(input_dim = VOCAB_SIZE, output_dim=EMB_SIZE))
lstm_model2.add(CuDNNLSTM(32, return_sequences = True))
lstm_model2.add(CuDNNLSTM(16))
lstm_model2.add(Dense(1, activation='sigmoid'))

lstm_model2.compile(optimizer='rmsprop',
                    loss='binary_crossentropy',
                    metrics=['acc'])
lstm_history2 = lstm_model2.fit(input_train, label_train,
                                epochs=10,
                                batch_size=32,
                                validation_data=(input_eval, label_eval),
                                callbacks=callbacks_list1)

Train on 135000 samples, validate on 15000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10


In [29]:
label_eval

array([1, 0, 0, ..., 1, 0, 1], dtype=int64)

#### < Note >
    
    훈련 정확도 자체가 많이 증가되지 않는 것은 데이터가 불충분해서 일 수 있다. 현재 데이터의 양(quantity)이 불충분하지는 않으므로,
    질(quality)의 불충분이라 가정하고 embedding dimensionality 역시 조정해보자
    
    딕셔너리의 크기에 비해 임베딩 차원 수가 부족해 전체 단어를 잘 표현하고 있지 못할 가능성이 있음. 

## embedding dimesionality조정

- EMB_SIZE * 2

In [30]:
from keras.layers import CuDNNLSTM

lstm_model3 = Sequential()
lstm_model3.add(Embedding(input_dim = VOCAB_SIZE, output_dim=EMB_SIZE*2))
lstm_model3.add(CuDNNLSTM(32))
lstm_model3.add(Dense(1, activation='sigmoid'))

lstm_model3.compile(optimizer='rmsprop',
                    loss='binary_crossentropy',
                    metrics=['acc'])
lstm_history3 = lstm_model3.fit(input_train, label_train,
                                epochs=10,
                                batch_size=32,
                                validation_data=(input_eval, label_eval),
                                callbacks=callbacks_list1)

Train on 135000 samples, validate on 15000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


- EMB_SIZE * 8

In [None]:
from keras.layers import CuDNNLSTM

lstm_model4 = Sequential()
lstm_model4.add(Embedding(input_dim = VOCAB_SIZE, output_dim=EMB_SIZE*8))
lstm_model4.add(CuDNNLSTM(32))
lstm_model4.add(Dense(1, activation='sigmoid'))

lstm_model4.compile(optimizer='rmsprop',
                    loss='binary_crossentropy',
                    metrics=['acc'])
lstm_history4 = lstm_model3.fit(input_train, label_train,
                                epochs=10,
                                batch_size=32,
                                validation_data=(input_eval, label_eval),
                                callbacks=callbacks_list1)