In [1]:
import logging, os
logging.disable(logging.WARNING)
os.environ["TF_CPP_MIN_LOG_LEVEL"] = '3'

import json
import numpy as np
import tensorflow as tf
from keras.datasets import imdb
from keras.preprocessing import sequence

# 난수 고정
np.random.seed(123)
tf.random.set_seed(123)

# data load를 위함 함수 정의
np_load_old = np.load
np.load = lambda *a, **k: np_load_old(*a, allow_pickle=True, **k)

n_of_training_ex = 5000
n_of_testing_ex = 1000

PATH = "data\\"

def imdb_data_load():
    # data load
    X_train = np.load(PATH + "data_X_train.npy")[:n_of_training_ex]
    y_train = np.load(PATH + "data_y_train.npy")[:n_of_training_ex]
    X_test = np.load(PATH + "data_X_test.npy")[:n_of_testing_ex]
    y_test = np.load(PATH + "data_y_test.npy")[:n_of_testing_ex]
    
    # json 파일에 저장된 단어 index 불러오기
    with open(PATH + "data_imdb_word_index.json") as f:
        word_index = json.load(f)
    # Dictionary의 "단어: Index" 를 "Index: 단어" 로 변환
    inverted_word_index = dict((i, word) for (word, i) in word_index.items())
    # 인덱스를 기준 단어를 문장으로 변환
    decoded_sequence = " ".join(inverted_word_index[i] for i in X_train[0])
    
    print("First X_train data sample: \n", decoded_sequence)
    print("\n First train data sample token index sequence: \n", X_train[0])
    print("\nLength of first train data sample token index sequence: ", len(X_train[0]))
    print("\nFirst y_train data: ", y_train[0])
    
    return X_train, y_train, X_test, y_test

X_train, y_train, X_test, y_test = imdb_data_load()

First X_train data sample: 
 the as you with out themselves powerful and and their becomes and had and of lot from anyone to have after out atmosphere never more room and it so heart shows to years of every never going and help moments or of every and and movie except her was several of enough more with is now and film as you of and and unfortunately of you than him that with out themselves her get for was and of you movie sometimes movie that with scary but and to story wonderful that in seeing in character to of and and with heart had and they of here that with her serious to have does when from why what have and they is you that isn't one will very to as itself with other and in of seen over and for anyone of and br and to whether from than out themselves history he name half some br of and and was two most of mean for 1 any an and she he should is thought and but of script you not while history he heart to real at and but when from one bit then have two of script their with her and

In [2]:
# padding 수행
# maxlen: 최대 길이 설정 (넘으면 자른다)
# padding: default=pre (왼쪽에 0) / padding=-1 (-1) / padding='post' (오른쪽 0)

max_review_length = 300

X_train = sequence.pad_sequences(X_train, maxlen=max_review_length, padding='post')
X_test = sequence.pad_sequences(X_test, maxlen=max_review_length, padding='post')

print("\n<Padding> First X_train data sample token index sequence: \n", X_train[0])


<Padding> First X_train data sample token index sequence: 
 [  1  14  22  16  43 530 973   2   2  65 458   2  66   2   4 173  36 256
   5  25 100  43 838 112  50 670   2   9  35 480 284   5 150   4 172 112
 167   2 336 385  39   4 172   2   2  17 546  38  13 447   4 192  50  16
   6 147   2  19  14  22   4   2   2 469   4  22  71  87  12  16  43 530
  38  76  15  13   2   4  22  17 515  17  12  16 626  18   2   5  62 386
  12   8 316   8 106   5   4   2   2  16 480  66   2  33   4 130  12  16
  38 619   5  25 124  51  36 135  48  25   2  33   6  22  12 215  28  77
  52   5  14 407  16  82   2   8   4 107 117   2  15 256   4   2   7   2
   5 723  36  71  43 530 476  26 400 317  46   7   4   2   2  13 104  88
   4 381  15 297  98  32   2  56  26 141   6 194   2  18   4 226  22  21
 134 476  26 480   5 144  30   2  18  51  36  28 224  92  25 104   4 226
  65  16  38   2  88  12  16 283   5  16   2 113 103  32  15  16   2  19
 178  32   0   0   0   0   0   0   0   0   0   0   0   0   0   

In [3]:
# 모델 구현
embedding_vector_length = 32

model = tf.keras.models.Sequential([
    tf.keras.layers.Embedding(1000, embedding_vector_length, input_length = max_review_length),   # Word Embedding
    tf.keras.layers.SimpleRNN(5),
    tf.keras.layers.Dense(1, activation='sigmoid')   # 이진분류라서 output을 1로 설정
])

print(model.summary())

# 모델 학습 방법 설정
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# 모델 학습
model_history = model.fit(X_train, y_train, epochs=5, verbose=2)

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 300, 32)           32000     
                                                                 
 simple_rnn (SimpleRNN)      (None, 5)                 190       
                                                                 
 dense (Dense)               (None, 1)                 6         
                                                                 
Total params: 32,196
Trainable params: 32,196
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/5
157/157 - 5s - loss: 0.6925 - accuracy: 0.5248 - 5s/epoch - 32ms/step
Epoch 2/5
157/157 - 4s - loss: 0.6814 - accuracy: 0.5680 - 4s/epoch - 27ms/step
Epoch 3/5
157/157 - 4s - loss: 0.6742 - accuracy: 0.5806 - 4s/epoch - 27ms/step
Epoch 4/5
157/157 - 4s - loss: 0.6640 - accuracy: 0.6014 - 4s/epoch - 2

In [4]:
# 모델 평가
loss, test_acc = model.evaluate(X_test, y_test, verbose=0)

# 예측
predictions = model.predict(X_test)

print('\nTest loss: {:.4f} | Test accuracy: {}'.format(loss, test_acc))
print('\nPredicted test data class: ', 1 if predictions[0]>=0.5 else 0)


Test loss: 0.6805 | Test accuracy: 0.5490000247955322

Predicted test data class:  0
