In [5]:
# reference : https://wikidocs.net/85337

In [68]:
from tensorflow.keras.models import Sequential, Model
import urllib.request
from tensorflow.keras.layers import Embedding, Dropout, Conv1D, GlobalMaxPooling1D, Dense, Input, Flatten, Concatenate
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.models import load_model
import pandas as pd
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import matplotlib.pyplot as plt
from tensorflow.keras import datasets

In [69]:
vocab_size = 10000
(X_train, y_train), (X_test, y_test) = datasets.imdb.load_data(num_words = vocab_size)

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz


In [71]:
max_len = 200
X_train = pad_sequences(X_train, maxlen = max_len)
X_test = pad_sequences(X_test, maxlen = max_len)

In [72]:
print("훈련용 이메일 데이터의 크기(shape): ", X_train.shape)
print("테스트용 이메일 데이터의 크기(shape): ", X_test.shape)
print("훈련용 레이블의 크기(shape): ", y_train.shape)
print("테스트용 레이블의 크기(shape): ", y_test.shape)

훈련용 이메일 데이터의 크기(shape):  (25000, 200)
테스트용 이메일 데이터의 크기(shape):  (25000, 200)
훈련용 레이블의 크기(shape):  (25000,)
테스트용 레이블의 크기(shape):  (25000,)


In [73]:
embedding_dim = 128
dropout_prob = (0.5, 0.8)
num_filters = 128

In [74]:
model_input = Input(shape = (max_len,))
z = Embedding(vocab_size, embedding_dim, input_length = max_len, name="embedding")(model_input)
z = Dropout(dropout_prob[0])(z)

In [75]:
conv_blocks = []

for sz in [3, 4, 5]:
    conv = Conv1D(filters = num_filters,
                         kernel_size = sz,
                         padding = "valid",
                         activation = "relu",
                         strides = 1)(z)
    conv = GlobalMaxPooling1D()(conv)
    conv = Flatten()(conv)
    conv_blocks.append(conv)

In [76]:
z = Concatenate()(conv_blocks) if len(conv_blocks) > 1 else conv_blocks[0]
z = Dropout(dropout_prob[1])(z)
z = Dense(128, activation="relu")(z)
model_output = Dense(1, activation="sigmoid")(z)

model = Model(model_input, model_output)
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["acc"])

In [77]:
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=4)
mc = ModelCheckpoint('CNN_model.h5', monitor='val_acc', mode='max', verbose=1, save_best_only=True)

model.fit(X_train, y_train, batch_size = 64, epochs=10, validation_data = (X_test, y_test), verbose=2, callbacks=[es, mc])

Epoch 1/10

Epoch 00001: val_acc improved from -inf to 0.77996, saving model to CNN_model.h5
391/391 - 81s - loss: 0.6566 - acc: 0.5786 - val_loss: 0.4684 - val_acc: 0.7800
Epoch 2/10

Epoch 00002: val_acc improved from 0.77996 to 0.85076, saving model to CNN_model.h5
391/391 - 84s - loss: 0.4267 - acc: 0.8087 - val_loss: 0.3550 - val_acc: 0.8508
Epoch 3/10

Epoch 00003: val_acc improved from 0.85076 to 0.86704, saving model to CNN_model.h5
391/391 - 83s - loss: 0.3222 - acc: 0.8657 - val_loss: 0.3119 - val_acc: 0.8670
Epoch 4/10

Epoch 00004: val_acc did not improve from 0.86704
391/391 - 83s - loss: 0.2650 - acc: 0.8954 - val_loss: 0.3269 - val_acc: 0.8588
Epoch 5/10

Epoch 00005: val_acc did not improve from 0.86704
391/391 - 86s - loss: 0.2225 - acc: 0.9134 - val_loss: 0.3257 - val_acc: 0.8629
Epoch 6/10

Epoch 00006: val_acc did not improve from 0.86704
391/391 - 90s - loss: 0.1934 - acc: 0.9267 - val_loss: 0.3371 - val_acc: 0.8607
Epoch 7/10

Epoch 00007: val_acc did not improve 

<tensorflow.python.keras.callbacks.History at 0x2c49842f2c8>

In [78]:
loaded_model = load_model('CNN_model.h5')
print("\n 테스트 정확도: %.4f" % (loaded_model.evaluate(X_test, y_test)[1]))


 테스트 정확도: 0.8670


In [79]:
from konlpy.tag import Okt

In [80]:
okt = Okt()

In [81]:
stopwords = ['의','가','이','은','들','는','좀','잘','걍','과','도','를','으로','자','에','와','한','하다']

In [82]:
def sentiment_predict(new_sentence):
    new_sentence = okt.morphs(new_sentence, stem=True) # 토큰화
    new_sentence = [word for word in new_sentence if not word in stopwords] # 불용어 제거
    encoded = tokenizer.texts_to_sequences([new_sentence]) # 정수 인코딩
    pad_new = pad_sequences(encoded, maxlen = max_len) # 패딩
    score = float(model.predict(pad_new)) # 예측
    
    if(score > 0.5):
        print("{:.2f}% 확률로 긍정 리뷰입니다.\n".format(score * 100))
    else:
        print("{:.2f}% 확률로 부정 리뷰입니다.\n".format((1 - score) * 100))

In [86]:
sentiment_predict('이 영화 노잼 ㅋㅋㅋ')

55.54% 확률로 긍정 리뷰입니다.



In [87]:
sentiment_predict('이 영화 핵꿀잼 ㅋㅋㅋ')

55.54% 확률로 긍정 리뷰입니다.

