In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.datasets import imdb
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.utils import to_categorical
import matplotlib.pyplot as plt
from tensorflow.keras.preprocessing.sequence import pad_sequences

np.random.seed(3)
tf.random.set_seed(3)

In [None]:
print(imdb.load_data())
vocab_size = 10000
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=vocab_size)
print(x_train.shape, y_train.shape, x_test.shape, y_test.shape)     # (25000,) (25000,) (25000,) (25000,)

print(y_train[:3])
num_classes = max(y_train) + 1
print("num_classes : ", num_classes)
print(set(y_train), ' ', np.unique(y_train))

print(x_train[0])
print(y_train[0])

# 시각화 : 훈련용 리뷰 분포 
len_result = [len(s) for s in x_train]
print('리뷰 최대 길이 : ', np.max(len_result))      # 2494
print('리뷰 평균 길이 : ', np.mean(len_result))     # 238.7

plt.subplot(1, 2, 1)
plt.boxplot(len_result)
plt.subplot(1, 2, 2)
plt.hist(len_result, bins=50)
plt.show()

In [None]:
# 긍/부정 빈도수
unique_ele, counts_ele = np.unique(y_train, return_counts=True)
print(np.asarray((unique_ele, counts_ele)))

# index에 대한 단어 출력
word_to_index = imdb.get_word_index()
index_to_word = {v:k for k, v in word_to_index.items()}
print(index_to_word)

print(index_to_word)
print(index_to_word[1])
print(index_to_word[1408])

print(x_train[0])
print(' '.join(index_to_word[index] for index in x_train[0]))

In [None]:
# LSTM으로 감성분류 
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.models import load_model

max_len=500
x_train = pad_sequences(x_train, maxlen=max_len)
x_test = pad_sequences(x_test, maxlen=max_len)
print(x_train[0].shape)


model = Sequential()
model.add(Embedding(vocab_size, 120))
model.add(LSTM(120, activation='tanh'))
model.add(Dense(1, activation='sigmoid'))

print(model.summary())

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['acc'])
es = EarlyStopping(monitor='val_loss', mode='auto', patience=3, baseline=0.01)
mc = ModelCheckpoint('tfrnn12.h5', monitor='val_acc', mode='max', save_best_only=True)
model.fit(x_train, y_train, validation_data=(x_test, y_test), epochs=100, batch_size=64, verbose=2, callbacks=[es, mc])

loaded_model = load_model('tfrnn12.h5')
print('acc : ', loaded_model.evaluate(x_test, y_test)[1])
print('loss : ', loaded_model.evaluate(x_test, y_test)[0])

In [None]:
# CNN 으로 텍스트 분류
from tensorflow.keras.layers import Conv1D, MaxPooling1D, GlobalMaxPool1D, Dropout

model = Sequential()
model.add(Embedding(vocab_size, 256))
model.add(Conv1D(256,  kernel_size=3, padding='valid', activation='relu', strides=1))
model.add(GlobalMaxPool1D())
model.add(Dense(1, activation='sigmoid'))
print(model.summary())

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['acc'])
es = EarlyStopping(monitor='val_loss', mode='auto', patience=3, baseline=0.01)
mc = ModelCheckpoint('tfrnn12_1.h5', monitor='val_acc', mode='max', save_best_only=True)
history = model.fit(x_train, y_train, validation_data=(x_test, y_test), epochs=100, batch_size=64, verbose=2, callbacks=[es, mc])

loaded_model = load_model('tfrnn12_1.h5')
print('acc : ', loaded_model.evaluate(x_test, y_test)[1])
print('loss : ', loaded_model.evaluate(x_test, y_test)[0])

In [None]:
# 시각화
vloss = history.history['val_loss']
loss = history.history['loss']
x_len = np.arange(len(loss))
plt.plot(x_len, vloss, marker='+', c='black', label='val_loss')
plt.plot(x_len, loss, marker='o', c='red', label='loss')
plt.legend()
plt.grid()
plt.show()

In [None]:
import re

def sentiment_predict(new_sentence):
  new_sentence = re.sub('[^0-9a-zA-Z ]', '', new_sentence).lower()

  # 정수 인코딩
  encoded = []
  for word in new_sentence.split():
    # 단어 집합의 크기를 10,000으로 제한.
    try :
      if word_to_index[word] <= 10000:
        encoded.append(word_to_index[word]+3)
      else:
        encoded.append(2)   # 10,000 이상의 숫자는 <unk> 토큰으로 취급.
    except KeyError:
      encoded.append(2)   # 단어 집합에 없는 단어는 <unk> 토큰으로 취급.

  pad_new = pad_sequences([encoded], maxlen = max_len) # 패딩
  
  # 예측하기
  score = float(loaded_model.predict(pad_new)) 
  if(score > 0.5):
    print("{:.2f}% 확률로 긍정!.".format(score * 100))
  else:
    print("{:.2f}% 확률로 부정!".format((1 - score) * 100))


# 긍/부정 분류 예측
temp_str = "This movie was just way too overrated. The fighting was not professional and in slow motion."
sentiment_predict(temp_str)

temp_str = " I was lucky enough to be included in the group to see the advanced screening in Melbourne on the 15th of April, 2012. And, firstly, I need to say a big thank-you to Disney and Marvel Studios."
sentiment_predict(temp_str)