# III. 영화 리뷰 데이터 딥러닝 트레이닝과 추론

---
### 1) 데이터 로드 : dataframe

In [None]:
import os
import pickle

In [None]:
DATA_DIR = 'data'
PROCESSED_DATA_DIR = os.path.join(DATA_DIR, 'processed')

PROCESSED_DATA_FILE = "train_padded.p"
PROCESSED_LABEL_FILE = "train_label.p"

PROCESSED_DATA_PATH = os.path.join(PROCESSED_DATA_DIR, PROCESSED_DATA_FILE)
PROCESSED_LABEL_PATH = os.path.join(PROCESSED_DATA_DIR, PROCESSED_LABEL_FILE)

In [None]:
with open(PROCESSED_DATA_PATH, "rb" ) as file:
    train_padded = pickle.load(file)
with open(PROCESSED_LABEL_PATH, "rb" ) as file:
    train_y = pickle.load(file)    

In [None]:
#data 확인
train_padded[:5]

In [None]:
train_y

In [None]:
# json file로 저장된 tokenizer를 읽어서 num_workds 값을 얻는다. 이 값은 vocab_size로 사용한다.

TOKENIZED_FILE = 'tokenized.json'
TOKENIZED_PATH = os.path.join(DATA_DIR, TOKENIZED_FILE)

import json
from tensorflow.keras.preprocessing.text import tokenizer_from_json

with open(TOKENIZED_PATH) as f:
    data = json.load(f)
    tokenizer = tokenizer_from_json(data)

In [None]:
vocab_size = tokenizer.get_config()['num_words']

---
### 2) 모델 만들기와 트레이닝

In [None]:
train_X = train_padded

In [None]:
train_X

In [None]:
train_X.shape

In [None]:
train_Y = train_y

---
#### (가) 네트워크 모델 설정

In [None]:
from tensorflow.keras.layers import Embedding, Dense, LSTM
from tensorflow.keras.models import Sequential
from tensorflow.keras.models import load_model
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

In [None]:
max_len = 30

In [None]:
model = Sequential()
model.add(Embedding(vocab_size, 100))
model.add(LSTM(128))
model.add(Dense(1, activation='sigmoid'))

In [None]:
model.summary()

In [None]:
MODEL_DIR = 'model'
os.makedirs(MODEL_DIR, exist_ok=True)

In [None]:
MODEL_SUMMARY_FILE = "movie_review_model.png"
MODEL_SUMMARY_PATH = os.path.join(MODEL_DIR, MODEL_SUMMARY_FILE)

from tensorflow.keras.utils import plot_model
# tf.keras.utils.plot_model(model, to_file = model_dir + 'cifar10_cnn_model.png', show_shapes=True)
plot_model(model, to_file = MODEL_SUMMARY_PATH, show_shapes=True)

In [None]:
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=4)

MODEL_FILE = "best_model.h5"
MODEL_PATH = os.path.join(MODEL_DIR, MODEL_FILE)

mc = ModelCheckpoint(MODEL_PATH, monitor='val_acc', mode='max', verbose=1, save_best_only=True)

In [None]:
model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['acc'])

In [None]:
history = model.fit(train_X, train_y, epochs=15, callbacks=[es, mc], batch_size=300, validation_split=0.2)

In [None]:
print(history.history.keys())

In [None]:
import matplotlib.pyplot as plt

In [None]:
# loss 측정값의 시각화 입니다.  

loss = history.history['loss']
val_loss = history.history['val_loss']
epochs = range(1,len(loss)+1)

plt.plot(epochs,loss,label='Training Loss')
plt.plot(epochs,val_loss,label='Validation Loss')
plt.title('Training and Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

plt.show()

In [None]:
# accuracy 측정값의 시각화 입니다.  

acc = history.history['acc']
val_acc = history.history['val_acc']
epochs = range(1,len(loss)+1)
plt.plot(epochs,acc,label='Training Accuarcy')
plt.plot(epochs,val_acc,label='Validation Accuarcy')
plt.title('Training and Validation Accuarcy')
plt.xlabel('Epochs')
plt.ylabel('Accuarcy')
plt.legend()

plt.show()

---
### 10) 테스트 ; 추론(Inference)

#### (가) 데스트 데이터로 평가

In [None]:
PROCESSED_TEST_DATA_FILE = "test_padded.p"
PROCESSED_TEST_LABEL_FILE = "test_label.p"

PROCESSED_TEST_DATA_PATH = os.path.join(PROCESSED_DATA_DIR, PROCESSED_TEST_DATA_FILE)
PROCESSED_TEST_LABEL_PATH = os.path.join(PROCESSED_DATA_DIR, PROCESSED_TEST_LABEL_FILE)

In [None]:
with open(PROCESSED_TEST_DATA_PATH, "rb" ) as file:
    test_padded = pickle.load(file)
with open(PROCESSED_TEST_LABEL_PATH, "rb" ) as file:
    test_y = pickle.load(file)    

In [None]:
test_X = test_padded
test_Y = test_y
# Evaluate the model on the test data using `evaluate`
print("Evaluate on test data")
results = model.evaluate(test_X, test_Y, batch_size=128)
print("test loss, test acc:", results)

In [None]:
# Generate predictions (probabilities -- the output of the last layer)
# on new data using `predict`
print("Generate predictions for 3 samples")
predictions = model.predict(test_X[:3])
print("predictions shape:", predictions.shape)

In [None]:
predictions

#### (나) 문장을 입력하여 추론하는 프로그램
  1. 형태소 분석
  2. 시퀀스 만들기
  3. 패딩
  4. 추론 : model.predict()

In [None]:
def get_morphed_word(wiki_result):
    # print('wiki_result : ', wiki_result[0][0])
    txt = []
    for i in wiki_result[0][0]:
        txt.append(i[0])
    # print('wiki_result, txt : ', txt)
    
    return txt

In [None]:
from kiwipiepy import Kiwi
# 형태소 분석 함수
def get_morph(input_text):
    # print('get_morph : ', input_text)
    kiwi = Kiwi()
    kiwi.prepare()
    morphed = get_morphed_word(kiwi.analyze(input_text))
    # print('morphed : ', morphed)
    
    return morphed

In [None]:
# max_len = 30 # 시퀀스의 길이를 30으로 고정
trunc_type = "post" # 길이가 30 보다 길 때 뒷 부분을 버린다. 
padding_type = "post" # 길이가 30 보다 짧을 대 뒷 부분을 0으로 채운다.

# padding : 뒤를 0으로 채운다.
from tensorflow.keras.preprocessing.sequence import pad_sequences

def sentiment_predict(sentence):
    morphed = get_morph(sentence)
    # print(morphed)
    encoded = tokenizer.texts_to_sequences([morphed]) # 정수 인코딩
    # print('encoded : ', encoded)
    padded = pad_sequences(encoded, maxlen = max_len, padding=padding_type, truncating=trunc_type) # 패딩
    # print(padded)
    score = float(model.predict(padded)) # 예측
    
    if(score > 0.5):
        print("{:.2f}% 확률로 긍정 리뷰입니다.\n".format(score * 100))
    else:
        print("{:.2f}% 확률로 부정 리뷰입니다.\n".format((1 - score) * 100))

In [None]:
input_txt = '이 영화 정말 재밌네!'

In [None]:
sentiment_predict(input_txt)

In [None]:
sentiment_predict('이 영화 개꿀잼 ㅋㅋㅋ')

In [None]:
sentiment_predict('이 영화 핵노잼')

In [None]:
sentiment_predict('뭐 이런 영화가 다있어?')

In [None]:
sentiment_predict('이 영화 핵노잼 ㅠㅠ')

In [None]:
sentiment_predict('이딴게 영화냐 ㅉㅉ')

In [None]:
sentiment_predict('감독 뭐하는 놈이냐?')

In [None]:
sentiment_predict('와 개쩐다 정말 세계관 최강자들의 영화다')

In [None]:
sentiment_predict('재미있네요.')