In [None]:
pip install konlpy

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from konlpy.tag import Okt
import re
from tqdm import tqdm
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, Dense, LSTM, Bidirectional, GlobalAveragePooling1D
from tensorflow.keras.models import Sequential
from tensorflow.keras.models import load_model
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from imblearn.over_sampling import SMOTE
import pickle
import warnings
warnings.filterwarnings("ignore", category=np.VisibleDeprecationWarning)

#데이터 전처리

##영화 + 스팀 게임 + 맛집 리뷰 데이터 / 테스트 데이터

In [None]:
train_data = pd.read_table('steam_movie_matjip_train.txt')
test_data = pd.read_table('test_sample_0or1.txt')

train_data.drop_duplicates(subset = ['document'], inplace=True)
train_data['document'] = train_data['document'].str.replace("[^ㄱ-ㅎㅏ-ㅣ가-힣 ]","", regex = True)
train_data['document'] = train_data['document'].str.replace('^ +', "", regex = True)
train_data['document'].replace('', np.nan, inplace=True)
train_data = train_data.dropna(how='any')

test_data.drop_duplicates(subset = ['document'], inplace=True)
test_data['document'] = test_data['document'].str.replace("[^ㄱ-ㅎㅏ-ㅣ가-힣 ]","", regex = True)
test_data['document'] = test_data['document'].str.replace('^ +', "", regex = True)
test_data['document'].replace('', np.nan, inplace=True)
test_data = test_data.dropna(how='any')

print('전처리 후 트레이닝용 샘플의 개수 :',len(train_data))
print('전처리 후 테스트용 샘플의 개수 :',len(test_data))

stopwords = ['의','가','이','은','들','는','좀','잘','걍','과','도','를','으로','자','에','와','한','하다']
okt = Okt()

X_train = []
for sentence in tqdm(train_data['document']):
    tokenized_sentence = okt.morphs(sentence, stem=True) # 토큰화
    stopwords_removed_sentence = [word for word in tokenized_sentence if not word in stopwords] # 불용어 제거
    X_train.append(stopwords_removed_sentence)

with open('x_train_matjip.pkl', 'wb') as f:    #리스트 저장
    pickle.dump(X_train, f)

# with open('x_train_matjip.pkl', 'rb') as f:    #저장한 리스트 로드해서 사용
#     X_train = pickle.load(f)

X_test = []
for sentence in tqdm(test_data['document']):
    tokenized_sentence = okt.morphs(sentence, stem=True) # 토큰화
    stopwords_removed_sentence = [word for word in tokenized_sentence if not word in stopwords] # 불용어 제거
    X_test.append(stopwords_removed_sentence)

with open('x_test_matjip.pkl', 'wb') as f:    
    pickle.dump(X_test, f)

# with open('x_test_matjip.pkl', 'rb') as f:    
#     X_test = pickle.load(f)

tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)

threshold = 3
total_cnt = len(tokenizer.word_index) # 단어의 수
rare_cnt = 0 # 등장 빈도수가 threshold보다 작은 단어의 개수를 카운트
total_freq = 0 # 훈련 데이터의 전체 단어 빈도수 총 합
rare_freq = 0 # 등장 빈도수가 threshold보다 작은 단어의 등장 빈도수의 총 합

# 단어와 빈도수의 쌍(pair)을 key와 value로 받는다.
for key, value in tokenizer.word_counts.items():
    total_freq = total_freq + value
    # 단어의 등장 빈도수가 threshold보다 작으면
    if(value < threshold):
        rare_cnt = rare_cnt + 1
        rare_freq = rare_freq + value

vocab_size = total_cnt - rare_cnt + 1

tokenizer = Tokenizer(vocab_size)
tokenizer.fit_on_texts(X_train)
X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

y_train = np.array(train_data['label'])
y_test = np.array(test_data['label'])

drop_train = [index for index, sentence in enumerate(X_train) if len(sentence) < 1]

# 빈 샘플들을 제거
X_train = np.delete(X_train, drop_train, axis=0)
y_train = np.delete(y_train, drop_train, axis=0)

print(len(X_train))
print(len(y_train))

max_len = 30

X_train = pad_sequences(X_train, maxlen = max_len)
X_test = pad_sequences(X_test, maxlen = max_len)

# Oversampling
smote = SMOTE(random_state=11)
X_train, y_train = smote.fit_resample(X_train, y_train)

print("Oversampling 한 트레이닝 데이터 개수 :", X_train.shape[0])

#데이터 학습 및 성능 평가

##1.DNN model

###case1. embedding dimension = 100

In [None]:
input_length = max_len

embedding_dim = 100
model_dnn_e100_adam = Sequential()
model_dnn_e100_adam.add(Embedding(vocab_size, embedding_dim, input_length=input_length))
model_dnn_e100_adam.add(GlobalAveragePooling1D())
model_dnn_e100_adam.add(Dense(units=16, activation='relu'))
model_dnn_e100_adam.add(Dense(units=4, activation='relu'))
model_dnn_e100_adam.add(Dense(units=1, activation='sigmoid'))
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=7)
mc = ModelCheckpoint('movie_game_matjip_e100_dnn_model_adam.h1', monitor='val_acc', mode='max', verbose=1, save_best_only=True)
model_dnn_e100_adam.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
history = model_dnn_e100_adam.fit(X_train, y_train, epochs=15, callbacks=[es, mc], batch_size=64, validation_split=0.2)


loaded_model = load_model('movie_game_matjip_e100_dnn_model_adam.h1')
print("\n e100 adam 맛집 테스트 정확도: %.4f" % (loaded_model.evaluate(X_test, y_test)[1]))

In [None]:
model_dnn_e100_rmsprop = Sequential()
model_dnn_e100_rmsprop.add(Embedding(vocab_size, embedding_dim, input_length=input_length))
model_dnn_e100_rmsprop.add(GlobalAveragePooling1D())
model_dnn_e100_rmsprop.add(Dense(units=16, activation='relu'))
model_dnn_e100_rmsprop.add(Dense(units=4, activation='relu'))
model_dnn_e100_rmsprop.add(Dense(units=1, activation='sigmoid'))
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=7)
mc = ModelCheckpoint('movie_game_matjip_e100_dnn_model_rmsprop.h1', monitor='val_acc', mode='max', verbose=1, save_best_only=True)
model_dnn_e100_rmsprop.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc'])
history = model_dnn_e100_rmsprop.fit(X_train, y_train, epochs=15, callbacks=[es, mc], batch_size=64, validation_split=0.2)

loaded_model = load_model('movie_game_matjip_e100_dnn_model_rmsprop.h1')
print("\n e100 rmsprop 맛집 테스트 정확도: %.4f" % (loaded_model.evaluate(X_test, y_test)[1]))

###case2. embedding dimension = 75

In [None]:
embedding_dim = 75

model_dnn_e75_adam = Sequential()
model_dnn_e75_adam.add(Embedding(vocab_size, embedding_dim, input_length=input_length))
model_dnn_e75_adam.add(GlobalAveragePooling1D())
model_dnn_e75_adam.add(Dense(units=16, activation='relu'))
model_dnn_e75_adam.add(Dense(units=4, activation='relu'))
model_dnn_e75_adam.add(Dense(units=1, activation='sigmoid'))
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=7)
mc = ModelCheckpoint('movie_game_matjip_e75_dnn_model_adam.h1', monitor='val_acc', mode='max', verbose=1, save_best_only=True)
model_dnn_e75_adam.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
history = model_dnn_e75_adam.fit(X_train, y_train, epochs=15, callbacks=[es, mc], batch_size=64, validation_split=0.2)


loaded_model = load_model('movie_game_matjip_e75_dnn_model_adam.h1')
print("\n e75 adam 맛집 테스트 정확도: %.4f" % (loaded_model.evaluate(X_test, y_test)[1]))

In [None]:
model_dnn_e75_rmsprop = Sequential()
model_dnn_e75_rmsprop.add(Embedding(vocab_size, embedding_dim, input_length=input_length))
model_dnn_e75_rmsprop.add(GlobalAveragePooling1D())
model_dnn_e75_rmsprop.add(Dense(units=16, activation='relu'))
model_dnn_e75_rmsprop.add(Dense(units=4, activation='relu'))
model_dnn_e75_rmsprop.add(Dense(units=1, activation='sigmoid'))
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=7)
mc = ModelCheckpoint('movie_game_matjip_e75_dnn_model_rmsprop.h1', monitor='val_acc', mode='max', verbose=1, save_best_only=True)
model_dnn_e75_rmsprop.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc'])
history = model_dnn_e75_rmsprop.fit(X_train, y_train, epochs=15, callbacks=[es, mc], batch_size=64, validation_split=0.2)


loaded_model = load_model('movie_game_matjip_e75_dnn_model_rmsprop.h1')
print("\n e75 rmsprop 맛집 테스트 정확도: %.4f" % (loaded_model.evaluate(X_test, y_test)[1]))

###case3. embedding dimension = 50

In [None]:
embedding_dim = 50

model_dnn_e50_adam = Sequential()
model_dnn_e50_adam.add(Embedding(vocab_size, embedding_dim, input_length=input_length))
model_dnn_e50_adam.add(GlobalAveragePooling1D())
model_dnn_e50_adam.add(Dense(units=16, activation='relu'))
model_dnn_e50_adam.add(Dense(units=4, activation='relu'))
model_dnn_e50_adam.add(Dense(units=1, activation='sigmoid'))
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=7)
mc = ModelCheckpoint('movie_game_matjip_e50_dnn_model_adam.h1', monitor='val_acc', mode='max', verbose=1, save_best_only=True)
model_dnn_e50_adam.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
history = model_dnn_e50_adam.fit(X_train, y_train, epochs=15, callbacks=[es, mc], batch_size=64, validation_split=0.2)

loaded_model = load_model('movie_game_matjip_e50_dnn_model_adam.h1')
print("\n e50 adam 맛집 테스트 정확도: %.4f" % (loaded_model.evaluate(X_test, y_test)[1]))

In [None]:
model_dnn_e50_rmsprop = Sequential()
model_dnn_e50_rmsprop.add(Embedding(vocab_size, embedding_dim, input_length=input_length))
model_dnn_e50_rmsprop.add(GlobalAveragePooling1D())
model_dnn_e50_rmsprop.add(Dense(units=16, activation='relu'))
model_dnn_e50_rmsprop.add(Dense(units=4, activation='relu'))
model_dnn_e50_rmsprop.add(Dense(units=1, activation='sigmoid'))
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=7)
mc = ModelCheckpoint('movie_game_matjip_e50_dnn_model_rmsprop.h1', monitor='val_acc', mode='max', verbose=1, save_best_only=True)
model_dnn_e50_rmsprop.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc'])
history = model_dnn_e50_rmsprop.fit(X_train, y_train, epochs=15, callbacks=[es, mc], batch_size=64, validation_split=0.2)



loaded_model = load_model('movie_game_matjip_e50_dnn_model_rmsprop.h1')
print("\n e50 rmsprop 맛집 테스트 정확도: %.4f" % (loaded_model.evaluate(X_test, y_test)[1]))

##2.LSTM model

###Hidden state = 64

In [None]:
embedding_dim = 100
hidden_units = 64
epoch = 5

print("lstm model with adam optimizer")
lstm_model_adam = Sequential()
lstm_model_adam.add(Embedding(vocab_size, embedding_dim))
lstm_model_adam.add(LSTM(hidden_units))
lstm_model_adam.add(Dense(1, activation='sigmoid'))

lstm_model_adam.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
lstm_model_adam_history = lstm_model_adam.fit(X_train, y_train, epochs=epoch, batch_size=64, validation_split=0.2)

lstm_model_adam.save('lstm_model_adam.h5')


print("lstm model with rmsprop optimizer")
lstm_model_rmsprop = Sequential()
lstm_model_rmsprop.add(Embedding(vocab_size, embedding_dim))
lstm_model_rmsprop.add(LSTM(hidden_units))
lstm_model_rmsprop.add(Dense(1, activation='sigmoid'))

lstm_model_rmsprop.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['accuracy'])
lstm_model_rmsprop_history = lstm_model_rmsprop.fit(X_train, y_train, epochs=epoch, batch_size=64, validation_split=0.2)

#saving
lstm_model_rmsprop.save('lstm_model_rmsprop.h5')

In [None]:
print("\n LSTM Adam 맛집테스트 정확도 %.4f" % (lstm_model_adam.evaluate(X_test, y_test)[1]))
print("\n LSTM RMSProp 맛집테스트 정확도: %.4f" % (lstm_model_rmsprop.evaluate(X_test, y_test)[1]))

###Hidden state = 128

In [None]:
embedding_dim = 100
hidden_units = 128
epoch = 5

print("lstm model with adam optimizer")
lstm_model_adam2 = Sequential()
lstm_model_adam2.add(Embedding(vocab_size, embedding_dim))
lstm_model_adam2.add(LSTM(hidden_units))
lstm_model_adam2.add(Dense(1, activation='sigmoid'))

lstm_model_adam2.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
lstm_model_adam_history2 = lstm_model_adam2.fit(X_train, y_train, epochs=epoch, batch_size=64, validation_split=0.2)

lstm_model_adam2.save('lstm_model_adam2.h5')


print("lstm model with rmsprop optimizer")
lstm_model_rmsprop2 = Sequential()
lstm_model_rmsprop2.add(Embedding(vocab_size, embedding_dim))
lstm_model_rmsprop2.add(LSTM(hidden_units))
lstm_model_rmsprop2.add(Dense(1, activation='sigmoid'))

lstm_model_rmsprop2.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['accuracy'])
lstm_model_rmsprop_history2 = lstm_model_rmsprop2.fit(X_train, y_train, epochs=epoch, batch_size=64, validation_split=0.2)

#saving
lstm_model_rmsprop2.save('lstm_model_rmsprop2.h5')

In [None]:
print("\n LSTM Adam 맛집테스트 정확도 %.4f" % (lstm_model_adam2.evaluate(X_test, y_test)[1]))
print("\n LSTM RMSProp 맛집테스트 정확도: %.4f" % (lstm_model_rmsprop2.evaluate(X_test, y_test)[1]))

##3.BiLSTM model

###Hidden state = 64

In [None]:
embedding_dim = 100
hidden_units = 64
input_length = max_len

model_bilstm_adam = Sequential()
model_bilstm_adam.add(Embedding(vocab_size, embedding_dim))
model_bilstm_adam.add(Bidirectional(LSTM(hidden_units)))
model_bilstm_adam.add(Dense(1, activation='sigmoid'))
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=7)
mc = ModelCheckpoint('movie_game_matjip_e100_h64_bilstm_model_adam.h1', monitor='val_acc', mode='max', verbose=1, save_best_only=True)

model_bilstm_adam.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
history = model_bilstm_adam.fit(X_train, y_train, epochs=15, callbacks=[es, mc], batch_size=64, validation_split=0.2)

In [None]:
loaded_model = load_model('movie_game_matjip_e100_h64_bilstm_model_adam.h1')
print("\n BiLSTM Adam 맛집 테스트 정확도: %.4f" % (loaded_model.evaluate(X_test, y_test)[1]))

In [None]:
model_bilstm_adam = Sequential()
model_bilstm_adam.add(Embedding(vocab_size, embedding_dim))
model_bilstm_adam.add(Bidirectional(LSTM(hidden_units)))
model_bilstm_adam.add(Dense(1, activation='sigmoid'))
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=7)
mc = ModelCheckpoint('movie_game_matjip_e100_h64_bilstm_model_rmsprop.h1', monitor='val_acc', mode='max', verbose=1, save_best_only=True)

model_bilstm_adam.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc'])
history = model_bilstm_adam.fit(X_train, y_train, epochs=15, callbacks=[es, mc], batch_size=64, validation_split=0.2)

In [None]:
loaded_model = load_model('movie_game_matjip_e100_h64_bilstm_model_rmsprop.h1')
print("\n BiLSTM RMSProp 맛집 테스트 정확도: %.4f" % (loaded_model.evaluate(X_test, y_test)[1]))

###Hidden state = 128

In [None]:
embedding_dim = 100
hidden_units = 128
input_length = max_len

model_bilstm_adam = Sequential()
model_bilstm_adam.add(Embedding(vocab_size, embedding_dim))
model_bilstm_adam.add(Bidirectional(LSTM(hidden_units)))
model_bilstm_adam.add(Dense(1, activation='sigmoid'))
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=7)
mc = ModelCheckpoint('movie_game_matjip_e100_h128_bilstm_model_adam.h1', monitor='val_acc', mode='max', verbose=1, save_best_only=True)

model_bilstm_adam.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
history = model_bilstm_adam.fit(X_train, y_train, epochs=15, callbacks=[es, mc], batch_size=64, validation_split=0.2)

In [None]:
loaded_model = load_model('movie_game_matjip_e100_h128_bilstm_model_adam.h1')
print("\n BiLSTM Adam 맛집 테스트 정확도: %.4f" % (loaded_model.evaluate(X_test, y_test)[1]))

In [None]:
model_bilstm_adam = Sequential()
model_bilstm_adam.add(Embedding(vocab_size, embedding_dim))
model_bilstm_adam.add(Bidirectional(LSTM(hidden_units)))
model_bilstm_adam.add(Dense(1, activation='sigmoid'))
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=7)
mc = ModelCheckpoint('movie_game_matjip_e100_h128_bilstm_model_rmsprop.h1', monitor='val_acc', mode='max', verbose=1, save_best_only=True)

model_bilstm_adam.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc'])
history = model_bilstm_adam.fit(X_train, y_train, epochs=15, callbacks=[es, mc], batch_size=64, validation_split=0.2)

In [None]:
loaded_model = load_model('movie_game_matjip_e100_h128_bilstm_model_rmsprop.h1')
print("\n BiLSTM RMSProp 맛집 테스트 정확도: %.4f" % (loaded_model.evaluate(X_test, y_test)[1]))

###Hidden state = 256

In [None]:
embedding_dim = 100
hidden_units = 256
input_length = max_len

model_bilstm_adam = Sequential()
model_bilstm_adam.add(Embedding(vocab_size, embedding_dim))
model_bilstm_adam.add(Bidirectional(LSTM(hidden_units)))
model_bilstm_adam.add(Dense(1, activation='sigmoid'))
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=7)
mc = ModelCheckpoint('movie_game_matjip_e100_h256_bilstm_model_adam.h1', monitor='val_acc', mode='max', verbose=1, save_best_only=True)

model_bilstm_adam.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
history = model_bilstm_adam.fit(X_train, y_train, epochs=15, callbacks=[es, mc], batch_size=64, validation_split=0.2)

In [None]:
loaded_model = load_model('movie_game_matjip_e100_h256_bilstm_model_adam.h1')
print("\n BiLSTM Adam 맛집 테스트 정확도: %.4f" % (loaded_model.evaluate(X_test, y_test)[1]))

In [None]:
model_bilstm_adam = Sequential()
model_bilstm_adam.add(Embedding(vocab_size, embedding_dim))
model_bilstm_adam.add(Bidirectional(LSTM(hidden_units)))
model_bilstm_adam.add(Dense(1, activation='sigmoid'))
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=7)
mc = ModelCheckpoint('movie_game_matjip_e100_h256_bilstm_model_rmsprop.h1', monitor='val_acc', mode='max', verbose=1, save_best_only=True)

model_bilstm_adam.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc'])
history = model_bilstm_adam.fit(X_train, y_train, epochs=15, callbacks=[es, mc], batch_size=64, validation_split=0.2)

In [None]:
loaded_model = load_model('movie_game_matjip_e100_h256_bilstm_model_rmsprop.h1')
print("\n BiLSTM RMSProp 맛집 테스트 정확도: %.4f" % (loaded_model.evaluate(X_test, y_test)[1]))