In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import urllib.request
from konlpy.tag import Okt
from tqdm import tqdm
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [2]:
train_data = pd.read_table('review.txt')
test_data = pd.read_table('review.txt')

In [3]:
train_data['document'].nunique(), train_data['label'].nunique()

(1735, 2)

In [4]:
# document 열의 중복 제거
train_data.drop_duplicates(subset=['document'], inplace=True)

In [5]:
# 한글과 공백을 제외하고 모두 제거
train_data['document'] = train_data['document'].str.replace("[^ㄱ-ㅎㅏ-ㅣ가-힣 ]","")
train_data[:5]

  train_data['document'] = train_data['document'].str.replace("[^ㄱ-ㅎㅏ-ㅣ가-힣 ]","")


Unnamed: 0,id,document,label
0,1,친절하시고 깔끔하고 좋았습니다,1.0
1,2,조용하고 고기도 굿,1.0
2,3,갈비탕과 냉면 육회비빔밥이 맛있습니다,1.0
3,4,대체적으로 만족하나와인의 구성이 살짝 아쉬움,1.0
4,5,고기도 맛있고 서비스는 더 최고입니다,1.0


In [6]:
train_data['document'] = train_data['document'].str.replace('^ +', "") # white space 데이터를 empty value로 변경
train_data['document'].replace('', np.nan, inplace=True)

train_data = train_data.dropna(how = 'any')

  train_data['document'] = train_data['document'].str.replace('^ +', "") # white space 데이터를 empty value로 변경


In [7]:
stopwords = ['의','가','이','은','들','는','좀','잘','걍','과','도','를','으로','자','에','와','한','하다']

okt = Okt()
okt.morphs('와 이런 것도 영화라고 차라리 뮤직비디오를 만드는 게 나을 뻔', stem = True)

X_train = []
for sentence in tqdm(train_data['document']):
    tokenized_sentence = okt.morphs(sentence, stem=True) # 토큰화
    stopwords_removed_sentence = [word for word in tokenized_sentence if not word in stopwords] # 불용어 제거
    X_train.append(stopwords_removed_sentence)

tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)

threshold = 3
total_cnt = len(tokenizer.word_index) # 단어의 수
rare_cnt = 0 # 등장 빈도수가 threshold보다 작은 단어의 개수를 카운트
total_freq = 0 # 훈련 데이터의 전체 단어 빈도수 총 합
rare_freq = 0 # 등장 빈도수가 threshold보다 작은 단어의 등장 빈도수의 총 합

# 단어와 빈도수의 쌍(pair)을 key와 value로 받는다.
for key, value in tokenizer.word_counts.items():
    total_freq = total_freq + value

    # 단어의 등장 빈도수가 threshold보다 작으면
    if(value < threshold):
        rare_cnt = rare_cnt + 1
        rare_freq = rare_freq + value
# 전체 단어 개수 중 빈도수 2이하인 단어는 제거.
# 0번 패딩 토큰을 고려하여 + 1
vocab_size = total_cnt - rare_cnt + 1


tokenizer = Tokenizer(vocab_size) 
tokenizer.fit_on_texts(X_train)
X_train = tokenizer.texts_to_sequences(X_train)

y_train = np.array(train_data['label'])

drop_train = [index for index, sentence in enumerate(X_train) if len(sentence) < 1]

X_train = np.delete(X_train, drop_train, axis=0)
y_train = np.delete(y_train, drop_train, axis=0)

def below_threshold_len(max_len, nested_list):
  count = 0
  for sentence in nested_list:
    if(len(sentence) <= max_len):
        count = count + 1

max_len = 30
below_threshold_len(max_len, X_train)

X_train = pad_sequences(X_train, maxlen=max_len)



100%|██████████| 1733/1733 [00:05<00:00, 306.56it/s]
  return array(a, dtype, copy=False, order=order)


In [8]:
from tensorflow.keras.layers import Embedding, Dense, LSTM
from tensorflow.keras.models import Sequential
from tensorflow.keras.models import load_model
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

embedding_dim = 100
hidden_units = 128

model = Sequential()
model.add(Embedding(vocab_size, embedding_dim))
model.add(LSTM(hidden_units))
model.add(Dense(1, activation='sigmoid'))

es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=4)
mc = ModelCheckpoint('best_model.h5', monitor='val_acc', mode='max', verbose=1, save_best_only=True)

model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc'])
history = model.fit(X_train, y_train, epochs=15, callbacks=[es, mc], batch_size=64, validation_split=0.2)


loaded_model = load_model('best_model.h5')

Epoch 1/15

Epoch 00001: val_acc improved from -inf to 0.68497, saving model to best_model.h5
Epoch 2/15

Epoch 00002: val_acc improved from 0.68497 to 0.74855, saving model to best_model.h5
Epoch 3/15

Epoch 00003: val_acc improved from 0.74855 to 0.78613, saving model to best_model.h5
Epoch 4/15

Epoch 00004: val_acc improved from 0.78613 to 0.83526, saving model to best_model.h5
Epoch 5/15

Epoch 00005: val_acc improved from 0.83526 to 0.84682, saving model to best_model.h5
Epoch 6/15

Epoch 00006: val_acc did not improve from 0.84682
Epoch 7/15

Epoch 00007: val_acc did not improve from 0.84682
Epoch 8/15

Epoch 00008: val_acc did not improve from 0.84682
Epoch 9/15

Epoch 00009: val_acc improved from 0.84682 to 0.86127, saving model to best_model.h5
Epoch 10/15

Epoch 00010: val_acc did not improve from 0.86127
Epoch 00010: early stopping


In [29]:
def sentiment_predict(new_sentence):
  new_sentence = str(new_sentence).replace("\n","")
  new_sentence = re.sub(r'[^ㄱ-ㅎㅏ-ㅣ가-힣 ]','', new_sentence)
  new_sentence = okt.morphs(new_sentence, stem=True) # 토큰화
  new_sentence = [word for word in new_sentence if not word in stopwords] # 불용어 제거
  encoded = tokenizer.texts_to_sequences([new_sentence]) # 정수 인코딩
  pad_new = pad_sequences(encoded, maxlen = max_len) # 패딩
  score = float(loaded_model.predict(pad_new)) # 예측
  if(score > 0.5):
    return 1
  else:
    return 0

In [43]:

df = pd.read_csv('data5.csv',encoding='utf-8',names=['names','menu','review','loc','review_num','score'])

review_df = df['review']
df = df.astype({'score':'float'})

i = 0
for line in review_df:
    df.loc[i,'score'] = sentiment_predict(line)
    i+=1


df = df.drop(['review'],axis=1)
df['avg_score'] = df.groupby(['names']).transform('mean')

df = df.drop_duplicates(subset=['names'])
df.drop(['score'],axis=1,inplace=True)



df.to_csv('data_5.csv',encoding='utf-8-sig')

