In [None]:

# 호텔 리뷰 감성분석 (lstm)

import pandas as pd
hotelreview = pd.read_csv('train_set3.csv', encoding = 'cp949')
hotelreview2 = pd.read_csv('test_set.csv')
hotelreview2.head()



import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
#import urllib.request
from konlpy.tag import Okt
from tqdm import tqdm
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
#!pip install git+https://github.com/ssut/py-hanspell.git
#from hanspell import spell_checker

train_data = hotelreview
test_data = hotelreview2

train_data.head()
test_data.head()
train_data.tail()
test_data.tail()
print(len(train_data), len(test_data))

train_data['reviews'].nunique(), train_data['label'].nunique()

train_data.drop_duplicates(subset=['reviews'], inplace = True)
print(len(train_data))

train_data['label'].value_counts().plot(kind='bar')

print(train_data.groupby('label').size().reset_index(name='count'))

print(train_data.isnull().values.any())

train_data['reviews'] = train_data['reviews'].str.replace("[^ㄱ-ㅎㅏ가-힣 ]","")
train_data[:5]

train_data['reviews'] = train_data['reviews'].str.replace('^ +', "")
train_data['reviews'].replace('', np.nan, inplace = True)
print(train_data.isnull().sum())

train_data.loc[train_data.reviews.isnull()]

train_data = train_data.dropna(how = 'any')
print(len(train_data))

test_data.drop_duplicates(subset = ['reviews'], inplace = True)
test_data['reviews'] = test_data['reviews'].str.replace("[^ㄱ-ㅎㅏ가-힣 ]", "")
test_data['reviews'] = test_data['reviews'].str.replace('^ +', "")
test_data['reviews'].replace('', np.nan, inplace = True)
test_data = test_data.dropna(how = 'any')
print(len(test_data))

stopwords = ['의','가','이','은','들','는','걍','과','도','을','를','로','거','님',
                           '때','으로','자','에','와','한','하다','이다','에서','수','고','이라','구','적','점','듯','그','에는',
                           '있다', '하다', '있고', '나다', '들다','나','요','이에요',
                   '해주다', '되어다', '제주', '제주도']

okt = Okt()
okt.morphs('깔끔하고 넓어서 좋았어요창으로 선산봉도 우도도 잘보이네요', stem = True)

'''
sent = "맞춤법 틀리면 외 않되? 쓰고싶은대로쓰면돼지"
spelled_sent = spell_checker.check(sent)

hanspell_sent = spelled_sent.checked
print(hanspell_sent)
'''

X_train = []
for sentence in tqdm(train_data['reviews']):
    #spelled_sent = spell_checker.check(sentence)
    #hanspell_sent = spelled_sent.checked
    tokenized_sentence = okt.morphs(sentence, norm = True, stem = True)
    stopwords_removed_sentence = [word for word in tokenized_sentence if not word in stopwords]
    X_train.append(stopwords_removed_sentence)

print(X_train[:4])

X_test = []
for sentence in tqdm(test_data['reviews']):
    tokenized_sentence = okt.morphs(sentence, stem = True)
    stopwords_removed_sentence = [word for word in tokenized_sentence if not word in stopwords]
    X_test.append(stopwords_removed_sentence)

tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)

print(tokenizer.word_index)

threshold = 3
total_cnt = len(tokenizer.word_index)
rare_cnt = 0
total_freq = 0
rare_freq = 0

for key, value in tokenizer.word_counts.items():
    total_freq = total_freq + value
    
    if (value < threshold):
        rare_cnt = rare_cnt + 1
        rare_freq = rare_freq + value
        
print('단어 집합의 크기: ', total_cnt)
print('등장 빈도가 3번 미만인 단어 수: ', rare_cnt)
print('단어 집합에서 희귀 단어의 비율: ', (rare_cnt / total_cnt))
print('전체 등장 빈도에서 희귀 단어 등장 빈도 비율: ', (rare_freq / total_freq)*100)
        

# 희귀 단어 제거
vocab_size = total_cnt - rare_cnt + 1
print('단어 집합의 크기: ', vocab_size)

tokenizer = Tokenizer(vocab_size)
tokenizer.fit_on_texts(X_train)
X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

print(X_train[:4])

y_train = np.array(train_data['label'])
y_test = np.array(test_data['label'])

drop_train = [index for index, sentence in enumerate(X_train) if len(sentence) < 1]

X_train = np.delete(X_train, drop_train, axis=0)
y_train = np.delete(y_train, drop_train, axis=0)
print(len(X_train), len(y_train))

print('리뷰의 최대 길이: ', max(len(review) for review in X_train))
print('리뷰의 평균 길이: ', sum(map(len, X_train))/len(X_train))
plt.hist([len(review) for review in X_train], bins=50)
plt.xlabel('length of samples')
plt.ylabel('number of samples')
plt.show()

def below_threshold_len(max_len, nested_list):
    count = 0
    for sentence in nested_list:
        if(len(sentence) <= max_len):
            count = count + 1
    print(f'전체 샘플 중 길이가 {max_len} 이하인 샘플의 비율: {(count / len(nested_list))*100}')

max_len = 100
below_threshold_len(max_len, X_train)

X_train = pad_sequences(X_train, maxlen = max_len)
X_test = pad_sequences(X_test, maxlen = max_len)

vocab_size

## LSTM

import tensorflow as tf
from tensorflow.keras.layers import Embedding, Dense, LSTM
from tensorflow.keras.models import Sequential
from tensorflow.keras.models import load_model
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

y_train_encoded = tf.keras.utils.to_categorical(y_train, 3)
y_test_encoded = tf.keras.utils.to_categorical(y_test, 3)

embedding_dim = 30
hidden_units = 10

model = Sequential()
model.add(Embedding(vocab_size, embedding_dim))
model.add(LSTM(hidden_units))
model.add(Dense(3, activation = 'softmax'))

es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=4)
mc = ModelCheckpoint('best_model.h5', monitor='val_accuracy', mode='max', verbose=1, save_best_only=True)

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
history = model.fit(X_train, y_train_encoded, epochs=30, callbacks=[es,mc], batch_size=10, validation_split=0.2)


loaded_model = load_model('best_model.h5')
print(f'테스트 정확도: {loaded_model.evaluate(X_test, y_test_encoded)[1]}')

def sentiment_predict(new_sentence):
    new_sentence = re.sub(r'[^ㄱ-ㅎㅏ가-힣 ]','', new_sentence)
    new_sentence = okt.morphs(new_sentence, norm=True, stem=True)
    new_sentence = [word for word in new_sentence if not word in stopwords]
    encoded = tokenizer.texts_to_sequences([new_sentence])
    pad_new = pad_sequences(encoded, maxlen = max_len)
    score = loaded_model.predict(pad_new)
    return score


def reviewer():
    myreview = input('리뷰를 입력하세요: ')
    predict = sentiment_predict(myreview)

    predict2 = predict.squeeze()
    print(predict2)
    if predict2.max() == predict2[0]:
        print(f'재방문하지 않을 확률이 높습니다. {predict2.max()*100:.2f}%')
    elif predict2.max() == predict2[1]:
              print(f'중립리뷰일 확률이 높습니다. {predict2.max()*100:.2f}%')
    else:
        print(f'재방문할 확률이 높습니다. {predict2.max()*100:.2f}%')

reviewer()


# hanspell 사용 X, vocabsize=vocab_size, batch size=10, threshold=3