<a href="https://colab.research.google.com/github/kovus380/testtest/blob/master/review_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!apt-get update
!apt-get install g++ openjdk-8-jdk 
!pip install konlpy

In [None]:
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
import re
import urllib.request
from konlpy.tag import Okt
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
urllib.request.urlretrieve("https://raw.githubusercontent.com/bab2min/corpus/master/sentiment/naver_shopping.txt", filename="ratings_total.txt")

In [None]:
total_data = pd.read_table('ratings_total.txt', names=['ratings', 'reviews'])

In [None]:
total_data['label'] = np.select([total_data.ratings > 3], [1], default=0)

In [None]:
total_data['ratings'].nunique(), total_data['reviews'].nunique(), total_data['label'].nunique()

In [None]:
total_data.drop_duplicates(subset=['reviews'], inplace=True)
print(total_data.isnull().values.any())

In [None]:
train_data, test_data = train_test_split(total_data, test_size = 0.25, random_state = 42)

In [None]:
train_data['reviews'] = train_data['reviews'].str.replace("[^ㄱ-ㅎㅏ-ㅣ가-힣 ]","")
train_data['reviews'].replace('', np.nan, inplace=True)

In [None]:
test_data.drop_duplicates(subset = ['reviews'], inplace=True) # 중복 제거
test_data['reviews'] = test_data['reviews'].str.replace("[^ㄱ-ㅎㅏ-ㅣ가-힣 ]","") # 정규 표현식 수행
test_data['reviews'].replace('', np.nan, inplace=True) # 공백은 Null 값으로 변경
test_data = test_data.dropna(how='any') # Null 값 제거

In [None]:
okt = Okt()

In [None]:
stopwords = ['도', '는', '다', '의', '가', '이', '은', '한', '에', '하', '고', '을', '를', '인', '듯', '과', '와', '네', '들', '듯', '지', '임', '게']

In [None]:
train_data['tokenized'] = train_data['reviews'].apply(okt.morphs)
train_data['tokenized'] = train_data['tokenized'].apply(lambda x: [item for item in x if item not in stopwords])

In [None]:
test_data['tokenized'] = test_data['reviews'].apply(okt.morphs)
test_data['tokenized'] = test_data['tokenized'].apply(lambda x: [item for item in x if item not in stopwords])

In [None]:
negative_words = np.hstack(train_data[train_data.label == 0]['tokenized'].values) # 부정 단어
positive_words = np.hstack(train_data[train_data.label == 1]['tokenized'].values) # 긍정 단어

In [None]:
from collections import Counter
negative_word_count = Counter(negative_words)
positive_word_count = Counter(positive_words)

In [None]:
X_train = train_data['tokenized'].values
y_train = train_data['label'].values

In [None]:
import csv 
path = '/content/X_train.csv'
with open(path, 'w') as f:
    writer = csv.writer(f)
    writer.writerows(X_train)

In [None]:
X_train = []
vocab_size = 42019
with open(path, 'r') as f:
    reader = csv.reader(f)
    for idx, content in enumerate(reader):
        X_train.append(content)

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)

In [None]:
vocab_size = 42019

In [None]:
tokenizer = Tokenizer(vocab_size, oov_token = 'OOV')
tokenizer.fit_on_texts(X_train)
X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

In [None]:
y_train = np.array(train_data['label'])
y_test = np.array(test_data['label'])

In [None]:
max_len = 50

In [None]:
X_train = pad_sequences(X_train, maxlen = max_len)
X_test = pad_sequences(X_test, maxlen = max_len)

In [None]:
from tensorflow.keras.layers import Embedding, Dense, LSTM
from tensorflow.keras.models import Sequential
from tensorflow.keras.models import load_model
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

In [None]:
model = Sequential()
model.add(Embedding(vocab_size, 100))
model.add(LSTM(128))
model.add(Dense(1, activation='sigmoid'))

In [None]:
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=4)
mc = ModelCheckpoint('review_model.h5', monitor='val_acc', mode='max', verbose=1, save_best_only=True)

In [None]:
model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc'])
history = model.fit(X_train, y_train, epochs=15, callbacks=[es, mc], batch_size=60, validation_split=0.2)

In [None]:
loaded_model = load_model('review_model.h5')
print("\n 테스트 정확도: %.4f" % (loaded_model.evaluate(X_test, y_test)[1]))