In [1]:
# nsmc 텍스트 토큰나이징 한 pickle 데이터 불러오기
import pickle
with open('train_text.pk', 'rb') as f:
    train = pickle.load(f)
with open('test_text.pk', 'rb') as f:
    test = pickle.load(f)

In [2]:
len(train), len(test)

(146124, 48760)

In [3]:
# import
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [4]:
# 파라미터
max_words = 35000
max_len = 50
batch_size=128
epochs = 30

In [5]:
# 데이터 전처리
# 토크나이징, 패딩(제로패딩)
import numpy as np

# 문서만 추출
train_x = [doc for doc, _ in train]

# 토크나이징
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(train_x)

# 변환(LSTM모델의 입력값을 넣기 위해)
x_train = tokenizer.texts_to_sequences(train_x)
y_train = np.array([int(label) for _, label in train])

x_test = tokenizer.texts_to_sequences([doc for doc, _ in test])
y_test = np.array([int(label) for _, label in test])

# 길이를 동일하게 맞추기
x_train = pad_sequences(x_train, maxlen=max_len, padding='pre')
x_test = pad_sequences(x_test, maxlen=max_len, padding='pre')

# 학습가능한 형태로 변환
train_ds = tf.data.Dataset.from_tensor_slices((x_train, y_train)).batch(batch_size)
test_ds = tf.data.Dataset.from_tensor_slices((x_test, y_test)).batch(batch_size)

In [6]:
train_ds

<BatchDataset shapes: ((None, 50), (None,)), types: (tf.int32, tf.int32)>

In [7]:
# 모델 구현
# 클래스로 구현하는 방법
class MyLSTM(tf.keras.Model):
    # 생성자
    def __init__(self):
        # 부모클래스의 생성자 호출
        super().__init__()
        self.emb = Embedding(max_words, 100)
        self.lstm = LSTM(128, dropout=0.2, recurrent_dropout=0.2)
        self.dense = Dense(1, activation='sigmoid')
    
    def call(self, x):
        x = self.emb(x)
        x = self.lstm(x)
        x = self.dense(x)
        return x

In [8]:
# 모델객체 선언
model = MyLSTM()

In [9]:
# 모델설정
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])

In [10]:
# Early Stopping
es = tf.keras.callbacks.EarlyStopping(monitor='val_loss',patience=3, verbose=1)
# ModelCheckpoint
mc = tf.keras.callbacks.ModelCheckpoint('nsmc_bestmodel.cp', monitor='val_loss', mode='min', save_best_only=True)

In [11]:
# 학습
history = model.fit(train_ds, validation_data=test_ds, epochs=epochs, batch_size=batch_size, callbacks=[es, mc])

Epoch 1/30
Epoch 2/30
 223/1142 [====>.........................] - ETA: 2:05 - loss: 0.3559 - acc: 0.8396

KeyboardInterrupt: 

model.evaluate(x_test, y_test)

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(12,8))
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.plot(history.history['acc'])
plt.plot(history.history['val_acc'])
plt.show()

In [None]:
# 후기 데이터 불러오기
import pymysql
conn = pymysql.connect(host='15.164.232.248', user='root', password='xx6677as', db='collection', charset='utf8mb4')
cur = conn.cursor()
cur.execute('select review from 29cm_lumir where product_no = 650636')
data = cur.fetchall()
cur.close()
conn.close()

In [None]:
data = list(data)

In [None]:
data[3][0]

In [None]:
review = [row[0] for row in data]

In [None]:
review

In [None]:
def preprocessing(txt):
    import re
    txt = re.sub('[^ㄱ-ㅎㅏ-ㅣ가-힣 ]', '', txt)
    sw = set() # 집합자료형 선언
    with open('stopwords-ko.txt', encoding='utf-8') as f:
        for w in f:
            sw.add(w.replace('\n',''))
    doc = []
    from eunjeon import Mecab
    mecab = Mecab()
    for word in mecab.morphs(txt):
        if word not in sw and len(word) > 1:
            doc.append(word)
    return doc

In [None]:
# 학습한 머신을 이용한 데이터 predict 후 리스트에 담기
def predict_func(review,pre_data):
    for text in review:
        text = preprocessing(text)
        text = tokenizer.texts_to_sequences([text])
        text = pad_sequences(text, maxlen=max_len, padding='pre')
        pre_data.append((model.predict(text)))
    return pre_data

In [None]:
pre_data = []
out_data = predict_func(review,pre_data)

In [None]:
# 예측한 결과 값에 *100 후 소수 첫번째에서 반올림 후 score에 저장  
result = [row[0][0] for row in out_data]
score =[]
for num in result:
    score.append(round(num*100))
score

In [None]:
# 점수와 후기 데이터 하나의 튜플로 합치기
data = tuple(zip(score,review))

In [None]:
data[:1]

In [None]:
#점수 데이터 데이터 베이스에 업데이트
conn = pymysql.connect(host='15.164.232.248', user='root', password='xx6677as', db='collection', charset='utf8mb4')
# Connection 으로부터 Cursor 생성
cur = conn.cursor()
for d in data:
    cur.execute("update 29cm_lumir set score = %s where review = %s", d)
conn.commit()
conn.close()