In [1]:
import pickle
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import urllib.request
from konlpy.tag import Okt
from tqdm import tqdm
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import sys

In [5]:
data_path = pd.read_csv("../data/dataset_fustion.csv")

In [6]:
#train & test 데이터로 나누기
from sklearn.model_selection import train_test_split
                                                         
train_data, test_data = train_test_split(data_path, test_size=0.25, random_state=0)

In [7]:
# document 열의 중복 제거
train_data.drop_duplicates(subset=['document'], inplace=True)

In [8]:
print('총 샘플의 수 :',len(train_data))

총 샘플의 수 : 23923


In [9]:
stopwords = ['의','가','이','은','들','는','좀','잘','걍','과','도','를','으로','자','에','와','한','하다']
okt = Okt()
okt.morphs('와 이런 것도 영화라고 차라리 뮤직비디오를 만드는 게 나을 뻔', stem = True)

['오다', '이렇다', '것', '도', '영화', '라고', '차라리', '뮤직비디오', '를', '만들다', '게', '나다', '뻔']

In [10]:
X_train = []
#돌아가는지 확인용...
cnt=-1 
for sentence in train_data['document']: 
    cnt = cnt +1 
    if cnt%2000==0:
        print(cnt)
    temp_X = [] 
    temp_X = okt.morphs(sentence, stem=True) # 토큰화
    temp_X = [word for word in temp_X if not word in stopwords] # 불용어 제거 
    X_train.append(temp_X) 

0
2000
4000
6000
8000
10000
12000
14000
16000
18000
20000
22000


In [11]:
max_words = 38000
tokenizer = Tokenizer(num_words = max_words) 
tokenizer.fit_on_texts(X_train) 
X_train = tokenizer.texts_to_sequences(X_train) 

In [15]:
print("문장의 최대 길이 : ", max(len(l) for l in X_train)) 
print("문장의 평균 길이 : ", sum(map(len, X_train))/ len(X_train)) 

문장의 최대 길이 :  113
문장의 평균 길이 :  13.783137566358734


In [16]:
y_train = []
#원핫인코딩
for i in range(len(train_data['label'])): 
    if train_data['label'].iloc[i] == 1: 
        y_train.append([0, 0, 1]) 
    elif train_data['label'].iloc[i] == 0:
        y_train.append([0, 1, 0]) 
    elif train_data['label'].iloc[i] == -1:
        y_train.append([1, 0, 0])

y_train = np.array(y_train)

In [17]:
x_train, x_test, y_train, y_test = train_test_split(X_train, y_train, test_size = 0.1, random_state = 100)

In [18]:
max_len = 35 # 전체 데이터의 길이를 15로 맞춘다 

x_train = pad_sequences(x_train, maxlen=max_len)
x_test = pad_sequences(x_test, maxlen = max_len)

In [19]:
from keras.layers import Embedding, Dense, LSTM, Dropout
from keras.models import Sequential 
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.preprocessing.sequence import pad_sequences 
from keras.layers import BatchNormalization
import keras

In [20]:
model = Sequential()
model.add(Embedding(max_words,128))
model.add(LSTM(64, return_sequences = True))
model.add(BatchNormalization())
model.add(Dropout(0.6)) # 드롭아웃 추가. 비율은 60%
model.add(LSTM(32, return_sequences = False))
model.add(BatchNormalization()) 
model.add(Dropout(0.2)) # 드롭아웃 추가. 비율은 20%
model.add(Dense(16, activation='relu'))
model.add(Dropout(0.1)) # 드롭아웃 추가. 비율은 20%
model.add(Dense(9, activation='relu')) 
model.add(Dense(3, activation='softmax'))

In [21]:
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=4)
mc = ModelCheckpoint('best_model2.h5', monitor='val_acc', mode='max', verbose=1, save_best_only=True)

In [22]:
model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc'])
history = model.fit(x_train, y_train, batch_size=100, epochs=15, callbacks=[es, mc], validation_data=(x_test, y_test))

Epoch 1/15
Epoch 1: val_acc improved from -inf to 0.54033, saving model to best_model2.h5
Epoch 2/15
  3/216 [..............................] - ETA: 8s - loss: 0.3455 - acc: 0.7600

  saving_api.save_model(


Epoch 2: val_acc improved from 0.54033 to 0.73422, saving model to best_model2.h5
Epoch 3/15
Epoch 3: val_acc improved from 0.73422 to 0.76097, saving model to best_model2.h5
Epoch 4/15
Epoch 4: val_acc improved from 0.76097 to 0.76264, saving model to best_model2.h5
Epoch 5/15
Epoch 5: val_acc improved from 0.76264 to 0.76473, saving model to best_model2.h5
Epoch 6/15
Epoch 6: val_acc did not improve from 0.76473
Epoch 7/15
Epoch 7: val_acc did not improve from 0.76473
Epoch 7: early stopping


In [25]:
with open('emotion_tokenizer.pickle', 'wb') as handle:
     pickle.dump(tokenizer, handle)

In [26]:
from tensorflow.keras.models import load_model
loaded_model = load_model('emotion_model.h5')

print("\n 테스트 정확도: %.4f" % (loaded_model.evaluate(x_test, y_test)[1]))


 테스트 정확도: 0.7647
