In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import re
from soynlp.normalizer import *
from konlpy.tag import Mecab

In [2]:
data = pd.read_csv('/home/kdt-admin/data/final_sentiment_dialogues.csv', index_col = 0)

# 중복 문장 제거
data.drop_duplicates(subset=['문장'], inplace=True)
data.reset_index(drop=True, inplace = True)

# 감정 object를 수치형 object로 변경
data.loc[(data['감정'] == "중립"), '감정'] = 0  
data.loc[(data['감정'] == "슬픔"), '감정'] = 1  
data.loc[(data['감정'] == "분노"), '감정'] = 2  
data.loc[(data['감정'] == "불안"), '감정'] = 3  
data.loc[(data['감정'] == "행복"), '감정'] = 4  
data.loc[(data['감정'] == "당황"), '감정'] = 5  

def remove_html_tags(text):
    clean = re.compile('<*>')
    return re.sub(clean, '', text)

def remove_noise(text):
    # 이모지 제거
    text = re.sub(r'[^\!\?\.\~\w\sㄱ-ㅎㅏ-ㅣ가-힣]', '', text)
    return text

#'문장' 열에 함수 적용
data['문장'] = data['문장'].apply(remove_html_tags)
data['문장'] = data['문장'].apply(remove_noise)
data[:5]

def custom_normalize(text, mapping):
    for p in mapping:
        text = text.replace(p, mapping[p])

    return text.strip()

# ㅋ의 반복에 ㅌ이 섞여있는 경우 ㅋ으로 변환
custom_mapping = {"ㅌ": "ㅋ"}

data['문장'] = data['문장'].apply(lambda x: custom_normalize(x, custom_mapping)) # ㅋㅋㅋㅌㅋㅋ 같은 ㅌ이 중간에 껴있는 경우 ㅌ을 ㅋ으로 대체
data['문장'] = data['문장'].apply(lambda x: repeat_normalize(x, num_repeats=3)) # ㅋㅋㅋㅋㅋㅋㅋ 같은 반복되는 문자열 ㅋㅋㅋ로 정규화


y = data['감정']

stopwords = ['의','가','이','은','들','는','좀','잘','걍','과','도','를','으로','자','에','와','한','하다']

mecab = Mecab()
# 불용어 제거하고 형태소를 분리하는 작업
x = []

for sentence in data['문장']:
    x.append([word for word in mecab.morphs(sentence) if not word in stopwords])

x_train, x_test, y_train, y_test = train_test_split(data['문장'], y, test_size = 0.2, 
                                                    random_state = 42)

y_train = y_train.astype('float64')
y_test = y_test.astype('float64')

# RNN과 LSTM 공통 전처리

### 라이브러리 불러오기

In [3]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing import sequence


2024-04-14 14:36:50.373425: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-04-14 14:36:50.499757: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2024-04-14 14:36:50.499775: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


2024-04-14 14:36:51.121014: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2024-04-14 14:36:51.121083: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory


In [4]:
x_train.info()

<class 'pandas.core.series.Series'>
Index: 138932 entries, 62730 to 121958
Series name: 문장
Non-Null Count   Dtype 
--------------   ----- 
138932 non-null  object
dtypes: object(1)
memory usage: 2.1+ MB


In [5]:
# 텍스트 토큰화
tokenizer = Tokenizer()

# 각 단어에 고유한 정수 인덱스 할당
tokenizer.fit_on_texts(data['문장'])
train_seq = tokenizer.texts_to_sequences(x_train)
test_seq = tokenizer.texts_to_sequences(x_test)

In [6]:
# 패딩
max_len = 80
train_pad = sequence.pad_sequences(train_seq, maxlen = max_len)
test_pad = sequence.pad_sequences(test_seq, maxlen = max_len)

# RNN 모델 적용

In [7]:
# from tensorflow.keras.utils import to_categorical
# y_train = to_categorical(y_train, num_classes = 6)
# y_test = to_categorical(y_test, num_classes = 6)

In [8]:
len(tokenizer.word_index)+1

235424

In [9]:
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense, Flatten
from tensorflow.keras.models import Sequential

#모델 설계, 구축
model_rnn1 = Sequential()
model_rnn1.add(Embedding(input_dim=len(tokenizer.word_index)+1, output_dim=128))

#순환층
model_rnn1.add(SimpleRNN(128, return_sequences=False)) 
model_rnn1.add(Flatten())

#Dense 층
model_rnn1.add(Dense(64, activation = 'relu'))
model_rnn1.add(Dense(6, activation = 'softmax'))

model_rnn1.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 128)         30134272  
                                                                 
 simple_rnn (SimpleRNN)      (None, 128)               32896     
                                                                 
 flatten (Flatten)           (None, 128)               0         
                                                                 
 dense (Dense)               (None, 64)                8256      
                                                                 
 dense_1 (Dense)             (None, 6)                 390       
                                                                 
Total params: 30,175,814
Trainable params: 30,175,814
Non-trainable params: 0
_________________________________________________________________


2024-04-14 14:36:57.411403: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2024-04-14 14:36:57.411612: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2024-04-14 14:36:57.411669: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcublas.so.11'; dlerror: libcublas.so.11: cannot open shared object file: No such file or directory
2024-04-14 14:36:57.411706: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcublasLt.so.11'; dlerror: libcublasLt.so.11: cannot open shared object file: No such file or directory
2024-04-14 14:36:57.411741: W tensorflow/c

In [None]:
#컴파일
model_rnn1.compile(optimizer = 'adam', loss='sparse_categorical_crossentropy', 
                   metrics = ['accuracy'])

# 모델 적용
model_rnn1.fit(train_pad, y_train, epochs = 10, batch_size = 64, validation_split=0.2, shuffle = True)

# 손실 계산
loss1 = model_rnn1.evaluate(test_pad, y_test, verbose = 0)
print(f'Test Loss : {loss1}')

# LSTM

In [None]:
from tensorflow.keras import models
from tensorflow.keras import layers
from tensorflow.keras.optimizers import RMSprop
max_features = len(train_pad)

In [None]:
# layers = 1, epochs = 10, batch_size = 128
model_lstm1 = models.Sequential()
model_lstm1.add(layers.Embedding(max_features, 32))

model_lstm1.add(layers.LSTM(32))  

model_lstm1.add(layers.Dense(6, activation = 'softmax'))

model_lstm1.compile(optimizer=RMSprop(learning_rate=0.001), loss='sparse_categorical_crossentropy', metrics=['acc'])

history = model_lstm1.fit(train_pad, y_train, epochs=10, batch_size=128, validation_split=0.2)

In [None]:
model_lstm1.summary()

In [None]:
#손실 계산
test_loss1, test_acc1 = model_lstm1.evaluate(test_pad,y_test)
print('Test acc:', test_acc1)
print(f'Test Loss : {test_loss1}')

In [None]:
# 예측
y_pred_lstm1 = model_lstm1.predict(test_pad)
# 다중 출력 모델의 출력을 다중 클래스 형식으로 변환
y_pred_classes_lstm = np.argmax(y_pred_lstm1, axis=1)

# 다중 클래스 형식으로 변환된 예측 결과와 실제 레이블 간의 F1-score 계산
f1_score_lstm1 = f1_score(y_test, y_pred_classes_lstm, average='weighted')

print(f'F1_Score : {f1_score_lstm1}')

In [None]:
# layers = 2, epochs = 30, batch_size = 128
model_lstm2 = models.Sequential()
model_lstm2.add(layers.Embedding(max_features, 32))

model_lstm2.add(layers.LSTM(32, return_sequences=True))  
model_lstm2.add(layers.LSTM(32))  

model_lstm2.add(layers.Dense(6, activation = 'softmax'))

model_lstm2.compile(optimizer=RMSprop(learning_rate=0.001), loss='sparse_categorical_crossentropy', metrics=['acc'])

history = model_lstm2.fit(train_pad, y_train, epochs=30, batch_size=128, validation_split=0.2)

In [None]:
model_lstm2.summary()

In [None]:
# 손실 계산
test_loss_2, test_acc_2 = model_lstm_2.evaluate(test_pad_tf,y_test_encoded)
print('Test acc:', test_acc_2)
print(f'Test Loss : {test_loss_2}')

In [None]:
# 예측
y_pred_lstm2 = model_lstm2.predict(test_pad)

# 다중 출력 모델의 출력을 다중 클래스 형식으로 변환
y_pred_classes_lstm2 = np.argmax(y_pred_lstm1, axis=1)

# 다중 클래스 형식으로 변환된 예측 결과와 실제 레이블 간의 F1-score 계산
f1_score_lstm2 = f1_score(y_test, y_pred_classes_lstm2, average='weighted')