In [1]:
import pandas as pd
import tensorflow as tf
from tensorflow.keras import preprocessing
from tensorflow.keras import Model
from tensorflow.keras.layers import Input, Embedding, Dense, Dropout, Conv1D, GlobalMaxPool1D, concatenate

In [2]:
# 데이터 읽어오기
train_file = "./cnn_model_data/ChatbotData.csv"
data = pd.read_csv(train_file, delimiter=",")
features = data["Q"].tolist()
labels = data["label"].tolist()

In [6]:
# 단어 인덱스 시퀀스 벡터
corpus = [preprocessing.text.text_to_word_sequence(text) for text in features]
tokenizer = preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(corpus)
sequences = tokenizer.texts_to_sequences(corpus)
word_index = tokenizer.word_index

MAX_SEQ_LEN = 15 # 단어 시퀀스 벡터의 크기 설정
padded_seqs = preprocessing.sequence.pad_sequences(sequences, maxlen=MAX_SEQ_LEN, padding="post")
print(padded_seqs.shape)

In [7]:
# 학습, 검증, 테스트 데이터셋 생성 (7:2:1 비율로 할당)
ds = tf.data.Dataset.from_tensor_slices((padded_seqs, labels))
ds = ds.shuffle(len(features))

In [14]:
train_size = int(len(padded_seqs) * 0.7)
val_size = int(len(padded_seqs) * 0.2)
test_size = int(len(padded_seqs) * 0.1)

In [15]:
train_ds = ds.take(train_size).batch(20)
val_ds = ds.skip(train_size).take(val_size).batch(20)
test_ds = ds.skip(train_size + val_size).take(test_size).batch(20)

In [18]:
# 하이퍼 파라미터 설정
dropout_prob = 0.5
EMB_SIZE = 128
EPOCH = 5
VOCAB_SIZE = len(word_index) + 1 # 전체 단어 수

# CNN 모델 정의
input_layer = Input(shape=(MAX_SEQ_LEN, ))
embedding_layer = Embedding(VOCAB_SIZE, EMB_SIZE, input_length=MAX_SEQ_LEN)(input_layer)
dropout_emb = Dropout(rate=dropout_prob)(embedding_layer)

conv1 = Conv1D(filters=128, kernel_size=3, padding="valid", activation=tf.nn.relu)(dropout_emb)
pool1 = GlobalMaxPool1D()(conv1)

conv2 = Conv1D(filters=128, kernel_size=4, padding="valid", activation=tf.nn.relu)(dropout_emb)
pool2 = GlobalMaxPool1D()(conv2)

conv3 = Conv1D(filters=128, kernel_size=5, padding="valid", activation=tf.nn.relu)(dropout_emb)
pool3 = GlobalMaxPool1D()(conv3)

# 3, 4, 5-gram 이후 합치기
concat = concatenate([pool1, pool2, pool3])

hidden = Dense(128, activation=tf.nn.relu)(concat)
dropout_hidden = Dropout(rate=dropout_prob)(hidden)
logits = Dense(3, name="logits")(dropout_hidden)
predictions = Dense(3, activation=tf.nn.softmax)(logits)

In [19]:
# 모델 생성
model = Model(inputs=input_layer, outputs=predictions)
model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])

In [20]:
# 모델 학습
model.fit(train_ds, validation_data=val_ds, epochs=EPOCH, verbose=1)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x23c42f85648>

In [26]:
# 모델 평가 (테스트 데이터셋 활용)
loss, accuracy = model.evaluate(test_ds, verbose=1)
print("Accuracy: {:.4f}%".format(accuracy * 100))
print("Loss: {:.4f}".format(loss))

# 모델 저장
model.save("cnn_model.h5")

Accuracy: 96.8697%
Loss: 0.0958


In [27]:
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Model, load_model
from tensorflow.keras import preprocessing

# 데이터 읽어오기
train_file = "./cnn_model_data/ChatbotData.csv"
data = pd.read_csv(train_file, delimiter=",")
features = data["Q"].tolist()
labels = data["label"].tolist()

# 단어 인덱스 시퀀스 벡터
corpus = []
for text in features:
    word_vec = preprocessing.text.text_to_word_sequence(text)
    corpus.append(word_vec)

tokenizer = preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(corpus)
sequences = tokenizer.texts_to_sequences(corpus)

MAX_SEQ_LEN = 15 # 단어 시퀀스 벡터의 크기
padded_seqs = preprocessing.sequence.pad_sequences(sequences, maxlen=MAX_SEQ_LEN, padding="post")

In [31]:
# 테스트 데이터셋 생성
ds = tf.data.Dataset.from_tensor_slices((padded_seqs, labels))
ds = ds.shuffle(len(features))
test_ds = ds.take(2000).batch(20) # 테스트 데이터 셋

In [32]:
# 감정 분류 CNN 모델 불러오기
model = load_model("cnn_model.h5")
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 15)]         0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 15, 128)      1715072     input_1[0][0]                    
__________________________________________________________________________________________________
dropout (Dropout)               (None, 15, 128)      0           embedding[0][0]                  
__________________________________________________________________________________________________
conv1d (Conv1D)                 (None, 13, 128)      49280       dropout[0][0]                    
______________________________________________________________________________________________

In [33]:
model.evaluate(test_ds, verbose=2)

100/100 - 0s - loss: 0.0708 - accuracy: 0.9770


[0.07083510607481003, 0.9769999980926514]

In [34]:
# 테스트 데이터셋의 10212번째 데이터 출력
print("단어 시퀀스: ", corpus[10212])
print("단어 인덱스 시퀀스: ", padded_seqs[10212])
print("문장 분류(정답): ", labels[10212])

단어 시퀀스:  ['썸', '타는', '여자가', '남사친', '만나러', '간다는데', '뭐라', '해']
단어 인덱스 시퀀스:  [   13    61   127  4320  1333 12162   856    31     0     0     0     0
     0     0     0]
문장 분류(정답):  2


In [35]:
# 테스트 데이터셋의 10212번째 데이터 감정 예측
picks = [10212]
predict = model.predict(padded_seqs[picks])
predict_class = tf.math.argmax(predict, axis=1)

print("감정 예측 점수: ", predict)
print("감정 예측 클래스: ", predict_class.numpy())

감정 예측 점수:  [[6.6040997e-07 5.0315560e-07 9.9999881e-01]]
감정 예측 클래스:  [2]
