# Seq2Seq Q&A Chatbot 구현

### 데이터 취득

In [1]:
import numpy as np
import pandas as pd

df = pd.read_csv('https://raw.githubusercontent.com/songys/Chatbot_data/refs/heads/master/ChatbotData.csv')
df = df[['Q', 'A']]
df

Unnamed: 0,Q,A
0,12시 땡!,하루가 또 가네요.
1,1지망 학교 떨어졌어,위로해 드립니다.
2,3박4일 놀러가고 싶다,여행은 언제나 좋죠.
3,3박4일 정도 놀러가고 싶다,여행은 언제나 좋죠.
4,PPL 심하네,눈살이 찌푸려지죠.
...,...,...
11818,훔쳐보는 것도 눈치 보임.,티가 나니까 눈치가 보이는 거죠!
11819,훔쳐보는 것도 눈치 보임.,훔쳐보는 거 티나나봐요.
11820,흑기사 해주는 짝남.,설렜겠어요.
11821,힘든 연애 좋은 연애라는게 무슨 차이일까?,잘 헤어질 수 있는 사이 여부인 거 같아요.


### 데이터 전처리

##### 토커나이저 학습 (sentencepice 활용)

- 접두사, 접미사 처리 (bos, eos)
    - Train() 인자 cmd 옵션을 추가 (<bos>, <eos>, <pad>, <oov>...)
    - set_encode_extra_options(':')
    - set_encode_extra_options('bos:')
    - set_encode_extra_options(':eos')
    - set_encode_extra_options('bos:eos')

In [2]:
q_inputs = [str(q) for q in df['Q']]
a_inputs = [str(a) for a in df['A']]
a_targets = [str(a) for a in df['A']]

##### 학습용 데이터 Q_input, A_input, A_target 생성

- 패딩처리까지

In [3]:
import sentencepiece as spm

all_text = q_inputs + a_inputs + a_targets
with open("all_text.txt", "w", encoding="utf-8") as f:
    for text in all_text:
        f.write(text + "\n")

INPUT = 'all_text.txt'
MODEL_PREFIX = 'chatbot'
VOCAB_SIZE = 9900

cmd = f'--input={INPUT} --model_prefix={MODEL_PREFIX} --vocab_size={VOCAB_SIZE}'

spm.SentencePieceTrainer.Train(cmd)

sp = spm.SentencePieceProcessor()
sp.load(f'{MODEL_PREFIX}.model')

for doc in df['Q'].values[:3]:
  print(doc)
  print(sp.encode_as_pieces(doc))
  print(sp.encode_as_ids(doc))


12시 땡!
['▁12', '시', '▁땡', '!']
[5512, 514, 4608, 53]
1지망 학교 떨어졌어
['▁1', '지망', '▁학교', '▁떨어졌어']
[346, 7847, 875, 2316]
3박4일 놀러가고 싶다
['▁3', '박', '4', '일', '▁놀러가고', '▁싶다']
[392, 1709, 3652, 95, 3532, 152]


In [4]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

q_input_seq = [sp.encode_as_ids(q) for q in q_inputs]
a_input_seq = [[sp.bos_id()] + sp.encode_as_ids(a) for a in a_inputs]
a_target_seq = [sp.encode_as_ids(a) + [sp.eos_id()] for a in a_targets]

q_max_len = max(len(seq) for seq in q_input_seq)
a_max_len = max(len(seq) for seq in a_input_seq)

# padding
q_input_padded = pad_sequences(q_input_seq, maxlen=q_max_len, padding='pre')
a_input_padded = pad_sequences(a_input_seq, maxlen=a_max_len, padding='post')
a_target_padded = pad_sequences(a_target_seq, maxlen=a_max_len, padding='post')

q_input_padded.shape, a_input_padded.shape, a_target_padded.shape

((11823, 23), (11823, 36), (11823, 36))

### 모델 생성 및 학습

##### 인코더 생성

##### 인코더 모델

In [5]:
from tensorflow.keras import layers, models

VOCAB_SIZE = sp.get_piece_size()
LATENT_DIM = 512
EMBEDDING_DIM = 100

encoder_inputs = layers.Input(shape=(q_max_len,))
embedding_layer = layers.Embedding(VOCAB_SIZE, EMBEDDING_DIM, trainable=False)

x = embedding_layer(encoder_inputs)
encoder_outputs, h, c = layers.LSTM(LATENT_DIM, return_state=True)(x)
encoder_status = [h, c]

encoder_model = models.Model(inputs=encoder_inputs, outputs=encoder_status)
encoder_model.summary()

##### 디코더 (teacher-forcing 모델) 생성

In [6]:
decoder_inputs = layers.Input(shape=(a_max_len,))
embedding_layer = layers.Embedding(VOCAB_SIZE, EMBEDDING_DIM, trainable=False)

x = embedding_layer(decoder_inputs)

decoder_lstm = layers.LSTM(LATENT_DIM, return_sequences=True, return_state=True)    # hidden layer size: LATENT_DIM
x, h, c = decoder_lstm(x, initial_state=encoder_status)     # 초기값을 인코더에서 넘어온 걸로 설정

decoder_dense = layers.Dense(VOCAB_SIZE, activation='softmax')
decoder_outputs = decoder_dense(x)

decoder_teacher_forcing_model = models.Model(
    inputs=[encoder_inputs, decoder_inputs],
    outputs=decoder_outputs
)

decoder_teacher_forcing_model.summary()

##### 학습 compile - fit

In [7]:
decoder_teacher_forcing_model.compile(
    loss='sparse_categorical_crossentropy',
    optimizer='adam',
    metrics=['accuracy']
)

history = decoder_teacher_forcing_model.fit(
    [q_input_padded, a_input_padded],
    a_target_padded,
    batch_size=32,
    epochs=70,
    validation_split=0.2
)

Epoch 1/70
[1m296/296[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 66ms/step - accuracy: 0.7790 - loss: 2.2702 - val_accuracy: 0.7791 - val_loss: 1.5892
Epoch 2/70
[1m296/296[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 66ms/step - accuracy: 0.8302 - loss: 1.2153 - val_accuracy: 0.8140 - val_loss: 1.4616
Epoch 3/70
[1m296/296[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 64ms/step - accuracy: 0.8441 - loss: 1.1246 - val_accuracy: 0.8138 - val_loss: 1.4579
Epoch 4/70
[1m296/296[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 63ms/step - accuracy: 0.8445 - loss: 1.1010 - val_accuracy: 0.8140 - val_loss: 1.4459
Epoch 5/70
[1m296/296[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 64ms/step - accuracy: 0.8443 - loss: 1.0867 - val_accuracy: 0.8142 - val_loss: 1.4441
Epoch 6/70
[1m296/296[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 65ms/step - accuracy: 0.8457 - loss: 1.0680 - val_accuracy: 0.8142 - val_loss: 1.4456
Epoch 7/70
[1m2

### 모델 추론

##### 디코더 (추론 모델) 생성

In [9]:
decoder_hidden_state = layers.Input(shape=(LATENT_DIM,))
decoder_cell_state = layers.Input(shape=(LATENT_DIM,))
decoder_states_inputs = [decoder_hidden_state, decoder_cell_state]

decoder_single_input = layers.Input(shape=(1,))

x = embedding_layer(decoder_single_input)

x, h, c = decoder_lstm(x, initial_state=decoder_states_inputs)
decoder_states = [h, c]

decoder_outputs_ = decoder_dense(x)

decoder_inference_model = models.Model(
    inputs=[decoder_single_input] + decoder_states_inputs,
    outputs=[decoder_outputs_] + decoder_states
)
decoder_inference_model.summary()

##### 추론 함수

In [10]:
def translate(input_seq):
  encoder_states_value = encoder_model.predict(input_seq)
  decoder_states_value = encoder_states_value

  bos_index = sp.piece_to_id('<bos>')
  eos_index = sp.piece_to_id('<eos>')

  target_seq = np.zeros((1, 1))
  target_seq[0, 0] = bos_index

  output_sentence = []

  for _ in range(q_max_len):
    output_tokens, h, c = decoder_inference_model.predict([target_seq] + decoder_states_value)

    pred_proba = output_tokens[0, 0, :]
    pred_index = int(np.argmax(pred_proba))

    if pred_index == eos_index:
      break

    if pred_index > 0:
      word = sp.id_to_piece(pred_index)
      output_sentence.append(word)

    target_seq[0, 0] = pred_index
    decoder_states_value = [h, c]

  return sp.decode_pieces(output_sentence)

  # return " ".join(output_sentence)


##### 테스트

In [11]:
for _ in range(5):
  idx = np.random.choice(len(q_input_padded))
  input_seq = q_input_padded[idx:idx+1]
  output_sent = translate(input_seq)

  display("입력:", q_inputs[idx])
  display("학습:", a_inputs[idx])
  display("추론:", output_sent)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 152ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 128ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 

'입력:'

'운명 같은 사랑 가능할까?'

'학습:'

'저는 있다고 믿어요.'

'추론:'

'좋은 곳으로 데려다 줄 거예요.텐데보세요.같이.도 보고.같이을 인지시켜'

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 51ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32

'입력:'

'인테리어 혼자 해볼까'

'학습:'

'손길이 가서 더 애정이 생길 것 같아요.'

'추론:'

'좋은 곳으로 데려다 줄 거예요.텐데보세요.같이.도 보고.같이을 인지시켜'

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40

'입력:'

'미안해 모든게 다 미안'

'학습:'

'마음 아프네요.'

'추론:'

'좋은 곳으로 데려다 줄 거예요.텐데보세요.같이.도 보고.같이을 인지시켜'

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38

'입력:'

'사랑이 다가왔어'

'학습:'

'당신도 다가가 보세요.'

'추론:'

'좋은 곳으로 데려다 줄 거예요.텐데보세요.같이.도 보고.같이을 인지시켜'

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 53ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 51ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 49ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 48ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 51ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 52ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 57ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 58ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 63ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37

'입력:'

'뭐 깨진지 몇달도 더 되었지만'

'학습:'

'떠오르는 생각은 지울 수가 없나봐요.'

'추론:'

'좋은 곳으로 데려다 줄 거예요.텐데보세요.같이.도 보고.같이을 인지시켜'

### 간단한 Chatbot 구현

1. 사용자의 입력을 받아 (처리)
2. 추론 함수에 전달해서
3. 응답을 출력
4. 1~3 '종료' 전까지 반복

In [None]:
while True:
  input_text = input("질문을 입력하세요.: ")

  if input_text == "종료":
    print("감사합니다.")
    break

  seq = sp.encode_as_ids(input_text)
  input_seq = pad_sequences([seq], maxlen=q_max_len, padding='pre')

  output_text = translate(input_text)

  print("chatbot:", output_text)