In [2]:
import tensorflow as tf
import pickle
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.callbacks import EarlyStopping

In [3]:
# 데이터 로드 (csv 파일 경로에 맞게 수정)
train_data_path = 'C:/deep_learning_project01/cafe_train.csv'
validation_data_path = 'C:/deep_learning_project01/cafe_validation.csv'

train_df = pd.read_csv(train_data_path)
validation_df = pd.read_csv(validation_data_path)

# 데이터에서 질문과 답변 텍스트 분리
train_questions = train_df['q'].values
train_answers = train_df['a'].values
validation_questions = validation_df['q'].values
validation_answers = validation_df['a'].values

train_questions

array(['빙수 위에 뿌려진 과일에서 쉰 냄새가 나는데 어떻게 하면 되죠?', '마카롱 주문했는데 마카롱이 엄청 딱딱해요.',
       '와플 지금 받았는데 너무 눅눅해서 못먹겠어요 이거 어떻게 하면 되나요?', ...,
       '모바일상품권도 결제할 수 있어요?', '기프티콘으로 음료 계산할 수 있는 거 맞죠?',
       '후레쉬크림빵 포인트로 결제할 수 있는거죠?'], dtype=object)

In [4]:
# 특수 토큰 설정(start, end)
start_token = '<start>'
end_token = '<end>'

# Start/End 토큰을 답변에 추가
train_answers_with_tokens = [start_token + ' ' + ans + ' ' + end_token for ans in train_answers]
validation_answers_with_tokens = [start_token + ' ' + ans + ' ' + end_token for ans in validation_answers]

# 텍스트 토큰화를 위한 Tokenizer 설정
tokenizer = Tokenizer()
#질문과 시작,종료토큰을 추가한 답변데이터를 기반으로 어휘 사전 생성 및 학습
tokenizer.fit_on_texts(train_questions.tolist() + train_answers_with_tokens)
tokenizer.word_index

{'start': 1,
 'end': 2,
 '네': 3,
 '수': 4,
 '있나요': 5,
 '있어요': 6,
 '있습니다': 7,
 '한': 8,
 '거': 9,
 '주세요': 10,
 '주문': 11,
 '얼마에요': 12,
 '아메리카노': 13,
 '더': 14,
 '이': 15,
 '가능합니다': 16,
 '잔': 17,
 '마카롱': 18,
 '얼마예요': 19,
 '하나': 20,
 '되나요': 21,
 '아이스': 22,
 '두': 23,
 '입니다': 24,
 '건가요': 25,
 '맞죠': 26,
 '있는': 27,
 '됩니다': 28,
 '좀': 29,
 '아니요': 30,
 '어떤': 31,
 '드리겠습니다': 32,
 '몇': 33,
 '혹시': 34,
 '어떻게': 35,
 '알겠습니다': 36,
 '지금': 37,
 '다': 38,
 '안': 39,
 '많이': 40,
 '드릴게요': 41,
 '추가': 42,
 '개': 43,
 '제일': 44,
 '아이스크림': 45,
 '얼마인가요': 46,
 '케이크': 47,
 '라떼': 48,
 '가능한가요': 49,
 '거예요': 50,
 '결제': 51,
 '거죠': 52,
 '주스': 53,
 '오늘': 54,
 '다시': 55,
 '세트': 56,
 '딸기': 57,
 '포장': 58,
 '것': 59,
 '들어가는': 60,
 '커피': 61,
 '쿠키': 62,
 '총': 63,
 '걸로': 64,
 '주문했는데': 65,
 '여기': 66,
 '따뜻한': 67,
 '같이': 68,
 '잘': 69,
 '따로': 70,
 '테이크아웃': 71,
 '맞습니다': 72,
 '있을까요': 73,
 '가격이': 74,
 '게': 75,
 '만든': 76,
 '바로': 77,
 '들어갑니다': 78,
 '변경': 79,
 '초콜릿': 80,
 '언제': 81,
 '때': 82,
 '가장': 83,
 '도넛': 84,
 '안에': 85,
 '맞나요': 86,
 '건': 87,
 '다른'

In [6]:
# 생성한 어휘사전을 통해 텍스트를 시퀀스로 변환
train_questions_seq = tokenizer.texts_to_sequences(train_questions)
train_answers_seq = tokenizer.texts_to_sequences(train_answers_with_tokens)
validation_questions_seq = tokenizer.texts_to_sequences(validation_questions)
validation_answers_seq = tokenizer.texts_to_sequences(validation_answers_with_tokens)

In [8]:
# 패딩 처리 (각각 질문과 답변의 최대 길이에 맞춰 패딩)
#각 질문과 답변 데이터중 가장 긴 시퀀스 길이
max_len_q = max(len(seq) for seq in train_questions_seq)
max_len_a = max(len(seq) for seq in train_answers_seq)
#post-padding방식으로 시퀀스길이를 최대 길이에 맞춤
train_questions_padded = pad_sequences(train_questions_seq, maxlen=max_len_q, padding='post')
train_answers_padded = pad_sequences(train_answers_seq, maxlen=max_len_a, padding='post')
validation_questions_padded = pad_sequences(validation_questions_seq, maxlen=max_len_q, padding='post')
validation_answers_padded = pad_sequences(validation_answers_seq, maxlen=max_len_a, padding='post')

# 단어 집합 크기 계산
vocab_size = len(tokenizer.word_index) + 1  # 패딩 토큰을 고려하여 +1

# 데이터셋 전처리 상태 확인
print({
    "vocab_size": vocab_size,
    "max_len_q": max_len_q,
    "max_len_a": max_len_a,
    "train_sample_shape": train_questions_padded.shape
})

{'vocab_size': 42436, 'max_len_q': 21, 'max_len_a': 36, 'train_sample_shape': (54922, 21)}


In [9]:
# 모델 하이퍼파라미터 설정
embedding_dim = 256  # 임베딩 차원
latent_dim = 512  # LSTM의 잠재 차원

# 인코더 설정
encoder_inputs = Input(shape=(None,), name='encoder_inputs')
#mask_zer=True -> 패딩을 통해 추가된 0을 무시함
encoder_embedding = Embedding(input_dim=vocab_size, output_dim=embedding_dim, mask_zero=True)(encoder_inputs)
encoder_lstm, state_h, state_c = LSTM(latent_dim, return_state=True, name='encoder_lstm')(encoder_embedding)
encoder_states = [state_h, state_c]

# 디코더 설정
decoder_inputs = Input(shape=(None,), name='decoder_inputs')
decoder_embedding = Embedding(input_dim=vocab_size, output_dim=embedding_dim, mask_zero=True)(decoder_inputs)
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True, name='decoder_lstm')
decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)
decoder_dense = Dense(vocab_size, activation='softmax', name='decoder_dense')
decoder_outputs = decoder_dense(decoder_outputs)

# 모델 정의
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

# 모델 컴파일
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# 모델 구조 확인
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 encoder_inputs (InputLayer)    [(None, None)]       0           []                               
                                                                                                  
 decoder_inputs (InputLayer)    [(None, None)]       0           []                               
                                                                                                  
 embedding (Embedding)          (None, None, 256)    10863616    ['encoder_inputs[0][0]']         
                                                                                                  
 embedding_1 (Embedding)        (None, None, 256)    10863616    ['decoder_inputs[0][0]']         
                                                                                              

In [10]:
# 디코더 출력은 정답 시퀀스에서 오른쪽으로 한 칸씩 이동한 값이 필요
train_answers_shifted = train_answers_padded[:, 1:]  # 마지막 한 칸을 제외한 시퀀스
train_answers_padded = train_answers_padded[:, :-1]  # 처음 한 칸을 제외한 시퀀스


early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

# 모델 훈련
batch_size = 64
epochs = 30

history = model.fit(
    [train_questions_padded, train_answers_padded],  # 인코더 입력과 디코더 입력
    train_answers_shifted,  # 디코더 출력
    batch_size=batch_size,
    epochs=epochs,
    validation_data=([validation_questions_padded, validation_answers_padded[:, :-1]], validation_answers_padded[:, 1:]),  # 검증 데이터
    callbacks=[early_stopping]
)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30


In [15]:
encoder_model = Model(encoder_inputs, encoder_states)

# 디코더 모델 구성
decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

decoder_outputs, state_h, state_c = decoder_lstm(
    decoder_embedding, initial_state=decoder_states_inputs)
decoder_states = [state_h, state_c]
decoder_outputs = decoder_dense(decoder_outputs)

decoder_model = Model(
    [decoder_inputs] + decoder_states_inputs,
    [decoder_outputs] + decoder_states)

In [28]:
# 모델 저장
model.save('C:/deep_learning_project01/seq2seq_model/seq2seq_model.h5')
encoder_model.save('C:/deep_learning_project01/seq2seq_model/seq2seq_encoder_model.h5')
decoder_model.save('C:/deep_learning_project01/seq2seq_model/seq2seq_decoder_model.h5')

# Tokenizer 저장
with open('C:/deep_learning_project01/seq2seq_model/tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

# max_len_q와 max_len_a 값을 파일로 저장
with open('C:/deep_learning_project01/attention/seq2seq_max_len_q.pickle', 'wb') as f:
    pickle.dump(max_len_q, f)

with open('C:/deep_learning_project01/attention/seq2seq_max_len_a.pickle', 'wb') as f:
    pickle.dump(max_len_a, f)
    



In [29]:
model2 = tf.keras.models.load_model('C:/deep_learning_project01/seq2seq_model/seq2seq_model.h5')

In [30]:
model2.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 encoder_inputs (InputLayer)    [(None, None)]       0           []                               
                                                                                                  
 decoder_inputs (InputLayer)    [(None, None)]       0           []                               
                                                                                                  
 embedding (Embedding)          (None, None, 256)    10863616    ['encoder_inputs[0][0]']         
                                                                                                  
 embedding_1 (Embedding)        (None, None, 256)    10863616    ['decoder_inputs[0][0]']         
                                                                                              

In [31]:
encoder_model2 = tf.keras.models.load_model('C:/deep_learning_project01/seq2seq_model/seq2seq_encoder_model.h5')
decoder_model2 = tf.keras.models.load_model('C:/deep_learning_project01/seq2seq_model/seq2seq_decoder_model.h5')



In [32]:
encoder_model2.summary()

Model: "model_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 encoder_inputs (InputLayer)  [(None, None)]           0         
                                                                 
 embedding (Embedding)       (None, None, 256)         10863616  
                                                                 
 encoder_lstm (LSTM)         [(None, 512),             1574912   
                              (None, 512),                       
                              (None, 512)]                       
                                                                 
Total params: 12,438,528
Trainable params: 12,438,528
Non-trainable params: 0
_________________________________________________________________


In [33]:
decoder_model2.summary()

Model: "model_3"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 decoder_inputs (InputLayer)    [(None, None)]       0           []                               
                                                                                                  
 embedding_1 (Embedding)        (None, None, 256)    10863616    ['decoder_inputs[0][0]']         
                                                                                                  
 input_1 (InputLayer)           [(None, 512)]        0           []                               
                                                                                                  
 input_2 (InputLayer)           [(None, 512)]        0           []                               
                                                                                            

In [35]:
tokenizer2 = None
max_len_a2 = None
max_len_q2 = None

with open('C:/deep_learning_project01/seq2seq_model/tokenizer.pickle', 'rb') as handle:
    tokenizer2 = pickle.load(handle)

with open('C:/deep_learning_project01/attention/seq2seq_max_len_q.pickle', 'rb') as handle:
    max_len_q2 = pickle.load(handle)

with open('C:/deep_learning_project01/attention/seq2seq_max_len_a.pickle', 'rb') as handle:
    max_len_a2 = pickle.load(handle) 

print(max_len_a2)
print(max_len_q2)

36
21


In [36]:
tokenizer2.word_index

{'start': 1,
 'end': 2,
 '네': 3,
 '수': 4,
 '있나요': 5,
 '있어요': 6,
 '있습니다': 7,
 '한': 8,
 '거': 9,
 '주세요': 10,
 '주문': 11,
 '얼마에요': 12,
 '아메리카노': 13,
 '더': 14,
 '이': 15,
 '가능합니다': 16,
 '잔': 17,
 '마카롱': 18,
 '얼마예요': 19,
 '하나': 20,
 '되나요': 21,
 '아이스': 22,
 '두': 23,
 '입니다': 24,
 '건가요': 25,
 '맞죠': 26,
 '있는': 27,
 '됩니다': 28,
 '좀': 29,
 '아니요': 30,
 '어떤': 31,
 '드리겠습니다': 32,
 '몇': 33,
 '혹시': 34,
 '어떻게': 35,
 '알겠습니다': 36,
 '지금': 37,
 '다': 38,
 '안': 39,
 '많이': 40,
 '드릴게요': 41,
 '추가': 42,
 '개': 43,
 '제일': 44,
 '아이스크림': 45,
 '얼마인가요': 46,
 '케이크': 47,
 '라떼': 48,
 '가능한가요': 49,
 '거예요': 50,
 '결제': 51,
 '거죠': 52,
 '주스': 53,
 '오늘': 54,
 '다시': 55,
 '세트': 56,
 '딸기': 57,
 '포장': 58,
 '것': 59,
 '들어가는': 60,
 '커피': 61,
 '쿠키': 62,
 '총': 63,
 '걸로': 64,
 '주문했는데': 65,
 '여기': 66,
 '따뜻한': 67,
 '같이': 68,
 '잘': 69,
 '따로': 70,
 '테이크아웃': 71,
 '맞습니다': 72,
 '있을까요': 73,
 '가격이': 74,
 '게': 75,
 '만든': 76,
 '바로': 77,
 '들어갑니다': 78,
 '변경': 79,
 '초콜릿': 80,
 '언제': 81,
 '때': 82,
 '가장': 83,
 '도넛': 84,
 '안에': 85,
 '맞나요': 86,
 '건': 87,
 '다른'

In [37]:
# 디코딩 함수에서 예측
def decode_sequence(input_seq):
    # 인코더 상태 추출
    states_value = encoder_model2.predict(input_seq)
    
    # <start> 토큰으로 시작하는 타겟 시퀀스
    target_seq = np.zeros((1, 1))
    target_seq[0, 0] = tokenizer2.word_index['start']
    
    stop_condition = False
    decoded_sentence = ''
    
    while not stop_condition:
        output_tokens, h, c = decoder_model2.predict([target_seq] + states_value)
        
        # 예측된 토큰을 텍스트로 변환
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_word = tokenizer2.index_word.get(sampled_token_index, '')
        decoded_sentence += ' ' + sampled_word
        
        # <end> 토큰을 만나거나 일정 길이를 넘으면 중단
        if (sampled_word == 'end' or len(decoded_sentence) > max_len_a2):
            stop_condition = True
        
        # 타겟 시퀀스 업데이트 (다음 단계 예측을 위해)
        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = sampled_token_index
        
        # 상태 업데이트
        states_value = [h, c]
    
    return decoded_sentence

In [39]:
import numpy as np
# 예시 질문
input_question = "빨때는 빼고 줘"  # 새로운 질문 입력
input_seq = tokenizer2.texts_to_sequences([input_question])
input_seq = pad_sequences(input_seq, maxlen=max_len_q2, padding='post')

# 예측
predicted_sentence = decode_sequence(input_seq)

print(f"질문: {input_question}")
print(f"예측된 답변: {predicted_sentence}")


질문: 빨때는 빼고 줘
예측된 답변:  네 고객님 end
