라이브러리 불러오기

In [1]:
import numpy as np
import pandas as pd
import re
import unicodedata
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

텍스트 데이터 전처리에 필요한 함수 정의

In [2]:
#유니코드 문자를 아스키로 변경함

#독일어에 사용되는 알파벳 세트를 영어에 사용되는 알파벳 세트와 일치시키기 위한 작업임
def unicode_to_ascii(s):
  return ''.join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn')

def preprocess_sentence(sent):
    #문장을 소문자로 변경시키고 아스키 문자로 변경함.
    sent = unicode_to_ascii(sent.lower())

    # 단어와 구두점 사이에 공백을 만듬.
    # Ex) "he is a boy." => "he is a boy ."
    sent = re.sub(r"([?.!,¿])", r" \1", sent)

    # (a-z, A-Z, ".", "?", "!", ",")를 제외하고는 전부 공백으로 변환.
    sent = re.sub(r"[^a-zA-Z!.?]+", r" ", sent)

    # 한 칸을 넘는 공백은 제거
    sent = re.sub(r"\s+", " ", sent)
    return sent

def load_preprocessed_data(num_samples):
  encoder_input, decoder_input, decoder_target = [], [], []

  with open("deu.txt", "r") as lines:
    for i, line in enumerate(lines):

      # src 데이터와 tar 데이터 분리
      src_line, tar_line, _ = line.strip().split('\t')

      # src 데이터 전처리
      src_line_input = [w for w in preprocess_sentence(src_line).split()]

      # tar 데이터 전처리
      tar_line = preprocess_sentence(tar_line)
      # 디코더의 입력값으로 들어갈 데이터에는 <sos> 태그를 붙여주고,
      tar_line_input = [w for w in ("<sos> " + tar_line).split()]
      # 디코더의 출력값과 비교될 타겟 데이터에는 <eos> 태그를 붙여줌.
      tar_line_target = [w for w in (tar_line + " <eos>").split()]

      encoder_input.append(src_line_input)
      decoder_input.append(tar_line_input)
      decoder_target.append(tar_line_target)

      if i == num_samples - 1:
        break
  
  return encoder_input, decoder_input, decoder_target


텍스트 전처리

In [3]:
# 30,000개의 텍스트 데이터를 전처리해서 사용함
# teacher forcing을 위해 decoder에 들어갈 input이 필요함.
sents_en_in, sents_deu_in, sents_deu_out = load_preprocessed_data(30000)

print(len(sents_en_in))
print(len(sents_deu_in))
print(len(sents_deu_out))

print(sents_en_in[0])
print(sents_deu_in[0])
print(sents_deu_out[0])


30000
30000
30000
['go', '.']
['<sos>', 'geh', '.']
['geh', '.', '<eos>']


단어 토큰화 시키기

In [4]:
# 텍스트 데이터 전처리 과정에서 이미 한 번 필터시켰기 때문에 따로 필터를 거치고 소문자 변환 작업을 할 필요가 없음.
tokenizer_en = Tokenizer(filters="", lower=False)
tokenizer_deu = Tokenizer(filters="", lower=False)

# 각 토크나이저 학습
tokenizer_en.fit_on_texts(sents_en_in)
tokenizer_deu.fit_on_texts(sents_deu_in + ["<eos>"])


print(tokenizer_deu.word_index["<sos>"])
print(tokenizer_deu.word_index["<eos>"])

# Keras 공식 문서에 의하면 Embedding 레이어에 들어가기 위해서 1을 더해줘야함.
en_vocab_size = len(tokenizer_en.word_index) + 1
deu_vocab_size = len(tokenizer_deu.word_index) + 1

print(f"the size of the english vocabulary is {en_vocab_size}")
print(f"the size of the english vocabulary is {deu_vocab_size}")

tokenized_en_in = tokenizer_en.texts_to_sequences(sents_en_in)
tokenized_deu_in = tokenizer_deu.texts_to_sequences(sents_deu_in)
tokenized_deu_out = tokenizer_deu.texts_to_sequences(sents_deu_out)


print(tokenized_en_in[5482])
print(tokenized_deu_in[5482])
print(tokenized_deu_out[5482])

1
7058
the size of the english vocabulary is 4460
the size of the english vocabulary is 7059
[20, 5, 19, 7, 4]
[1, 94, 12, 10, 83, 6]
[94, 12, 10, 83, 6, 7058]


인덱스-단어 사전, 단어-인덱스 사전 만들기

In [5]:
en_to_index = tokenizer_en.word_index
index_to_en = tokenizer_en.index_word

deu_to_index = tokenizer_deu.word_index
index_to_deu = tokenizer_deu.index_word

시퀸스 데이터에 패딩 적용

In [6]:
padded_en_in = pad_sequences(tokenized_en_in, padding='post')
padded_deu_in = pad_sequences(tokenized_deu_in, padding='post')
padded_deu_out = pad_sequences(tokenized_deu_out, padding='post')

# 각 입력값의 패딩 길이 확인.
print(len(padded_en_in[0]))
print(len(padded_deu_in[0]))

7
12


훈련, 검증 데이터 분할

In [7]:
# shuffle
indices = np.arange(padded_en_in.shape[0])
np.random.shuffle(indices)
print(indices)

encoder_input = padded_en_in[indices]
decoder_input = padded_deu_in[indices]
decoder_output = padded_deu_out[indices]

# 분할
n_train = int(encoder_input.shape[0]*0.9)

train_encoder_input = encoder_input[:n_train, :]
train_decoder_input = decoder_input[:n_train, :]
train_decoder_output = decoder_output[:n_train, :]

val_encoder_input = encoder_input[n_train:, :]
val_decoder_input = decoder_input[n_train:, :]
val_decoder_output = decoder_output[n_train:, :]

print(train_encoder_input.shape)
print(val_encoder_input.shape)

print(train_decoder_input.shape)
print(val_decoder_input.shape)

print(train_decoder_output.shape)
print(val_decoder_output.shape)

[26430 11794  3989 ... 22805 12049  2228]
(27000, 7)
(3000, 7)
(27000, 12)
(3000, 12)
(27000, 12)
(3000, 12)


모델 만들기 위해 필요한 모듈 import

In [8]:
from tensorflow.keras.layers import Dense, Embedding, Masking, LSTM, Input
from tensorflow.keras import Model

seq2seq 모델 만들기

In [9]:
# 임베딩 크기
embedding_size = 100
# LSTM 레이어의 셀 개수
latent_size = 50

encoder_input = Input(shape=(None, ))
encoder_embedding_layer = Embedding(input_dim=en_vocab_size, output_dim=embedding_size)
encoder_embedding_output = encoder_embedding_layer(encoder_input)
# 0으로 패딩된 시퀸스는 건너뛰도록 마스킹 레이어를 추가해줌.
encoder_masking_output = Masking(mask_value=0.0)(encoder_embedding_output)
encoder_lstm_layer = LSTM(units=latent_size, return_state=True)
encoder_lstm_output, encoder_state_h, encoder_state_c = encoder_lstm_layer(encoder_masking_output)
encoder_states = [encoder_state_h, encoder_state_c]

decoder_input = Input(shape=(None, ))
decoder_embedding_layer = Embedding(input_dim=deu_vocab_size, output_dim=embedding_size)
decoder_embedding_output = decoder_embedding_layer(decoder_input)
# 0으로 패딩된 시퀸스는 건너뛰도록 마스킹 레이어를 추가해줌.
decoder_masking_output = Masking(mask_value=0.0)(decoder_embedding_output)
# many to many이므로 return_sequences를 True로 설정해줌.
decoder_lstm_layer = LSTM(units=latent_size, return_state=True, return_sequences=True)
decoder_lstm_output, _, _ = decoder_lstm_layer(decoder_masking_output, initial_state=encoder_states)
# 독일어 어휘의 개수만큼 Dense 레이어의 유닛을 설정해줌.
decoder_dense_layer = Dense(units=deu_vocab_size, activation="softmax")
decoder_dense_output = decoder_dense_layer(decoder_lstm_output)

model = Model(inputs=[encoder_input, decoder_input], outputs=[decoder_dense_output])
model.compile(optimizer='rmsprop', metrics=['acc'], loss="sparse_categorical_crossentropy")
model.summary()



Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, None)]       0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, None)]       0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, None, 100)    446000      input_1[0][0]                    
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, None, 100)    705900      input_2[0][0]                    
______________________________________________________________________________________________

seq2seq 모델 훈련시키기

In [10]:
model.fit(x=[train_encoder_input, train_decoder_input], 
          y=train_decoder_output, 
          validation_data=([val_encoder_input, val_decoder_input], val_decoder_output),
          batch_size=128,
          epochs=50)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x7f8386831450>

인코더 모델 빌드

In [11]:
encoder_model = Model(encoder_input, encoder_states)

encoder_model.summary()


Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, None)]            0         
_________________________________________________________________
embedding (Embedding)        (None, None, 100)         446000    
_________________________________________________________________
masking (Masking)            (None, None, 100)         0         
_________________________________________________________________
lstm (LSTM)                  [(None, 50), (None, 50),  30200     
Total params: 476,200
Trainable params: 476,200
Non-trainable params: 0
_________________________________________________________________


디코더 모델 빌드

In [12]:
decoder_state_input_h = Input(shape=(latent_size,))
decoder_state_input_c = Input(shape=(latent_size,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

decoder_embedding_output2 = decoder_embedding_layer(decoder_input)
decoder_outputs2, state_h2, state_c2 = decoder_lstm_layer(decoder_embedding_output2, initial_state=decoder_states_inputs)
decoder_states2 = [state_h2, state_c2]

decoder_outputs2 = decoder_dense_layer(decoder_outputs2)

decoder_model = Model(
    [decoder_input] + decoder_states_inputs,
    [decoder_outputs2] + decoder_states2
)

decoder_model.summary()

Model: "model_2"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            [(None, None)]       0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, None, 100)    705900      input_2[0][0]                    
__________________________________________________________________________________________________
input_3 (InputLayer)            [(None, 50)]         0                                            
__________________________________________________________________________________________________
input_4 (InputLayer)            [(None, 50)]         0                                            
____________________________________________________________________________________________

eng to deu 번역 함수 만들기

In [13]:
def decode_sentences(sentences):
    
    
    results = []
    sequenced_sentences = tokenizer_en.texts_to_sequences(sentences)
    padded_sequences = pad_sequences(sequenced_sentences, maxlen=7, padding="post")

    for i in range(padded_sequences.shape[0]):
      padded_sequence = padded_sequences[i, :].reshape(1, -1)
      
      # 입력으로부터 인코더의 상태를 얻음
      states_value = encoder_model.predict(padded_sequence)


      target_seq = np.zeros((1, 1))
      target_seq[0, 0] = deu_to_index['<sos>']

      sentence = ""


      while True:
        output, h, c = decoder_model.predict([target_seq] + states_value)

        output_word_index = np.argmax(np.squeeze(output))
        output_word = index_to_deu[output_word_index]

        if output_word == "<eos>" or len(sentence) > 50:
          break
        

        sentence = sentence + " " + output_word

        states_value = [h, c]

        target_seq = np.zeros((1,1))
        target_seq[0, 0] = output_word_index
      
      results.append(sentence)

    return results

단어 번역 실험하기

In [14]:
input_sentences = ["hello", "how are you", "nice to meet you", "where is your mother?"]
results = decode_sentences(input_sentences)
print(results)

[' ?', ' wie sind sie ?', ' beschaftigt sie ?', ' wo ist deine .']
