# seq2seq으로 한글,영어 번역기 만들기 프로젝트

### 라이브러리 불러오기

In [1]:
import requests
import tarfile

import tensorflow as tf
import numpy as np

from sklearn.model_selection import train_test_split

import matplotlib.ticker as ticker
import matplotlib.pyplot as plt

import time
import re
import os
import io

from konlpy.tag import Mecab

In [2]:
# 다운로드할 파일의 URL
url = "https://raw.githubusercontent.com/jungyeul/korean-parallel-corpora/master/korean-english-news-v1/korean-english-park.train.tar.gz"

# 파일을 저장할 경로
filename = "korean-english-park.train.tar.gz"

# 파일 다운로드
response = requests.get(url)
with open(filename, 'wb') as file:
    file.write(response.content)

print(f"{filename} 파일이 성공적으로 다운로드되었습니다.")

# 파일 경로
file_name = "korean-english-park.train.tar.gz"

# 압축 해제 및 추출
with tarfile.open(file_name, "r:gz") as tar:
    tar.extractall()  # 현재 디렉토리에 파일을 추출
    print(f"{file_name} 파일이 성공적으로 추출되었습니다.")

korean-english-park.train.tar.gz 파일이 성공적으로 다운로드되었습니다.
korean-english-park.train.tar.gz 파일이 성공적으로 추출되었습니다.


In [3]:
ko_path_to_file = 'korean-english-park.train.ko'
en_path_to_file = 'korean-english-park.train.en'

with open(ko_path_to_file, "r") as f:
    ko_raw = f.read().splitlines()

with open(en_path_to_file, "r") as f:
    en_raw = f.read().splitlines()
len(ko_raw)

94123

In [4]:
# set 중복 데이터 제거

seen = set()  # 중복 여부를 확인할 set
ko_clean_corp = []
en_clean_corp = []

for ko, en in zip(ko_raw, en_raw):
    if ko not in seen:
        ko_clean_corp.append(ko)  # 중복되지 않는 경우만 추가
        en_clean_corp.append(en)
        seen.add(ko)

In [5]:
print(len(ko_clean_corp))
print(len(en_clean_corp))

print(ko_clean_corp[1430])
print(en_clean_corp[1430])

77591
77591
그러나 박스데일은 최근 해커들이 넷스케이프 소프트웨어 보안 암호를 해독했을 때 스스로 아주 민첩함을 입증했다. 어떻게 할 것인가?
But Barksdale proved himself to be fairly nimble recently when hackers cracked a security code in Netscape's software.


In [6]:
# 한글 정규식을 빼고 전처리 진행

def kor_preprocess_sentence(sentence):
    sentence = re.sub(r"([?.!,])", r" \1 ", sentence)
    sentence = re.sub(r'[" "]+', " ", sentence)
    sentence = re.sub(r"[^가-힣?.!,]+", " ", sentence)

    sentence = sentence.strip()
    return sentence 
    
def preprocess_sentence(sentence, s_token=False, e_token=False):
    sentence = sentence.lower().strip()

    sentence = re.sub(r"([?.!,])", r" \1 ", sentence)
    sentence = re.sub(r'[" "]+', " ", sentence)
    sentence = re.sub(r"[^a-zA-Z?.!,]+", " ", sentence)

    sentence = sentence.strip()

    if s_token:
        sentence = '<start> ' + sentence

    if e_token:
        sentence += ' <end>'
    
    return sentence

In [7]:
## 전처리 코드를 돌려서 corpus에 넣기

f_enc_corpus = []
f_kor_corpus = []



for ko_sentence, en_sentence in zip(ko_clean_corp, en_clean_corp):
    if len(kor_preprocess_sentence(ko_sentence)) < 40:
        f_kor_corpus.append(kor_preprocess_sentence(ko_sentence))  # 한국어 문장 추가
        f_enc_corpus.append(preprocess_sentence(en_sentence, s_token=True, e_token=True))  # 영어 문장 추가

print("Korean:", f_kor_corpus[125])   
print("English:", f_enc_corpus[125])   

print(len(f_kor_corpus))
print(len(f_enc_corpus))

Korean: 하지만 멀티미디어용 컴퓨터 한 대에는 약 명의 학생들이 있다 .
English: <start> however , there are about students for every multimedia computer . <end>
16647
16647


In [8]:
mecab = Mecab()

def ko_tokenize(corpus):
    mecab = Mecab()
    
    tokenized_corpus = [mecab.morphs(sentence) for sentence in corpus]
    
    tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='')
    tokenizer.fit_on_texts(tokenized_corpus)

    tensor = tokenizer.texts_to_sequences(tokenized_corpus)

    tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor, padding='post')

    return tensor, tokenizer

def en_tokenize(corpus):
    tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='')
    tokenizer.fit_on_texts(corpus)

    tensor = tokenizer.texts_to_sequences(corpus)

    tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor, padding='post')

    return tensor, tokenizer

In [9]:
# 토큰화 시키고 테스트, 검증셋 분리
kor_tensor, kor_tokenizer = ko_tokenize(f_kor_corpus)
eng_tensor, eng_tokenizer = en_tokenize(f_enc_corpus)

In [10]:
kor_tensor

array([[ 911,  479,  430, ...,    0,    0,    0],
       [  54,  284,    5, ...,    0,    0,    0],
       [ 110,    5,  431, ...,    0,    0,    0],
       ...,
       [  72,    5, 1390, ...,    0,    0,    0],
       [  54,  304, 1147, ...,    0,    0,    0],
       [ 895,  636,   30, ...,    0,    0,    0]], dtype=int32)

In [11]:
print('한국어 단어장크기 : ',len(kor_tokenizer.word_index))
print('영어 단어장크기 : ',len(eng_tokenizer.word_index))

한국어 단어장크기 :  15829
영어 단어장크기 :  17604


# 모델 설계

In [12]:
class BahdanauAttention(tf.keras.layers.Layer):
    def __init__(self, units):
        super(BahdanauAttention, self).__init__()
        self.w_dec = tf.keras.layers.Dense(units)
        self.w_enc = tf.keras.layers.Dense(units)
        self.w_com = tf.keras.layers.Dense(1)
    
    def call(self, h_enc, h_dec):
        # h_enc shape: [batch x length x units]
        # h_dec shape: [batch x units]

        h_enc = self.w_enc(h_enc)
        h_dec = tf.expand_dims(h_dec, 1)
        h_dec = self.w_dec(h_dec)

        score = self.w_com(tf.nn.tanh(h_dec + h_enc))
        
        attn = tf.nn.softmax(score, axis=1)

        context_vec = attn * h_enc
        context_vec = tf.reduce_sum(context_vec, axis=1)

        return context_vec, attn
    
class Encoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, enc_units):
        super(Encoder, self).__init__()

        self.enc_units = enc_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = tf.keras.layers.GRU(enc_units,
                                       return_sequences=True)

    def call(self, x):
        emb_x = self.embedding(x)
        return self.gru(emb_x)
    
class Decoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, dec_units):
        super(Decoder, self).__init__()

        self.dec_units = dec_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.attention = BahdanauAttention(self.dec_units)   # Attention 필수 사용!
        self.gru= tf.keras.layers.GRU(dec_units,
                                       return_sequences=True,
                                       return_state=True)
        self.fc = tf.keras.layers.Dense(vocab_size)

    def call(self, x, h_dec, enc_out):
        context_vec, attn = self.attention(enc_out, h_dec)
        
        out = self.embedding(x)
        out = tf.concat([tf.expand_dims(context_vec, 1), out], axis=-1)
        out, h_dec = self.gru(out)

        out = tf.reshape(out, (-1, out.shape[2]))
        out = self.fc(out)

        return out, h_dec, attn

In [13]:
BATCH_SIZE     = 64
SRC_VOCAB_SIZE = len(kor_tokenizer.index_word) + 1
TGT_VOCAB_SIZE = len(eng_tokenizer.index_word) + 1

units         = 1024
embedding_dim = 512

encoder = Encoder(SRC_VOCAB_SIZE, embedding_dim, units)
decoder = Decoder(TGT_VOCAB_SIZE, embedding_dim, units)

# sample input
sequence_len = 30

sample_enc = tf.random.uniform((BATCH_SIZE, sequence_len))
print ('Encoder Input:', sample_enc.shape)

sample_output = encoder(sample_enc)

print ('Encoder Output:', sample_output.shape)

sample_state = tf.random.uniform((BATCH_SIZE, units))

sample_logits, h_dec, attn = decoder(tf.random.uniform((BATCH_SIZE, 1)),
                                     sample_state, sample_output)

print ('Decoder Output:', sample_logits.shape)
print ('Decoder Hidden State:', h_dec.shape)
print ('Attention:', attn.shape)

Encoder Input: (64, 30)
Encoder Output: (64, 30, 1024)
Decoder Output: (64, 17605)
Decoder Hidden State: (64, 1024)
Attention: (64, 30, 1)


# 학습

In [14]:
optimizer = tf.keras.optimizers.Adam()
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction='none')

def loss_function(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss = loss_object(real, pred)
    
    mask = tf.cast(mask, dtype=loss.dtype)
    loss *= mask
    
    return tf.reduce_mean(loss)

@tf.function
def train_step(src, tgt, encoder, decoder, optimizer, dec_tok):
    bsz = src.shape[0]
    loss = 0

    with tf.GradientTape() as tape:
        enc_out = encoder(src)
        h_dec = enc_out[:, -1]
        dec_src = tf.expand_dims([dec_tok.word_index['<start>']] * bsz, 1)

        for t in range(1, tgt.shape[1]):
            # 디코더에 dec_src = 첫번째는 start, h_dec = 첫번째는 encoder의 마지막 state
            pred, h_dec, _ = decoder(dec_src, h_dec, enc_out)

            loss += loss_function(tgt[:, t], pred)
            dec_src = tf.expand_dims(tgt[:, t], 1)
        
    batch_loss = (loss / int(tgt.shape[1]))

    variables = encoder.trainable_variables + decoder.trainable_variables
    gradients = tape.gradient(loss, variables)
    optimizer.apply_gradients(zip(gradients, variables))
    
    return batch_loss

def test_step(src_list, kor_tokenizer, eng_tokenizer):
    for src in src_list:
        output = kor_preprocess_sentence(src)
        output = mecab.morphs(output)
        output = kor_tokenizer.texts_to_sequences(output)
        pre_src = tf.keras.preprocessing.sequence.pad_sequences(output, padding='post')
        pre_src = tf.reshape(pre_src, (1, pre_src.shape[0]))
        enc_out = encoder(pre_src)
    
        h_dec = enc_out[:, -1]
        bsz = 1
        dec_src = tf.expand_dims([eng_tokenizer.word_index['<start>']] * bsz, 1)
        
        dec_output = []
        
        max_length = 25
        for t in range(max_length):
            pred, h_dec, _ = decoder(dec_src, h_dec, enc_out)
            index = np.argmax(pred)
            eng_word = eng_tokenizer.index_word[index]
            dec_output.append(eng_word)
            if tf.equal(index, eng_tokenizer.word_index['<end>']):
                break
            dec_src = tf.expand_dims([index], 0)
        
        result = ' '.join(dec_output)
        print('원문 : ',src)
        print('번역 : ',result)
    return

In [15]:
test_step(['너는 직업이 무엇이니?'],kor_tokenizer, eng_tokenizer)

원문 :  너는 직업이 무엇이니?
번역 :  earnings solomon listed basics slowing villagers heavens lexi unanswered absolutely stays accusations accidents surpassed angelina strangers alphabet larisa effects pardo astronomy indefinite mainland athletes recovered


In [20]:
import random
from tqdm import tqdm

EPOCHS = 3

# 예문
src_list = ['오바마는 대통령이다.', '시민들은 도시 속에 산다.', '커피는 필요 없다.',' 일곱 명의 사망자가 발생했다.','진민준은 천재다.']

for epoch in range(EPOCHS):
    total_loss = 0
    
    idx_list = list(range(0, kor_tensor.shape[0], BATCH_SIZE))
    random.shuffle(idx_list)
    t = tqdm(idx_list)
    
    for (batch, idx) in enumerate(t):
        batch_loss = train_step(kor_tensor[idx:idx+BATCH_SIZE],
                                eng_tensor[idx:idx+BATCH_SIZE],
                                encoder,
                                decoder,
                                optimizer,
                                eng_tokenizer)
    
        total_loss += batch_loss
        
        
        
        t.set_description_str('Epoch %2d' % (epoch + 1))
        t.set_postfix_str('Loss %.4f' % (total_loss.numpy() / (batch + 1)))
        
    test_step(src_list, kor_tokenizer, eng_tokenizer)    

Epoch  1: 100%|██████████| 521/521 [04:25<00:00,  1.97it/s, Loss 0.3767]


원문 :  오바마는 대통령이다.
번역 :  obama will be president elect barack obama will be president elect barack obama will be president elect barack obama will be president elect barack obama
원문 :  시민들은 도시 속에 산다.
번역 :  louis city alderman wants people to arm themselves to arm themselves to arm themselves to arm themselves to arm themselves to arm themselves to arm
원문 :  커피는 필요 없다.
번역 :  the coffee has hundreds of coffee has hundreds of coffee has hundreds of coffee has hundreds of coffee has hundreds of coffee has hundreds of
원문 :   일곱 명의 사망자가 발생했다.
번역 :  two of the fire was taken to the children . <end>
원문 :  진민준은 천재다.
번역 :  a group a day checkout period . <end>


Epoch  2: 100%|██████████| 521/521 [04:24<00:00,  1.97it/s, Loss 0.3326]


원문 :  오바마는 대통령이다.
번역 :  obama s speech in the first wired president . <end>
원문 :  시민들은 도시 속에 산다.
번역 :  india makes the gunshot wound . <end>
원문 :  커피는 필요 없다.
번역 :  his boss , or shoppers can buy g packs of coffee beans to make up <end>
원문 :   일곱 명의 사망자가 발생했다.
번역 :  his wife was taken to the hospital , and some was taken to the hospital , and some was taken to the hospital , and
원문 :  진민준은 천재다.
번역 :  the airstrip comes back into the spotlight back into the spotlight back into the spotlight back into the spotlight back into the spotlight back into


Epoch  3: 100%|██████████| 521/521 [04:24<00:00,  1.97it/s, Loss 0.2978]


원문 :  오바마는 대통령이다.
번역 :  obama s meet the first time to decelerate . <end>
원문 :  시민들은 도시 속에 산다.
번역 :  india cnn life are the da to arm themselves to arm themselves to arm themselves to arm themselves to arm themselves to arm themselves to
원문 :  커피는 필요 없다.
번역 :  ice is available , which will become compounds . <end>
원문 :   일곱 명의 사망자가 발생했다.
번역 :  his wife said . <end>
원문 :  진민준은 천재다.
번역 :  you re scheduled to a day of benjamin button . <end>
