In [None]:
import tensorflow as tf
import numpy as np
import re
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import layers
import pandas as pd

# -----------------------------
# 1. 문장 전처리 함수
# -----------------------------
def tokenize(sentence):
    sentence = sentence.lower().strip()
    sentence = re.sub(r"([?.!,])", r" \1 ", sentence)
    sentence = re.sub(r'[" "]+', " ", sentence)
    return sentence

# -----------------------------
# 2. 데이터 정의
# -----------------------------

df = pd.read_csv("ko_en_100k.csv")
raw_inputs = df['ko'].tolist()
raw_outputs = df['en'].tolist()
# 전처리
inputs = [tokenize(s) for s in raw_inputs]
outputs = [tokenize(s) for s in raw_outputs]

# -----------------------------
# 3. 토크나이저 및 시퀀스 변환
# -----------------------------
inp_tokenizer = Tokenizer(filters='', oov_token='<OOV>')
inp_tokenizer.fit_on_texts(inputs)
inp_seq = pad_sequences(inp_tokenizer.texts_to_sequences(inputs), padding='post')

out_tokenizer = Tokenizer(filters='', oov_token='<OOV>')
out_tokenizer.fit_on_texts(outputs)
out_seq = pad_sequences(out_tokenizer.texts_to_sequences(outputs), padding='post')

# 시작/끝 토큰 ID
start_token = out_tokenizer.word_index['<start>']
end_token = out_tokenizer.word_index['<end>']

# 인덱스 → 단어 맵
index_to_word = {idx: word for word, idx in out_tokenizer.word_index.items()}

# -----------------------------
# 4. Transformer 모델 정의
# -----------------------------
EMBED_DIM = 64
NUM_HEADS = 2
FF_DIM = 128
ENC_VOCAB = len(inp_tokenizer.word_index) + 1
DEC_VOCAB = len(out_tokenizer.word_index) + 1

class SimpleEncoder(tf.keras.Model):
    def __init__(self):
        super().__init__()
        self.embed = layers.Embedding(ENC_VOCAB, EMBED_DIM)
        self.attn = layers.MultiHeadAttention(num_heads=NUM_HEADS, key_dim=EMBED_DIM)
        self.ff = tf.keras.Sequential([
            layers.Dense(FF_DIM, activation='relu'),
            layers.Dense(EMBED_DIM)
        ])

    def call(self, x):
        x = self.embed(x)
        x = self.attn(x, x, x) + x
        return self.ff(x)

class SimpleDecoder(tf.keras.Model):
    def __init__(self):
        super().__init__()
        self.embed = layers.Embedding(DEC_VOCAB, EMBED_DIM)
        self.attn = layers.MultiHeadAttention(num_heads=NUM_HEADS, key_dim=EMBED_DIM)
        self.ff = tf.keras.Sequential([
            layers.Dense(FF_DIM, activation='relu'),
            layers.Dense(EMBED_DIM)
        ])
        self.out = layers.Dense(DEC_VOCAB)

    def call(self, x, enc_output):
        x = self.embed(x)
        x = self.attn(x, enc_output, enc_output) + x
        x = self.ff(x)
        return self.out(x)

class TransformerModel(tf.keras.Model):
    def __init__(self):
        super().__init__()
        self.encoder = SimpleEncoder()
        self.decoder = SimpleDecoder()

    def call(self, inputs):
        inp, out = inputs
        enc_out = self.encoder(inp)
        dec_out = self.decoder(out, enc_out)
        return dec_out

# -----------------------------
# 5. 모델 학습
# -----------------------------
model = TransformerModel()
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')

decoder_input = out_seq[:, :-1]
decoder_target = out_seq[:, 1:]
decoder_target = np.expand_dims(decoder_target, -1)

model.fit([inp_seq, decoder_input], decoder_target, epochs=100, verbose=0)

# -----------------------------
# 6. 번역 함수
# -----------------------------
max_input_len = inp_seq.shape[1]
max_output_len = out_seq.shape[1]

def sample_from_probs(probs, temperature=1.0):
    probs = np.asarray(probs).astype('float64')
    probs = np.clip(probs, 1e-10, 1.0)
    probs = np.log(probs) / temperature
    exp_probs = np.exp(probs - np.max(probs)) 
    probs = exp_probs / np.sum(exp_probs)
    if np.any(np.isnan(probs)) or np.sum(probs) == 0:
        probs = np.ones_like(probs) / len(probs)

    return np.random.choice(len(probs), p=probs)

def translate(sentence):
    sentence = tokenize(sentence)
    input_seq = inp_tokenizer.texts_to_sequences([sentence])
    input_seq = pad_sequences(input_seq, maxlen=max_input_len, padding='post')

    decoded_sentence = ""
    decoder_input_ids = [start_token]

    for _ in range(max_output_len):
        dec_input = pad_sequences([decoder_input_ids], maxlen=max_output_len, padding='post')
        preds = model.predict([input_seq, dec_input], verbose=0)

        time_step = min(len(decoder_input_ids) - 1, preds.shape[1] - 1)
        pred_id = sample_from_probs(preds[0, time_step], temperature=0.8)

        if pred_id == end_token:
            break

        decoded_sentence += index_to_word.get(pred_id, '') + " "
        decoder_input_ids.append(pred_id)

    return decoded_sentence.strip()


In [None]:
# -----------------------------
# 7. 테스트
# -----------------------------
print("입력:", "나는 밥을 먹었다")
print("번역 결과:", translate("나는 밥을 먹었다"))