# 미션 11: Seq2Seq + Attention 기반 한-영 번역 실습

제공된 일상 회화 한-영 병렬 코퍼스를 이용해 기본 Seq2Seq 모델과 Attention이 포함된 모델을 모두 구현하고 비교합니다. 노트북 전체 흐름은 데이터 로딩 → 전처리 → 어휘 사전 구축 → PyTorch Dataset/DataLoader 구성 → Encoder/Decoder 구현 → 학습 및 평가 → 번역 예시 확인 순서로 이어집니다.


## 노트북 구성

1. **데이터 준비**: JSON 원본을 불러와 간단한 정규화/토크나이즈를 수행합니다.
2. **어휘 사전**: 토큰 빈도를 기준으로 `<pad>/<sos>/<eos>/<unk>` 토큰을 포함한 Vocabulary를 만듭니다.
3. **Dataset & DataLoader**: `<sos>`/`<eos>`가 붙은 시퀀스를 텐서로 변환하고, 배치 단위로 패딩합니다.
4. **모델 정의**: GRU 기반 Encoder/Decoder, Bahdanau Attention을 구현합니다.
5. **학습/평가 루프**: Teacher Forcing, Gradient Clipping, 간단한 BLEU 계산을 포함한 공용 함수로 두 모델을 모두 돌립니다.
6. **결과 확인**: 검증 세트와 임의 문장의 번역 결과를 비교합니다.


In [23]:

import json
import math
import random
import re
import time
from collections import Counter
from pathlib import Path
from typing import Dict, List, Tuple

import numpy as np
import torch
from torch import nn
from torch.nn.utils.rnn import pad_packed_sequence, pack_padded_sequence, pad_sequence
from torch.utils.data import DataLoader, Dataset

SEED = 42


def set_seed(seed: int = 42) -> None:
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False


set_seed(SEED)

DATA_DIR = Path("Part3_mission_11")
CONFIG = {
    "train_file": "/mnt/nas/jayden_code/Codeit_Practice/Part3_mission_11/일상생활및구어체_한영_train_set.json",
    "valid_file": "/mnt/nas/jayden_code/Codeit_Practice/Part3_mission_11/일상생활및구어체_한영_valid_set.json",
    "train_limit": 60000,  # 너무 오래 걸리면 값을 더 줄여 사용하세요.
    "valid_limit": 5000,
    "min_freq": 3,
    "max_length": 40,
    "batch_size": 128,
    "embedding_dim": 256,
    "hidden_dim": 384,
    "dropout": 0.2,
    "num_layers": 1,
    "learning_rate": 1e-3,
    "epochs": 8,
    "teacher_forcing": 0.5,
    "grad_clip": 1.0,
    "num_workers": 2,
}

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
print("[CONFIG]")
print(json.dumps({k: (str(v) if isinstance(v, Path) else v) for k, v in CONFIG.items()}, indent=2, ensure_ascii=False))


Using device: cuda
[CONFIG]
{
  "train_file": "/mnt/nas/jayden_code/Codeit_Practice/Part3_mission_11/일상생활및구어체_한영_train_set.json",
  "valid_file": "/mnt/nas/jayden_code/Codeit_Practice/Part3_mission_11/일상생활및구어체_한영_valid_set.json",
  "train_limit": 60000,
  "valid_limit": 5000,
  "min_freq": 3,
  "max_length": 40,
  "batch_size": 128,
  "embedding_dim": 256,
  "hidden_dim": 384,
  "dropout": 0.2,
  "num_layers": 1,
  "learning_rate": 0.001,
  "epochs": 8,
  "teacher_forcing": 0.5,
  "grad_clip": 1.0,
  "num_workers": 2
}


## 데이터 로딩 및 간단 전처리

- JSON 파일을 불러온 뒤 한국어/영어 문장을 각각 정규화하고 공백 단위로 토크나이즈합니다.
- 전체 120만 문장으로는 학습 시간이 오래 걸리므로, 기본값으로는 무작위 6만/5천 문장만 사용하도록 제한해 두었습니다. 필요하면 `CONFIG['train_limit']` 값을 조정하면 됩니다.
- 문장 길이 분포를 확인해 Encoder/Decoder 최대 길이(`max_length`)가 적절한지 점검합니다.


In [24]:

def normalize_text(text: str) -> str:
    text = text.strip()
    text = re.sub(r"\s+", " ", text)
    return text


def tokenize_ko(text: str) -> List[str]:
    text = normalize_text(text)
    text = re.sub(r"[^0-9A-Za-z가-힣ㄱ-ㅎㅏ-ㅣ!?.,']+", " ", text)
    text = re.sub(r"\s+", " ", text)
    return [tok for tok in text.split(" ") if tok]


def tokenize_en(text: str) -> List[str]:
    text = normalize_text(text.lower())
    text = re.sub(r"[^a-z0-9!?.,']+", " ", text)
    text = re.sub(r"\s+", " ", text)
    return [tok for tok in text.split(" ") if tok]


def load_sentence_pairs(path: Path, limit: int = None, seed: int = SEED) -> List[Dict[str, List[str]]]:
    with open(path, encoding="utf-8") as f:
        raw = json.load(f)["data"]
    if limit is not None and limit < len(raw):
        rng = random.Random(seed)
        indices = rng.sample(range(len(raw)), limit)
        raw = [raw[i] for i in indices]
    pairs = []
    for item in raw:
        ko_text = item["ko"].strip()
        en_text = item["mt"].strip()
        ko_tokens = tokenize_ko(ko_text)
        en_tokens = tokenize_en(en_text)
        if not ko_tokens or not en_tokens:
            continue
        pairs.append({"ko_text": ko_text, "en_text": en_text,"ko_tokens": ko_tokens, "en_tokens": en_tokens,})
    return pairs


def describe_lengths(pairs: List[Dict[str, List[str]]], label: str) -> None:
    ko_lengths = [len(p["ko_tokens"]) for p in pairs]
    en_lengths = [len(p["en_tokens"]) for p in pairs]

    def _summary(lengths: List[int]) -> Dict[str, float]:
        percentiles = [50, 75, 90, 95]
        return {f"p{p}": float(np.percentile(lengths, p)) for p in percentiles}

    print(f"{label} | samples = {len(pairs)}")
    print(" KO lengths:", _summary(ko_lengths))
    print(" EN lengths:", _summary(en_lengths))


train_examples = load_sentence_pairs(CONFIG["train_file"], CONFIG["train_limit"], SEED)
valid_examples = load_sentence_pairs(CONFIG["valid_file"], CONFIG["valid_limit"], SEED + 1)
print(f"Loaded {len(train_examples)} training pairs / {len(valid_examples)} validation pairs")
describe_lengths(train_examples, "Train")
describe_lengths(valid_examples, "Valid")

print("샘플 데이터 3건")
for sample in train_examples[:3]:
    print("KO:", sample["ko_text"])
    print("EN:", sample["en_text"])
    print("-")

train_pairs = [(ex["ko_tokens"], ex["en_tokens"]) for ex in train_examples]
valid_pairs = [(ex["ko_tokens"], ex["en_tokens"]) for ex in valid_examples]


Loaded 60000 training pairs / 5000 validation pairs
Train | samples = 60000
 KO lengths: {'p50': 6.0, 'p75': 9.0, 'p90': 12.0, 'p95': 14.0}
 EN lengths: {'p50': 9.0, 'p75': 13.0, 'p90': 18.0, 'p95': 21.0}
Valid | samples = 5000
 KO lengths: {'p50': 6.0, 'p75': 9.0, 'p90': 12.0, 'p95': 14.0}
 EN lengths: {'p50': 9.0, 'p75': 13.0, 'p90': 18.0, 'p95': 21.0}
샘플 데이터 3건
KO: 그리고 더블 크립 제품이 하자 없는 배송 부탁드립니다.
EN: Also, please ensure that the double creep product is delivered without defects.
-
KO: 날씨 보호 테크 시스템, 특허받은 용접 모서리 및 반전된 솔기가 물을 막아줍니다.
EN: Weather protection tech system, patented welded edges and inverted seams keep water out.
-
KO: 무엇을 도와드릴까요?
EN: How may I help you?
-


## Vocabulary 생성

- `<pad>`, `<sos>`, `<eos>`, `<unk>` 4개의 특수 토큰을 고정으로 추가합니다.
- 토큰 빈도가 `min_freq` 미만이면 `<unk>`로 치환합니다.
- 학습 데이터를 기반으로 소스/타깃 Vocabulary를 각각 구축합니다.


In [25]:

class Vocabulary:
    def __init__(self, min_freq: int = 1):
        self.min_freq = min_freq
        self.special_tokens = ["<pad>", "<sos>", "<eos>", "<unk>"]
        self.token2idx: Dict[str, int] = {}
        self.idx2token: Dict[int, str] = {}
        self.pad_token, self.sos_token, self.eos_token, self.unk_token = self.special_tokens
        self._built = False

    def build(self, sentences: List[List[str]]) -> None:
        counter = Counter()
        for sentence in sentences:
            counter.update(sentence)
        self.token2idx = {}
        self.idx2token = {}
        for token in self.special_tokens:
            self._add_token(token)
        for token, freq in sorted(counter.items(), key=lambda x: (-x[1], x[0])):
            if freq < self.min_freq:
                continue
            if token in self.token2idx:
                continue
            self._add_token(token)
        self._built = True

    def _add_token(self, token: str) -> None:
        idx = len(self.token2idx)
        self.token2idx[token] = idx
        self.idx2token[idx] = token

    @property
    def size(self) -> int:
        return len(self.token2idx)

    @property
    def pad_id(self) -> int:
        return self.token2idx[self.pad_token]

    @property
    def sos_id(self) -> int:
        return self.token2idx[self.sos_token]

    @property
    def eos_id(self) -> int:
        return self.token2idx[self.eos_token]

    @property
    def unk_id(self) -> int:
        return self.token2idx[self.unk_token]

    def encode(
        self,
        tokens: List[str],
        add_sos: bool = True,
        add_eos: bool = True,
        max_length: int = None,
    ) -> List[int]:
        assert self._built, "Vocabulary.build()를 먼저 호출하세요."
        limit = max_length - 2 if max_length else None
        truncated = tokens[:limit] if limit is not None else tokens
        seq = []
        if add_sos:
            seq.append(self.sos_id)
        seq.extend(self.token2idx.get(tok, self.unk_id) for tok in truncated)
        if add_eos:
            seq.append(self.eos_id)
        return seq

    def decode(self, ids: List[int]) -> List[str]:
        tokens = []
        for idx in ids:
            token = self.idx2token.get(int(idx), self.unk_token)
            if token in {self.pad_token, self.sos_token}:
                continue
            if token == self.eos_token:
                break
            tokens.append(token)
        return tokens


In [26]:

src_vocab = Vocabulary(min_freq=CONFIG["min_freq"])
src_vocab.build([src for src, _ in train_pairs])

tgt_vocab = Vocabulary(min_freq=CONFIG["min_freq"])
tgt_vocab.build([tgt for _, tgt in train_pairs])

print(f"Source vocab size : {src_vocab.size}")
print(f"Target vocab size : {tgt_vocab.size}")


Source vocab size : 18907
Target vocab size : 12566


## Dataset & DataLoader

- Dataset은 (소스, 타깃) 토큰 리스트를 숫자 시퀀스로 변환합니다.
- Collate 함수가 배치 단위로 패딩, 길이 텐서 계산, Decoder input/target 분리를 담당합니다.
- 학습 시 셔플, 검증 시 셔플을 끕니다.


In [27]:

class TranslationDataset(Dataset):
    def __init__(self, pairs: List[Tuple[List[str], List[str]]], src_vocab: Vocabulary, tgt_vocab: Vocabulary, max_length: int):
        self.pairs = pairs
        self.src_vocab = src_vocab
        self.tgt_vocab = tgt_vocab
        self.max_length = max_length

    def __len__(self) -> int:
        return len(self.pairs)

    def __getitem__(self, idx: int) -> Tuple[torch.Tensor, torch.Tensor]:
        src_tokens, tgt_tokens = self.pairs[idx]
        src_ids = self.src_vocab.encode(src_tokens, add_sos=True, add_eos=True, max_length=self.max_length)
        tgt_ids = self.tgt_vocab.encode(tgt_tokens, add_sos=True, add_eos=True, max_length=self.max_length)
        return torch.tensor(src_ids, dtype=torch.long), torch.tensor(tgt_ids, dtype=torch.long)


def collate_batch(batch):
    src_batch, tgt_batch = zip(*batch)
    src_lengths = torch.tensor([len(x) for x in src_batch], dtype=torch.long)
    src_padded = pad_sequence(src_batch, batch_first=True, padding_value=src_vocab.pad_id)
    tgt_padded = pad_sequence(tgt_batch, batch_first=True, padding_value=tgt_vocab.pad_id)
    decoder_inputs = tgt_padded[:, :-1]
    decoder_targets = tgt_padded[:, 1:]
    return {
        "src": src_padded,
        "src_lengths": src_lengths,
        "decoder_inputs": decoder_inputs,
        "decoder_targets": decoder_targets,
    }


train_dataset = TranslationDataset(train_pairs, src_vocab, tgt_vocab, CONFIG["max_length"])
valid_dataset = TranslationDataset(valid_pairs, src_vocab, tgt_vocab, CONFIG["max_length"])
print(f"Dataset sizes | train={len(train_dataset)}, valid={len(valid_dataset)}")

pin_memory = device.type == "cuda"
train_loader = DataLoader(
    train_dataset,
    batch_size=CONFIG["batch_size"],
    shuffle=True,
    num_workers=CONFIG["num_workers"],
    pin_memory=pin_memory,
    collate_fn=collate_batch,
)
valid_loader = DataLoader(
    valid_dataset,
    batch_size=CONFIG["batch_size"],
    shuffle=False,
    num_workers=CONFIG["num_workers"],
    pin_memory=pin_memory,
    collate_fn=collate_batch,
)
print("DataLoader 준비 완료")


Dataset sizes | train=60000, valid=5000
DataLoader 준비 완료


## 모델 정의: Encoder, Decoder, Attention

- Encoder와 Decoder 모두 GRU 1층 구조를 기본으로 합니다.
- 기본 Decoder는 Context 없이 이전 Hidden state만 사용합니다.
- Attention Decoder는 Bahdanau 스타일 가중합을 통해 Context vector를 계산합니다.


In [28]:

class Encoder(nn.Module):
    def __init__(self, vocab_size: int, embed_dim: int, hidden_dim: int, pad_idx: int, num_layers: int = 1, dropout: float = 0.1):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=pad_idx)
        self.gru = nn.GRU(embed_dim, hidden_dim, num_layers=num_layers, batch_first=True)
        self.dropout = nn.Dropout(dropout)

    def forward(self, src: torch.Tensor, lengths: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
        embedded = self.dropout(self.embedding(src))
        packed = pack_padded_sequence(embedded, lengths.cpu(), batch_first=True, enforce_sorted=False)
        outputs, hidden = self.gru(packed)
        outputs, _ = pad_packed_sequence(outputs, batch_first=True)
        return outputs, hidden


class DecoderRNN(nn.Module):
    def __init__(self, vocab_size: int, embed_dim: int, hidden_dim: int, pad_idx: int, num_layers: int = 1, dropout: float = 0.1):
        super().__init__()
        self.output_dim = vocab_size
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=pad_idx)
        self.gru = nn.GRU(embed_dim, hidden_dim, num_layers=num_layers, batch_first=True)
        self.dropout = nn.Dropout(dropout)
        self.fc_out = nn.Linear(hidden_dim, vocab_size)

    def forward(self, input_token: torch.Tensor, hidden: torch.Tensor, encoder_outputs: torch.Tensor, mask: torch.Tensor = None):
        embedded = self.dropout(self.embedding(input_token)).unsqueeze(1)
        output, hidden = self.gru(embedded, hidden)
        prediction = self.fc_out(output.squeeze(1))
        return prediction, hidden, None


class BahdanauAttention(nn.Module):
    def __init__(self, hidden_dim: int):
        super().__init__()
        self.attn = nn.Linear(hidden_dim * 2, hidden_dim)
        self.v = nn.Linear(hidden_dim, 1, bias=False)

    def forward(self, hidden: torch.Tensor, encoder_outputs: torch.Tensor, mask: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
        hidden = hidden[-1].unsqueeze(1).repeat(1, encoder_outputs.size(1), 1)
        energy = torch.tanh(self.attn(torch.cat((hidden, encoder_outputs), dim=2)))
        attn_scores = self.v(energy).squeeze(-1)
        attn_scores = attn_scores.masked_fill(mask == 0, -1e9)
        attn_weights = torch.softmax(attn_scores, dim=1)
        context = torch.bmm(attn_weights.unsqueeze(1), encoder_outputs)
        return context, attn_weights


class AttnDecoderRNN(nn.Module):
    def __init__(self, vocab_size: int, embed_dim: int, hidden_dim: int, pad_idx: int, dropout: float = 0.1):
        super().__init__()
        self.output_dim = vocab_size
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=pad_idx)
        self.attention = BahdanauAttention(hidden_dim)
        self.gru = nn.GRU(embed_dim + hidden_dim, hidden_dim, batch_first=True)
        self.dropout = nn.Dropout(dropout)
        self.fc_out = nn.Linear(hidden_dim * 2 + embed_dim, vocab_size)

    def forward(self, input_token: torch.Tensor, hidden: torch.Tensor, encoder_outputs: torch.Tensor, mask: torch.Tensor):
        embedded = self.dropout(self.embedding(input_token)).unsqueeze(1)
        context, attn_weights = self.attention(hidden, encoder_outputs, mask)
        rnn_input = torch.cat((embedded, context), dim=2)
        output, hidden = self.gru(rnn_input, hidden)
        combined = torch.cat((output, context, embedded), dim=2)
        prediction = self.fc_out(combined.squeeze(1))
        return prediction, hidden, attn_weights


class Seq2Seq(nn.Module):
    def __init__(self, encoder: nn.Module, decoder: nn.Module, pad_idx: int, device: torch.device):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.pad_idx = pad_idx
        self.device = device
        self.latest_attentions = None

    def forward(self, src: torch.Tensor, lengths: torch.Tensor, decoder_inputs: torch.Tensor, teacher_forcing_ratio: float = 0.5):
        encoder_outputs, hidden = self.encoder(src, lengths)
        mask = (src != self.pad_idx)
        batch_size = src.size(0)
        trg_len = decoder_inputs.size(1)
        vocab_size = self.decoder.output_dim
        outputs = torch.zeros(batch_size, trg_len, vocab_size, device=self.device)
        input_token = decoder_inputs[:, 0]
        attentions = []
        for t in range(trg_len):
            prediction, hidden, attn_weights = self.decoder(input_token, hidden, encoder_outputs, mask)
            outputs[:, t, :] = prediction
            attentions.append(attn_weights)
            teacher_force = random.random() < teacher_forcing_ratio
            if teacher_force and t + 1 < trg_len:
                input_token = decoder_inputs[:, t + 1]
            else:
                input_token = prediction.argmax(dim=1)
        self.latest_attentions = attentions
        return outputs

    def greedy_decode(self, src: torch.Tensor, lengths: torch.Tensor, max_len: int, start_token_id: int, end_token_id: int) -> torch.Tensor:
        self.eval()
        with torch.no_grad():
            encoder_outputs, hidden = self.encoder(src, lengths)
            mask = (src != self.pad_idx)
            batch_size = src.size(0)
            input_token = torch.full((batch_size,), start_token_id, dtype=torch.long, device=self.device)
            finished = torch.zeros(batch_size, dtype=torch.bool, device=self.device)
            outputs = []
            for _ in range(max_len):
                prediction, hidden, attn_weights = self.decoder(input_token, hidden, encoder_outputs, mask)
                next_token = prediction.argmax(dim=1)
                outputs.append(next_token)
                finished |= next_token.eq(end_token_id)
                input_token = next_token
                if finished.all():
                    break
            if outputs:
                predictions = torch.stack(outputs, dim=1)
            else:
                predictions = torch.zeros((batch_size, 0), dtype=torch.long, device=self.device)
        return predictions.cpu()


## 학습/평가 유틸리티

- `corpus_bleu()`는 간단한 N-gram 정밀도를 사용한 BLEU 추정치입니다.
- `train_epoch()`와 `evaluate_epoch()`에서 공통 손실 계산, Teacher Forcing, Grad Clipping을 수행합니다.
- `translate_sentence()`와 `show_predictions()`로 정성적 결과를 확인합니다.


In [30]:

def corpus_bleu(references: List[List[str]], hypotheses: List[List[str]], max_n: int = 4, smooth_eps: float = 1e-9) -> float:
    clipped = [0] * max_n
    totals = [0] * max_n
    ref_length = 0
    hyp_length = 0
    for ref, hyp in zip(references, hypotheses):
        ref_length += max(len(ref), 1)
        hyp_length += max(len(hyp), 1)
        for n in range(1, max_n + 1):
            ref_ngrams = Counter(tuple(ref[i:i + n]) for i in range(max(len(ref) - n + 1, 0)))
            hyp_ngrams = Counter(tuple(hyp[i:i + n]) for i in range(max(len(hyp) - n + 1, 0)))
            totals[n - 1] += sum(hyp_ngrams.values())
            for ng, count in hyp_ngrams.items():
                clipped[n - 1] += min(count, ref_ngrams.get(ng, 0))
    precisions = []
    for n in range(max_n):
        numerator = clipped[n] + smooth_eps
        denominator = totals[n] + smooth_eps
        precisions.append(numerator / denominator)
    geo_mean = math.exp(sum((1 / max_n) * math.log(p) for p in precisions))
    bp = 1.0 if hyp_length > ref_length else math.exp(1 - ref_length / max(hyp_length, 1))
    return float(bp * geo_mean)


def train_epoch(model: Seq2Seq, dataloader: DataLoader, optimizer: torch.optim.Optimizer, criterion: nn.Module, teacher_forcing: float) -> float:
    model.train()
    total_loss = 0.0
    for batch in dataloader:
        src = batch["src"].to(device)
        src_lengths = batch["src_lengths"]
        decoder_inputs = batch["decoder_inputs"].to(device)
        decoder_targets = batch["decoder_targets"].to(device)
        optimizer.zero_grad()
        outputs = model(src, src_lengths, decoder_inputs, teacher_forcing_ratio=teacher_forcing)
        loss = criterion(outputs.reshape(-1, outputs.size(-1)), decoder_targets.reshape(-1))
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), CONFIG["grad_clip"])
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(dataloader)


def evaluate_epoch(model: Seq2Seq, dataloader: DataLoader, criterion: nn.Module, tgt_vocab: Vocabulary) -> Tuple[float, float]:
    model.eval()
    total_loss = 0.0
    references, hypotheses = [], []
    with torch.no_grad():
        for batch in dataloader:
            src = batch["src"].to(device)
            src_lengths = batch["src_lengths"]
            decoder_inputs = batch["decoder_inputs"].to(device)
            decoder_targets = batch["decoder_targets"].to(device)
            outputs = model(src, src_lengths, decoder_inputs, teacher_forcing_ratio=0.0)
            loss = criterion(outputs.reshape(-1, outputs.size(-1)), decoder_targets.reshape(-1))
            total_loss += loss.item()
            max_len = decoder_targets.size(1) + 5
            preds = model.greedy_decode(
                src,
                src_lengths,
                max_len=max_len,
                start_token_id=tgt_vocab.sos_id,
                end_token_id=tgt_vocab.eos_id,
            )
            for target_ids, pred_ids in zip(decoder_targets.cpu(), preds):
                references.append(tgt_vocab.decode(target_ids.tolist()))
                hypotheses.append(tgt_vocab.decode(pred_ids.tolist()))
    bleu = corpus_bleu(references, hypotheses)
    return total_loss / len(dataloader), bleu


def train_model(model: Seq2Seq, model_name: str, train_loader: DataLoader, valid_loader: DataLoader, tgt_vocab: Vocabulary, epochs: int) -> List[Dict[str, float]]:
    optimizer = torch.optim.Adam(model.parameters(), lr=CONFIG["learning_rate"])
    criterion = nn.CrossEntropyLoss(ignore_index=tgt_vocab.pad_id)
    history = []
    best_bleu = -1.0
    best_state = None
    for epoch in range(1, epochs + 1):
        start_time = time.time()
        train_loss = train_epoch(model, train_loader, optimizer, criterion, CONFIG["teacher_forcing"])
        valid_loss, valid_bleu = evaluate_epoch(model, valid_loader, criterion, tgt_vocab)
        elapsed = time.time() - start_time
        history.append(
            {
                "epoch": epoch,
                "train_loss": train_loss,
                "valid_loss": valid_loss,
                "valid_bleu": valid_bleu,
                "time": elapsed,
            }
        )
        print(f"[{model_name}] Epoch {epoch}/{epochs} | train {train_loss:.3f} | valid {valid_loss:.3f} | BLEU {valid_bleu:.3f} | {elapsed:.1f}s")
        if valid_bleu > best_bleu:
            best_bleu = valid_bleu
            best_state = {k: v.cpu() for k, v in model.state_dict().items()}
    if best_state is not None:
        model.load_state_dict(best_state)
    return history


def detokenize(tokens: List[str]) -> str:
    return " ".join(tokens)


def translate_sentence(text: str, model: Seq2Seq, src_vocab: Vocabulary, tgt_vocab: Vocabulary, max_len: int = None) -> str:
    tokens = tokenize_ko(text)
    if not tokens:
        return ""
    seq = src_vocab.encode(tokens, add_sos=True, add_eos=True, max_length=CONFIG["max_length"])
    src_tensor = torch.tensor(seq, dtype=torch.long).unsqueeze(0).to(device)
    lengths = torch.tensor([len(seq)], dtype=torch.long)
    max_len = max_len or CONFIG["max_length"] + 5
    pred_ids = model.greedy_decode(
        src_tensor,
        lengths,
        max_len=max_len,
        start_token_id=tgt_vocab.sos_id,
        end_token_id=tgt_vocab.eos_id,
    )
    tokens = tgt_vocab.decode(pred_ids[0].tolist())
    return detokenize(tokens)


def show_predictions(model: Seq2Seq, sentences: List[str], title: str) -> None:
    print(f"[{title}] 번역 예시")
    for sent in sentences:
        pred = translate_sentence(sent, model, src_vocab, tgt_vocab)
        print(f"KO: {sent}")
        print(f"EN(pred): {pred}")
        print("-")


def show_validation_samples(model: Seq2Seq, num_samples: int = 5) -> None:
    samples = random.sample(valid_examples, num_samples)
    for sample in samples:
        pred = translate_sentence(sample["ko_text"], model, src_vocab, tgt_vocab)
        print(f"KO: {sample['ko_text']}")
        print(f"EN(gt): {sample['en_text']}")
        print(f"EN(pred): {pred}")
        print("=")


## Seq2Seq (Encoder-Decoder) 학습

- Attention 없이 기본 GRU Encoder/Decoder만 사용합니다.
- 학습 시간이 길면 `CONFIG['epochs']`나 `CONFIG['train_limit']`를 줄이면서 빠르게 확인해도 됩니다.


In [31]:

RUN_SEQ2SEQ = True
seq2seq_model = None
seq2seq_history = []

if RUN_SEQ2SEQ:
    encoder = Encoder(
        vocab_size=src_vocab.size,
        embed_dim=CONFIG["embedding_dim"],
        hidden_dim=CONFIG["hidden_dim"],
        pad_idx=src_vocab.pad_id,
        num_layers=CONFIG["num_layers"],
        dropout=CONFIG["dropout"],
    )
    decoder = DecoderRNN(
        vocab_size=tgt_vocab.size,
        embed_dim=CONFIG["embedding_dim"],
        hidden_dim=CONFIG["hidden_dim"],
        pad_idx=tgt_vocab.pad_id,
        num_layers=CONFIG["num_layers"],
        dropout=CONFIG["dropout"],
    )
    seq2seq_model = Seq2Seq(encoder, decoder, tgt_vocab.pad_id, device).to(device)
    seq2seq_history = train_model(seq2seq_model, "Seq2Seq", train_loader, valid_loader, tgt_vocab, CONFIG["epochs"])
else:
    print("Seq2Seq 학습이 비활성화되었습니다.")


[Seq2Seq] Epoch 1/8 | train 5.916 | valid 5.755 | BLEU 0.024 | 65.3s
[Seq2Seq] Epoch 2/8 | train 5.206 | valid 5.563 | BLEU 0.035 | 64.6s
[Seq2Seq] Epoch 3/8 | train 4.853 | valid 5.507 | BLEU 0.046 | 64.9s
[Seq2Seq] Epoch 4/8 | train 4.601 | valid 5.427 | BLEU 0.051 | 64.5s
[Seq2Seq] Epoch 5/8 | train 4.389 | valid 5.406 | BLEU 0.058 | 64.9s
[Seq2Seq] Epoch 6/8 | train 4.205 | valid 5.389 | BLEU 0.061 | 64.7s
[Seq2Seq] Epoch 7/8 | train 4.044 | valid 5.411 | BLEU 0.064 | 64.8s
[Seq2Seq] Epoch 8/8 | train 3.884 | valid 5.423 | BLEU 0.066 | 64.8s


## Seq2Seq + Bahdanau Attention 학습

- Attention Decoder가 Encoder 출력 전체를 매 Step 참고하므로 긴 문장 번역에 유리합니다.
- 동일한 하이퍼파라미터를 유지한 채 Decoder만 교체하여 효과를 비교합니다.


In [32]:

RUN_ATTENTION = True
attn_model = None
attn_history = []

if RUN_ATTENTION:
    encoder_attn = Encoder(
        vocab_size=src_vocab.size,
        embed_dim=CONFIG["embedding_dim"],
        hidden_dim=CONFIG["hidden_dim"],
        pad_idx=src_vocab.pad_id,
        num_layers=CONFIG["num_layers"],
        dropout=CONFIG["dropout"],
    )
    decoder_attn = AttnDecoderRNN(
        vocab_size=tgt_vocab.size,
        embed_dim=CONFIG["embedding_dim"],
        hidden_dim=CONFIG["hidden_dim"],
        pad_idx=tgt_vocab.pad_id,
        dropout=CONFIG["dropout"],
    )
    attn_model = Seq2Seq(encoder_attn, decoder_attn, tgt_vocab.pad_id, device).to(device)
    attn_history = train_model(attn_model, "Seq2Seq+Attention", train_loader, valid_loader, tgt_vocab, CONFIG["epochs"])
else:
    print("Attention 모델 학습이 비활성화되었습니다.")


[Seq2Seq+Attention] Epoch 1/8 | train 5.728 | valid 5.521 | BLEU 0.038 | 116.7s
[Seq2Seq+Attention] Epoch 2/8 | train 4.808 | valid 5.302 | BLEU 0.056 | 117.0s
[Seq2Seq+Attention] Epoch 3/8 | train 4.281 | valid 5.185 | BLEU 0.065 | 116.7s
[Seq2Seq+Attention] Epoch 4/8 | train 3.831 | valid 5.224 | BLEU 0.071 | 117.0s
[Seq2Seq+Attention] Epoch 5/8 | train 3.501 | valid 5.269 | BLEU 0.076 | 117.0s
[Seq2Seq+Attention] Epoch 6/8 | train 3.224 | valid 5.331 | BLEU 0.083 | 117.1s
[Seq2Seq+Attention] Epoch 7/8 | train 3.055 | valid 5.379 | BLEU 0.085 | 116.7s
[Seq2Seq+Attention] Epoch 8/8 | train 2.901 | valid 5.418 | BLEU 0.085 | 116.9s


## 번역 예시 확인

학습된 모델 가중치가 있다면 아래 셀을 실행해 검증 세트 또는 임의 문장의 번역 품질을 직접 살펴볼 수 있습니다.


In [33]:

custom_sentences = [
    "오늘 저녁에 시간 있어?",
    "비가 올 것 같으니까 우산을 챙겨.",
    "이번 프로젝트 일정은 어떻게 조정할까요?",
]

if seq2seq_model is not None:
    show_predictions(seq2seq_model, custom_sentences, title="Seq2Seq")
    print("[Seq2Seq] 검증 샘플")
    show_validation_samples(seq2seq_model, num_samples=5)

if attn_model is not None:
    show_predictions(attn_model, custom_sentences, title="Seq2Seq+Attention")
    print("[Seq2Seq+Attention] 검증 샘플")
    show_validation_samples(attn_model, num_samples=5)

if seq2seq_model is None and attn_model is None:
    print("먼저 모델을 학습시켜 주세요.")


[Seq2Seq] 번역 예시
KO: 오늘 저녁에 시간 있어?
EN(pred): do you have to do today?
-
KO: 비가 올 것 같으니까 우산을 챙겨.
EN(pred): i think you can just a <unk>
-
KO: 이번 프로젝트 일정은 어떻게 조정할까요?
EN(pred): how can i ask for this week?
-
[Seq2Seq] 검증 샘플
KO: 케이스 포함 제품입니다.
EN(gt): This product includes a case.
EN(pred): it is a product product.
=
KO: >대한민국, 대한민국이 아니고 진짜 솔직히 제가 볼 때 진짜 최고 수준인 것 같아요.
EN(gt): >Korea, not Korea, but honestly, I think it's at the highest level.
EN(pred): i think it's a really <unk> but i think i really like to eat it in the
=
KO: 진화하는 라이프스타일에서 영감받은 라이트라이드 테니스 신발 컬렉션입니다.
EN(gt): A collection of lightride tennis shoes inspired by the evolving lifestyle.
EN(pred): the <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk>
=
KO: >양복 사고 그랬지.
EN(gt): We bought suits.
EN(pred): i also <unk> the <unk> of the <unk>
=
KO: 컬러 선택도 가능하며 사이즈 선택도 가능합니다.
EN(gt): You can choose a color and a size.
EN(pred): it can be operated with a <unk>
=
[Seq2Seq+Attention] 번역 예시
KO: 오늘 저녁에 시간 있어?
EN(pred): do you have a meeting a

## 다음에 시도해 볼 아이디어

- SentencePiece/BPE 등을 적용해 희귀 단어 문제를 줄여 보기
- Encoder/Decoder 다층화, 양방향 Encoder, Dropout 비율 조정
- Teacher Forcing 비율을 Epoch에 따라 점감시키기
- BLEU 외에 chrF, ROUGE 등 다른 정량 지표 추가
