In [1]:
# !python -m spacy download en
# !python -m spacy download de

import pandas as pd

data = pd.read_csv("wmt14_translate_de-en_test.csv")

In [2]:
import spacy

spacy_en = spacy.load('en_core_web_sm')
spacy_de = spacy.load('de_core_news_sm')

# 독일어(Deutsch) 문장을 토큰화 하는 함수
def tokenize_de(text):
    return [token.text for token in spacy_de.tokenizer(text)]
de_token = data['de'].apply(tokenize_de)

# 영어(English) 문장을 토큰화 하는 함수
def tokenize_en(text):
    return [token.text for token in spacy_en.tokenizer(text)]
en_token = data['en'].apply(tokenize_en)

de_max_len = de_token.apply(len).max()
en_max_len = en_token.apply(len).max()
print("독일어 토큰 최대 길이:", de_max_len)
print("영어 토큰 최대 길이:", en_max_len)

독일어 토큰 최대 길이: 75
영어 토큰 최대 길이: 92


In [3]:
from collections import defaultdict

# 독일어(Deutsch) 사전 제작
de_dict = defaultdict(list)
de_dict["<unk>"] = 0
de_dict["<pad>"] = 1
de_dict["<sos>"] = 2
de_dict["<eos>"] = 3
idx = 4
for token_list in de_token:
    for token in token_list:
        if token not in de_dict:
            de_dict[token] = idx
            idx += 1
print("독일어 토큰 사전 크기:", len(de_dict))

# 영어(English) 토큰을 숫자 인덱스로 변환
en_dict = defaultdict(list)
en_dict["<unk>"] = 0
en_dict["<pad>"] = 1
en_dict["<sos>"] = 2
en_dict["<eos>"] = 3
idx = 4
for token_list in en_token:
    for token in token_list:
        if token not in en_dict:
            en_dict[token] = idx
            idx += 1
print("영어 토큰 사전 크기:", len(en_dict))

dict_en = {v: k for k, v in en_dict.items()}

독일어 토큰 사전 크기: 13941
영어 토큰 사전 크기: 10242


In [4]:
# 최대 사전 크기 (input_dim 값 설정)
VOCAB_SIZE = max(len(de_dict), len(en_dict))
# Maximum sequence length
MAX_LEN = max(de_max_len, en_max_len) + 2

# 독일어(Deutsch) 토큰을 숫자 인덱스로 변환
de_vocab = []
for token_list in de_token:
    tmp = [2] # 시작 토큰 추가
    for token in token_list:
        tmp.append(de_dict[token])
    tmp.append(3) # 종료 토큰 추가
    # 패딩 처리
    if len(tmp) < MAX_LEN:
        tmp += [1] * (MAX_LEN - len(tmp))
    de_vocab.append(tmp)

# 영어(English) 토큰을 숫자 인덱스로 변환
en_vocab = []
for token_list in en_token:
    tmp = [2] # 시작 토큰 추가
    for token in token_list:
        tmp.append(en_dict[token])
    tmp.append(3) # 종료 토큰 추가
    # 패딩 처리
    if len(tmp) < MAX_LEN:
        tmp += [1] * (MAX_LEN - len(tmp))
    en_vocab.append(tmp)

print(VOCAB_SIZE, MAX_LEN)

13941 94


In [5]:
import random
import numpy as np
import torch

seed = 2025
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True 

In [6]:
import torch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

batch_size = 64
epochs = 100

input_dim = VOCAB_SIZE
seq_len = MAX_LEN
d_model = 512
num_heads = 8
num_layers = 6
dropout = 0.1
d_ffn=2048

betas = (0.9, 0.98)
eps = 1e-9

warmup_steps = 4000
lrate = lambda step: 0 if step == 0 else (d_model ** -0.5) * min(step ** -0.5, step * warmup_steps ** -1.5)

cuda


In [7]:
from sklearn.model_selection import train_test_split

x_train, x_valid, y_train, y_valid = train_test_split(de_vocab, en_vocab, test_size = 0.2, random_state = seed)

x_train = torch.LongTensor(x_train)
x_valid = torch.LongTensor(x_valid)
y_train = torch.LongTensor(y_train)
y_valid = torch.LongTensor(y_valid)

In [8]:
from torch.utils.data import TensorDataset, DataLoader

train_dataset = TensorDataset(x_train, y_train)
data_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, drop_last=True)

valid_dataset = TensorDataset(x_valid, y_valid)
valid_loader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False, drop_last=True)

In [9]:
import math
import torch
from torch import nn

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len, dropout=dropout):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)

        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(
            torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model)
        )

        pe = torch.zeros(max_len, 1, d_model)
        pe[:, 0, 0::2] = torch.sin(position * div_term)
        pe[:, 0, 1::2] = torch.cos(position * div_term)
        self.register_buffer("pe", pe)

    def forward(self, x):
        x = x + self.pe[: x.size(0)]
        return self.dropout(x)


class TokenEmbedding(nn.Module):
    def __init__(self, vocab_size, emb_size):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, emb_size)
        self.emb_size = emb_size

    def forward(self, tokens):
        return self.embedding(tokens.long()) * math.sqrt(self.emb_size)


class Seq2SeqTransformer(nn.Module):
    def __init__(
        self,
        num_encoder_layers,
        num_decoder_layers,
        emb_size,
        max_len,
        nhead,
        src_vocab_size,
        tgt_vocab_size,
        dim_feedforward,
        dropout=dropout,
    ):
        super().__init__()
        self.src_tok_emb = TokenEmbedding(src_vocab_size, emb_size)
        self.tgt_tok_emb = TokenEmbedding(tgt_vocab_size, emb_size)
        self.positional_encoding = PositionalEncoding(
            d_model=emb_size, max_len=max_len, dropout=dropout
        )
        self.transformer = nn.Transformer(
            d_model=emb_size,
            nhead=nhead,
            num_encoder_layers=num_encoder_layers,
            num_decoder_layers=num_decoder_layers,
            dim_feedforward=dim_feedforward,
            dropout=dropout,
        )
        self.generator = nn.Linear(emb_size, tgt_vocab_size)

    def forward(
        self,
        src,
        trg,
        src_mask,
        tgt_mask,
        src_padding_mask,
        tgt_padding_mask,
        memory_key_padding_mask,
    ):
        src_emb = self.positional_encoding(self.src_tok_emb(src))
        tgt_emb = self.positional_encoding(self.tgt_tok_emb(trg))
        outs = self.transformer(
            src=src_emb,
            tgt=tgt_emb,
            src_mask=src_mask,
            tgt_mask=tgt_mask,
            memory_mask=None,
            src_key_padding_mask=src_padding_mask,
            tgt_key_padding_mask=tgt_padding_mask,
            memory_key_padding_mask=memory_key_padding_mask
        )
        return self.generator(outs)

    def encode(self, src, src_mask):
        return self.transformer.encoder(
            self.positional_encoding(self.src_tok_emb(src)), src_mask
        )

    def decode(self, tgt, memory, tgt_mask):
        return self.transformer.decoder(
            self.positional_encoding(self.tgt_tok_emb(tgt)), memory, tgt_mask
        )

In [10]:
from torch import optim

model = Seq2SeqTransformer(
    num_encoder_layers=num_layers,
    num_decoder_layers=num_layers,
    emb_size=d_model,
    max_len=seq_len,
    nhead=num_heads,
    src_vocab_size=len(de_dict),
    tgt_vocab_size=len(en_dict),
    dim_feedforward=d_ffn,
).to(device)

optimizer = torch.optim.Adam(model.parameters(), betas=betas, eps=eps)
scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lrate)

# <pad> 토큰은 무시해야한다.
criterion = torch.nn.CrossEntropyLoss(ignore_index=1, label_smoothing=0.1)

for main_name, main_module in model.named_children():
    print(main_name)
    for sub_name, sub_module in main_module.named_children():
        print("└", sub_name)
        for ssub_name, ssub_module in sub_module.named_children():
            print("│  └", ssub_name)
            for sssub_name, sssub_module in ssub_module.named_children():
                print("│  │  └", sssub_name)



src_tok_emb
└ embedding
tgt_tok_emb
└ embedding
positional_encoding
└ dropout
transformer
└ encoder
│  └ layers
│  │  └ 0
│  │  └ 1
│  │  └ 2
│  │  └ 3
│  │  └ 4
│  │  └ 5
│  └ norm
└ decoder
│  └ layers
│  │  └ 0
│  │  └ 1
│  │  └ 2
│  │  └ 3
│  │  └ 4
│  │  └ 5
│  └ norm
generator


In [11]:
def generate_square_subsequent_mask(s):
    mask = (torch.triu(torch.ones((s, s), device=device)) == 1).transpose(0, 1)
    mask = (
        mask.float()
        .masked_fill(mask == 0, float("-inf"))
        .masked_fill(mask == 1, float(0.0))
    )
    return mask

def create_mask(src, tgt):
    src_seq_len = src.shape[0]
    tgt_seq_len = tgt.shape[0]

    tgt_mask = generate_square_subsequent_mask(tgt_seq_len)
    src_mask = torch.zeros((src_seq_len, src_seq_len), device=device).type(torch.bool)

    src_padding_mask = (src == 1).transpose(0, 1)
    tgt_padding_mask = (tgt == 1).transpose(0, 1)
    return src_mask, tgt_mask, src_padding_mask, tgt_padding_mask

In [12]:
import time

total_batch = len(data_loader)

for epoch in range(epochs):
    start_time = time.time()
    avg_cost = 0
    
    model.train()    
    for X, Y in data_loader:
        source_batch = X.transpose(0, 1).to(device)
        target_batch = Y.transpose(0, 1).to(device)

        target_input = target_batch[:-1, :]
        target_output = target_batch[1:, :]

        src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(
            source_batch, target_input
        )

        hypothesis = model(
            src=source_batch,
            trg=target_input,
            src_mask=src_mask,
            tgt_mask=tgt_mask,
            src_padding_mask=src_padding_mask,
            tgt_padding_mask=tgt_padding_mask,
            memory_key_padding_mask=src_padding_mask,
        )

        optimizer.zero_grad()
        cost = criterion(hypothesis.reshape(-1, hypothesis.shape[-1]), target_output.reshape(-1))
        cost.backward()
        optimizer.step()
        scheduler.step()

        avg_cost += cost.item() / total_batch

    if (epoch + 1) % 10 == 0:
        print("#" * 30)
        print('[Epoch: {:>4}] cost = {:>.9}'.format(epoch + 1, avg_cost))
        allocated = torch.cuda.memory_allocated() / (1024 ** 2)
        reserved = torch.cuda.memory_reserved() / (1024 ** 2)
        print(f"[GPU] Allocated: {allocated:.2f}MB | Reserved: {reserved:.2f}MB")
        print(f"[Time] {time.time() - start_time:.2f} sec")



##############################
[Epoch:   10] cost = 9.28551455
[GPU] Allocated: 1215.57MB | Reserved: 5480.00MB
[Time] 28.48 sec
##############################
[Epoch:   20] cost = 8.99073503
[GPU] Allocated: 1215.57MB | Reserved: 5480.00MB
[Time] 28.26 sec
##############################
[Epoch:   30] cost = 8.67281081
[GPU] Allocated: 1215.57MB | Reserved: 5480.00MB
[Time] 27.61 sec
##############################
[Epoch:   40] cost = 8.4457872
[GPU] Allocated: 1215.57MB | Reserved: 5480.00MB
[Time] 30.23 sec
##############################
[Epoch:   50] cost = 8.31056231
[GPU] Allocated: 1215.57MB | Reserved: 5480.00MB
[Time] 27.99 sec
##############################
[Epoch:   60] cost = 8.22065191
[GPU] Allocated: 1215.57MB | Reserved: 5480.00MB
[Time] 30.40 sec
##############################
[Epoch:   70] cost = 8.14255637
[GPU] Allocated: 1215.57MB | Reserved: 5480.00MB
[Time] 29.85 sec
##############################
[Epoch:   80] cost = 8.06369397
[GPU] Allocated: 1215.57MB | Reserv

In [15]:
import time
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

# 배치 단위로 탐욕 디코딩(greedy decode)을 수행하는 함수
def greedy_decode_batch(model, src, src_mask, src_padding_mask, max_len, start_symbol):
    batch_size = src.size(1)
    device = src.device
    memory = model.encode(src, src_mask)
    # 모든 문장의 시작을 <sos> 토큰으로 초기화
    ys = torch.full((1, batch_size), start_symbol, dtype=torch.long, device=device)

    for i in range(max_len - 1):
        tgt_mask = (generate_square_subsequent_mask(ys.size(0)).type(torch.bool)).to(device)

        # 디코더를 통해 다음 토큰 예측
        out = model.decode(ys, memory, tgt_mask)
        out = out.transpose(0, 1)
        prob = model.generator(out[:, -1])
        _, next_word = torch.max(prob, dim=1)

        # 예측된 토큰을 결과 시퀀스에 추가
        ys = torch.cat([ys, next_word.unsqueeze(0)], dim=0)

    return ys.transpose(0, 1) # (batch_size, seq_len)으로 차원 복원


# --- 검증 시작 ---
model.eval()
total_valid_cost = 0
bleu_scores = []
smoothie = SmoothingFunction().method4

with torch.no_grad():
    for X, Y in valid_loader:
        # 텐서 차원 변환 및 디바이스로 이동
        src = X.transpose(0, 1).to(device)
        tgt = Y.transpose(0, 1).to(device)

        tgt_input = tgt[:-1, :]

        # 마스크 생성
        src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_input)

        # --- 1. Validation Loss 계산 ---
        logits = model(src, tgt_input, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask)
        tgt_out = tgt[1:, :]
        cost = criterion(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1))
        total_valid_cost += cost.item()

        # --- 2. BLEU 점수 계산을 위한 문장 생성 ---
        # <sos> 토큰 인덱스는 2
        predicted_sentences_indices = greedy_decode_batch(model, src, src_mask, src_padding_mask, max_len=MAX_LEN, start_symbol=2)

        # 배치 내 모든 문장에 대해 BLEU 점수 계산
        for i in range(X.size(0)):
            # 정답 문장 (Y)에서 <pad>, <sos> 토큰 제외
            target_sentence = [dict_en.get(token_id.item(), "<unk>") for token_id in Y[i] if token_id.item() not in [1, 2]]
            # 예측 문장 (predicted)에서 <pad>, <sos> 토큰 제외
            predicted_sentence = [dict_en.get(token_id.item(), "<unk>") for token_id in predicted_sentences_indices[i] if token_id.item() not in [1, 2]]

            # <eos> 토큰이 나오면 문장 끝으로 간주
            try:
                eos_idx = target_sentence.index('<eos>')
                target_sentence = target_sentence[:eos_idx]
            except ValueError: pass

            try:
                eos_idx = predicted_sentence.index('<eos>')
                predicted_sentence = predicted_sentence[:eos_idx]
            except ValueError: pass

            # BLEU 점수 계산
            bleu = sentence_bleu(
                [target_sentence], predicted_sentence,
                weights=(0.25, 0.25, 0.25, 0.25),
                smoothing_function=smoothie
            )
            bleu_scores.append(bleu)

avg_valid_cost = total_valid_cost / len(valid_loader)
avg_bleu = sum(bleu_scores) / len(bleu_scores) if bleu_scores else 0

print("#" * 30)
print('[Valid] cost = {:>.9}'.format(avg_valid_cost))
print('[Valid] BLEU = {:.2f}'.format(avg_bleu * 100))
allocated = torch.cuda.memory_allocated() / (1024 ** 2)
reserved = torch.cuda.memory_reserved() / (1024 ** 2)
print(f"[GPU] Allocated: {allocated:.2f}MB | Reserved: {reserved:.2f}MB")
print(f"[Time] {time.time() - start_time:.2f} sec")
print("#" * 30)

##############################
[Valid] cost = 7.74307701
[Valid] BLEU = 0.26
[GPU] Allocated: 1460.12MB | Reserved: 5480.00MB
[Time] 4599.73 sec
##############################
