In [1]:
# !python -m spacy download en
# !python -m spacy download de

import pandas as pd

data = pd.read_csv("wmt14_translate_de-en_test.csv")

In [2]:
import spacy

spacy_en = spacy.load('en_core_web_sm')
spacy_de = spacy.load('de_core_news_sm')

# 독일어(Deutsch) 문장을 토큰화 하는 함수
def tokenize_de(text):
    return [token.text for token in spacy_de.tokenizer(text)]
de_token = data['de'].apply(tokenize_de)

# 영어(English) 문장을 토큰화 하는 함수
def tokenize_en(text):
    return [token.text for token in spacy_en.tokenizer(text)]
en_token = data['en'].apply(tokenize_en)

de_max_len = de_token.apply(len).max()
en_max_len = en_token.apply(len).max()
print("독일어 토큰 최대 길이:", de_max_len)
print("영어 토큰 최대 길이:", en_max_len)

독일어 토큰 최대 길이: 75
영어 토큰 최대 길이: 92


In [3]:
from collections import defaultdict

# 독일어(Deutsch) 사전 제작
de_dict = defaultdict(list)
de_dict["<unk>"] = 0
de_dict["<pad>"] = 1
de_dict["<sos>"] = 2
de_dict["<eos>"] = 3
idx = 4
for token_list in de_token:
    for token in token_list:
        if token not in de_dict:
            de_dict[token] = idx
            idx += 1
print("독일어 토큰 사전 크기:", len(de_dict))

# 영어(English) 토큰을 숫자 인덱스로 변환
en_dict = defaultdict(list)
en_dict["<unk>"] = 0
en_dict["<pad>"] = 1
en_dict["<sos>"] = 2
en_dict["<eos>"] = 3
idx = 4
for token_list in en_token:
    for token in token_list:
        if token not in en_dict:
            en_dict[token] = idx
            idx += 1
print("영어 토큰 사전 크기:", len(en_dict))

dict_en = {v: k for k, v in en_dict.items()}

독일어 토큰 사전 크기: 13941
영어 토큰 사전 크기: 10242


In [4]:
# 최대 사전 크기 (input_dim 값 설정)
VOCAB_SIZE = max(len(de_dict), len(en_dict))
# Maximum sequence length
MAX_LEN = max(de_max_len, en_max_len) + 2

# 독일어(Deutsch) 토큰을 숫자 인덱스로 변환
de_vocab = []
for token_list in de_token:
    tmp = [2] # 시작 토큰 추가
    for token in token_list:
        tmp.append(de_dict[token])
    tmp.append(3) # 종료 토큰 추가
    # 패딩 처리
    if len(tmp) < MAX_LEN:
        tmp += [1] * (MAX_LEN - len(tmp))
    de_vocab.append(tmp)

# 영어(English) 토큰을 숫자 인덱스로 변환
en_vocab = []
for token_list in en_token:
    tmp = [2] # 시작 토큰 추가
    for token in token_list:
        tmp.append(en_dict[token])
    tmp.append(3) # 종료 토큰 추가
    # 패딩 처리
    if len(tmp) < MAX_LEN:
        tmp += [1] * (MAX_LEN - len(tmp))
    en_vocab.append(tmp)

print(VOCAB_SIZE, MAX_LEN)

13941 94


In [5]:
import random
import numpy as np
import torch

seed = 2025
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True 

In [6]:
import torch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

batch_size = 64
epochs = 100

input_dim = VOCAB_SIZE
seq_len = MAX_LEN
d_model = 512
num_heads = 8
num_layers = 6
dropout = 0.1
d_ffn=2048

betas = (0.9, 0.98)
eps = 1e-9

warmup_steps = 4000
lrate = lambda step: 0 if step == 0 else (d_model ** -0.5) * min(step ** -0.5, step * warmup_steps ** -1.5)

cuda


In [7]:
from sklearn.model_selection import train_test_split

x_train, x_valid, y_train, y_valid = train_test_split(de_vocab, en_vocab, test_size = 0.2, random_state = seed)

x_train = torch.LongTensor(x_train)
x_valid = torch.LongTensor(x_valid)
y_train = torch.LongTensor(y_train)
y_valid = torch.LongTensor(y_valid)

In [8]:
from torch.utils.data import TensorDataset, DataLoader

train_dataset = TensorDataset(x_train, y_train)
data_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, drop_last=True)

valid_dataset = TensorDataset(x_valid, y_valid)
valid_loader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False, drop_last=True)

In [None]:
import custom_transformer2
model = custom_transformer2.Transformer(input_dim, seq_len, d_model, num_heads, num_layers, dropout, d_ffn, en_dict['<pad>'], device).to(device)

optimizer = torch.optim.Adam(model.parameters(), betas=betas, eps=eps)
scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lrate)

# <pad> 토큰은 무시해야한다.
criterion = torch.nn.CrossEntropyLoss(ignore_index=1, label_smoothing=0.1)

In [10]:
import time

total_batch = len(data_loader)

for epoch in range(epochs):
    start_time = time.time()
    avg_cost = 0
    
    model.train()    
    for X, Y in data_loader:
        X = X.to(device)
        Y = Y.to(device)
        
        optimizer.zero_grad()
        hypothesis = model(X, Y[:, :-1])
        cost = criterion(hypothesis.transpose(1, 2), Y[:, 1:])
        cost.backward()
        optimizer.step()
        scheduler.step()

        avg_cost += cost.item() / total_batch

    if (epoch + 1) % 10 == 0:
        print("#" * 30)
        print('[Epoch: {:>4}] cost = {:>.9}'.format(epoch + 1, avg_cost))
        allocated = torch.cuda.memory_allocated() / (1024 ** 2)
        reserved = torch.cuda.memory_reserved() / (1024 ** 2)
        print(f"[GPU] Allocated: {allocated:.2f}MB | Reserved: {reserved:.2f}MB")
        print(f"[Time] {time.time() - start_time:.2f} sec")

##############################
[Epoch:   10] cost = 9.66373975
[GPU] Allocated: 1335.86MB | Reserved: 5560.00MB
[Time] 27.91 sec
##############################
[Epoch:   20] cost = 9.33843056
[GPU] Allocated: 1335.86MB | Reserved: 5560.00MB
[Time] 31.69 sec
##############################
[Epoch:   30] cost = 8.9292417
[GPU] Allocated: 1335.86MB | Reserved: 5560.00MB
[Time] 31.50 sec
##############################
[Epoch:   40] cost = 8.63359871
[GPU] Allocated: 1335.86MB | Reserved: 5560.00MB
[Time] 31.50 sec
##############################
[Epoch:   50] cost = 8.46960271
[GPU] Allocated: 1335.86MB | Reserved: 5560.00MB
[Time] 31.58 sec
##############################
[Epoch:   60] cost = 8.35601381
[GPU] Allocated: 1335.86MB | Reserved: 5560.00MB
[Time] 31.32 sec
##############################
[Epoch:   70] cost = 8.23938568
[GPU] Allocated: 1335.86MB | Reserved: 5560.00MB
[Time] 31.37 sec
##############################
[Epoch:   80] cost = 8.11401133
[GPU] Allocated: 1335.86MB | Reserv

In [11]:
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

smoothie = SmoothingFunction().method4

model.eval()
start_time = time.time()
total_valid_cost = 0
bleu_scores = []

with torch.no_grad():
    for X, Y in valid_loader:
        X = X.to(device)
        Y = Y.to(device)
        
        # --- 1. Validation Loss 계산 (Teacher Forcing 방식 유지) ---
        # 모델에 정답을 알려주며 다음 단어 예측 능력을 평가
        hypothesis_for_loss = model(X, Y[:, :-1])
        cost = criterion(hypothesis_for_loss.transpose(1, 2), Y[:, 1:])
        total_valid_cost += cost.item() / len(valid_loader)

        # --- 2. BLEU 점수 계산을 위한 문장 생성 (Auto-Regressive 방식) ---
        # 모델 스스로 문장을 생성하는 능력을 평가

        # <sos> 토큰으로 시작
        trg_input = torch.full((batch_size, 1), 2, dtype=torch.long, device=device)

        for _ in range(MAX_LEN - 1):
            hypothesis_for_bleu = model(X, trg_input)
            pred_token = hypothesis_for_bleu.argmax(dim=-1)[:, -1].unsqueeze(1)
            trg_input = torch.cat((trg_input, pred_token), dim=1)

        # 🔁 디코딩 후 BLEU 계산
        for i in range(batch_size):
            predicted_sentence = trg_input[i].cpu()
            target_sentence = Y[i, 1:].cpu()

            pred_tokens = [dict_en.get(token_id.item(), "<unk>") for token_id in predicted_sentence]
            target_tokens = [dict_en.get(token_id.item(), "<unk>") for token_id in target_sentence]
            
            # <eos> 토큰 이후는 잘라내어 더 정확한 BLEU 점수 계산 (개선 사항)
            try:
                eos_idx = pred_tokens.index('<eos>')
                pred_tokens = pred_tokens[:eos_idx]
            except ValueError: pass # <eos>가 없는 경우 그대로 사용
            try:
                eos_idx = target_tokens.index('<eos>')
                target_tokens = target_tokens[:eos_idx]
            except ValueError: pass

            bleu = sentence_bleu(
                [target_tokens], pred_tokens,
                weights=(0.25, 0.25, 0.25, 0.25),
                smoothing_function=smoothie
            )
            bleu_scores.append(bleu)

avg_bleu = sum(bleu_scores) / len(bleu_scores)
print("#" * 30)
print('[Valid] cost = {:>.9}'.format(total_valid_cost))
print('[Valid] BLEU = {:.2f}'.format(avg_bleu * 100))
allocated = torch.cuda.memory_allocated() / (1024 ** 2)
reserved = torch.cuda.memory_reserved() / (1024 ** 2)
print(f"[GPU] Allocated: {allocated:.2f}MB | Reserved: {reserved:.2f}MB")
print(f"[Time] {time.time() - start_time:.2f} sec")
print("#" * 30)

##############################
[Valid] cost = 7.8903177
[Valid] BLEU = 1.28
[GPU] Allocated: 1969.02MB | Reserved: 5560.00MB
[Time] 121.44 sec
##############################
