In [None]:
# =============================================
# [6/9] 셀프 어텐션(Self-Attention) & Transformer
# =============================================
# 목표: 트랜스포머의 핵심인 멀티헤드 셀프어텐션의 구조를 직접 구현하며 이해합니다.

import torch
import torch.nn as nn
import math

# --- 1. Scaled Dot-Product Attention 구현 ---
class ScaledDotProductAttention(nn.Module):
    def forward(self, Q, K, V, mask=None):
        # Q, K, V: (batch_size, n_heads, seq_len, d_k)
        d_k = K.size(-1)
        scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(d_k)
        
        if mask is not None:
            # 마스킹: 어텐션 스코어 행렬에서 마스크 값이 0인 위치를 아주 작은 값(-1e9)으로 치환
            scores = scores.masked_fill(mask == 0, -1e9)
            
        attention = torch.softmax(scores, dim=-1)
        context = torch.matmul(attention, V)
        return context, attention

# --- 2. Multi-Head Attention 구현 ---
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model=512, n_heads=8):
        super().__init__()
        self.n_heads = n_heads
        self.d_k = d_model // n_heads
        
        self.W_Q = nn.Linear(d_model, d_model)
        self.W_K = nn.Linear(d_model, d_model)
        self.W_V = nn.Linear(d_model, d_model)
        self.W_O = nn.Linear(d_model, d_model)
        
        self.attention = ScaledDotProductAttention()

    def forward(self, Q, K, V, mask=None):
        batch_size = Q.size(0)
        
        # 1. Linear projections
        q_s = self.W_Q(Q).view(batch_size, -1, self.n_heads, self.d_k).transpose(1, 2)
        k_s = self.W_K(K).view(batch_size, -1, self.n_heads, self.d_k).transpose(1, 2)
        v_s = self.W_V(V).view(batch_size, -1, self.n_heads, self.d_k).transpose(1, 2)
        
        # 2. Apply attention on all heads
        context, attn = self.attention(q_s, k_s, v_s, mask)
        
        # 3. Concat and additional linear layer
        context = context.transpose(1, 2).contiguous().view(batch_size, -1, self.n_heads * self.d_k)
        output = self.W_O(context)
        return output

# --- 3. 작은 Transformer 모델 빌드 (개념적 구조) ---
# 실제 모델은 Positional Encoding, FeedForward Network, LayerNorm 등이 추가됩니다.
class EncoderLayer(nn.Module):
    def __init__(self, d_model=512, n_heads=8):
        super().__init__()
        self.self_attn = MultiHeadAttention(d_model, n_heads)
        self.ffn = nn.Sequential(
            nn.Linear(d_model, 2048),
            nn.ReLU(),
            nn.Linear(2048, d_model)
        )
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(0.1)

    def forward(self, src, src_mask):
        # Multi-Head Attention -> Add & Norm
        _src = self.self_attn(src, src, src, src_mask)
        src = self.norm1(src + self.dropout(_src))
        # FeedForward -> Add & Norm
        _src = self.ffn(src)
        src = self.norm2(src + self.dropout(_src))
        return src

print("ScaledDotProductAttention, MultiHeadAttention, EncoderLayer가 정의되었습니다.")
print("\n[실습 과제]")
print("1. PositionalEncoding 클래스를 구현해보세요. (sin, cos 함수 사용)")
print("2. DecoderLayer를 구현해보세요. (Masked Self-Attention + Encoder-Decoder Attention)")
print("3. 이 구성 요소들을 조립해 전체 Transformer 모델을 완성하고, Seq2Seq 과제에 적용해보세요.")