# 1. Scaled dot Attention

In [1]:
import numpy as np
from scipy.special import softmax

# 임의의 입력 벡터(임베딩)를 가정
input_vectors = np.array([[1, 0, 1], [0, 2, 0], [1, 1, 1]])

# 가중치 행렬 (임의로 설정)
W_q = np.array([[1, 0, 0], [0, 1, 0], [0, 0, 1]])  # Q에 대한 가중치
W_k = np.array([[1, 0, 0], [0, 1, 0], [0, 0, 1]])  # K에 대한 가중치
W_v = np.array([[1, 0, 0], [0, 1, 0], [0, 0, 1]])  # V에 대한 가중치

# Q, K, V 계산
Q = np.dot(input_vectors, W_q)  # 쿼리
K = np.dot(input_vectors, W_k)  # 키
V = np.dot(input_vectors, W_v)  # 밸류

# 스케일드 닷-프로덕트 어텐션 계산
dk = K.shape[1]  # 키 벡터의 차원
attention_scores = np.matmul(Q, K.T) / np.sqrt(dk) # Scaled Dot-Product
attention_weights = softmax(attention_scores, axis=1) # softmax -> attention weight
output = np.matmul(attention_weights, V)

In [2]:
# 결과 확인
Q, K, V

(array([[1, 0, 1],
        [0, 2, 0],
        [1, 1, 1]]),
 array([[1, 0, 1],
        [0, 2, 0],
        [1, 1, 1]]),
 array([[1, 0, 1],
        [0, 2, 0],
        [1, 1, 1]]))

In [3]:
attention_weights

array([[0.4319371 , 0.1361258 , 0.4319371 ],
       [0.07021749, 0.70697728, 0.22280523],
       [0.26445846, 0.26445846, 0.47108308]])

In [4]:
output

array([[0.8638742 , 0.7041887 , 0.8638742 ],
       [0.29302272, 1.63675979, 0.29302272],
       [0.73554154, 1.        , 0.73554154]])

In [6]:
attention_weights@V

array([[0.8638742 , 0.7041887 , 0.8638742 ],
       [0.29302272, 1.63675979, 0.29302272],
       [0.73554154, 1.        , 0.73554154]])

In [11]:
np.exp(attention_scores) / np.sum(np.exp(attention_scores), axis=1, keepdims=True)

array([[0.4319371 , 0.1361258 , 0.4319371 ],
       [0.07021749, 0.70697728, 0.22280523],
       [0.26445846, 0.26445846, 0.47108308]])

# 2. Multi-Head Attention

In [33]:
import torch
import torch.nn as nn

class MultiHeadAttentionWithMask(nn.Module):
    def __init__(self, d_model, num_heads):
        super(MultiHeadAttentionWithMask, self).__init__()
        self.d_model = d_model
        self.num_heads = num_heads
        self.depth = d_model // num_heads

        self.wq = nn.Linear(d_model, d_model)
        self.wk = nn.Linear(d_model, d_model)
        self.wv = nn.Linear(d_model, d_model)
    
    # 마스크 영역 생성을 위한 함수
    def create_look_ahead_mask(self, size):
        mask = torch.triu(torch.ones(size, size), diagonal=1)
        return mask  
    
    # Multi-Head Attention을 위한 함수
    def split_heads(self, x, batch_size):
        x = x.view(batch_size, -1, self.num_heads, self.depth)
        return x.permute(0, 2, 1, 3)
    
    def forward(self, q, k, v):
        batch_size = q.size(0)

        q = self.split_heads(self.wq(q), batch_size)
        k = self.split_heads(self.wk(k), batch_size)
        v = self.split_heads(self.wv(v), batch_size)

        matmul_qk = torch.matmul(q, k.transpose(-2, -1))

        # 마스크 적용 전 스코어
        dk = torch.tensor(self.depth, dtype=torch.float32)
        attention_before_mask = matmul_qk / torch.sqrt(dk)

        # 마스크 영역
        mask_area = self.create_look_ahead_mask(q.size(2))
        matmul_qk += (mask_area * -1e9)

        # 마스크 적용 후 스코어
        attention_after_mask = matmul_qk / torch.sqrt(dk)

        attention_weights = nn.functional.softmax(attention_after_mask, dim=-1)

        output = torch.matmul(attention_weights, v)

        output = output.permute(0, 2, 1, 3).contiguous()
        output = output.view(batch_size, -1, self.d_model)

        return attention_before_mask, attention_after_mask, mask_area, output

# 모델 초기화
d_model = 64
num_heads = 4

# 더미 입력 데이터 생성 (q, k, v)
batch_size = 2
seq_len = 10
q = torch.rand(batch_size, seq_len, d_model)
k = torch.rand(batch_size, seq_len, d_model)
v = torch.rand(batch_size, seq_len, d_model)

# 멀티헤드 어텐션 모델 초기화 (마스크 포함)
multi_head_attn_with_mask = MultiHeadAttentionWithMask(d_model, num_heads)

# 멀티헤드 어텐션에 마스크 적용하여 수행
scores_before_mask, scores_after_mask, mask_area, output = multi_head_attn_with_mask(q, k, v)

In [32]:
mask_area

tensor([[0., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
        [0., 0., 1., 1., 1., 1., 1., 1., 1., 1.],
        [0., 0., 0., 1., 1., 1., 1., 1., 1., 1.],
        [0., 0., 0., 0., 1., 1., 1., 1., 1., 1.],
        [0., 0., 0., 0., 0., 1., 1., 1., 1., 1.],
        [0., 0., 0., 0., 0., 0., 1., 1., 1., 1.],
        [0., 0., 0., 0., 0., 0., 0., 1., 1., 1.],
        [0., 0., 0., 0., 0., 0., 0., 0., 1., 1.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 1.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]])

In [39]:
import torch

# 출력 길이 설정
torch.set_printoptions(linewidth=140)

In [40]:
scores_before_mask[0][0] # mask가 적용되지 않은 score

tensor([[-1.5456e-02,  4.1445e-02,  3.5639e-04, -4.4191e-02, -3.3841e-03,  7.5077e-02,  6.1469e-02, -5.1129e-02,  2.8336e-02, -2.6438e-02],
        [-7.7577e-02, -1.1581e-02, -6.0852e-02, -8.6383e-02, -6.6699e-02,  1.4562e-02, -2.7794e-03, -1.0405e-01, -2.9481e-02, -7.7760e-02],
        [-1.8181e-02,  3.8231e-02, -5.2480e-04,  1.3417e-02, -1.0679e-05,  3.1950e-02,  4.4274e-02, -3.6149e-02,  4.6171e-02,  5.4952e-03],
        [-1.9081e-01, -1.4831e-01, -1.5368e-01, -2.3596e-01, -1.9699e-01, -5.2835e-02, -7.0669e-02, -2.3505e-01, -1.1223e-01, -1.6724e-01],
        [-8.6314e-03,  3.8139e-02, -2.5298e-02, -4.7316e-04, -8.1381e-02,  4.9788e-02,  4.4812e-02, -4.4572e-02,  9.3385e-03, -6.0012e-02],
        [-1.4960e-01, -8.4267e-02, -1.3960e-01, -1.3799e-01, -1.1551e-01, -9.7467e-03, -3.0405e-02, -1.5251e-01, -6.4841e-02, -1.2215e-01],
        [-1.7652e-01, -7.3223e-02, -1.5140e-01, -1.8178e-01, -1.2315e-01, -3.0111e-02, -1.1649e-01, -1.8518e-01, -1.1341e-01, -1.4087e-01],
        [-2.1848e-02

In [41]:
scores_after_mask[0][0]  # mask가 적용된 score

tensor([[-1.5456e-02, -2.5000e+08, -2.5000e+08, -2.5000e+08, -2.5000e+08, -2.5000e+08, -2.5000e+08, -2.5000e+08, -2.5000e+08, -2.5000e+08],
        [-7.7577e-02, -1.1581e-02, -2.5000e+08, -2.5000e+08, -2.5000e+08, -2.5000e+08, -2.5000e+08, -2.5000e+08, -2.5000e+08, -2.5000e+08],
        [-1.8181e-02,  3.8231e-02, -5.2480e-04, -2.5000e+08, -2.5000e+08, -2.5000e+08, -2.5000e+08, -2.5000e+08, -2.5000e+08, -2.5000e+08],
        [-1.9081e-01, -1.4831e-01, -1.5368e-01, -2.3596e-01, -2.5000e+08, -2.5000e+08, -2.5000e+08, -2.5000e+08, -2.5000e+08, -2.5000e+08],
        [-8.6314e-03,  3.8139e-02, -2.5298e-02, -4.7316e-04, -8.1381e-02, -2.5000e+08, -2.5000e+08, -2.5000e+08, -2.5000e+08, -2.5000e+08],
        [-1.4960e-01, -8.4267e-02, -1.3960e-01, -1.3799e-01, -1.1551e-01, -9.7467e-03, -2.5000e+08, -2.5000e+08, -2.5000e+08, -2.5000e+08],
        [-1.7652e-01, -7.3223e-02, -1.5140e-01, -1.8178e-01, -1.2315e-01, -3.0111e-02, -1.1649e-01, -2.5000e+08, -2.5000e+08, -2.5000e+08],
        [-2.1848e-02

# 3. Positional Encoding

In [2]:
import numpy as np
np.set_printoptions(linewidth=200)

In [4]:
import torch

torch.arange(10).unsqueeze(1)

tensor([[0],
        [1],
        [2],
        [3],
        [4],
        [5],
        [6],
        [7],
        [8],
        [9]])

In [6]:
np.arange(10).reshape(-1,1)

array([[0],
       [1],
       [2],
       [3],
       [4],
       [5],
       [6],
       [7],
       [8],
       [9]])

In [18]:
import torch
import math

max_len = 10
d_model = 64
torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0)/d_model))

tensor([1.0000e+00, 7.4989e-01, 5.6234e-01, 4.2170e-01, 3.1623e-01, 2.3714e-01,
        1.7783e-01, 1.3335e-01, 1.0000e-01, 7.4989e-02, 5.6234e-02, 4.2170e-02,
        3.1623e-02, 2.3714e-02, 1.7783e-02, 1.3335e-02, 1.0000e-02, 7.4989e-03,
        5.6234e-03, 4.2170e-03, 3.1623e-03, 2.3714e-03, 1.7783e-03, 1.3335e-03,
        1.0000e-03, 7.4989e-04, 5.6234e-04, 4.2170e-04, 3.1623e-04, 2.3714e-04,
        1.7783e-04, 1.3335e-04])

In [22]:
position = np.exp(np.arange(0, d_model, 2) * (-np.log(10000)/d_model))

In [23]:
div_term = np.arange(max_len).reshape(-1, 1)

In [24]:
np.sin(position*div_term)

array([[ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
       [ 8.41470985e-01,  6.81561350e-01,  5.33168440e-01,  4.09308924e-01,  3.10983593e-01,  2.34921076e-01,  1.76892186e-01,  1.32957266e-01,  9.98334166e-02,  7.49191579e-02,  5.62044992e-02,
         4.21571532e-02,  3.16175064e-02,  2.37115146e-02,  1.77818569e-02,  1.33348191e-02,  9.99983333e-03,  7.49887181e-03,  5.62338361e-03,  4.21695254e-03,  3.16227239e-03,  2.37137148e-03,
         1.77827847e-03,  1.33352104e-03,

In [26]:
def positional_encoding(max_len, d_model):
    """
    주어진 최대 길이(max_len)와 모델 차원(d_model)에 대한 위치 인코딩을 생성합니다.
    
    :param max_len: 시퀀스의 최대 길이.
    :param d_model: 모델의 차원.
    :return: (max_len, d_model) 형태의 numpy 배열로, 위치 인코딩을 포함합니다.
    """
    
    # 사전에 시퀀스의 위치와 주기 특징(주파수)에 대한 변수 작성
    position = np.arange(max_len).reshape(-1, 1) # 시퀀스의 각 위치
    div_term = np.exp(np.arange(0, d_model, 2) * (-np.log(10000.0)/d_model)) # 주기
    
    pos_enc = np.zeros((max_len, d_model)) # max_len x d_model 사이즈의 영행렬 생성
    
    # 짝수 인덱스에는 사인 값을, 홀수 인덱스에는 코사인 값을 저장
    pos_enc[:, 0::2] = np.sin(position*div_term)
    pos_enc[:, 1::2] = np.cos(position*div_term)
    
    return pos_enc

# 최대 길이 10과 모델 차원 64를 가진 시퀀스에 대한 위치 인코딩
pos_encoding = positional_encoding(10, 64)

array([[ 0.        ,  1.        ,  0.        ,  1.        ,  0.        ,  1.        ,  0.        ,  1.        ],
       [ 0.84147098,  0.54030231,  0.68156135,  0.73176098,  0.53316844,  0.84600911,  0.40930892,  0.91239586],
       [ 0.90929743, -0.41614684,  0.99748   ,  0.07094825,  0.90213071,  0.43146283,  0.74690354,  0.66493241],
       [ 0.14112001, -0.9899925 ,  0.77827252, -0.62792665,  0.99325317, -0.11596614,  0.95363446,  0.30096729],
       [-0.7568025 , -0.65364362,  0.14153892, -0.98993269,  0.77847174, -0.62767965,  0.99328073, -0.11572978],
       [-0.95892427,  0.28366219, -0.5711272 , -0.82086157,  0.3239352 , -0.94607927,  0.858896  , -0.51215004],
       [-0.2794155 ,  0.96017029, -0.97739612, -0.21141624, -0.23036747, -0.97310371,  0.57402557, -0.81883737],
       [ 0.6569866 ,  0.75390225, -0.85931347,  0.51144927, -0.71372117, -0.70042994,  0.18858111, -0.98205762],
       [ 0.98935825, -0.14550003, -0.28022801,  0.95993347, -0.97726175, -0.2120365 , -0.2299043

In [27]:
# positional encoding 출력결과
pos_encoding.shape, pos_encoding[:, :8]

((10, 64),
 array([[ 0.        ,  1.        ,  0.        ,  1.        ,  0.        ,  1.        ,  0.        ,  1.        ],
        [ 0.84147098,  0.54030231,  0.68156135,  0.73176098,  0.53316844,  0.84600911,  0.40930892,  0.91239586],
        [ 0.90929743, -0.41614684,  0.99748   ,  0.07094825,  0.90213071,  0.43146283,  0.74690354,  0.66493241],
        [ 0.14112001, -0.9899925 ,  0.77827252, -0.62792665,  0.99325317, -0.11596614,  0.95363446,  0.30096729],
        [-0.7568025 , -0.65364362,  0.14153892, -0.98993269,  0.77847174, -0.62767965,  0.99328073, -0.11572978],
        [-0.95892427,  0.28366219, -0.5711272 , -0.82086157,  0.3239352 , -0.94607927,  0.858896  , -0.51215004],
        [-0.2794155 ,  0.96017029, -0.97739612, -0.21141624, -0.23036747, -0.97310371,  0.57402557, -0.81883737],
        [ 0.6569866 ,  0.75390225, -0.85931347,  0.51144927, -0.71372117, -0.70042994,  0.18858111, -0.98205762],
        [ 0.98935825, -0.14550003, -0.28022801,  0.95993347, -0.97726175, -0.

In [28]:
pos_encoding

array([[ 0.00000000e+00,  1.00000000e+00,  0.00000000e+00,  1.00000000e+00,  0.00000000e+00,  1.00000000e+00,  0.00000000e+00,  1.00000000e+00,  0.00000000e+00,  1.00000000e+00,  0.00000000e+00,
         1.00000000e+00,  0.00000000e+00,  1.00000000e+00,  0.00000000e+00,  1.00000000e+00,  0.00000000e+00,  1.00000000e+00,  0.00000000e+00,  1.00000000e+00,  0.00000000e+00,  1.00000000e+00,
         0.00000000e+00,  1.00000000e+00,  0.00000000e+00,  1.00000000e+00,  0.00000000e+00,  1.00000000e+00,  0.00000000e+00,  1.00000000e+00,  0.00000000e+00,  1.00000000e+00,  0.00000000e+00,
         1.00000000e+00,  0.00000000e+00,  1.00000000e+00,  0.00000000e+00,  1.00000000e+00,  0.00000000e+00,  1.00000000e+00,  0.00000000e+00,  1.00000000e+00,  0.00000000e+00,  1.00000000e+00,
         0.00000000e+00,  1.00000000e+00,  0.00000000e+00,  1.00000000e+00,  0.00000000e+00,  1.00000000e+00,  0.00000000e+00,  1.00000000e+00,  0.00000000e+00,  1.00000000e+00,  0.00000000e+00,
         1.00000000e+00, 

# 4. Cross Attention

In [29]:
import torch
import torch.nn.functional as F

def cross_attention(decoder_output, encoder_output, mask=None):
    """
    Cross Attention을 수행하는 함수.

    :param decoder_output: Decoder에서 나온 Query 행렬 (batch_size, target_seq_len, d_model)
    :param encoder_output: Encoder에서 나온 Key, Value 행렬 (batch_size, source_seq_len, d_model)
    :param mask: 선택적 Mask 행렬 (batch_size, 1, target_seq_len, source_seq_len)
    :return: Attention을 적용한 결과와 attention weights
    """
    d_model = decoder_output.size(-1)

    # Decoder 출력을 Query로, Encoder 출력을 Key와 Value로 사용
    query = decoder_output
    key = value = encoder_output

    # Scaled Dot-Product Attention
    scores = torch.matmul(query, key.transpose(-2, -1)) / torch.sqrt(torch.tensor(d_model, dtype=torch.float32))

    # Mask가 제공된 경우 적용
    if mask is not None:
        scores = scores.masked_fill(mask == 0, float('-inf'))

    # Softmax를 적용하여 확률 분포 얻기
    attention_weights = F.softmax(scores, dim=-1)

    # Attention weights를 Value에 적용
    output = torch.matmul(attention_weights, value)

    return output, attention_weights