# Type of Attention mechanism
- ScaledDotProductAttention
- DotProductAttention
- AddictiveAttention
- LocationawareAttention
- MultiheadLocationawareAttention
- MultiheadAttention
- RelativeMultiheadAttention
- CustomizingAttention
- CrossAttention
- GlobalAttention
- HardAttention
- SoftAttention
- HierarchicalAttention
- LocalAttention
- MaskedAttention
- SelfAttention
- CasualSelfAttention


In [2]:
import math
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import Tensor
import numpy as np
from typing import Optional, Tuple

## ScaledDotProductAttention
- Attention is all you need 논문에서 제시
- query와 key의 dot product 계산 후 sqrt(attention dim)으로 나누고, softmax 적용

### Inputs: query, key, value, mask
- **query** (batch, q_len, d_model): tensor containing projection vector for decoder.
- **key** (batch, k_len, d_model): tensor containing projection vector for encoder.
- **value** (batch, v_len, d_model): tensor containing features of the encoded input sequence.
- **mask** (-): tensor containing indices to be masked

### Returns: context, attn
- **context**: tensor containing the context vector from attention mechanism.
- **attn**: tensor containing the attention (alignment) from the encoder outputs.

In [None]:
class ScaledDotProductAttention(nn.Module):
    def __init__(self, dim: int):
        

## Casual Self Attention
- self attention의 한 종류로, 주로 autoregressive 모델에서 사용됨
- self attention에 시간적 제약 추가
    - 특정 위치에서의 예측이 그 위치 이전의 정보에만 의존하도록 보장함

In [5]:
class CausalSelfAttention(nn.Module):
    def __init__(self, num_heads:int, embed_dimension:int, bias:bool=False, is_casual:bool=False, dropout:float=0.0):
        super().__init__()
        assert embed_dimension%num_heads == 0
        
        # qkv projection
        # input: (batch_size, seq_len, embed_dimension)
        # output: (batch_size, seq_len, 3 * embed_dimension)
        self.c_attn = nn.Linear(embed_dimension, 3*embed_dimension, bias=bias) 
        self.c_proj = nn.Linear(embed_dimension, embed_dimension, bias=bias)
        
        # regularization
        self.dropout = dropout
        self.resid_dropout = nn.Dropout(dropout)
        self.num_heads = num_heads
        self.embed_dimension = embed_dimension
        self.is_casual = is_casual
        
    def forward(self, x):
        query_projected = self.c_attn(x) # (batch_size, seq_len, 3 * embed_dimension)
        
        batch_size = query_projected.size(0) # 0번째 차원의 크기
        embed_dim = query_projected.size(2)
        head_dim = embed_dim // (self.num_heads*3)
        
        query, key, value = query_projected.chunk(3, -1) # -1 축으로 3분할
        
        # tensor 형태 변경
        query = query.view(batch_size, -1, self.num_heads, head_dim).transpose(1,2)
        key = key.view(batch_size, -1, self.num_heads, head_dim).transpose(1,2)
        value = value.view(batch_size, -1, self.num_heads, head_dim).transpose(1,2)
        
        if self.training:
            dropout = self.dropout
            is_casual = self.is_casual
            
        else:
            dropout = 0.0
            is_casual = False
            
        y = F.scaled_dot_product_attention(query, key, value, attn_mask=None, dropout_p=dropout, is_causal=is_casual)
        y = y.transpose(1,2).view(batch_size, -1, self.num_heads*head_dim)
        
        y = self.resid_dropout(self.c_proj(y))
        
        return y
        
        
        
        
        


In [7]:
num_heads = 8
heads_per_dim = 64
embed_dimension = num_heads * heads_per_dim
dtype = torch.float16
model = CausalSelfAttention(num_heads=num_heads, embed_dimension=embed_dimension, bias=False, is_casual=True, dropout=0.1).to("cuda").to(dtype).eval()
print(model)

CausalSelfAttention(
  (c_attn): Linear(in_features=512, out_features=1536, bias=False)
  (c_proj): Linear(in_features=512, out_features=512, bias=False)
  (resid_dropout): Dropout(p=0.1, inplace=False)
)
