## 예제 2.1 토큰화 코드

In [1]:
# 띄어쓰기 단위로 분리
input_text = "나는 최근 파리 여행을 다녀왔다"
input_text_list = input_text.split()
print("input_text_list: ", input_text_list)

# 토큰 -> 아이디 딕셔너리와 아이디 -> 토큰 딕셔너리 만들기
str2idx = {word:idx for idx, word in enumerate(input_text_list)}
idx2str = {idx:word for idx, word in enumerate(input_text_list)}
print("str2idx: ", str2idx)
print("idx2str: ", idx2str)

# 토큰을 토큰 아이디로 변환
input_ids = [str2idx[word] for word in input_text_list]
print("input_ids: ", input_ids)

input_text_list:  ['나는', '최근', '파리', '여행을', '다녀왔다']
str2idx:  {'나는': 0, '최근': 1, '파리': 2, '여행을': 3, '다녀왔다': 4}
idx2str:  {0: '나는', 1: '최근', 2: '파리', 3: '여행을', 4: '다녀왔다'}
input_ids:  [0, 1, 2, 3, 4]


## 예제 2.2 토큰 아이디에서 벡터로 변환

In [2]:
import torch
import torch.nn as nn

embedding_dim = 16
embed_layer = nn.Embedding(len(str2idx), embedding_dim)

input_embeddings = embed_layer(torch.tensor(input_ids)) # (5, 16)
print('input_embeddings', input_embeddings)
print('input_embeddings.shape', input_embeddings.shape)
input_embeddings = input_embeddings.unsqueeze(0) # (1, 5, 16)
print('input_embeddings', input_embeddings)
print('input_embeddings.shape', input_embeddings.shape)

input_embeddings tensor([[ 0.2049,  1.6807,  1.5765, -0.5395,  0.9899,  0.5722, -0.2818, -0.3771,
          0.0328, -1.6833, -1.5398, -0.5732, -0.1802, -1.3813, -0.4298,  1.5461],
        [-0.3825,  1.4663, -1.0194,  1.5348,  0.4648,  0.8077,  0.7148,  0.9094,
         -0.7804,  0.5306,  0.3723, -1.8589,  0.2746,  1.0328, -2.4390,  0.4086],
        [-0.1172,  0.3957, -0.1310, -0.2463, -0.2879,  0.7252, -0.6387, -0.9032,
         -1.6478, -0.0396,  0.9037,  0.0452, -0.5477,  0.9914, -0.2411, -0.8526],
        [-1.1236, -1.2650,  1.2597,  0.9548, -0.5562,  1.2302,  0.3720,  0.1331,
         -1.2442,  0.4147, -0.3238, -0.0697,  0.2225, -0.0969, -1.6555, -2.0065],
        [-0.3631, -2.3431,  0.0504, -2.0693, -1.2970, -1.1414, -0.6474,  0.2287,
          0.9482,  0.4899,  0.3273,  0.2623,  2.4651, -0.5680,  0.8162,  0.6339]],
       grad_fn=<EmbeddingBackward0>)
input_embeddings.shape torch.Size([5, 16])
input_embeddings tensor([[[ 0.2049,  1.6807,  1.5765, -0.5395,  0.9899,  0.5722, -0.281

In [3]:
embedding_dim = 16
max_position = 12

## 예제 2.3 절대적 위치 인코딩

In [4]:
# 토큰 임베딩 층 생성
embed_layer = nn.Embedding(len(str2idx), embedding_dim)
# 위치 인코딩 층 생성
position_embed_layer = nn.Embedding(max_position, embedding_dim)

position_ids = torch.arange(len(input_ids), dtype=torch.long).unsqueeze(0)
print('position_ids', position_ids)
position_encodings = position_embed_layer(position_ids)
print('position_encodings', position_encodings)
print('position_encodings.shape', position_encodings.shape)



position_ids tensor([[0, 1, 2, 3, 4]])
position_encodings tensor([[[ 0.2800, -1.4892,  1.5191,  1.3917,  0.1355, -1.4985,  1.3526,
           0.8153,  0.9171,  1.1009, -0.4183, -0.5849,  0.3299,  1.3712,
           0.7591, -0.3041],
         [ 1.5170,  0.6659, -0.0108, -1.4093,  0.9824, -2.4438,  1.4747,
           0.1785,  0.3609, -1.6568,  3.4831,  0.3396, -1.4861, -0.5981,
           0.4219, -0.1289],
         [ 0.0146, -0.4801,  0.0749, -1.0300, -0.3913, -0.9857,  0.3914,
          -0.5196,  0.8458, -0.7562, -1.1063, -0.8546, -0.3067, -0.3044,
           0.6212, -1.3468],
         [-0.2744,  2.3860,  0.1830,  0.4471, -0.0629,  2.4600, -0.3933,
          -0.7084,  0.6466,  0.0467,  0.5754, -0.2558, -0.5307,  1.3259,
           0.5275,  0.9467],
         [-0.5554, -0.3552, -0.2682,  2.3147, -0.9738, -0.0857, -0.3732,
           1.1452,  0.3385,  2.0273,  0.1792, -0.0590, -0.4788, -0.2977,
          -0.7366,  2.1764]]], grad_fn=<EmbeddingBackward0>)
position_encodings.shape torch.Size

In [5]:
token_embeddings = embed_layer(torch.tensor(input_ids)) # (5, 16)
print('token_embeddings', token_embeddings)
print('token_embeddings.shape', token_embeddings.shape)

token_embeddings = token_embeddings.unsqueeze(0) # (1, 5, 16)
print('token_embeddings', token_embeddings)
print('token_embeddings.shape', token_embeddings.shape)


token_embeddings tensor([[-1.7694e+00,  8.9674e-01, -1.3924e+00, -2.3082e-01, -8.2124e-02,
         -6.2122e-01,  1.6840e+00,  3.0954e-01,  7.1563e-02, -2.0688e-01,
         -1.6112e-01,  1.7311e-01, -9.4804e-01,  1.1038e-01, -1.2019e+00,
         -7.7155e-01],
        [ 6.9043e-01, -4.7758e-01,  5.5456e-02, -9.6146e-01, -3.1735e-01,
          1.2977e+00,  6.2255e-02, -1.1639e+00, -5.6206e-01,  4.4180e-01,
          6.4554e-01,  3.0120e-01,  2.6968e-01, -2.4253e-01,  5.6468e-01,
          2.7834e-01],
        [ 1.1892e+00, -8.9980e-02,  2.3783e-02, -1.1387e+00, -1.4679e+00,
          4.9899e-01,  1.4221e+00, -4.2517e-01,  1.6667e-01,  6.8872e-01,
         -7.3018e-01, -2.5622e+00, -1.1939e+00,  1.2343e+00, -3.6834e-01,
          4.9050e-01],
        [ 1.8684e-01, -1.8169e+00,  4.1210e-01, -1.6667e-03, -1.8055e-01,
         -9.3585e-01, -6.5065e-01,  6.7471e-02, -2.6076e-01, -1.8144e+00,
          5.0001e-01,  1.4602e-01, -2.2640e+00, -8.8636e-02, -1.2958e+00,
         -1.5845e-01],
   

In [6]:
# 토큰 임베딩과 위치 인코딩을 더해 최종 입력 임베딩 생성
input_embeddings = token_embeddings + position_encodings
print('input_embeddings', input_embeddings)
print('input_embeddings.shape', input_embeddings.shape)

input_embeddings tensor([[[-1.4895, -0.5925,  0.1266,  1.1609,  0.0534, -2.1197,  3.0366,
           1.1248,  0.9886,  0.8940, -0.5794, -0.4117, -0.6181,  1.4816,
          -0.4428, -1.0756],
         [ 2.2075,  0.1883,  0.0447, -2.3708,  0.6651, -1.1461,  1.5370,
          -0.9855, -0.2011, -1.2150,  4.1286,  0.6408, -1.2164, -0.8407,
           0.9865,  0.1495],
         [ 1.2038, -0.5701,  0.0987, -2.1688, -1.8592, -0.4867,  1.8135,
          -0.9447,  1.0125, -0.0675, -1.8365, -3.4168, -1.5006,  0.9299,
           0.2528, -0.8563],
         [-0.0876,  0.5691,  0.5951,  0.4454, -0.2435,  1.5242, -1.0440,
          -0.6409,  0.3859, -1.7676,  1.0754, -0.1097, -2.7947,  1.2373,
          -0.7683,  0.7882],
         [-0.9832, -2.3812,  0.9108,  2.2962, -0.4176,  0.2116, -0.9426,
           2.2611,  0.3590,  1.5026,  0.8748,  1.9629, -1.3917, -1.2113,
          -0.0289,  1.6123]]], grad_fn=<AddBackward0>)
input_embeddings.shape torch.Size([1, 5, 16])


## 예제 2.4 쿼리, 키, 값 벡터를 만드는 nn.Linear 층

In [7]:
head_dim = 16

# 쿼리, 키, 값을 계산하기 위한 변환
weight_q = nn.Linear(embedding_dim, head_dim)
weight_k = nn.Linear(embedding_dim, head_dim)
weight_v = nn.Linear(embedding_dim, head_dim)
# 변환 수행
querys = weight_q(input_embeddings) # (1, 5, 16)
keys = weight_k(input_embeddings) # (1, 5, 16)
values = weight_v(input_embeddings) # (1, 5, 16)

## 예제 2.5. 스케일 점곱 방식의 어텐션

In [8]:
from math import sqrt
import torch.nn.functional as F

def compute_attention(querys, keys, values, is_causal=False):
	dim_k = querys.size(-1) # 16
	scores = querys @ keys.transpose(-2, -1) / sqrt(dim_k)
	weights = F.softmax(scores, dim=-1)
	return weights @ values

In [9]:
import numpy as np

# 2x2 행렬 A와 B
A = np.array([[1, 2], 
              [3, 4]])

B = np.array([[5, 6], 
              [7, 8]])

# 행렬 곱셈
C = A @ B

print(C)

C_manual = np.dot(A, B)
print(C_manual)



[[19 22]
 [43 50]]
[[19 22]
 [43 50]]


## 예제 2.6. 어텐션 연산의 입력과 출력

In [10]:
print("원본 입력 형태: ", input_embeddings.shape)

after_attention_embeddings = compute_attention(querys, keys, values)

print("어텐션 적용 후 형태: ", after_attention_embeddings.shape)
# 원본 입력 형태:  torch.Size([1, 5, 16])
# 어텐션 적용 후 형태:  torch.Size([1, 5, 16])

원본 입력 형태:  torch.Size([1, 5, 16])
어텐션 적용 후 형태:  torch.Size([1, 5, 16])


## 예제 2.7. 어텐션 연산을 수행하는 AttentionHead 클래스

In [11]:
class AttentionHead(nn.Module):
  def __init__(self, token_embed_dim, head_dim, is_causal=False):
    super().__init__()
    self.is_causal = is_causal
    self.weight_q = nn.Linear(token_embed_dim, head_dim) # 쿼리 벡터 생성을 위한 선형 층
    self.weight_k = nn.Linear(token_embed_dim, head_dim) # 키 벡터 생성을 위한 선형 층
    self.weight_v = nn.Linear(token_embed_dim, head_dim) # 값 벡터 생성을 위한 선형 층

  def forward(self, querys, keys, values):
    outputs = compute_attention(
        self.weight_q(querys),  # 쿼리 벡터
        self.weight_k(keys),    # 키 벡터
        self.weight_v(values),  # 값 벡터
        is_causal=self.is_causal
    )
    return outputs

attention_head = AttentionHead(embedding_dim, embedding_dim)
after_attention_embeddings = attention_head(input_embeddings, input_embeddings, input_embeddings)

## 예제 2.8. 멀티 헤드 어텐션 구현

In [12]:
class MultiheadAttention(nn.Module):
  def __init__(self, token_embed_dim, d_model, n_head, is_causal=False):
    super().__init__()
    self.n_head = n_head
    self.is_causal = is_causal
    self.weight_q = nn.Linear(token_embed_dim, d_model)
    self.weight_k = nn.Linear(token_embed_dim, d_model)
    self.weight_v = nn.Linear(token_embed_dim, d_model)
    self.concat_linear = nn.Linear(d_model, d_model)

  def forward(self, querys, keys, values):
    B, T, C = querys.size()
    querys = self.weight_q(querys).view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
    keys = self.weight_k(keys).view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
    values = self.weight_v(values).view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
    attention = compute_attention(querys, keys, values, self.is_causal)
    output = attention.transpose(1, 2).contiguous().view(B, T, C)
    output = self.concat_linear(output)
    return output

n_head = 4
mh_attention = MultiheadAttention(embedding_dim, embedding_dim, n_head)
after_attention_embeddings = mh_attention(input_embeddings, input_embeddings, input_embeddings)
after_attention_embeddings.shape

torch.Size([1, 5, 16])

## 예제 2.9. 층 정규화 코드

In [13]:
norm = nn.LayerNorm(embedding_dim)
norm_x = norm(input_embeddings)
norm_x.shape # torch.Size([1, 5, 16])

norm_x.mean(dim=-1).data, norm_x.std(dim=-1).data

# (tensor([[ 2.2352e-08, -1.1176e-08, -7.4506e-09, -3.9116e-08, -1.8626e-08]]),
#  tensor([[1.0328, 1.0328, 1.0328, 1.0328, 1.0328]]))

(tensor([[ 0.0000e+00, -1.4901e-08,  3.3528e-08, -1.1176e-08,  0.0000e+00]]),
 tensor([[1.0328, 1.0328, 1.0328, 1.0328, 1.0328]]))

## 예제 2.10. 피드 포워드 층 코드

In [14]:
class PreLayerNormFeedForward(nn.Module):
  def __init__(self, d_model, dim_feedforward, dropout):
    super().__init__()
    self.linear1 = nn.Linear(d_model, dim_feedforward) # 선형 층 1
    self.linear2 = nn.Linear(dim_feedforward, d_model) # 선형 층 2
    self.dropout1 = nn.Dropout(dropout) # 드랍아웃 층 1
    self.dropout2 = nn.Dropout(dropout) # 드랍아웃 층 2
    self.activation = nn.GELU() # 활성 함수
    self.norm = nn.LayerNorm(d_model) # 층 정규화

  def forward(self, src):
    x = self.norm(src)
    x = x + self.linear2(self.dropout1(self.activation(self.linear1(x))))
    x = self.dropout2(x)
    return x

## 예제 2.11. 인코더 층

In [15]:
class TransformerEncoderLayer(nn.Module):
  def __init__(self, d_model, nhead, dim_feedforward, dropout):
    super().__init__()
    self.attn = MultiheadAttention(d_model, d_model, nhead) # 멀티 헤드 어텐션 클래스
    self.norm1 = nn.LayerNorm(d_model) # 층 정규화
    self.dropout1 = nn.Dropout(dropout) # 드랍아웃
    self.feed_forward = PreLayerNormFeedForward(d_model, dim_feedforward, dropout) # 피드포워드

  def forward(self, src):
    norm_x = self.norm1(src)
    attn_output = self.attn(norm_x, norm_x, norm_x)
    x = src + self.dropout1(attn_output) # 잔차 연결

    # 피드 포워드
    x = self.feed_forward(x)
    return x

## 예제 2.12. 인코더 구현

In [16]:
import copy
def get_clones(module, N):
  return nn.ModuleList([copy.deepcopy(module) for i in range(N)])

class TransformerEncoder(nn.Module):
  def __init__(self, encoder_layer, num_layers):
    super().__init__()
    self.layers = get_clones(encoder_layer, num_layers)
    self.num_layers = num_layers
    self.norm = norm

  def forward(self, src):
    output = src
    for mod in self.layers:
        output = mod(output)
    return output

## 예제 2.13. 디코더에서 어텐션 연산(마스크 어텐션)

In [17]:
def compute_attention(querys, keys, values, is_causal=False):
	dim_k = querys.size(-1) # 16
	scores = querys @ keys.transpose(-2, -1) / sqrt(dim_k) # (1, 5, 5)
	if is_causal:
		query_length = querys.size(2)
		key_length = keys.size(2)
		temp_mask = torch.ones(query_length, key_length, dtype=torch.bool).tril(diagonal=0)
		scores = scores.masked_fill(temp_mask == False, float("-inf"))
	weights = F.softmax(scores, dim=-1) # (1, 5, 5)
	return weights @ values # (1, 5, 16)

## 예제 2.14. 크로스 어텐션이 포함된 디코더 층

In [18]:
class TransformerDecoderLayer(nn.Module):
  def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.1):
    super().__init__()
    self.self_attn = MultiheadAttention(d_model, d_model, nhead)
    self.multihead_attn = MultiheadAttention(d_model, d_model, nhead)
    self.feed_forward = PreLayerNormFeedForward(d_model, dim_feedforward, dropout)

    self.norm1 = nn.LayerNorm(d_model)
    self.norm2 = nn.LayerNorm(d_model)
    self.dropout1 = nn.Dropout(dropout)
    self.dropout2 = nn.Dropout(dropout)

  def forward(self, tgt, encoder_output, is_causal=True):
    # 셀프 어텐션 연산
    x = self.norm1(tgt)
    x = x + self.dropout1(self.self_attn(x, x, x))
    # 크로스 어텐션 연산
    x = self.norm2(x)
    x = x + self.dropout2(self.multihead_attn(x, encoder_output, encoder_output, is_causal=is_causal))
    # 피드 포워드 연산
    x = self.feed_forward(x)
    return x

## 예제 2.15. 디코더 구현

In [19]:
import copy
def get_clones(module, N):
  return nn.ModuleList([copy.deepcopy(module) for i in range(N)])

class TransformerDecoder(nn.Module):
  def __init__(self, decoder_layer, num_layers):
    super().__init__()
    self.layers = get_clones(decoder_layer, num_layers)
    self.num_layers = num_layers

  def forward(self, tgt, src):
    output = tgt
    for mod in self.layers:
        output = mod(tgt, src)
    return output