# Tokenization

In [2]:
# Tokenization with whitespaces
input = '나는 최근 파리 여행을 다녀왔다'
input_list = input.split()
print('input_list:', input_list)

input_list: ['나는', '최근', '파리', '여행을', '다녀왔다']


In [3]:
# Tokens Dictionaries
word2idx = {word:idx for idx, word in enumerate(input_list)}
idx2word = {idx:word for idx, word in enumerate(input_list)}

print('word2idx:', word2idx)
print('idx2word:', idx2word)

word2idx: {'나는': 0, '최근': 1, '파리': 2, '여행을': 3, '다녀왔다': 4}
idx2word: {0: '나는', 1: '최근', 2: '파리', 3: '여행을', 4: '다녀왔다'}


In [4]:
# Convert Tokens to Token IDs
input_ids = [word2idx[word] for word in input_list]
print('input_ids:', input_ids)

input_ids: [0, 1, 2, 3, 4]


# Token Embeddings

In [7]:
import torch
import torch.nn as nn

In [25]:
embedding_dim = 16
embedding_layer = nn.Embedding(
    len(word2idx),
    embedding_dim
) # (5, 16)

input_embeddings = embedding_layer(torch.tensor(input_ids)) # (5, 16)
input_embeddings = input_embeddings.unsqueeze(0) # (1, 5, 16)
print(input_embeddings.shape)

torch.Size([1, 5, 16])


# Absolute Positioning Encoding

In [14]:
embedding_dim = 16
max_position = 12

embedding_layer = nn.Embedding(len(word2idx), embedding_dim) # (5, 16)
positioning_embedding_layer = nn.Embedding(max_position, embedding_dim) # (12, 16)

positioning_ids = torch.arange(len(input_ids), dtype=torch.long).unsqueeze(0) # (1, 5)
positioning_encodings = positioning_embedding_layer(positioning_ids) # (1, 5, 16)

token_embeddings = embedding_layer(torch.tensor(input_ids)) # (5, 16)
token_embeddings = token_embeddings.unsqueeze(0) # (1, 5, 16)

input_embeddings = token_embeddings + positioning_encodings
print(input_embeddings.shape)


torch.Size([1, 5, 16])


# `nn.Linear` Layer creating query, key, and vector

In [30]:
head_dim = 16

weight_q = nn.Linear(embedding_dim, head_dim) # Input Size 16 - Linear Function - Output Size 16 
weight_k = nn.Linear(embedding_dim, head_dim) # Input Size 16 - Linear Function - Output Size 16
weight_v = nn.Linear(embedding_dim, head_dim) # Input Size 16 - Linear Function - Output Size 16

queries = weight_q(input_embeddings) # (1, 5, 16)
keys = weight_k(input_embeddings) # (1, 5, 16)
values = weight_v(input_embeddings) # (1, 5, 16)

print(queries.shape)
print(keys.shape)
print(values.shape)

torch.Size([1, 5, 16])
torch.Size([1, 5, 16])
torch.Size([1, 5, 16])


# Attention with Scaled Dot Product

In [31]:
from math import sqrt
import torch.nn.functional as F

def compute_attention(queries, keys, values, is_causal=False):
    embedding_dim = queries.size(-1) # 16
    scores = queries @ keys.transpose(-2, -1) / sqrt(embedding_dim)
    weights = F.softmax(scores, dim=-1)
    return weights @ values

# Input Embeddings & Output Embeddings while Computing Attention

In [32]:
print('Input Embeddings:', input_embeddings.shape)

output_embeddings = compute_attention(queries, keys, values)

print('Output Embeddings:', output_embeddings.shape)

Input Embeddings: torch.Size([1, 5, 16])
Output Embeddings: torch.Size([1, 5, 16])


# `AttentionHead` Class

In [35]:
class AttentionHead(nn.Module):
    
    def __init__(self, embedding_dim, head_dim, is_causal=False):
        super().__init__()
        self.is_causal = is_causal
        self.weight_q = nn.Linear(embedding_dim, head_dim) # Input Size 16 - Linear Function - Output Size 16
        self.weight_k = nn.Linear(embedding_dim, head_dim) # Input Size 16 - Linear Function - Output Size 16
        self.weight_v = nn.Linear(embedding_dim, head_dim) # Input Size 16 - Linear Function - Output Size 16
    
    def forward(self, queries, keys, values):
        output_embeddings = compute_attention(
            self.weight_q(queries),
            self.weight_k(keys),
            self.weight_v(values),
            is_causal=self.is_causal
        )
        return output_embeddings

attention_head = AttentionHead(embedding_dim, embedding_dim)
output_embeddings = attention_head(input_embeddings, input_embeddings, input_embeddings)
print(attention_head)
print(output_embeddings.shape)

AttentionHead(
  (weight_q): Linear(in_features=16, out_features=16, bias=True)
  (weight_k): Linear(in_features=16, out_features=16, bias=True)
  (weight_v): Linear(in_features=16, out_features=16, bias=True)
)
torch.Size([1, 5, 16])


# Multi-head Attention

In [38]:
class MultiheadAttention(nn.Module):
    
    def __init__(self, embedding_dim, head_dim, n_head, is_causal=False):
        super().__init__()
        self.n_head = n_head
        self.is_causal = is_causal
        self.weight_q = nn.Linear(embedding_dim, head_dim) # Input Size 16 - Linear Function - Output Size 16
        self.weight_k = nn.Linear(embedding_dim, head_dim) # Input Size 16 - Linear Function - Output Size 16
        self.weight_v = nn.Linear(embedding_dim, head_dim) # Input Size 16 - Linear Function - Output Size 16
        self.concat_linear = nn.Linear(head_dim, head_dim)
    
    def forward(self, queries, keys, values):
        B, T, C = queries.size()
        queries = self.weight_q(queries).view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
        keys = self.weight_q(keys).view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
        values = self.weight_q(values).view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
        attention = compute_attention(queries, keys, values, self.is_causal)
        output_embeddings = attention.transpose(1, 2).contiguous().view(B, T, C)
        output_embeddings = self.concat_linear(output_embeddings)
        return output_embeddings

n_head = 4
multihead_attention = MultiheadAttention(embedding_dim, embedding_dim, n_head)
output_embeddings = multihead_attention(input_embeddings, input_embeddings, input_embeddings)
print(output_embeddings.shape)

torch.Size([1, 5, 16])


# Layer Normalization

In [50]:
norm = nn.LayerNorm(embedding_dim) # 16
norm_x = norm(input_embeddings) # (1, 5, 16)
print('norm_x shape:', norm_x.shape)

print('Mean of All the 16 Dim Values:', norm_x.mean(dim=1).data)
print('Std of All the 16 Dim Values:', norm_x.std(dim=1).data)

norm_x shape: torch.Size([1, 5, 16])
Mean of All the 16 Dim Values: tensor([[ 0.3371,  0.5774,  0.5453, -0.6474, -0.1695, -0.5298,  0.5652, -0.2789,
         -0.2505, -0.1851,  0.5560,  0.2587, -0.2790,  0.3025, -0.5617, -0.2402]])
Std of All the 16 Dim Values: tensor([[0.7775, 0.8474, 0.6012, 0.4478, 1.1062, 1.2426, 0.7826, 0.6867, 0.7281,
         1.0024, 0.9099, 1.3914, 1.5595, 1.0045, 0.7462, 1.5360]])


# Feed-forward Layer

In [51]:
class PreLayerNormFeedForward(nn.Module):

    def __init__(self, head_dim, feedforward_dim, dropout):
        super().__init__()
        self.linear1 = nn.Linear(head_dim, feedforward_dim)
        self.linear2 = nn.Linear(feedforward_dim, head_dim)
        self.dropout1 = nn.Dropout(dropout)
        self.dropout2 = nn.Dropout(dropout)
        self.activation = nn.GELU()
        self.norm = nn.LayerNorm(head_dim)
    
    def forward(self, src):
        x = self.norm(src)
        x = x + self.linear2(self.dropout1(self.activation(self.linear1(x))))
        x = self.dropout2(x)
        return x

# Encoder Layer

In [52]:
class TransformerEncoderLayer(nn.Module):
    
    def __init__(self, head_dim, n_head, feedforward_dim, dropout):
        super().__init__()
        self.attention = MultiheadAttention(head_dim, head_dim, n_head)
        self.norm1 = nn.LayerNorm(head_dim)
        self.dropout1 = nn.Dropout(dropout)
        self.feed_forward = PreLayerNormFeedForward(head_dim, feedforward_dim, dropout)
    
    def forward(self, src):
        norm_x = self.norm1(src)
        attention_output = self.attention(norm_x, norm_x, norm_x)
        x = src + self.dropout1(attention_output)
        x = self.feed_forward(x)
        return x

# Encoder

In [54]:
import copy

def get_clones(module, N):
    return nn.ModuleList(
        [copy.deepcopy(module) for i in range(N)]
    )

class TransformerEncoder(nn.Module):
    
    def __init__(self, encoder_layer, layers_cnt):
        super().__init__()
        self.layers = get_clones(encoder_layer, layers_cnt)
        self.layers_cnt = layers_cnt
        self.norm = norm
    
    def forward(self, src):
        output = src
        for mod in self.layers:
            output = mod(output)
        return output

# Mask Attention in Decoder

In [55]:
def compute_attention(queries, keys, values, is_causal=False):

    embedding_dim = queries.size(-1) # 16
    scores = queries @ keys.transpose(-2, -1) / sqrt(embedding_dim) # (1, 5, 5)

    if is_causal:
        query_len = queries.size(2)
        key_len = keys.size(2)
        temp_mask = torch.ones(query_len, key_len, dtype=torch.bool).tril(diagonal=0)
        scores = scores.masked_fill(temp_mask==False, float('-inf'))
    
    weights = F.softmax(scores, dim=-1) # (1, 5, 5)

    return weights @ values # (1, 5, 16)

# Cross-attention in Decoder Layer

In [56]:
class TransformerDecoderLayer(nn.Module):

    def __init__(self, head_dim, n_head, feedforward_dim=2048, dropout=0.1):
        super().__init__()
        self.self_attention = MultiheadAttention(head_dim, head_dim, n_head)
        self.multihead_attention = MultiheadAttention(head_dim, head_dim, n_head)
        self.feed_forward = PreLayerNormFeedForward(head_dim, feedforward_dim, dropout)

        self.norm1 = nn.LayerNorm(head_dim)
        self.norm2 = nn.LayerNorm(head_dim)
        self.dropout1 = nn.Dropout(dropout)
        self.dropout2 = nn.Dropout(dropout)

    def forward(self, target, encoder_output, is_causal=True):
        
        # Self Attention
        x = self.norm1(target)
        x = x + self.dropout1(self.self_attention(x, x, x, is_causal=is_causal))
        # Cross-attention
        x = self.norm2(x)
        x = x + self.dropout2(self.multihead_attention(x, encoder_output, encoder_output))
        # Feed-forward
        x = self.feed_forward(x)

        return x

# Decoder

In [57]:
import copy

def get_clones(module, N):
    return nn.ModuleList(
        [copy.deepcopy(module) for i in range(N)]
    )

class TransformerDecoder(nn.Module):

    def __init__(self, decoder_layer, layers_cnt):
        super().__init__()
        self.layers = get_clones(decoder_layer, layers_cnt)
        self.layers_cnt = layers_cnt
    
    def forward(self, target, src):
        output = target
        for mod in self.layers:
            output = mod(target, src)
        return output