In [9]:
!pip install -r requirements.txt

Looking in links: https://download.pytorch.org/whl/torch_stable.html


In [10]:
import os
from os.path import exists
import torch
import torch.nn as nn
from torch.nn.functional import log_softmax, pad
import math
import copy
import time
from torch.optim.lr_scheduler import LambdaLR
import pandas as pd
import altair as alt
from torchtext.data.functional import to_map_style_dataset
from torch.utils.data import DataLoader
from torchtext.vocab import build_vocab_from_iterator
import torchtext.datasets as datasets
import spacy
import GPUtil
import warnings
from torch.utils.data.distributed import DistributedSampler
import torch.distributed as dist
import torch.multiprocessing as mp
from torch.nn.parallel import DistributedDataParallel as DDP


# Set to False to skip notebook execution (e.g. for debugging)
warnings.filterwarnings("ignore")
RUN_EXAMPLES = True

In [11]:
# Some convenience helper functions used throughout the notebook

def is_interactive_notebook():
    return __name__ == "__main__"

def show_example(fn, args=[]):
    if __name__=="__main__" and RUN_EXAMPLES:
        return fn(*args)

def execute_example(fn, args=[]):
    if __name__ == "__main__" and RUN_EXAMPLES:
        fn(*args)

class DummyOptimizer(torch.optim.Optimizer):
    def __init__(self):
        self.param_groups = [{"lr": 0}]
        None

    def step(self):
        None 
    
    def zero_grad(self, set_to_none=False):
        None

class DummyScheduler:
    def step(self):
        None


## Architecture

In [12]:
class EncoderDecoder(nn.Module):
    """
    A standard Encoder-Decoder architecture. Base for this and many other models. 
    """
    
    def __init__(self, encoder, decoder, src_embed, tgt_embed, generator):
        super(EncoderDecoder, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.src_embed = src_embed
        self.tgt_embed = tgt_embed 
        self.generator = generator
    
    def forward(self, src, tgt, src_mask, tgt_mask):
        "Take in and process masked src and target sequences."
        return self.decode(self.encode(src, src_mask), src_mask, tgt, tgt_mask)
    
    def encode(self, src, src_mask):
        return self.encoder(self.src_embed(src), src_mask)
    
    def decode(self, memory, src_mask, tgt, tgt_mask):
        return self.decoder(self.tgt_embed(tgt), memory, src_mask, tgt_mask)

In [13]:
class Generator(nn.Module):
    "Define standard linear + softmax generation step."

    def __init__(self, d_model, vocab):
        super(Generator, self).__init__()
        self.proj = nn.Linear(d_model, vocab)

    def forward(self, x):
        return log_softmax(self.proj(x), dim=-1)

## Encoder and Deconder Stacks

#### Encoder 

The encoder is composed of a stack of N=6 identical layers. 

In [14]:
def clones(module, N):
    "Produce N identical layers."
    return nn.ModuleList([copy.deepcopy(module) for _ in range(N)])

In [17]:
class Encoder(nn.Module):
    "Core encoder is a stack of N layers"

    def __init__(self, layer, N):
        super(Encoder, self).__init__()
        self.layers = clones(layer, N) 
        self.norm = LayerNorm(layer.size)

    '''
    args: 
        x: input sequence of embeddings 
        mask: indicating which elements of the sequence should be attended to by the self-attention sublayer
    '''
    def forward(self, x, mask):
        "Pass the input (and mask) through each layer in turn."
        for layer in self.layers:
            x = layer(x, mask)  # initialize layers using layer argument, which is instance of EncoderLayer class 
                                # `layer(x, mask)` is calling the `forward` method of the EncoderLayer class 
        return self.norm(x)  # apply layer normalization to output of the final layer

In [18]:
class LayerNorm(nn.Module):
    "Construct a layernorm module (See citation for details)."

    ''' __init__
    initialize the module by defining two trainable parameters a_2 and b_2. 
    Each are initialized to ones and zeros. 

    args:
        features: number of features in the input tensor 
        eps     : small constant added to the standard deviation to avoid division by zero
    '''
    def __init__(self, features, eps=1e-6):
        super(LayerNorm, self).__init__()
        self.a_2 = nn.Parameter(torch.ones(features))
        self.b_2 = nn.Parameter(torch.zeros(features))
        self.eps = eps

    ''' forward()
    perform layer normalization operation on input tensor x 
    compute mean and standard deviation of the tensor along the last dimension, which corresponds to featueres. 
    Then apply to normalization formula (x-mean)/(std+eps), and scales and shifts the result using a_2, b_2 
    
    ** a_2 and b_2 are trainable and upated during training. mean and std are not trainable. 
    '''
    def forward(self, x):
        mean = x.mean(-1, keepdim=True) # `-1` indicate last dimension, which is features. (NOT SURE why using last dim)
        std = x.std(-1, keepdim=True)
        return self.a_2 * (x - mean) / (std + self.eps) + self.b_2

In [19]:
class SublayerConnection(nn.Module):
    """
    A residual connection followed by a layer norm.
    Note for code simplicity the norm is first as opposed to last.
    """

    '''
    size: input embedding size 
    dropout: dropout rate 
    '''
    def __init__(self, size, dropout):
        super(SublayerConnection, self).__init__()
        self.norm = LayerNorm(size)  # initialize layer normalization module 
        self.dropout = nn.Dropout(dropout)  # initialize dropout module 

    def forward(self, x, sublayer):
        "Apply residual connection to any sublayer with the same size."
        return x + self.dropout(sublayer(self.norm(x)))  # apply residual connection using dropout and normalization 

In [20]:
# single layer of the encoder 
class EncoderLayer(nn.Module):
    "Encoder is made up of self-attn and feed forward (defined below)"

    '''
    args: 
        size: input embedding size 
        self_attn: self-attention module 
        feed_forward: feed-forward module 
        dropout: droupout rate 
    '''
    def __init__(self, size, self_attn, feed_forward, dropout):
        super(EncoderLayer, self).__init__()
        self.self_attn = self_attn
        self.feed_forward = feed_forward
        self.sublayer  = clones(SublayerConnection(size, dropout), 2) # initializes two sublayers using SublayerConnection
    
    '''
    Apply the self-attention sublayer followed by the feedforward sublayer 
    using the residual connection and layer normalization defined the the sublayers. 
    '''
    def forward(self, x, mask):
        "Follow Figure 1 (left) for connections."
        x = self.sublayer[0](x, lambda x: self.self_attn(x, x, x, mask))  # apply self-attention 
        return self.sublayer[1](x, self.feed_forward)  # apply feedforward and return 

#### Decoder 

Decoder is also composed of a stack of N = 6 identical layers. 

In [21]:
class Decoder(nn.Module):
    "Generic N layer decoder with masking"

    def __init__(self, layer, N):
        super(Decoder, self).__init__()
        self.layers = clones(layer, N)
        self.norm = LayerNorm(layer.size)

    def forward(self, x, memory, src_mask, tgt_mask):
        for layer in self.layers:
            x = layer(x, memory, src_mask, tgt_mask)
        return self.norm(x)

In [22]:
# single layer of decoder 
class DecoderLayer(nn.Module):
    "Decoder is made of self-attn, src_attn, and feed forward (defined below)"

    def __init__(self, size, self_attn, src_attn, feed_forward, dropout):
        super(DecoderLayer, self).__init__()
        self.size = size
        self.self_attn = self_attn
        self.src_attn = src_attn 
        self.feed_forward= feed_forward 
        self.sublayer = clones(SublayerConnection(size, dropout), 3) # decoder has three sublayer 

    def forward(self, x, memory, src_mask, tgt_mask):
        "Follow Figure 1 (right) for connections."
        m = memory 
        x = self.sublayer[0](x, lambda x: self.self_attn(x, x, x, tgt_mask))  # apply self-attention 
        x = self.sublayer[1](x, lambda x: self.src_attn(x, m, m, src_mask))  # performs multi-head attention over the output of the ENCODER stack 
        return self.sublayer[2](x, self.feed_forward)  # apply feed forward

In [24]:
# this method return matrix of (1, size, size) 
def subsequent_mask(size):
    "Mask out subsequent positions."
    attn_shape = (1, size, size)

    # torch.trui: Returns the upper triangular part of a matrix (2D matrix)
    #   diagonal=1: 대각행렬을 살린다. 
    #   torch.uint8: 8-bit unsigned integer       # WHY ?_? 왜 uint8 로 타입을 바꿔주지? 
    subsequent_mask = torch.triu(torch.ones(attn_shape), diagonal=1).type(
        torch.uint8
    )
    return subsequent_mask == 0

In [29]:
def example_mask():
    LS_data = pd.concat(
        [
            pd.DataFrame(
                {
                    "Subsequent Mask": subsequent_mask(20)[0][x, y].flatten(), # flatten(): 1D 배열로 
                    "Window": y, 
                    "Masking": x, 
                }
            )
            for y in range(20)
            for x in range(20)
        ]
    )

    return (
        alt.Chart(LS_data)
        .mark_rect()
        .properties(height=250, width=250)
        .encode(
            alt.X("Window:O"),
            alt.Y("Masking:O"),
            alt.Color("Subsequent Mask:Q", scale=alt.Scale(scheme="viridis")),
        )
        .interactive()
    )

show_example(example_mask)

#### Attention 

Attention function can be described as mapping a query and a set of key-value pairs to an output, where the query, keys, values, and output are all vectors. The output is computed as a weighted sum of the values, where the weight assigned to each value is computed by a compatibility function of the query with the correspoonding key.

We call this particular attention as "Scaled Dot-Product Attention". The input consists of queries and keys of dimension $d_k$, and values of dimension $d_v$. We compute the dot products of the query with all keys, divide each by $\sqrt{d_k}$, and apply a softmax fuction to obtain the weights on the values. 


In practice, we compute the attention function on a set of queries simultaneously, packed together into matrix Q. The keys and values are also packed together into matrices K and V. We compute the matrix of output as: 

$Attention(Q, K, V) = softmax(\frac{QK^T}{\sqrt{d_k}})V$

In [30]:
def attention(query, key, value, mask=None, dropout=None):
    "Compute 'scaled Dot Product Attention' "
    d_k = query.size(-1)  # get the size of the last dimension of the query tensor
    # d_k: represents the dim of key and query vectors. 
    #      `query` is shape of (batch_size, num_heads, seq_length, d_k), so we take the last dim of query tensor to get value of d_k. 

    scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(d_k)  # calculate the scaled dot-product attention scores
    # key.transpose(-2, -1) swap the last two dimensions of the key tensor 

    if mask is not None: # apply the mask to attention scores if mask is not None 
        scores = scores.masked_fill(mask == 0, -1e9) 
        # set the attention scores corresponding to the padded elements in the input sequence to a very large negative value (-1e9)
        # this ensures that the softmax activation function used in the next step assigns a prob of zero to these elements, ignoring them during attention calculation 

    p_attn = scores.softmax(dim=-1)  # apply softmax function to the attention scores along the last dimension, which represents the sequence length 
    # result is prob distribution over the input sequence, where each element represents the prob of attending to the corresponding element in the input sequence 
    
    if dropout is not None: # apply dropout to attention probabilities before computing the weighted sum of the value vector if dropout is provided 
        p_attn = dropout(p_attn)
    return torch.matmul(p_attn, value), p_attn  # compute weighted sum of the value vectors using attention probabilities as weights 
    # return attention output and the attention probabilities 

The two most commonly used attention functions are additive attention, and dot-product (multiplicative ) attention. Dot-product attention is identical to this algorothm, except for the scaling factor of $\frac{1}{\sqrt{d_k}}$. Additive attention computes the compatibility function using a feed-forward network with a single hidden layer. While the two are similar in theoretical complexity, dot-product attention is much faster and more space-efficient in practice, since it can be implemented using highly optimized matrix multiplication code. 

$d_k$ 가 작을 때는 두 방법의 성능이 비슷하지만, $d_k$ 가 클 때는 additive attention 이 scale 하지 않은 dot-product attention 보다 성능이 잘 나온다. $d_k$ 가 클 때, dot product 의 규모가 커지고, softmax function을 아주 작은 gradient 를 갖게 된다. 이를 방지하기 위해서 dot product를 $\frac{1}{\sqrt{d_k}}$ 로 scale 해준다.  

Multi-head attention은 모델이 다른 representation 정보를 합칠 수 있게 해준다. 

$MultiHead(Q, K, V) = Concat(head_1, ..., head_h)W^O$

where $head_i = Attention(QW_i^Q, KW_i^K, VW_i^V)$


여기에서 h = 8 개의 parallel attention layers (heads) 를 적용한다. $d_k = d_v = d_{model}/h = 64$

Due to the reduced dimension of each head, the total computational cost is similar to that of single-head attention with full dimensionality.

In [None]:
class MultiHeadedAttention(nn.Module):
    def __init__(self, h, d_model, dropout=0.1):
        "Take in model size and number of heads."
        super(MultiHeadedAttention, self).__init__()
        assert d_model % h == 0
        # we assume d_v always equals d_k
        self.d_k = d_model // h
        self.h = h
        self.linears = clones(nn.Linear(d_model, d_model), 4)
        self.attn = None
        self.dropout = nn.Dropout(p=dropout)  # p = dropout rate

    def forward(self, query, key, value, mask=None):
        "Implements Figure 2"
        if mask is not None: 
            # Same mask applied to all h heads. 
            mask = mask.unsqueeze(1)
        nbatches = query.size(0)

        # 1) Do all the linear projections in batch from d_model => h x d_k 
        query, key, value = [
            lin(x).view(nbatches, -1, self.h, self.d_k).transpose(1, 2)
            for lin, x in zip(self.linears, (query, key))
        ]
