In [28]:
!pip install -r requirements.txt

Looking in links: https://download.pytorch.org/whl/torch_stable.html


In [29]:
import os
from os.path import exists
import torch
import torch.nn as nn
from torch.nn.functional import log_softmax, pad
import math
import copy
import time
from torch.optim.lr_scheduler import LambdaLR
import pandas as pd
import altair as alt
from torchtext.data.functional import to_map_style_dataset
from torch.utils.data import DataLoader
from torchtext.vocab import build_vocab_from_iterator
import torchtext.datasets as datasets
import spacy
import GPUtil
import warnings
from torch.utils.data.distributed import DistributedSampler
import torch.distributed as dist
import torch.multiprocessing as mp
from torch.nn.parallel import DistributedDataParallel as DDP


# Set to False to skip notebook execution (e.g. for debugging)
warnings.filterwarnings("ignore")
RUN_EXAMPLES = True

In [30]:
# Some convenience helper functions used throughout the notebook

def is_interactive_notebook():
    return __name__ == "__main__"

def show_example(fn, args=[]):
    if __name__=="__main__" and RUN_EXAMPLES:
        return fn(*args)

def execute_example(fn, args=[]):
    if __name__ == "__main__" and RUN_EXAMPLES:
        fn(*args)

class DummyOptimizer(torch.optim.Optimizer):
    def __init__(self):
        self.param_groups = [{"lr": 0}]
        None

    def step(self):
        None 
    
    def zero_grad(self, set_to_none=False):
        None

class DummyScheduler:
    def step(self):
        None


# Part 1: Model Architecture

In [31]:
class EncoderDecoder(nn.Module):
    """
    A standard Encoder-Decoder architecture. Base for this and many other models. 
    """
    
    def __init__(self, encoder, decoder, src_embed, tgt_embed, generator):
        super(EncoderDecoder, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.src_embed = src_embed
        self.tgt_embed = tgt_embed 
        self.generator = generator
    
    def forward(self, src, tgt, src_mask, tgt_mask):
        "Take in and process masked src and target sequences."
        return self.decode(self.encode(src, src_mask), src_mask, tgt, tgt_mask)
    
    def encode(self, src, src_mask):
        return self.encoder(self.src_embed(src), src_mask)
    
    def decode(self, memory, src_mask, tgt, tgt_mask):
        return self.decoder(self.tgt_embed(tgt), memory, src_mask, tgt_mask)

In [32]:
class Generator(nn.Module):
    "Define standard linear + softmax generation step."

    def __init__(self, d_model, vocab):
        super(Generator, self).__init__()
        self.proj = nn.Linear(d_model, vocab)

    def forward(self, x):
        return log_softmax(self.proj(x), dim=-1)

## Encoder and Deconder Stacks

#### Encoder 

The encoder is composed of a stack of N=6 identical layers. 

In [33]:
def clones(module, N):
    "Produce N identical layers."
    return nn.ModuleList([copy.deepcopy(module) for _ in range(N)])

In [34]:
class Encoder(nn.Module):
    "Core encoder is a stack of N layers"

    def __init__(self, layer, N):
        super(Encoder, self).__init__()
        self.layers = clones(layer, N)  # copy Encoder layer 
        self.norm = LayerNorm(layer.size)

    '''
    args: 
        x: input sequence of embeddings 
        mask: indicating which elements of the sequence should be attended to by the self-attention sublayer
    '''
    def forward(self, x, mask):
        "Pass the input (and mask) through each layer in turn."
        for layer in self.layers:
            x = layer(x, mask)  # initialize layers using layer argument, which is instance of EncoderLayer class 
                                # `layer(x, mask)` is calling the `forward` method of the EncoderLayer class 
        return self.norm(x)  # apply layer normalization to output of the final layer

In [35]:
class LayerNorm(nn.Module):
    "Construct a layernorm module (See citation for details)."

    ''' __init__
    initialize the module by defining two trainable parameters a_2 and b_2. 
    Each are initialized to ones and zeros. 

    args:
        features: number of features in the input tensor 
        eps     : small constant added to the standard deviation to avoid division by zero
    '''
    def __init__(self, features, eps=1e-6):
        super(LayerNorm, self).__init__()
        self.a_2 = nn.Parameter(torch.ones(features))
        self.b_2 = nn.Parameter(torch.zeros(features))
        self.eps = eps

    ''' forward()
    perform layer normalization operation on input tensor x 
    compute mean and standard deviation of the tensor along the last dimension, which corresponds to featueres. 
    Then apply to normalization formula (x-mean)/(std+eps), and scales and shifts the result using a_2, b_2 
    
    ** a_2 and b_2 are trainable and upated during training. mean and std are not trainable. 
    '''
    def forward(self, x):
        mean = x.mean(-1, keepdim=True) # `-1` indicate last dimension, which is features. (NOT SURE why using last dim)
        std = x.std(-1, keepdim=True)
        return self.a_2 * (x - mean) / (std + self.eps) + self.b_2

In [36]:
class SublayerConnection(nn.Module):
    """
    A residual connection followed by a layer norm.
    Note for code simplicity the norm is first as opposed to last.
    """

    '''
    size: input embedding size 
    dropout: dropout rate 
    '''
    def __init__(self, size, dropout):
        super(SublayerConnection, self).__init__()
        self.norm = LayerNorm(size)  # initialize layer normalization module 
        self.dropout = nn.Dropout(dropout)  # initialize dropout module 

    def forward(self, x, sublayer):
        "Apply residual connection to any sublayer with the same size."
        return x + self.dropout(sublayer(self.norm(x)))  # apply residual connection using dropout and normalization 

In [37]:
# single layer of the encoder 
class EncoderLayer(nn.Module):
    "Encoder is made up of self-attn and feed forward (defined below)"

    '''
    args: 
        size: input embedding size 
        self_attn: self-attention module 
        feed_forward: feed-forward module 
        dropout: droupout rate 
    '''
    def __init__(self, size, self_attn, feed_forward, dropout):
        super(EncoderLayer, self).__init__()
        self.self_attn = self_attn
        self.feed_forward = feed_forward
        self.sublayer  = clones(SublayerConnection(size, dropout), 2) # initializes two sublayers using SublayerConnection
        self.size = size
    
    '''
    Apply the self-attention sublayer followed by the feedforward sublayer 
    using the residual connection and layer normalization defined the the sublayers. 
    '''
    def forward(self, x, mask):
        "Follow Figure 1 (left) for connections."
        x = self.sublayer[0](x, lambda x: self.self_attn(x, x, x, mask))  # apply self-attention 
        return self.sublayer[1](x, self.feed_forward)  # apply feedforward and return 

#### Decoder 

Decoder is also composed of a stack of N = 6 identical layers. 

In [38]:
class Decoder(nn.Module):
    "Generic N layer decoder with masking"

    def __init__(self, layer, N):
        super(Decoder, self).__init__()
        self.layers = clones(layer, N)
        self.norm = LayerNorm(layer.size)

    def forward(self, x, memory, src_mask, tgt_mask):
        for layer in self.layers:
            x = layer(x, memory, src_mask, tgt_mask)
        return self.norm(x)

In [39]:
# single layer of decoder 
class DecoderLayer(nn.Module):
    "Decoder is made of self-attn, src_attn, and feed forward (defined below)"

    def __init__(self, size, self_attn, src_attn, feed_forward, dropout):
        super(DecoderLayer, self).__init__()
        self.size = size
        self.self_attn = self_attn
        self.src_attn = src_attn 
        self.feed_forward= feed_forward 
        self.sublayer = clones(SublayerConnection(size, dropout), 3) # decoder has three sublayer 

    def forward(self, x, memory, src_mask, tgt_mask):
        "Follow Figure 1 (right) for connections."
        m = memory 
        x = self.sublayer[0](x, lambda x: self.self_attn(x, x, x, tgt_mask))  # apply self-attention 
        x = self.sublayer[1](x, lambda x: self.src_attn(x, m, m, src_mask))  # performs multi-head attention over the output of the ENCODER stack 
        return self.sublayer[2](x, self.feed_forward)  # apply feed forward

In [40]:
# this method return matrix of (1, size, size) 
def subsequent_mask(size):
    "Mask out subsequent positions."
    attn_shape = (1, size, size)

    # torch.trui: Returns the upper triangular part of a matrix (2D matrix)
    #   diagonal=1: 대각행렬을 살린다. 
    #   torch.uint8: 8-bit unsigned integer       # WHY ?_? 왜 uint8 로 타입을 바꿔주지? 
    subsequent_mask = torch.triu(torch.ones(attn_shape), diagonal=1).type(
        torch.uint8
    )
    return subsequent_mask == 0

In [41]:
def example_mask():
    LS_data = pd.concat(
        [
            pd.DataFrame(
                {
                    "Subsequent Mask": subsequent_mask(20)[0][x, y].flatten(), # flatten(): 1D 배열로 
                    "Window": y, 
                    "Masking": x, 
                }
            )
            for y in range(20)
            for x in range(20)
        ]
    )

    return (
        alt.Chart(LS_data)
        .mark_rect()
        .properties(height=250, width=250)
        .encode(
            alt.X("Window:O"),
            alt.Y("Masking:O"),
            alt.Color("Subsequent Mask:Q", scale=alt.Scale(scheme="viridis")),
        )
        .interactive()
    )

show_example(example_mask)

#### Attention 

Attention function can be described as mapping a query and a set of key-value pairs to an output, where the query, keys, values, and output are all vectors. The output is computed as a weighted sum of the values, where the weight assigned to each value is computed by a compatibility function of the query with the correspoonding key.

We call this particular attention as "Scaled Dot-Product Attention". The input consists of queries and keys of dimension $d_k$, and values of dimension $d_v$. We compute the dot products of the query with all keys, divide each by $\sqrt{d_k}$, and apply a softmax fuction to obtain the weights on the values. 


In practice, we compute the attention function on a set of queries simultaneously, packed together into matrix Q. The keys and values are also packed together into matrices K and V. We compute the matrix of output as: 

$Attention(Q, K, V) = softmax(\frac{QK^T}{\sqrt{d_k}})V$

In [42]:
def attention(query, key, value, mask=None, dropout=None):
    "Compute 'scaled Dot Product Attention' "
    d_k = query.size(-1)  # get the size of the last dimension of the query tensor
    # d_k: represents the dim of key and query vectors. 
    #      `query` is shape of (batch_size, num_heads, seq_length, d_k), so we take the last dim of query tensor to get value of d_k. 

    scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(d_k)  # calculate the scaled dot-product attention scores
    # key.transpose(-2, -1) swap the last two dimensions of the key tensor 

    if mask is not None: # apply the mask to attention scores if mask is not None 
        scores = scores.masked_fill(mask == 0, -1e9) 
        # set the attention scores corresponding to the padded elements in the input sequence to a very large negative value (-1e9)
        # this ensures that the softmax activation function used in the next step assigns a prob of zero to these elements, ignoring them during attention calculation 

    p_attn = scores.softmax(dim=-1)  # apply softmax function to the attention scores along the last dimension, which represents the sequence length 
    # result is prob distribution over the input sequence, where each element represents the prob of attending to the corresponding element in the input sequence 
    
    if dropout is not None: # apply dropout to attention probabilities before computing the weighted sum of the value vector if dropout is provided 
        p_attn = dropout(p_attn)
    
    return torch.matmul(p_attn, value), p_attn  # compute weighted sum of the value vectors using attention probabilities as weights 
    # return attention output and the attention probabilities 

The two most commonly used attention functions are additive attention, and dot-product (multiplicative ) attention. Dot-product attention is identical to this algorothm, except for the scaling factor of $\frac{1}{\sqrt{d_k}}$. Additive attention computes the compatibility function using a feed-forward network with a single hidden layer. While the two are similar in theoretical complexity, dot-product attention is much faster and more space-efficient in practice, since it can be implemented using highly optimized matrix multiplication code. 

$d_k$ 가 작을 때는 두 방법의 성능이 비슷하지만, $d_k$ 가 클 때는 additive attention 이 scale 하지 않은 dot-product attention 보다 성능이 잘 나온다. $d_k$ 가 클 때, dot product 의 규모가 커지고, softmax function을 아주 작은 gradient 를 갖게 된다. 이를 방지하기 위해서 dot product를 $\frac{1}{\sqrt{d_k}}$ 로 scale 해준다.  

Multi-head attention은 모델이 다른 representation 정보를 합칠 수 있게 해준다. 

$MultiHead(Q, K, V) = Concat(head_1, ..., head_h)W^O$

where $head_i = Attention(QW_i^Q, KW_i^K, VW_i^V)$


여기에서 h = 8 개의 parallel attention layers (heads) 를 적용한다. $d_k = d_v = d_{model}/h = 64$

Due to the reduced dimension of each head, the total computational cost is similar to that of single-head attention with full dimensionality.

In [43]:
class MultiHeadedAttention(nn.Module):
    def __init__(self, h, d_model, dropout=0.1):
        "Take in model size and number of heads."
        super(MultiHeadedAttention, self).__init__()
        assert d_model % h == 0
        # we assume d_v always equals d_k
        self.d_k = d_model // h
        self.h = h
        self.linears = clones(nn.Linear(d_model, d_model), 4) 
            # 4 transformations: one each for projecting the queries, keys, and values to (h x d_k)-dimensional vectors, 
            #                    and one for the final output projection to the original (d_model)-dimensional space.
        self.attn = None
        self.dropout = nn.Dropout(p=dropout)  # p = dropout rate

    def forward(self, query, key, value, mask=None):
        "Implements Figure 2"
        if mask is not None:
            # Same mask applied to all h heads.
            mask = mask.unsqueeze(1)  # unsqueezes the mask tensor to broadcast across all heads  = adds a dimension of size 1 to the tensor at position 1
            # mask tensor becomes compatible with the shape of the projected queries, keys, and values in the subsequent steps of the forward method. 
        
        nbatches = query.size(0)  # batch size

        # 1) Do all the linear projections in batch from d_model => h x d_k 
        query, key, value = [
            lin(x).view(nbatches, -1, self.h, self.d_k).transpose(1, 2)  # tensor of size: (batch_size, h, seq_len, d_k) for each of query, key, and value.
            for lin, x in zip(self.linears, (query, key, value))  # for loop 3번 반복. (각각 query, key, and value에 대해)
        ]
        # applies a linear transformation to each of the query, key, and value tensors using the corresponding linear layer in self.linears. 
        # Resuling tensor = reshaped to have dimensions (batch_size, num_heads, sequence_length, d_k) and transposed such that the sequence_length and num_heads dimensions are switched. 
        # this enables efficient computation of the dot product attention between the query, key, and value tensors in parallel across all heads.


        # 2) Apply attention on all the projected vectors in batch 
        x, self.attn = attention(
            query, key, value, mask=mask, dropout=self.dropout
        )
        # x: attention output, self.attn: attention probabilities 


        # 3) "Concat" using a view and apply a final linear 
        x = (   # h개의 다른 attention output tensor 를 하나의 tensor 로 합친다. 
            x.transpose(1, 2)  # transpose in original order (sequence_length, num_heads)
            .contiguous()  # make sure that the tensor is stored in a contiguous block of memory
            .view(nbatches, -1, self.h * self.d_k) 
        )
        # concatenates the h sub-vectors along the last dimension to produce a tensor of shape (batch_size, sequence_length, d_model) 

        del query 
        del key 
        del value 
        return self.linears[-1](x)
        # output of the final linear transformation self.linears[-1] applied to the concatenated and transformed attention output x. 


#### Applications of Attention in our Model 

1) In “encoder-decoder attention” layers, the queries come from the previous decoder layer, and the memory keys and values come from the output of the encoder. This allows every position in the decoder to attend over all positions in the input sequence. This mimics the typical encoder-decoder attention mechanisms in sequence-to-sequence models.

2) The encoder contains self-attention layers. In a self-attention layer all of the keys, values and queries come from the same place, in this case, the output of the previous layer in the encoder. Each position in the encoder can attend to all positions in the previous layer of the encoder.

3) Similarly, self-attention layers in the decoder allow each position in the decoder to attend to all positions in the decoder up to and including that position. We need to prevent leftward information flow in the decoder to preserve the auto-regressive property. We implement this inside of scaled dot-product attention by masking out (setting to −∞) all values in the input of the softmax which correspond to illegal connections.


## Position-wise Feed-Forward Nerworks 

In addition to attention sub-layers, each of the layers in our encoder and decoder contains a fully connected feed-forward network, which is applied to each position seperately and identically. This consists of two linear transfomrations with a ReLU activation in between. 

$FFN(x) = max(0, xW_1 + b_1)W_2 + b_2$

While the linear transformations are the same across different positions, they use different parameters from layer to layer. Another way of describing this is as two convolutions with kernel size 1. The dimensionality of input and output is $d_{model}=512$, and the inner-layer has dimensionality $d_{ff} = 2048$ 


In [44]:
class PositionwiseFeedForward(nn.Module):
    "Implements FFN equation."

    # d_model is the size of the input and output tensor, 
    # d_ff is the size of the intermediate hidden layer in the feedforward network, 
    # dropout is the dropout rate 
    def __init__(self, d_model, d_ff, dropout=0.1):
        super(PositionwiseFeedForward, self).__init__()
        self.w_1 = nn.Linear(d_model, d_ff)  # map the input tensor to the hidden layer 
        self.w_2 = nn.Linear(d_ff, d_model)  # map the hidden layer to the output tensor
        self.dropout = nn.Dropout(dropout)  # regularize the output of the feedforward network 

    # x 는 앞서 멀티 헤드 어텐션의 결과로 나온 (seq_len, d_model)의 크기를 가지는 행렬 
    def forward(self, x):
        return self.w_2(self.dropout(self.w_1(x).relu())) 
        # applying the self.w_1 linear transformation to x followed by the relu activation function, and then applying self.w_2

## Embeddings and Softmax 

Similarly to other sequence transduction models, we use learned embeddings to convert the input tokens and output tokens to vectors of dimension $d_{model}$. We also use the usual learned linear transformation and softmax function to convert the decoder output to predicted next-token probabilities. In our model, we share the same weight matrix between the two embedding layers and the pre-softmax linear transformation. In the embedding layers, we multiply those weights by $\sqrt{d_{model}}$. 



In [51]:
class Embeddings(nn.Module):
    # Embbding module uses an embedding lookup table (nn.Embedding) to convert each integer in the sequence 
    # to a corresponding dense vector of d_model dimensions. 
    def __init__(self, d_model, vocab):
        super(Embeddings, self).__init__()
        self.lut = nn.Embedding(vocab, d_model)
        self.d_model = d_model
    
    # x is shape of (batch_size, seq_len) 
    def forward(self, x): 
        return self.lut(x) * math.sqrt(self.d_model)  
        # output is shape of (batch_size, seq_len, d_model) containing the embeddings of the input tokens 

## Positional Encoding 


Since our model contains no recurrence and no convolution, in order for the model to make use of **the order of the sequence**, we must inject some information about the relative or absolute position of the tokens in the sequence. To this end, we add "positional encodings" to the input embeddings at the bottoms of the encoder and decoder stacks. The positional encodings have the same dimension $d_{model}$ as the embeddings, so that the two can be summed. There are many choices of positional encodings, learned and fixed. 

In this work, we use sine and cosine functions of different frequencies: 

$PE_{(pos, 2i)} = sin(pos/1000^{2i/d_{model}})$

$PE_{(pos, 2i+1)} = cos(pos/1000^{2i/d_{model}})$

where $pos$ is the position and $i$ is the dimension. That is, each dimension of the positional encoding corresponds to a sinusoid. The wavelengths form a geometric progression from $2\pi$ to $1000$ x $2\pi$. We chose this function because we hypothesized it would allow the model to easily learn to attend by relative positions, since for any fixed offset $k$, $PE_{pos+k}$ can be represented as a linear function of $PE_{pos+k}$ can be represented as a linear function of $PE_{pos}$. 

In addition, we appy dropout to the sums of the embeddings and the positional encodings in both the encoder and decoder stacks. For the base model, we use a rate of $P_{drop}=0.1$. 




In [46]:
class PositionalEncoding(nn.Module):
    "Implement the PE function."

    def __init__(self, d_model, dropout, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        # Compute the positional encodings once in log space. 
        pe = torch.zeros(max_len, d_model)  # create matrix shape of (max_len, d_model)
        position = torch.arange(0, max_len).unsqueeze(1)  # this is a tensor of shape (max_len, 1) containing the values from 0 to max_len - 1
        div_term = torch.exp(  # a tensor of shape (d_model//2,) that contains values based on a fixed formula. 
            torch.arange(0, d_model, 2) * -(math.log(10000.0) / d_model)
        )
        pe[:, 0::2] = torch.sin(position * div_term)  # even-numbered indices are filled with sin function 
        pe[:, 1::2] = torch.cos(position * div_term)  # odd-numbered indices are filled with cos function 
        pe = pe.unsqueeze(0)  # resulting in tensor of shape (1, max_len, d_model) 
        self.register_buffer("pe", pe)  # register tensor as a buffer with the name "pe"

    # applies the positional encoding to the input x
    def forward(self, x):
        x = x + self.pe[:, : x.size(1)].requires_grad_(False) 
        # adds the positional encodings from the pe buffer to the first x.size(1) positions of x
        # requires_grad_(False) ensures that the gradients do not flow through the positional encodings. 
        
        return self.dropout(x) 

In [47]:
def example_positional():
    pe = PositionalEncoding(20, 0)
    y = pe.forward(torch.zeros(1, 100, 20))

    data = pd.concat(
        [
            pd.DataFrame(
                {
                    "embedding": y[0, :, dim], 
                    "dimension": dim, 
                    "position": list(range(100)),
                }
            )
            for dim in [4, 5, 6, 7]
        ]
    )

    return (
        alt.Chart(data)
        .mark_line()
        .properties(width=800)
        .encode(x="position", y="embedding", color="dimension:N") 
        .interactive()
    )

show_example(example_positional)


We also experimented with using learned positional embeddings instead, and found that the two versions produced nearly identical results. We chose the sinusoidal version because it may allow the model to extrapolate to sequence lengths longer than the ones encoundtered during training. 

## Full Model

Here we define a function from hyperparameters to a full model. 

In [48]:

# args: 
#   src_vocab: size of the src vocabulary (num of unique words in the src lang)
#   tgt_vocab: size of the target vocabulary (num of unique words in the target lang) 
#   N: num of encoder and decoder layers in the model 
#   d_model: dimensionality of the model (num of features in the embedding layer and num of channels in the multi-head attention and feed-forward layers) 
#   d_ff: num of neurons in the feed-forward layers 
#   h: num of heads in the multi-head attention layer 
#   dropout: dropout rate for regularization 

def make_model(
        src_vocab, tgt_vocab, N=6, d_model=512, d_ff=2048, h=8, dropout=0.1
):
    "Helper: Construct a model from hyperparameters."
    c = copy.deepcopy
    attn = MultiHeadedAttention(h, d_model)  # creates instance of MultiHeadedAttention 
    ff = PositionwiseFeedForward(d_model, d_ff, dropout)  # create PositionwiseFeedForward 
    position = PositionalEncoding(d_model, dropout)  # create PositionalEncoding 
    model = EncoderDecoder(
        Encoder(EncoderLayer(d_model, c(attn), c(ff), dropout), N),  # create Encoder instance using EncoderLayer 
        Decoder(DecoderLayer(d_model, c(attn), c(attn), c(ff), dropout), N),  # create Decoder instance using DecoderLayer 
        nn.Sequential(Embeddings(d_model, src_vocab), c(position)),   # create two Sequential instances, which are composed Embedding and PositionalEncoding 
        nn.Sequential(Embeddings(d_model, tgt_vocab), c(position)),   # one for the source input and one for the target input.
        Generator(d_model, tgt_vocab),  # maps the final output of the decoder to the size of the target vocabulary 
    )

    # This was important from their code. 
    # Initialize parameters with Glorot / fan_avg 
    for p in model.parameters():  # initializes the parameters of the model using  Xavier initialization 
        if p.dim() > 1:
            nn.init.xavier_uniform_(p)
    return model  # the constructed model is returned 

## Inference

Here we make a forward step to generate a prediction of the model. We try to use our transformer to memorize the input. As you will see the output is randomly generated due to the fact that the model is not trained yet. In the next tutorial we will build the training function and try to train our model to memorize the numbers from 1 to 10. 

In [53]:
def inference_test():
    test_model = make_model(11, 11, 2)  # creates an untrained model using the make_model function 
    test_model.eval()  # model is put into evaluation mode using 'eval()' 
    src = torch.LongTensor([[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]])  # source sequence 
    src_mask = torch.ones(1, 1, 10)  # source mask 

    memory = test_model.encode(src, src_mask)   # call encode() of model to create `memory` tensor 
    ys = torch.zeros(1, 1).type_as(src)  # initializes a target sequence tensor with shape of 1x1 and same type with src tensor 

    for i in range(9):
        out = test_model.decode(  
            memory, src_mask, ys, subsequent_mask(ys.size(1)).type_as(src.data)  
            # subsequent_mask() create a mask for the decoder output (ensuring model only attend to prev positions in output sequence) 
        )
        prob = test_model.generator(out[:, -1])  # produce a probability distribution over the target vocabulary based on the decoder output 
        _, next_word = torch.max(prob, dim=1) # use max() to get the index of the most likely next word 
        next_word = next_word.data[0]
        ys = torch.cat(  # index is added to the ys tensor 
            [ys, torch.empty(1, 1).type_as(src.data).fill_(next_word)], dim=1
        )

    print("Example Untrained Model Prediction:", ys)


def run_tests():
    for _ in range(10):  # call inference_test() 10 times and prints the predicted target sequence at each iteration 
        inference_test()


show_example(run_tests)

Example Untrained Model Prediction: tensor([[0, 9, 3, 9, 3, 9, 3, 9, 3, 6]])
Example Untrained Model Prediction: tensor([[ 0, 10,  2,  8,  7, 10, 10,  8,  7, 10]])
Example Untrained Model Prediction: tensor([[0, 5, 1, 6, 3, 0, 5, 5, 1, 6]])
Example Untrained Model Prediction: tensor([[0, 6, 2, 2, 5, 2, 5, 2, 7, 6]])
Example Untrained Model Prediction: tensor([[0, 8, 8, 8, 4, 8, 4, 8, 3, 6]])
Example Untrained Model Prediction: tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])
Example Untrained Model Prediction: tensor([[0, 8, 7, 3, 7, 0, 8, 7, 3, 7]])
Example Untrained Model Prediction: tensor([[0, 8, 0, 8, 0, 0, 0, 0, 0, 0]])
Example Untrained Model Prediction: tensor([[0, 6, 2, 2, 2, 2, 2, 2, 2, 2]])
Example Untrained Model Prediction: tensor([[ 0,  7,  2,  5,  2, 10,  7,  1,  5,  1]])


# Part 2: Model Training 

This section describes the training regime for our models. 

We stop for a quick interlude to introduce to introduce some of the tools needed to train a standard encoder decoder model. First we deine a batch object that holds the src and target sentences for training, as well as constructing the masks. 

## Batches and Masking 

In [None]:
class Batch: 
    """Object for holding a batch of data with mask during training."""

    def __init__(self, src, tgt=None, pad=2):
        self.src = src 
        self.src_mask = (src != pad).unsqueeze(-2)  # src_mask used to ignore the padding tokens 
        if tgt is not None: 
            self.tgt = tgt[:, :-1]  # target sentence tensor where the last token is removed 
            self.tgt_y = tgt[:, 1:]  # tgt tensor shifted by one position, used as input for the decoder 
            self.tgt_mask = self.make_std_mask(self.tgt, pad)  # tgt_mask used to hide padding and future words 
            self.ntokens = (self.tgt_y != pad).data.sum()  # number of non-padding tokens in the target sentence 

    @staticmethod 
    def make_std_mask(tgt, pad):  # creates a mask to hide padding and future words in the target sentence 
        "Create a mask to hide padding and future words" 
        tgt_mask = (tgt != pad).unsqueeze(-2) 
        tgt_mask = tgt_mask & subsequent_mask(tgt.size(-1)).type_as(
            tgt_mask.data
        )
        # subsequent_mask() returns a tensor where the elements above the diagonal are set to zero, and the elements below the diagonal are set to one. 
        # decoder가 자신의 앞에 나오는 token 만 볼 수 있고 이후에 나오는 token은 볼 수 없게 만드는 것. 

        return tgt_mask 
        # returns a mask tensor with the same size as the target sentence tensor, where padding and future words are masked
