### In this demonstration, we will walk through how a BERT encoder and decoder are structured

In [1]:
from transformers import AutoTokenizer
model_ckpt = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

In [50]:
text = "time flies like an arrow"
inputs = tokenizer(text, 
                   return_tensors= "pt", 
                   add_special_tokens = True)
text, inputs.input_ids

('time flies like an arrow',
 tensor([[  101,  2051, 10029,  2066,  2019,  8612,   102]]))

In [51]:
inputs

{'input_ids': tensor([[  101,  2051, 10029,  2066,  2019,  8612,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1]])}

### Scaled Dot-Product Attention

![image.png](attachment:image.png)

In [52]:
from torch import nn
from transformers import AutoConfig

config = AutoConfig.from_pretrained(model_ckpt) # model_ckpt = 'bert-base-uncased'
token_emb = nn.Embedding(config.vocab_size, config.hidden_size)
config.vocab_size, config.hidden_size

(30522, 768)

#### Note:
`nn.Embedding` is a bijective mapping an integer indicating each unique "vocab" into $\mathbf{R}^{\text{hidden size}}$

In [53]:
inputs_embeds = token_emb(inputs.input_ids)
inputs_embeds, inputs_embeds.size() # one sentence with 5 vocabs where each vocab is represented in R^768

(tensor([[[ 0.8298, -1.4387,  0.8415,  ..., -0.1652,  1.8568,  1.0731],
          [-0.9533, -0.5467, -1.3910,  ...,  0.0978, -1.3665, -0.4827],
          [-0.0334,  1.4064,  1.8789,  ...,  0.9220,  0.4580,  0.0293],
          ...,
          [-1.3034, -0.8980, -0.3262,  ...,  0.3359, -1.0844, -0.5058],
          [ 0.1540,  0.3592, -1.4273,  ..., -0.3512, -1.4690,  0.9891],
          [ 0.2375, -0.2011, -1.4063,  ..., -0.4454, -0.0785, -0.3534]]],
        grad_fn=<EmbeddingBackward0>),
 torch.Size([1, 7, 768]))

In [54]:
import torch
from math import sqrt

query = key = value = inputs_embeds
dim_k = key.size(-1) # 768
scores = torch.bmm(query, key.transpose(1,2))/sqrt(dim_k)
scores, scores.size()

(tensor([[[27.9374,  0.5832,  0.1873, -1.2411,  1.1076,  0.4711,  0.7189],
          [ 0.5832, 28.5478, -0.3939, -1.2933,  0.4488, -0.8020,  0.8273],
          [ 0.1873, -0.3939, 28.1409, -0.5918,  2.5694, -0.9842, -2.0848],
          [-1.2411, -1.2933, -0.5918, 29.8803, -0.7456, -0.2377, -0.6547],
          [ 1.1076,  0.4488,  2.5694, -0.7456, 31.4436,  0.6445, -0.1621],
          [ 0.4711, -0.8020, -0.9842, -0.2377,  0.6445, 25.9169, -0.4546],
          [ 0.7189,  0.8273, -2.0848, -0.6547, -0.1621, -0.4546, 26.1233]]],
        grad_fn=<DivBackward0>),
 torch.Size([1, 7, 7]))

In [55]:
import torch.nn.functional as F
weights = F.softmax(scores, dim = -1) 

attn_outputs = torch.bmm(weights, value)
attn_outputs, attn_outputs.size()

(tensor([[[ 0.8298, -1.4387,  0.8415,  ..., -0.1652,  1.8568,  1.0731],
          [-0.9533, -0.5467, -1.3910,  ...,  0.0978, -1.3665, -0.4827],
          [-0.0334,  1.4064,  1.8789,  ...,  0.9220,  0.4580,  0.0293],
          ...,
          [-1.3034, -0.8980, -0.3262,  ...,  0.3359, -1.0844, -0.5058],
          [ 0.1540,  0.3592, -1.4273,  ..., -0.3512, -1.4690,  0.9891],
          [ 0.2375, -0.2011, -1.4063,  ..., -0.4454, -0.0785, -0.3534]]],
        grad_fn=<BmmBackward0>),
 torch.Size([1, 7, 768]))

In [56]:
### a function for scaled-dot product

def scaled_dot_product_attention(query, key, value):
    assert query.size(-1) == key.size(-1)
    assert query.size(-1) == value.size(-1)
    dim_k = query.size(-1)
    scores = torch.bmm(query, key.transpose(1, 2))/sqrt(dim_k)
    weights = F.softmax(scores, dim  = -1)
    return torch.bmm(weights, value)

### Multi-headed attention

![image.png](attachment:image.png)

#### Part I nn.Linear for V, K and Q => h x scaled dot-product attention

In [57]:
class AttentionHead(nn.Module):
    def __init__(self, embed_dim, head_dim):
        super().__init__() 
        """
        After each token are embedded, they will go throguh Linear layer to reduced the dimension
        from R^{hidden size} to R^{hidden_size // number of heads}
        When you see the numbers chosen for hidden_size and the number of heads,
        the hidden size is typically a multiple of the number of heads
        """
        self.q = nn.Linear(embed_dim, head_dim)
        self.k = nn.Linear(embed_dim, head_dim)
        self.v = nn.Linear(embed_dim, head_dim) 
        
    def forward(self, hidden_state):
        attn_outputs = scaled_dot_product_attention(                           # h x scaled dot-product attention
            self.q(hidden_state), self.k(hidden_state), self.v(hidden_state)   # nn.liear of V, K and Q
        )
        return attn_outputs

#### Part II. concatenate => nn.Linear

In [58]:
class MultiHeadAttention(nn.Module):
    def __init__(self, config):
        super().__init__()
        embed_dim = config.hidden_size         # dim_k, q, v = 768
        num_heads = config.num_attention_heads # h           = 6
        head_dim  = embed_dim // num_heads     # each head dimension = 128
        self.heads = nn.ModuleList(
            [AttentionHead(embed_dim, head_dim) for _ in range(num_heads)]
        ) # ModuleList of 6 Attention Heads
        self.output_linear = nn.Linear(embed_dim, embed_dim)
        
    def forward(self, hidden_state):
        x = torch.cat([h(hidden_state) for h in self.heads], dim = -1) # concatenate outputs from 6 single heads
        x = self.output_linear(x) # go through extra linear layer
        return x
        

In [59]:
multihead_attn = MultiHeadAttention(config)
attn_output = multihead_attn(inputs_embeds)
attn_output, attn_output.size()

(tensor([[[ 0.0118, -0.2353, -0.1709,  ...,  0.1693, -0.0639,  0.1826],
          [-0.0111, -0.1212, -0.2021,  ...,  0.1086,  0.0108,  0.1257],
          [-0.0168, -0.0826, -0.0624,  ...,  0.0730, -0.0143,  0.2149],
          ...,
          [ 0.0319, -0.1980, -0.1510,  ...,  0.1011, -0.0837,  0.2684],
          [-0.0146, -0.1353, -0.1556,  ...,  0.0692,  0.0315,  0.1220],
          [ 0.0546, -0.1126, -0.2485,  ...,  0.1429, -0.0539,  0.2016]]],
        grad_fn=<ViewBackward0>),
 torch.Size([1, 7, 768]))

### The Feed-Forward Layer

![image.png](attachment:image.png)

In [60]:
config.hidden_size, config.intermediate_size, config.hidden_dropout_prob

(768, 3072, 0.1)

In [61]:
class FeedForward(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.linear_1 = nn.Linear(config.hidden_size, config.intermediate_size)
        self.linear_2 = nn.Linear(config.intermediate_size, config.hidden_size)
        self.gelu     = nn.GELU()
        self.dropout  = nn.Dropout(config.hidden_dropout_prob)
    
    def forward(self, x):
        x = self.linear_1(x)
        x = self.gelu(x)
        x = self.linear_2(x)
        x = self.dropout(x)
        return x

In [62]:
device = torch.device('cpu')

In [63]:
feed_forward = FeedForward(config).to(device)
ff_outputs   = feed_forward(attn_outputs.to(device))
ff_outputs, ff_outputs.size()

(tensor([[[-0.0000, -0.1465,  0.1293,  ..., -0.1408,  0.0333,  0.2117],
          [-0.0000, -0.1654, -0.0141,  ..., -0.1167,  0.1713, -0.2765],
          [ 0.0007,  0.0472,  0.1031,  ...,  0.1707,  0.1554,  0.2026],
          ...,
          [ 0.0141, -0.3168, -0.1804,  ...,  0.2389,  0.0075,  0.4167],
          [-0.0293, -0.1542,  0.1083,  ..., -0.2492,  0.3071,  0.1469],
          [ 0.1234, -0.0876, -0.0932,  ..., -0.1141, -0.3408, -0.0000]]],
        grad_fn=<MulBackward0>),
 torch.Size([1, 7, 768]))

In [64]:
ff_outputs

tensor([[[-0.0000, -0.1465,  0.1293,  ..., -0.1408,  0.0333,  0.2117],
         [-0.0000, -0.1654, -0.0141,  ..., -0.1167,  0.1713, -0.2765],
         [ 0.0007,  0.0472,  0.1031,  ...,  0.1707,  0.1554,  0.2026],
         ...,
         [ 0.0141, -0.3168, -0.1804,  ...,  0.2389,  0.0075,  0.4167],
         [-0.0293, -0.1542,  0.1083,  ..., -0.2492,  0.3071,  0.1469],
         [ 0.1234, -0.0876, -0.0932,  ..., -0.1141, -0.3408, -0.0000]]],
       grad_fn=<MulBackward0>)

### Layer Normalization (PreLayer)

PreLayer Normalization is different from the one introduced in the paper "Attention is all you need"

In [65]:
class TransformerEncoderLayer(nn.Module):
    def __init__(self, conifg):
        super().__init__()
        self.layer_norm_1 = nn.LayerNorm(config.hidden_size)
        self.layer_norm_2 = nn.LayerNorm(config.hidden_size)
        self.attention    = MultiHeadAttention(config)
        self.feed_forward = FeedForward(config)
    
    def forward(self, x):
        # Apply layer normalization and then copy input into q, k, v
        hidden_state = self.layer_norm_1(x)
        # Apply attention with a skip connection
        x = x + self.attention(hidden_state)
        # Apply feed-forward layer with a skip connection
        x = self.layer_norm_2(x)
        x = self.feed_forward(x)
        return x

In [66]:
encoder_layer = TransformerEncoderLayer(config).to(device)

In [67]:
encoded_input = encoder_layer(inputs_embeds.to(device))

In [68]:
encoded_input

tensor([[[ 0.0247, -0.0098, -0.0626,  ...,  0.0043,  0.3185,  0.0000],
         [-0.1243,  0.2204,  0.3312,  ...,  0.4814,  0.0472,  0.3123],
         [-0.0678, -0.2136, -0.1762,  ..., -0.0580,  0.1348,  0.2905],
         ...,
         [ 0.0887, -0.1957, -0.1547,  ...,  0.3081,  0.2865, -0.2823],
         [ 0.1967, -0.0968, -0.2297,  ...,  0.0259,  0.2965, -0.2234],
         [ 0.1779,  0.1446,  0.0771,  ...,  0.1336,  0.2598, -0.2073]]],
       grad_fn=<MulBackward0>)

### Appendix I: Positional Embeddings

![image.png](attachment:image.png)

Note that we are applying the PreLayer Normalization, which first normalizes embedded sequence before handing them into the multi-headed attention layers.

In [69]:
config.max_position_embeddings

512

In [70]:
torch.arange(5, dtype = torch.long).unsqueeze(0)

tensor([[0, 1, 2, 3, 4]])

In [71]:
class Embeddings(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.token_embeddings = nn.Embedding(config.vocab_size,
                                            config.hidden_size)
        self.position_embeddings = nn.Embedding(config.max_position_embeddings,
                                               config.hidden_size) 
        self.layer_norm = nn.LayerNorm(config.hidden_size, eps = 1e-12 ) #default eps = 1e-5
        self.dropout = nn.Dropout()
    
    def forward(self, input_ids):
        # Create position IDs for input sequence
        seq_length = input_ids.size(1) # adding pads to batch embedded inputs, they all have the same seq_length
        position_ids = torch.arange(seq_length, dtype = torch.long).unsqueeze(0)
        # Create token and position embeddings
        token_embeddings = self.token_embeddings(input_ids)
        position_embeddings = self.position_embeddings(position_ids)
        # Combine token and position embeddings
        embeddings = token_embeddings + position_embeddings
        embeddings = self.layer_norm(embeddings)
        embeddings = self.dropout(embeddings)
        return embeddings
        

In [72]:
embedding_layer = Embeddings(config)
embedding_layer(inputs.input_ids.to(torch.device('cpu')))

tensor([[[-0.0000,  0.0000,  0.0000,  ...,  0.0000, -0.5153,  1.7909],
         [-0.0000,  0.0000,  0.0000,  ...,  0.0000, -0.0000, -0.0000],
         [ 0.0000,  0.0000, -0.0000,  ..., -0.0000,  2.0999, -1.7728],
         ...,
         [-0.0000, -0.0000, -0.0000,  ..., -1.3130, -0.8731,  0.0000],
         [-0.0000, -1.6717, -0.0000,  ..., -1.4543,  0.0000, -0.0000],
         [-2.1617, -3.0126,  3.0466,  ..., -0.0000,  0.0000,  0.0000]]],
       grad_fn=<MulBackward0>)

### Putting all together

In [73]:
config.num_hidden_layers

12

In [74]:
class TransformerEncoder(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.embeddings = Embeddings(config)
        self.layers = nn.ModuleList([TransformerEncoderLayer(config) for _ in range(config.num_hidden_layers)])
        
    def forward(self, x):
        x = self.embeddings(x)
        for layer in self.layers:
            x = layer(x)
        return x

In [75]:
encoder = TransformerEncoder(config)
encoder(inputs.input_ids)

tensor([[[ 0.2823, -0.0174,  0.1372,  ..., -0.1051, -0.1766,  0.3754],
         [ 0.1859,  0.0237,  0.0733,  ..., -0.1469, -0.0533,  0.3927],
         [ 0.2076, -0.0299,  0.0288,  ..., -0.0551, -0.1325,  0.0000],
         ...,
         [ 0.2470, -0.0254,  0.1746,  ..., -0.1589, -0.1601,  0.3860],
         [ 0.2324,  0.0172,  0.0837,  ..., -0.0216, -0.1657,  0.3680],
         [ 0.3542, -0.0452,  0.0000,  ..., -0.1646, -0.1627,  0.3050]]],
       grad_fn=<MulBackward0>)

In [106]:
config

BertConfig {
  "_name_or_path": "bert-base-uncased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.21.2",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

### What is it all like when using Pytorch's TransformerEncoder?

In [130]:
from torch.nn import TransformerEncoder, TransformerEncoderLayer
from torch import nn


encoder_layer = nn.TransformerEncoderLayer(d_model=config.hidden_size, nhead=config.num_attention_heads)
transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=config.num_hidden_layers)
embeddings = Embeddings(config)
src = embeddings(inputs.input_ids)
out = transformer_encoder(src)

In [131]:
out.size()

torch.Size([1, 7, 768])

### How to implement Decoder Layer

In [139]:
def scaled_dot_product_attention(query, key, value, mask = None):
    assert query.size(-1) == key.size(-1)
    assert query.size(-1) == value.size(-1)
    dim_k = query.size(-1)
    scores = torch.bmm(query, key.transpose(1, 2))/sqrt(dim_k)
    if mask is not None:
        scores = scores.masked_fill(mask== 0, float("-inf"))
    weights = F.softmax(scores, dim  = -1)
    return torch.bmm(weights, value)