# 0. install model

In [3]:
!pip install bertviz

Collecting bertviz
  Downloading bertviz-1.4.0-py3-none-any.whl.metadata (19 kB)
Collecting boto3 (from bertviz)
  Downloading boto3-1.35.44-py3-none-any.whl.metadata (6.7 kB)
Collecting botocore<1.36.0,>=1.35.44 (from boto3->bertviz)
  Downloading botocore-1.35.44-py3-none-any.whl.metadata (5.7 kB)
Collecting jmespath<2.0.0,>=0.7.1 (from boto3->bertviz)
  Downloading jmespath-1.0.1-py3-none-any.whl.metadata (7.6 kB)
Collecting s3transfer<0.11.0,>=0.10.0 (from boto3->bertviz)
  Downloading s3transfer-0.10.3-py3-none-any.whl.metadata (1.7 kB)
Downloading bertviz-1.4.0-py3-none-any.whl (157 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m157.6/157.6 kB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading boto3-1.35.44-py3-none-any.whl (139 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m139.2/139.2 kB[0m [31m13.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading botocore-1.35.44-py3-none-any.whl (12.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━

# 0-1. import modules

In [4]:
from transformers import AutoTokenizer
from bertviz.transformers_neuron_view import BertModel
from bertviz.neuron_view import show

# 1. Encoder

## 1-1. Scaled Dot-Product Attention

### Attention visualization

In [5]:
model_ckpt = 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
model = BertModel.from_pretrained(model_ckpt)
text1 = 'time flies like an arrow'
show(model, 'bert', tokenizer, text1, display_mode='light', layer=0, head=8)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

100%|██████████| 433/433 [00:00<00:00, 292453.08B/s]
100%|██████████| 440473133/440473133 [00:13<00:00, 33021709.41B/s]
  state_dict = torch.load(resolved_archive_file, map_location='cpu')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [6]:
# compare w/ another meaning of 'fly'
text2 = 'fruit flies like an apple'
show(model, 'bert', tokenizer, text2, display_mode='light', layer=0, head=8)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

### tokenize

In [7]:
inputs = tokenizer(text1, return_tensors='pt', add_special_tokens=False)  # to declude [CLS] [SEP]
inputs.input_ids  # mapping

tensor([[ 2051, 10029,  2066,  2019,  8612]])

### dense embedding -> token embedding is still context-independent

In [8]:
from torch import nn
from transformers import AutoConfig
# loading config.json, which is related to bert-base-uncased checkpoint
    # to assign hyperparam.s (vocab_size, hidden_size)
    # and also get metadata for setting prediction format

config = AutoConfig.from_pretrained(model_ckpt)
token_emb = nn.Embedding(config.vocab_size, config.hidden_size)  # lookup table
token_emb

Embedding(30522, 768)

In [9]:
# get embedding
inputs_embeds = token_emb(inputs.input_ids)
inputs_embeds.size()  # [batch_size, seq_len, hidden_dim]

torch.Size([1, 5, 768])

### calculate attention score

In [10]:
import torch
from math import sqrt

query = key = value = inputs_embeds
dim_k = key.size(-1)
# ignore batch size -> to calculate easier;
    # transpose key tensor -> [hidden_dim, seq_len]
    # bmm w/ qeury -> [seq_len, seq_len]

scores = torch.bmm(query, key.transpose(1, 2)) / sqrt(dim_k)  # normalization
scores.size()

torch.Size([1, 5, 5])

### apply softmax

In [11]:
import torch.nn.functional as F

weights = F.softmax(scores, dim=-1); print(weights.size())
weights.sum(dim=-1)

torch.Size([1, 5, 5])


tensor([[1., 1., 1., 1., 1.]], grad_fn=<SumBackward1>)

In [12]:
attn_outputs = torch.bmm(weights, value)
# 5 by 5 * 5 by 786 -> 5 by 768 (= hidden_dim)
attn_outputs.shape

torch.Size([1, 5, 768])

### functionize

In [13]:
def scale_dot_product_attention_for_encoder(query, key, value, mask=None):
    dim_k = key.size(-1)
    scores = torch.bmm(query, key.transpose(1, 2)) / sqrt(dim_k)
    weights = F.softmax(scores, dim=-1)
    return torch.bmm(weights, value)

## 1-2. Multi-Head Attention

### class declaration

In [14]:
class AttentionHead(nn.Module):
    def __init__(self, embed_dim, head_dim):
        super().__init__()

        # linear transformation -> [batch_size, seq_len, head_dim]
        self.q = nn.Linear(embed_dim, head_dim)
        self.k = nn.Linear(embed_dim, head_dim)
        self.v = nn.Linear(embed_dim, head_dim)

    def forward(self, hidden_state):
        return scale_dot_product_attention_for_encoder(self.q(hidden_state), self.k(hidden_state), self.v(hidden_state))

In [15]:
class MultiHeadAttention(nn.Module):
    def __init__(self, config):
        super().__init__()
        embed_dim = config.hidden_size  # 768
        num_heads = config.num_attention_heads  # 12
        head_dim = embed_dim // num_heads  # 64
        self.heads = nn.ModuleList(
            [AttentionHead(embed_dim, head_dim) for _ in range(num_heads)]
        )
        self.output_linear = nn.Linear(embed_dim, embed_dim)
        # The reason for linear transformation to the same dimension is
        # to apply additional weight learning to the attention results to help
        # the model learn better representations, and to adjust the information
        # obtained from each head while maintaining the dimension to highlight
        # useful features or remove unnecessary information.

    def forward(self, hidden_state):
        x = torch.cat([h(hidden_state) for h in self.heads], dim=-1)
        x = self.output_linear(x)
        # linear transformation -> [batch_size, seq_len, embed_dim(hidden_dim)]
        return x

### initializing & test

In [16]:
multihead_attn = MultiHeadAttention(config)
attn_output = multihead_attn(inputs_embeds)
attn_output.size()

torch.Size([1, 5, 768])

### visualization

In [18]:
from bertviz import head_view
from transformers import AutoModel

model = AutoModel.from_pretrained(model_ckpt, output_attentions=True)

sentence_a = "time flies like an arrow"
sentence_b = "fruit flies like an apple"
viz_inputs = tokenizer(sentence_a, sentence_b, return_tensors='pt')
attention = model(**viz_inputs).attentions
sentence_b_start = (viz_inputs.token_type_ids == 0).sum(dim=1)
tokens = tokenizer.convert_ids_to_tokens(viz_inputs.input_ids[0])

head_view(attention, tokens, sentence_b_start, heads=[8])

<IPython.core.display.Javascript object>

## 1-3. Feed-Forward Layer

In [19]:
class FeedForward(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.linear_1 = nn.Linear(config.hidden_size, config.intermediate_size)
        self.linear_2 = nn.Linear(config.intermediate_size, config.hidden_size)
        self.gelu = nn.GELU()
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

    def forward(self, x):
        x = self.linear_1(x)
        x = self.gelu(x)
        x = self.linear_2(x)
        x = self.dropout(x)
        return x

In [20]:
feed_forward = FeedForward(config)
ff_outputs = feed_forward(attn_outputs)
ff_outputs.size()

torch.Size([1, 5, 768])

## 1-4. Norm (pre-layer normalization)

In [21]:
class TransformerEncoderLayer(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.layer_norm_1 = nn.LayerNorm(config.hidden_size)
        self.layer_norm_2 = nn.LayerNorm(config.hidden_size)
        self.attention = MultiHeadAttention(config)
        self.feed_forward = FeedForward(config)

    def forward(self, x):
        # to apply layer norm -> copy input to q, k, v
        hidden_state = self.layer_norm_1(x)
        # attn + skip connection -> to know the variance
        x = x + self.attention(hidden_state)
        # skip connection + ffnn
        x = x + self.feed_forward(self.layer_norm_2(x))
        return x

In [22]:
encoder_layer = TransformerEncoderLayer(config)
inputs_embeds.shape, encoder_layer(inputs_embeds).size()

(torch.Size([1, 5, 768]), torch.Size([1, 5, 768]))

## 1-5. position embedding

In [23]:
class Embeddings(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.token_embeddings = nn.Embedding(config.vocab_size, config.hidden_size)
        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
        self.layer_norm = nn.LayerNorm(config.hidden_size, eps=1e-12)
        self.dropout = nn.Dropout()

    def forward(self, input_ids):
        # input_ids: [batch_size, seq_len]
        # make position id about input seq.
        seq_length = input_ids.size(1)
        position_ids = torch.arange(seq_length, dtype=torch.long).unsqueeze(0)
        # make token embed & position embed
        token_embeddings = self.token_embeddings(input_ids)
        position_embeddings = self.position_embeddings(position_ids)
        # concat
        embeddings = token_embeddings + position_embeddings
        embeddings = self.layer_norm(embeddings)
        embeddings = self.dropout(embeddings)
        return embeddings

embedding_layer = Embeddings(config)
embedding_layer(inputs.input_ids).size()

torch.Size([1, 5, 768])

## 1-6. Transformer Encoder

In [24]:
class TransformerEncoder(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.embeddings = Embeddings(config)
        self.layers = nn.ModuleList([TransformerEncoderLayer(config) for _ in range(config.num_hidden_layers)])

    def forward(self, x):
        x = self.embeddings(x)
        for layer in self.layers:
            x = layer(x)
        return x

In [25]:
encoder = TransformerEncoder(config)
encoder(inputs.input_ids).size()

torch.Size([1, 5, 768])

# 2. Decoder

## 2-1. Masked Multi-Head Attention

### get mask matrix

In [26]:
seq_len = inputs.input_ids.size(-1)
mask = torch.tril(torch.ones(seq_len, seq_len)).unsqueeze(0)
mask[0]

tensor([[1., 0., 0., 0., 0.],
        [1., 1., 0., 0., 0.],
        [1., 1., 1., 0., 0.],
        [1., 1., 1., 1., 0.],
        [1., 1., 1., 1., 1.]])

In [27]:
scores.masked_fill(mask == 0, -float('inf'))

tensor([[[24.1238,    -inf,    -inf,    -inf,    -inf],
         [ 0.9738, 29.0553,    -inf,    -inf,    -inf],
         [-0.6139,  0.0586, 28.6357,    -inf,    -inf],
         [-0.4496, -0.1232, -0.1859, 28.0106,    -inf],
         [-1.0547, -2.1772,  0.6620, -0.4505, 27.1762]]],
       grad_fn=<MaskedFillBackward0>)

### functionization

In [28]:
def scale_dot_product_attention_for_decoder(query, key, value, mask=None):
    dim_k = key.size(-1)
    scores = torch.bmm(query, key.transpose(1, 2)) / sqrt(dim_k)
    if mask is not None:
        scores = scores.masked_fill(mask == 0, float('-inf'))  # for decoder
    weights = F.softmax(scores, dim=-1)
    return torch.bmm(weights, value)

In [29]:
class AttentionHead(nn.Module):
    def __init__(self, embed_dim, head_dim):
        super().__init__()

        # linear transformation -> [batch_size, seq_len, head_dim]
        self.q = nn.Linear(embed_dim, head_dim)
        self.k = nn.Linear(embed_dim, head_dim)
        self.v = nn.Linear(embed_dim, head_dim)

    def forward(self, hidden_state):
        return scale_dot_product_attention_for_decoder(self.q(hidden_state), self.k(hidden_state), self.v(hidden_state))

In [30]:
class MaskedMultiHeadAttention(nn.Module):
    def __init__(self, config):
        super().__init__()
        embed_dim = config.hidden_size  # 768
        num_heads = config.num_attention_heads  # 12
        head_dim = embed_dim // num_heads  # 64
        self.heads = nn.ModuleList(
            [AttentionHead(embed_dim, head_dim) for _ in range(num_heads)]
        )
        self.output_linear = nn.Linear(embed_dim, embed_dim)
        # The reason for linear transformation to the same dimension is
        # to apply additional weight learning to the attention results to help
        # the model learn better representations, and to adjust the information
        # obtained from each head while maintaining the dimension to highlight
        # useful features or remove unnecessary information.

    def forward(self, hidden_state):
        x = torch.cat([h(hidden_state) for h in self.heads], dim=-1)
        x = self.output_linear(x)
        # linear transformation -> [batch_size, seq_len, embed_dim(hidden_dim)]
        return x

## 2-2. Transformer Decoder

In [31]:
class TransformerDecoderLayer(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.layer_norm_1 = nn.LayerNorm(config.hidden_size)
        self.layer_norm_2 = nn.LayerNorm(config.hidden_size)
        self.attention = MultiHeadAttention(MaskedMultiHeadAttention(config))
        self.feed_forward = FeedForward(config)

    def forward(self, x):
        # to apply layer norm -> copy input to q, k, v
        hidden_state = self.layer_norm_1(x)
        # attn + skip connection -> to know the variance
        x = x + self.attention(hidden_state)
        # skip connection + ffnn
        x = x + self.feed_forward(self.layer_norm_2(x))
        return x

In [32]:
class TransformerDecoder(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.embeddings = Embeddings(config)
        self.layers = nn.ModuleList([TransformerDecoderLayer(config) for _ in range(config.num_hidden_layers)])

    def forward(self, x):
        x = self.embeddings(x)
        for layer in self.layers:
            x = layer(x)
        return x