In [1]:
from transformers import AutoTokenizer
!pip install bertviz==1.0.0
from bertviz.transformers_neuron_view import BertModel
from bertviz.neuron_view import show

Collecting bertviz==1.0.0
  Downloading bertviz-1.0.0-py3-none-any.whl.metadata (10 kB)
Downloading bertviz-1.0.0-py3-none-any.whl (162 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m162.6/162.6 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bertviz
Successfully installed bertviz-1.0.0


In [2]:
model_ckpt = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
text = "time flies like an arrow"

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



In [3]:
model = BertModel.from_pretrained(model_ckpt)
show(model, "bert", tokenizer, text, display_mode="light", layer=0, head=8)

100%|██████████| 433/433 [00:00<00:00, 385345.56B/s]
100%|██████████| 440473133/440473133 [00:11<00:00, 39145575.68B/s]
  state_dict = torch.load(resolved_archive_file, map_location='cpu')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

## Simulating the Self-attention Mechanism 

In [4]:
inputs = tokenizer(text, return_tensors="pt", add_special_tokens=False)
inputs.input_ids

tensor([[ 2051, 10029,  2066,  2019,  8612]])

In [5]:
tokenizer.convert_ids_to_tokens(inputs.input_ids[0])

['time', 'flies', 'like', 'an', 'arrow']

In [6]:
from torch import nn
from transformers import AutoConfig

config = AutoConfig.from_pretrained(model_ckpt) # loading the config.json file associated with the base model (contains info about the vocab_size and hidden_size)
print(f"the vocab size: {config.vocab_size}, embed size: {config.hidden_size}")

the vocab size: 30522, embed size: 768


In [7]:
config.num_attention_heads 

12

In [8]:
config.intermediate_size

3072

In [9]:
token_embed = nn.Embedding(config.vocab_size, config.hidden_size)
token_embed

Embedding(30522, 768)

In [10]:
input_embeds = token_embed(inputs.input_ids)
input_embeds.size()

torch.Size([1, 5, 768])

In [11]:
import torch
from math import sqrt

query = key = value = input_embeds # for simplicity, later we have to instantiate a learnable projection matrix for each 
dim_k = key.size(-1)
scores = torch.bmm(query, key.transpose(1,2)) / sqrt(dim_k)
scores.size()

torch.Size([1, 5, 5])

In [12]:
import torch.nn.functional as F 

weights = F.softmax(scores, dim = -1) # apply the softmax on the column direction
weights.sum(dim=-1) # great (as expected)

tensor([[1., 1., 1., 1., 1.]], grad_fn=<SumBackward1>)

In [13]:
attn_output = torch.bmm(weights, value) # no transposing required, as it's a square matrix
attn_output.shape

torch.Size([1, 5, 768])

## Multi-head self-attention Implementation Experiments

In [14]:
import torch 
import torch.nn as nn
import torch.nn.functional as F
from math import sqrt

def scaled_dot_product_attention(query, key, value):
    dim_k = key.size(-1) # the embeddings length
    scores = torch.bmm(query, key.transpose(1,2)) / sqrt(dim_k)
    weights = F.softmax(scores, dim=-1) # along columns of the score matrix
    return torch.bmm(weights, value)


class AttentionHead(nn.Module):
    def __init__(self, embed_dim, head_dim):
        super().__init__()
        self.query = nn.Linear(embed_dim, head_dim) # (1, embed_dim) x (embed_dim, head_dim) == (1, head_dim) project the embeddings onto a different space
        self.key = nn.Linear(embed_dim, head_dim)
        self.value = nn.Linear(embed_dim, head_dim)


    def forward(self, hidden_state):
        attn_output = scaled_dot_product_attention(self.query(hidden_state), self.key(hidden_state), self.value(hidden_state))
        return attn_output
    

class MultiHeadAttention(nn.Module):
    def __init__(self, config):
        super().__init__()
        embed_dim = config.hidden_size
        num_heads = config.num_attention_heads
        head_dim = embed_dim // num_heads
        self.heads = nn.ModuleList(
            [AttentionHead(embed_dim, head_dim) for _ in range(num_heads)]
        ) 
        self.output_layer = nn.Linear(embed_dim, embed_dim) # the output shape is the same as the input

    def forward(self, hidden_state):
        x = torch.concat([h(hidden_state) for h in self.heads], dim = -1) # concatenate on the column direction
        x = self.output_layer(x)
        return x 


In [15]:
multihead_attn = MultiHeadAttention(config)
attn_output = multihead_attn(input_embeds)
attn_output.size() # great, we are generating the same shape!

torch.Size([1, 5, 768])

In [16]:
from bertviz import head_view
from transformers import AutoModel 
model = AutoModel.from_pretrained(model_ckpt, 
output_attentions=True) 
sentence_a = "time flies like an arrow"
sentence_b = "fruit flies like a banana" 
viz_inputs = tokenizer(sentence_a, sentence_b, 
return_tensors='pt')
attention = model(**viz_inputs).attentions

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]



In [17]:
viz_inputs

{'input_ids': tensor([[  101,  2051, 10029,  2066,  2019,  8612,   102,  5909, 10029,  2066,
          1037, 15212,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [18]:
tokenizer.convert_ids_to_tokens(viz_inputs.input_ids[0])

['[CLS]',
 'time',
 'flies',
 'like',
 'an',
 'arrow',
 '[SEP]',
 'fruit',
 'flies',
 'like',
 'a',
 'banana',
 '[SEP]']

In [19]:
sentence_b_start = (viz_inputs.token_type_ids == 0).sum(dim=1)
tokens = tokenizer.convert_ids_to_tokens(viz_inputs.input_ids[0]) 
head_view(attention, tokens, sentence_b_start, heads=[8])

<IPython.core.display.Javascript object>

##  The Feed-Forward Layer

In [20]:
import torch.nn as nn

class FeedForward(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.linear_1 = nn.Linear(config.hidden_size, config.intermediate_size)
        self.linear_2 = nn.Linear(config.intermediate_size, config.hidden_size)
        self.gelu = nn.GELU()
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

    def forward(self, x):
        x = self.linear_1(x)
        x = self.gelu(x)
        x = self.linear_2(x)
        x = self.dropout(x)
        return x 


In [21]:
 feed_forward = FeedForward(config)
 ff_outputs = feed_forward(attn_output)
 ff_outputs.size() 

torch.Size([1, 5, 768])

## Transformer Encoder Layer

In [22]:
class TransformerEncoderLayer(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.layer_norm_1 = nn.LayerNorm(config.hidden_size)
        self.layer_norm_2 = nn.LayerNorm(config.hidden_size)
        self.attention = MultiHeadAttention(config)
        self.feed_forward = FeedForward(config)

    def forward(self, x):
        # we will use the pre-normalization design choice
        hidden_state = self.layer_norm_1(x)
        x = x + self.attention(hidden_state)
        x = x + self.feed_forward(self.layer_norm_2(x))
        return x 

In [23]:
encoder_layer = TransformerEncoderLayer(config)
input_embeds.shape, encoder_layer(input_embeds).size() # same dimensions, great!

(torch.Size([1, 5, 768]), torch.Size([1, 5, 768]))

### Incorporating Positional Embeddings

In [24]:
# we have used learnable positional embeddings for this implementation
class Embeddings(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.token_embeddings = nn.Embedding(config.vocab_size, config.hidden_size)
        self.pos_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
        self.layer_norm = nn.LayerNorm(config.hidden_size, eps=1e-12) # eps for numerical stability
        self.dropout = nn.Dropout()

    def forward(self, input_ids):
        seq_lenght = input_ids.size(1)
        position_ids = torch.arange(seq_lenght, dtype=torch.long).unsqueeze(0)

        token_embedings = self.token_embeddings(input_ids)
        pos_embeddings = self.pos_embeddings(position_ids)

        embeddings = token_embedings + pos_embeddings
        embeddings = self.layer_norm(embeddings)
        embeddings = self.dropout(embeddings)
        
        return embeddings

In [25]:
 embedding_layer = Embeddings(config)
 embedding_layer(inputs.input_ids).size()

torch.Size([1, 5, 768])

## Encoder Part

In [26]:
class TransformerEncoder(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.embeddings = Embeddings(config)
        self.layers_stack = nn.ModuleList([
            TransformerEncoderLayer(config) for _ in range(config.num_hidden_layers)
        ])  

    def forward(self, x):
        x = self.embeddings(x)
        for layer in self.layers_stack:
            x = layer(x)
        return x
    

In [32]:
encoder = TransformerEncoder(config)
encoder(inputs.input_ids).size() # forward-pass throught the encoder

torch.Size([1, 5, 768])

## Adding a Classification Head

In [28]:
class TransformerForSequenceClassification(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.encoder = TransformerEncoder(config)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)

    def forward(self, x):
        x = self.encoder(x) [:,0,:] # select the first hidden state, conventional but not restricted.
        x = self.dropout(x)
        x = self.classifier(x)
        return x

In [29]:
config.num_labels = 3
encoder_classifier = TransformerForSequenceClassification(config)
encoder_classifier(inputs.input_ids).size() # great!

torch.Size([1, 3])

### Adjusting for Masked Multi-head Attention

In [38]:
seq_len = inputs.input_ids.size(1)
mask = torch.tril(torch.ones(seq_len, seq_len), diagonal = 0).unsqueeze(0) 
mask

tensor([[[1., 0., 0., 0., 0.],
         [1., 1., 0., 0., 0.],
         [1., 1., 1., 0., 0.],
         [1., 1., 1., 1., 0.],
         [1., 1., 1., 1., 1.]]])

In [41]:
print(scores)
scores.masked_fill(mask == 0, -float('inf')) # e power -inf == 0, thus the score becomes zero.

tensor([[[28.1745,  1.5482, -2.7964,  0.3549, -0.1260],
         [ 1.5482, 25.7138, -1.6975, -0.9447, -0.1193],
         [-2.7964, -1.6975, 29.6646, -0.3023, -0.1539],
         [ 0.3549, -0.9447, -0.3023, 29.8381, -1.1742],
         [-0.1260, -0.1193, -0.1539, -1.1742, 27.4790]]],
       grad_fn=<DivBackward0>)


tensor([[[28.1745,    -inf,    -inf,    -inf,    -inf],
         [ 1.5482, 25.7138,    -inf,    -inf,    -inf],
         [-2.7964, -1.6975, 29.6646,    -inf,    -inf],
         [ 0.3549, -0.9447, -0.3023, 29.8381,    -inf],
         [-0.1260, -0.1193, -0.1539, -1.1742, 27.4790]]],
       grad_fn=<MaskedFillBackward0>)