In [1]:
### Library for hf datasets
import datasets
from icecream import ic
import sys
import os
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sn
import torch
import transformers as transformers
import numpy as np
BASE_DIR:Path = Path(os.getcwd())
ROOT_DIR:Path = BASE_DIR.parent

  from .autonotebook import tqdm as notebook_tqdm


## Visuzlize Attention 

In [7]:
from transformers import AutoTokenizer, AutoModel
from bertviz.transformers_neuron_view import BertModel
from bertviz.neuron_view import show

- layers = 0 and head = 8 focuses on 1 layer and 9th attention head. BERT model has 12 encoder layers and 12 attention heads per layer.
- Earlier layers focus more on local patterns

In [13]:
model_ckpt = 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
model= BertModel.from_pretrained(model_ckpt)
text = "time flies like an arrow"
show(model,"bert",tokenizer, text,display_mode='light',layer=0, head = 8)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

## Attention from scratch

In [16]:
inputs = tokenizer(text, return_tensors='pt',add_special_tokens=False)
inputs

{'input_ids': tensor([[ 2051, 10029,  2066,  2019,  8612]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1]])}

### Sample embedding layer - This is a lookup layer

In [None]:
## create a embedding layer
torch.manual_seed(42)
emb = torch.nn.Embedding(4,12)
emb(torch.tensor([1]))

tensor([[-0.7279, -0.5594, -0.7688,  0.7624,  1.6423, -0.1596, -0.4974,  0.4396,
         -0.7581,  1.0783,  0.8008,  1.6806]], grad_fn=<EmbeddingBackward0>)

### Embedding layer for BERT

In [113]:
from transformers import AutoConfig
bert_config = AutoConfig.from_pretrained(model_ckpt)
torch.manual_seed(42)
token_emb = torch.nn.Embedding(bert_config.vocab_size, bert_config.hidden_size)
input_embeds = token_emb(inputs.input_ids)
input_embeds.shape
## This shape is consistent with the (batch_size, sequence_length, embedding_size)

torch.Size([1, 5, 768])

### Decompose into Q,k,V

In [115]:
from math import sqrt
import torch.nn.functional as F

query, key, value = (input_embeds, input_embeds, input_embeds)
## batch matrix multiplication
## we want to do dot product for each batch
## In this case, we have done transpose across dim 1 and dim 2
size = query.size()[-1]
## size helps for normalization
scores = torch.bmm(query, key.transpose(1,2))
print(scores.size())
## Apply softmax to scores
weights = F.softmax(scores, dim = -1)
## for each token - weights sum up to 1
print(weights.sum(axis= -1))
## multiply attention weights by value
attn_outputs = torch.bmm(weights,value)
print(attn_outputs.shape)

torch.Size([1, 5, 5])
tensor([[1., 1., 1., 1., 1.]], grad_fn=<SumBackward1>)
torch.Size([1, 5, 768])


In [133]:
## Simply enclose dot product attention into a function
def scaled_dot_product_attention(query, key, value):
    dim_k = query.shape[-1]
    scores = torch.bmm(query, key.transpose(1,2))/(dim_k**0.5) # Batch x sequence x sequence
    weights = F.softmax(scores, -1)
    return torch.bmm(weights, value)

## Multi-head attention
- Motivation: If we want to capture different relationships, we can have more linear projections and then concat them. Each projection (called an attention head) captures a different relationship

#### Basics of a linear layer
- The goal is to project a 5 dimensional vector into a 2-D vector. So D x 5 --> D x 2 so the weight matrix should be 5 x 2 but pytorch stores the weights in transpose form

In [172]:
from torch import nn
l1 = nn.Linear(5,2, bias=True)
print(l1.weight.shape)
random_tensor = torch.randn([1,2,5]) ## batch x sequence x embedding
l1(random_tensor) ## the output is modified tensor 


torch.Size([2, 5])


tensor([[[-0.2612,  0.7212],
         [ 0.0855,  0.9757]]], grad_fn=<ViewBackward0>)

In [None]:
from torch import nn
import torch
import torch.nn.functional as F

def scaled_dot_product_attention(query, key, value):
    dim_k = query.shape[-1]
    scores = torch.bmm(query, key.transpose(1,2))/(dim_k**0.5) # Batch x sequence x sequence
    weights = F.softmax(scores, -1)
    return torch.bmm(weights, value)

class AttentionHead(nn.Module):
    ## emb_dim --> 
    def __init__(self, embed_dim, head_dim):
        super().__init__()
        self.q = nn.Linear(embed_dim,head_dim)
        self.k = nn.Linear(embed_dim,head_dim)
        self.v = nn.Linear(embed_dim,head_dim)
    
    def forward(self, hidden_state):
        attn_outputs = scaled_dot_product_attention(
            self.q(hidden_state), self.k(hidden_state), self.v(hidden_state)
        )
        return attn_outputs

class MultiHeadAttention(nn.Module):
    def __init__(self, config):
        

tensor([[[-0.9723, -0.4978, -0.6432],
         [-0.8106, -0.4961, -0.8941],
         [-0.8387, -0.5562, -0.9311],
         [-0.8215, -0.6042, -1.0176]]], grad_fn=<BmmBackward0>)