In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
%%HTML
<style type="text/css">
table.dataframe td, table.dataframe th {
    border: 1px  black solid !important;
  color: black !important;
}
</style>

In [3]:
from torch import nn
from transformers import AutoTokenizer, AutoConfig

import torch

In [4]:
model_ckpt = 'bert-base-uncased'
text = 'time flies like an arrow'

In [5]:
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
tokenizer

BertTokenizerFast(name_or_path='bert-base-uncased', vocab_size=30522, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True)

In [6]:
config = AutoConfig.from_pretrained(model_ckpt)
config

BertConfig {
  "_name_or_path": "bert-base-uncased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.28.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

In [7]:
inputs = tokenizer(text, return_tensors='pt', add_special_tokens=False)
inputs.input_ids

tensor([[ 2051, 10029,  2066,  2019,  8612]])

In [8]:
token_emb = nn.Embedding(config.vocab_size, config.hidden_size)
token_emb

Embedding(30522, 768)

In [9]:
input_embs = token_emb(inputs.input_ids)
input_embs

print(f'the shape of the input embeddings is: {input_embs.shape}')

tensor([[[-1.4554,  0.2239, -0.6744,  ..., -0.6691, -0.8009, -0.3083],
         [-0.9770, -0.8626,  0.7488,  ...,  0.7846,  1.5331,  0.1193],
         [ 0.7315, -1.0357,  0.8945,  ...,  0.1678,  0.3803, -1.1836],
         [ 1.4281, -1.2846, -0.7726,  ...,  0.6359,  1.2404,  1.8382],
         [-1.1415,  0.8932, -1.5255,  ..., -0.1771, -0.8013, -0.4303]]],
       grad_fn=<EmbeddingBackward0>)

the shape of the input embeddings is: torch.Size([1, 5, 768])


In [10]:
# creating key, query and value vectors by projecting the token embeddings
from math import sqrt

query = key = value = input_embs
dim_k = key.size(-1)

key.transpose(1, 2)

tensor([[[-1.4554, -0.9770,  0.7315,  1.4281, -1.1415],
         [ 0.2239, -0.8626, -1.0357, -1.2846,  0.8932],
         [-0.6744,  0.7488,  0.8945, -0.7726, -1.5255],
         ...,
         [-0.6691,  0.7846,  0.1678,  0.6359, -0.1771],
         [-0.8009,  1.5331,  0.3803,  1.2404, -0.8013],
         [-0.3083,  0.1193, -1.1836,  1.8382, -0.4303]]],
       grad_fn=<TransposeBackward0>)

In [11]:
# scaled dot product. we are scaling the dot producct with sqrt(dim_k)
scores = torch.bmm(query, key.transpose(1, 2)) / sqrt(dim_k)
scores

print(f'the size of the attentions scores tensor is: {scores.size()}')

tensor([[[ 2.8150e+01, -2.8010e-01, -4.4082e-01,  9.4643e-01,  3.2643e-01],
         [-2.8010e-01,  2.6411e+01, -1.0638e+00, -1.9945e-02,  8.0360e-01],
         [-4.4082e-01, -1.0638e+00,  2.6834e+01, -1.3272e-01,  1.5849e+00],
         [ 9.4643e-01, -1.9945e-02, -1.3272e-01,  2.8542e+01,  9.0373e-01],
         [ 3.2643e-01,  8.0360e-01,  1.5849e+00,  9.0373e-01,  2.6120e+01]]],
       grad_fn=<DivBackward0>)

the size of the scores(attentions scores) matrix is: torch.Size([1, 5, 5])


In [56]:
# applying softmax function

import torch.nn.functional as F

weights = F.softmax(scores, dim=-1)
weights.sum(dim=-1)

weights

print(f'the shape of the attention weights tensor is: {weights.shape}')

tensor([[1., 1., 1., 1., 1.]], grad_fn=<SumBackward1>)

tensor([[[1.0000e+00, 4.4993e-13, 3.8312e-13, 1.5340e-12, 8.2518e-13],
         [2.5606e-12, 1.0000e+00, 1.1695e-12, 3.3214e-12, 7.5681e-12],
         [1.4275e-12, 7.6564e-13, 1.0000e+00, 1.9425e-12, 1.0823e-11],
         [1.0357e-12, 3.9404e-13, 3.5202e-13, 1.0000e+00, 9.9241e-13],
         [6.2787e-12, 1.0118e-11, 2.2102e-11, 1.1184e-11, 1.0000e+00]]],
       grad_fn=<SoftmaxBackward0>)

the shape of the attention weights tensor is: torch.Size([1, 5, 5])


In [57]:
# multiplying the attention weights by values.

attention_outputs = torch.bmm(weights, value)
attention_outputs

print(f'the shape of the attention output is: {attention_outputs.shape}')

tensor([[[-1.4554,  0.2239, -0.6744,  ..., -0.6691, -0.8009, -0.3083],
         [-0.9770, -0.8626,  0.7488,  ...,  0.7846,  1.5331,  0.1193],
         [ 0.7315, -1.0357,  0.8945,  ...,  0.1678,  0.3803, -1.1836],
         [ 1.4281, -1.2846, -0.7726,  ...,  0.6359,  1.2404,  1.8382],
         [-1.1415,  0.8932, -1.5255,  ..., -0.1771, -0.8013, -0.4303]]],
       grad_fn=<BmmBackward0>)

the shape of the attention output is: torch.Size([1, 5, 768])


In [58]:
# implemtation of scaled dot product attention
def scaled_dot_product_attention(query, key, value):
    dim_k = key.size(-1)
    attention_scores = torch.bmm(query, key.transpose(1, 2))/sqrt(dim_k)
    attention_weights = F.softmax(attention_scores, dim=-1)
    return torch.bmm(attention_weigths, value)