## BERT Model

In [None]:
!pip install transformers

In [12]:
import torch
from transformers import BertModel, BertTokenizer

In [11]:
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
len(bert_tokenizer.vocab)

30522

In [3]:
model = BertModel.from_pretrained('bert-base-uncased')

print(model)

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          

In [7]:
# BERT model is primarily made up of a BERT Embeddings layer and an Encoder which contains multiple layers
print("BERT Embeddings Layer:")
print(model.embeddings)

BERT Embeddings Layer:
BertEmbeddings(
  (word_embeddings): Embedding(30522, 768, padding_idx=0)
  (position_embeddings): Embedding(512, 768)
  (token_type_embeddings): Embedding(2, 768)
  (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
  (dropout): Dropout(p=0.1, inplace=False)
)


In [8]:
print("First BERT Encoder Layer:")
print(model.encoder.layer[0])

# Specific components within an encoder layer
# Each encoder layer has two main components: a self-attention mechanism and a feed-forward neural network
print("\nSelf-Attention Mechanism of the First Layer:")
print(model.encoder.layer[0].attention)

print("\nFeed-Forward Neural Network of the First Layer:")
print(model.encoder.layer[0].output)

First BERT Encoder Layer:
BertLayer(
  (attention): BertAttention(
    (self): BertSelfAttention(
      (query): Linear(in_features=768, out_features=768, bias=True)
      (key): Linear(in_features=768, out_features=768, bias=True)
      (value): Linear(in_features=768, out_features=768, bias=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (output): BertSelfOutput(
      (dense): Linear(in_features=768, out_features=768, bias=True)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
  )
  (intermediate): BertIntermediate(
    (dense): Linear(in_features=768, out_features=3072, bias=True)
    (intermediate_act_fn): GELUActivation()
  )
  (output): BertOutput(
    (dense): Linear(in_features=3072, out_features=768, bias=True)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
)

Self-Attention Mechanism of the First Layer:
BertAtten

Let's encode a single token and obtain the embeddings

In [13]:
model = BertModel.from_pretrained('bert-base-uncased', output_hidden_states=True)

text = "hello"
inputs = bert_tokenizer(text, return_tensors='pt')

In [17]:
inputs

{'input_ids': tensor([[ 101, 7592,  102]]), 'token_type_ids': tensor([[0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1]])}

In [19]:
bert_tokenizer.batch_decode(inputs['input_ids'])

['[CLS] hello [SEP]']

In [26]:
input_ids = inputs['input_ids']
attention_mask = inputs['attention_mask']

with torch.no_grad():
    embeddings = model.embeddings(input_ids)

# Pass embeddings to the first encoder layer
with torch.no_grad():
    output_first_layer = model.encoder.layer[0](embeddings, attention_mask=attention_mask)[0]
    
print("Output of the first encoder layer:")
print(output_first_layer)

Output of the first encoder layer:
tensor([[[ 0.1923,  0.0070, -0.0734,  ..., -0.1187,  0.0510,  0.0204],
         [ 0.2661,  0.2085,  0.6496,  ...,  0.2059,  0.5710, -0.5848],
         [-0.3075,  0.2466,  0.0656,  ..., -0.3652,  0.6310, -0.0507]]])


In [20]:
# Forward pass, getting all hidden states
with torch.no_grad():
    outputs = model(**inputs)
    hidden_states = outputs.hidden_states  # Hidden states for each layer

In [21]:
# Display embeddings and hidden states for each layer
print("Input Embeddings (Layer 0):")
print(hidden_states[0].shape)
print(hidden_states[0])  # Embeddings before entering the first layer

for i, layer_output in enumerate(hidden_states[1:], start=1):
    print(f"\nOutput of Layer {i}:")
    print(layer_output)


Input Embeddings (Layer 0):
torch.Size([1, 3, 768])
tensor([[[ 0.1686, -0.2858, -0.3261,  ..., -0.0276,  0.0383,  0.1640],
         [ 0.3739, -0.0156, -0.2456,  ..., -0.0317,  0.5514, -0.5241],
         [-0.4815, -0.0189,  0.0092,  ..., -0.2806,  0.3895, -0.2815]]])

Output of Layer 1:
tensor([[[ 0.1923,  0.0070, -0.0734,  ..., -0.1187,  0.0510,  0.0204],
         [ 0.2661,  0.2085,  0.6496,  ...,  0.2059,  0.5710, -0.5848],
         [-0.3075,  0.2466,  0.0656,  ..., -0.3652,  0.6310, -0.0507]]])

Output of Layer 2:
tensor([[[ 0.0482, -0.1791, -0.1576,  ...,  0.0085,  0.0731,  0.0772],
         [ 0.4012,  0.6425,  1.1682,  ...,  0.7173,  0.2574, -0.9423],
         [-0.3227,  0.1739,  0.1891,  ..., -0.1468,  0.5731, -0.0023]]])

Output of Layer 3:
tensor([[[ 0.0161, -0.2921, -0.0109,  ...,  0.2190,  0.0788,  0.3306],
         [ 0.3857,  0.4602,  1.3417,  ...,  0.6322, -0.2826, -1.0969],
         [-0.0960, -0.0680,  0.1347,  ...,  0.0404,  0.0955,  0.0109]]])

Output of Layer 4:
tensor([

BertLayer(
  (attention): BertAttention(
    (self): BertSelfAttention(
      (query): Linear(in_features=768, out_features=768, bias=True)
      (key): Linear(in_features=768, out_features=768, bias=True)
      (value): Linear(in_features=768, out_features=768, bias=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (output): BertSelfOutput(
      (dense): Linear(in_features=768, out_features=768, bias=True)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
  )
  (intermediate): BertIntermediate(
    (dense): Linear(in_features=768, out_features=3072, bias=True)
    (intermediate_act_fn): GELUActivation()
  )
  (output): BertOutput(
    (dense): Linear(in_features=3072, out_features=768, bias=True)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
)