# **Position Embedding en BERT**

In [1]:
!pip install transformers
!pip install torch
!pip install -U scikit-learn



In [2]:
from transformers import BertModel, BertTokenizer

model = BertModel.from_pretrained('bert-base-uncased')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

model.embeddings

2023-09-26 18:23:07.961559: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-09-26 18:23:08.122643: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


BertEmbeddings(
  (word_embeddings): Embedding(30522, 768, padding_idx=0)
  (position_embeddings): Embedding(512, 768)
  (token_type_embeddings): Embedding(2, 768)
  (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
  (dropout): Dropout(p=0.1, inplace=False)
)

In [3]:
#return_tensors='pt' convierte a pytorch automáticamente

secuencia_ejemplo = "We lived above the clouds"
tokenizer.encode(secuencia_ejemplo, return_tensors = 'pt')

tensor([[ 101, 2057, 2973, 2682, 1996, 8044,  102]])

In [4]:
#embedding sin contexto para cada token en nuestra secuencia
model.embeddings.word_embeddings(tokenizer.encode(secuencia_ejemplo, return_tensors='pt'))

tensor([[[ 0.0136, -0.0265, -0.0235,  ...,  0.0087,  0.0071,  0.0151],
         [-0.0564,  0.0063, -0.0526,  ...,  0.0193,  0.0267, -0.0234],
         [ 0.0055, -0.0347,  0.0068,  ..., -0.0516, -0.0400,  0.0004],
         ...,
         [-0.0446,  0.0061, -0.0022,  ..., -0.0363, -0.0004, -0.0306],
         [-0.0291, -0.0159, -0.0204,  ..., -0.0452, -0.0196, -0.0080],
         [-0.0145, -0.0100,  0.0060,  ..., -0.0250,  0.0046, -0.0015]]],
       grad_fn=<EmbeddingBackward0>)

In [5]:
#observa que la primera y última fila son las mismas porque son
#los tokens reservados [CLS] y [SEP]. Son los mismos sin contexto para cada entrada
model.embeddings.word_embeddings(tokenizer.encode('I am a man' , return_tensors='pt'))

tensor([[[ 0.0136, -0.0265, -0.0235,  ...,  0.0087,  0.0071,  0.0151],
         [-0.0211,  0.0059, -0.0179,  ...,  0.0163,  0.0122,  0.0073],
         [-0.0437, -0.0150,  0.0029,  ..., -0.0282,  0.0474, -0.0448],
         [ 0.0152,  0.0082,  0.0043,  ..., -0.0031, -0.0055,  0.0189],
         [-0.0077, -0.0312, -0.0070,  ...,  0.0076, -0.0427, -0.0426],
         [-0.0145, -0.0100,  0.0060,  ..., -0.0250,  0.0046, -0.0015]]],
       grad_fn=<EmbeddingBackward0>)

In [6]:
model.embeddings.position_embeddings    #512 embeddings, uno por cada posición de un máximo de 512 tokens en la secuencia de entrada

Embedding(512, 768)

In [7]:
import torch
torch.LongTensor(range(7))

tensor([0, 1, 2, 3, 4, 5, 6])

In [8]:
model.embeddings.position_embeddings(torch.LongTensor(range(7)))  #embeddings de posición para nuestra secuencia_ejemplo

tensor([[ 1.7505e-02, -2.5631e-02, -3.6642e-02,  ...,  3.3437e-05,
          6.8312e-04,  1.5441e-02],
        [ 7.7580e-03,  2.2613e-03, -1.9444e-02,  ...,  2.8910e-02,
          2.9753e-02, -5.3247e-03],
        [-1.1287e-02, -1.9644e-03, -1.1573e-02,  ...,  1.4908e-02,
          1.8741e-02, -7.3140e-03],
        ...,
        [-5.6087e-03, -1.0445e-02, -7.2288e-03,  ...,  2.0837e-02,
          3.5402e-03,  4.7708e-03],
        [-3.0871e-03, -1.8956e-02, -1.8930e-02,  ...,  7.4045e-03,
          2.0183e-02,  3.4077e-03],
        [ 6.4257e-03, -1.7664e-02, -2.2067e-02,  ...,  6.7531e-04,
          1.1108e-02,  3.7521e-03]], grad_fn=<EmbeddingBackward0>)

In [9]:
model.embeddings.token_type_embeddings  # 2 embeddings. Uno para A y otro para B

Embedding(2, 768)

In [10]:
torch.LongTensor([0]*7)

tensor([0, 0, 0, 0, 0, 0, 0])

In [11]:
model.embeddings.token_type_embeddings(torch.LongTensor([0]*7))  # todos los tokens tienen el mismo embedding

tensor([[ 0.0004,  0.0110,  0.0037,  ..., -0.0066, -0.0034, -0.0086],
        [ 0.0004,  0.0110,  0.0037,  ..., -0.0066, -0.0034, -0.0086],
        [ 0.0004,  0.0110,  0.0037,  ..., -0.0066, -0.0034, -0.0086],
        ...,
        [ 0.0004,  0.0110,  0.0037,  ..., -0.0066, -0.0034, -0.0086],
        [ 0.0004,  0.0110,  0.0037,  ..., -0.0066, -0.0034, -0.0086],
        [ 0.0004,  0.0110,  0.0037,  ..., -0.0066, -0.0034, -0.0086]],
       grad_fn=<EmbeddingBackward0>)

In [12]:
# Aplicar capa de normalización de avance(feed-forward)
model.embeddings.LayerNorm(
    model.embeddings.word_embeddings(tokenizer.encode(secuencia_ejemplo, return_tensors='pt' )) + \
    model.embeddings.position_embeddings(torch.LongTensor(range(7))) + \
    model.embeddings.token_type_embeddings(torch.LongTensor([0]*7))
)

tensor([[[ 0.1686, -0.2858, -0.3261,  ..., -0.0276,  0.0383,  0.1640],
         [-0.6375,  0.5415, -0.8719,  ...,  0.8028,  1.1306, -0.3939],
         [ 0.1881, -0.1682,  0.2943,  ..., -0.4347, -0.1062,  0.0591],
         ...,
         [-0.6588,  0.3324,  0.1625,  ..., -0.1498,  0.2525, -0.3350],
         [-0.1675, -0.0255, -0.1687,  ..., -0.3345,  0.3529,  0.2077],
         [-0.1481, -0.2948, -0.1690,  ..., -0.5009,  0.2544, -0.0700]]],
       grad_fn=<NativeLayerNormBackward0>)

In [13]:
# model.embeddings devuelve el input embedding que alimentará al primer codificador
model.embeddings(tokenizer.encode(secuencia_ejemplo, return_tensors='pt'))

tensor([[[ 0.1686, -0.2858, -0.3261,  ..., -0.0276,  0.0383,  0.1640],
         [-0.6375,  0.5415, -0.8719,  ...,  0.8028,  1.1306, -0.3939],
         [ 0.1881, -0.1682,  0.2943,  ..., -0.4347, -0.1062,  0.0591],
         ...,
         [-0.6588,  0.3324,  0.1625,  ..., -0.1498,  0.2525, -0.3350],
         [-0.1675, -0.0255, -0.1687,  ..., -0.3345,  0.3529,  0.2077],
         [-0.1481, -0.2948, -0.1690,  ..., -0.5009,  0.2544, -0.0700]]],
       grad_fn=<NativeLayerNormBackward0>)

In [14]:
model.embeddings(tokenizer.encode(secuencia_ejemplo, return_tensors='pt')).shape

torch.Size([1, 7, 768])