**Arquitectura práctica de BERT**

In [1]:
!pip install transformers
!pip install torch
!pip install -U scikit-learn

Collecting scikit-learn
  Downloading scikit_learn-1.3.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (10.9 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.9/10.9 MB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m0:01[0m01[0m
Collecting joblib>=1.1.1 (from scikit-learn)
  Downloading joblib-1.3.2-py3-none-any.whl (302 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.2/302.2 kB[0m [31m60.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting threadpoolctl>=2.0.0 (from scikit-learn)
  Downloading threadpoolctl-3.2.0-py3-none-any.whl (15 kB)
Installing collected packages: threadpoolctl, joblib, scikit-learn
Successfully installed joblib-1.3.2 scikit-learn-1.3.1 threadpoolctl-3.2.0


In [2]:
from transformers import BertTokenizer, BertModel
import torch
from sklearn.metrics.pairwise import cosine_similarity

2023-09-25 18:50:59.484661: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-09-25 18:50:59.566636: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
#Creamos un modelo BERT-base
model = BertModel.from_pretrained('bert-base-uncased')

In [4]:
#Obtenemos todos los parámetros del modelo como una lista de tuplas.
named_params = list(model.named_parameters())

In [5]:
print('El modelo BERT tiene {:} diferentes parámetros con nombre.\n' .format(len(named_params)))

print( '=== Capa de Incrustación ===\n' )
for p in named_params [0:5]:
      print ("{:<55} {:>12}" .format(p[0], str(tuple(p)[1].size())))

print('\n=== Primer Codificador ===\n')
for p in named_params[5:21]:
      print ("{:<55} {:>12}" .format(p[0], str(tuple(p)[1].size())))

print('\n=== Capa Salida ===\n')
for p in named_params[-2:]:
      print ("{:<55} {:>12}" .format(p[0], str(tuple(p)[1].size())))

El modelo BERT tiene 199 diferentes parámetros con nombre.

=== Capa de Incrustación ===

embeddings.word_embeddings.weight                       torch.Size([30522, 768])
embeddings.position_embeddings.weight                   torch.Size([512, 768])
embeddings.token_type_embeddings.weight                 torch.Size([2, 768])
embeddings.LayerNorm.weight                             torch.Size([768])
embeddings.LayerNorm.bias                               torch.Size([768])

=== Primer Codificador ===

encoder.layer.0.attention.self.query.weight             torch.Size([768, 768])
encoder.layer.0.attention.self.query.bias               torch.Size([768])
encoder.layer.0.attention.self.key.weight               torch.Size([768, 768])
encoder.layer.0.attention.self.key.bias                 torch.Size([768])
encoder.layer.0.attention.self.value.weight             torch.Size([768, 768])
encoder.layer.0.attention.self.value.bias               torch.Size([768])
encoder.layer.0.attention.output.dens

In [6]:
# El pooler es una capa separada lineal y activada por tanh que actúa sobre la representación del token [CLS]
# Esta salida agrupada se usa a menudo como una representación de la oración completa.

In [7]:
#cargar el tokenizador bert-base uncased
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [8]:
codificado = tokenizer.encode ('You must stop now [SEP] the light is red')  # tokeniza una secuencia simple
print (codificado)

[101, 2017, 2442, 2644, 2085, 102, 1996, 2422, 2003, 2417, 102]


In [9]:
tokenizer.decode (codificado)

'[CLS] you must stop now [SEP] the light is red [SEP]'

In [None]:
#Ejecutar los tokens a través del modelo
#1 Convertimos los tokens de la sentencia en un tensor (será de tamaño (8,))
#2 Unsqueeze una primera dimensión para simular lotes. La forma resultante es (1, 8)
response = model(torch.tensor(tokenizer.encode('You must stop at the signal')).unsqueeze(0))

In [None]:
#Incrustación para cada token, siendo el primero el token [CLS]
response.last_hidden_state

tensor([[[-0.0404,  0.0731, -0.0454,  ..., -0.1092,  0.0687,  0.3676],
         [-0.3538, -0.4443,  0.2865,  ...,  0.2542,  0.0527, -0.1061],
         [ 0.4359,  0.2655,  0.2471,  ...,  0.1396, -0.2317,  0.4675],
         ...,
         [-0.2060, -0.4944,  0.0676,  ..., -0.2488,  0.1107, -0.0121],
         [ 0.1167, -0.0951, -0.2409,  ..., -0.0901, -0.0830,  0.0446],
         [ 0.6370, -0.0453, -0.5646,  ...,  0.0691, -0.5480, -0.3318]]],
       grad_fn=<NativeLayerNormBackward0>)

In [None]:
response

BaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=tensor([[[-0.0404,  0.0731, -0.0454,  ..., -0.1092,  0.0687,  0.3676],
         [-0.3538, -0.4443,  0.2865,  ...,  0.2542,  0.0527, -0.1061],
         [ 0.4359,  0.2655,  0.2471,  ...,  0.1396, -0.2317,  0.4675],
         ...,
         [-0.2060, -0.4944,  0.0676,  ..., -0.2488,  0.1107, -0.0121],
         [ 0.1167, -0.0951, -0.2409,  ..., -0.0901, -0.0830,  0.0446],
         [ 0.6370, -0.0453, -0.5646,  ...,  0.0691, -0.5480, -0.3318]]],
       grad_fn=<NativeLayerNormBackward0>), pooler_output=tensor([[-0.8201, -0.2552, -0.0986,  0.6649,  0.0891, -0.0435,  0.8439,  0.1667,
         -0.0937, -1.0000, -0.0602,  0.2733,  0.9764, -0.0422,  0.8642, -0.4555,
          0.1762, -0.5287,  0.2783, -0.5646,  0.4909,  0.9959,  0.5069,  0.1917,
          0.3634,  0.6161, -0.4522,  0.8926,  0.9348,  0.6213, -0.6211,  0.2280,
         -0.9726, -0.2186, -0.4746, -0.9881,  0.2360, -0.7169,  0.0265, -0.0632,
         -0.8467,  0.2509,  0.99

In [None]:
#Esta capa se entrena sobre la incrustación del token CLS

response.pooler_output.shape

torch.Size([1, 768])

In [None]:
model.pooler

BertPooler(
  (dense): Linear(in_features=768, out_features=768, bias=True)
  (activation): Tanh()
)

In [None]:
#toma la representación del codificador final del token CLS
CLS_embedding = response.last_hidden_state[:,0,:].unsqueeze(0)

CLS_embedding.shape

torch.Size([1, 1, 768])

In [None]:
#Ejecutar la incrustación de CLS a través del agrupador da el mismo resultado que "pooler_output"
(model.pooler(CLS_embedding) == response.pooler_output).all()

tensor(True)

In [None]:
total_params = 0
for p in model.parameters():
  if len(p.shape) == 2:
    total_params += p.shape[0] * p.shape[1]

print(f'Total Parameters: {total_params:,}') #Esto nos devolverá los 110 M de parametros de este modelob

Total Parameters: 109,360,128


In [None]:
30522 * 768

23440896