# Tokens and Token Embeddings

In [None]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
tokenizer

In [None]:
from transformers import BertModel

model = BertModel.from_pretrained("bert-base-uncased")
print("model:", model)

In [3]:
text = "Using transformers is easy!"
encoded_input = tokenizer(text, return_tensors="pt")
print("encoded_input:", encoded_input)
output = model(**encoded_input)
print("output:", output.last_hidden_state.shape)

encoded_input: {'input_ids': tensor([[  101,  2478, 19081,  2003,  3733,   999,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1]])}
output: torch.Size([1, 7, 768])


In [4]:
# dir(model)
for name, param in model.named_parameters():
    print(name, param.shape)

embeddings.word_embeddings.weight torch.Size([30522, 768])
embeddings.position_embeddings.weight torch.Size([512, 768])
embeddings.token_type_embeddings.weight torch.Size([2, 768])
embeddings.LayerNorm.weight torch.Size([768])
embeddings.LayerNorm.bias torch.Size([768])
encoder.layer.0.attention.self.query.weight torch.Size([768, 768])
encoder.layer.0.attention.self.query.bias torch.Size([768])
encoder.layer.0.attention.self.key.weight torch.Size([768, 768])
encoder.layer.0.attention.self.key.bias torch.Size([768])
encoder.layer.0.attention.self.value.weight torch.Size([768, 768])
encoder.layer.0.attention.self.value.bias torch.Size([768])
encoder.layer.0.attention.output.dense.weight torch.Size([768, 768])
encoder.layer.0.attention.output.dense.bias torch.Size([768])
encoder.layer.0.attention.output.LayerNorm.weight torch.Size([768])
encoder.layer.0.attention.output.LayerNorm.bias torch.Size([768])
encoder.layer.0.intermediate.dense.weight torch.Size([3072, 768])
encoder.layer.0.inter

In [5]:
model.embeddings.word_embeddings

Embedding(30522, 768, padding_idx=0)

In [6]:
model.embeddings.word_embeddings.weight.shape

torch.Size([30522, 768])

In [7]:
from torch import nn

embedding = nn.Embedding.from_pretrained(model.embeddings.word_embeddings.weight)
embedding

Embedding(30522, 768)

In [8]:
input_ids = encoded_input["input_ids"][0]
input_ids

tensor([  101,  2478, 19081,  2003,  3733,   999,   102])

In [9]:
for id in input_ids:
    print("id:", id)
    print(tokenizer.decode(id))
    print(embedding(id).shape)
    print("----------" * 10)

id: tensor(101)
[CLS]
torch.Size([768])
----------------------------------------------------------------------------------------------------
id: tensor(2478)
using
torch.Size([768])
----------------------------------------------------------------------------------------------------
id: tensor(19081)
transformers
torch.Size([768])
----------------------------------------------------------------------------------------------------
id: tensor(2003)
is
torch.Size([768])
----------------------------------------------------------------------------------------------------
id: tensor(3733)
easy
torch.Size([768])
----------------------------------------------------------------------------------------------------
id: tensor(999)
!
torch.Size([768])
----------------------------------------------------------------------------------------------------
id: tensor(102)
[SEP]
torch.Size([768])
----------------------------------------------------------------------------------------------------


In [10]:
import torch

id = torch.LongTensor([2478])
print(embedding(id).shape)
embedding(id)

torch.Size([1, 768])


tensor([[-5.1966e-02,  1.1857e-02, -4.5572e-03, -1.0487e-02, -9.5375e-03,
          9.9534e-03, -5.0752e-02,  2.4708e-02,  3.8804e-03, -5.9030e-02,
         -1.3676e-02, -2.4394e-02, -2.0969e-03,  4.6606e-02, -5.9757e-02,
          1.0715e-02,  8.4250e-04, -1.1981e-03, -2.2384e-02, -6.7643e-02,
         -3.3886e-02,  3.2578e-02,  2.9374e-02, -9.1450e-03,  2.3834e-03,
         -1.9332e-02, -9.7808e-03, -2.7400e-02, -4.3215e-02, -7.6628e-03,
          4.8815e-02,  3.2702e-03,  4.0998e-02, -3.0004e-02,  3.5387e-03,
          2.0872e-02, -5.6711e-02,  2.6018e-03, -4.7690e-02, -7.8423e-02,
         -1.1707e-03, -1.8451e-02,  1.2292e-02, -5.7301e-02,  1.2055e-03,
         -2.4245e-02,  3.3236e-02, -5.9467e-02,  2.2343e-02, -3.4456e-02,
         -6.4358e-02, -2.3962e-02,  1.6150e-02, -1.3999e-02, -3.0813e-02,
          3.5889e-02, -3.7521e-02, -1.6993e-02,  8.3541e-03,  3.2487e-02,
         -2.2775e-02, -3.8754e-03,  2.1007e-02, -3.2956e-02, -2.1765e-02,
         -2.4722e-02,  3.9563e-02, -3.