In [5]:
import torch
from transformers import BertTokenizer, BertModel

# Load the pre-trained BERT tokenizer and model
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)

In [6]:
# Text to be encoded
text = "cat is cute"

# Tokenize the text
tokens = tokenizer.tokenize(text)

# Convert tokens to token IDs
input_ids = tokenizer.convert_tokens_to_ids(tokens)

# Add special tokens [CLS] and [SEP] for BERT
input_ids = [tokenizer.cls_token_id] + input_ids + [tokenizer.sep_token_id]

# Convert the input_ids list to a tensor
input_ids = torch.tensor(input_ids)
input_ids

tensor([  101,  4937,  2003, 10140,   102])

In [7]:
# Get the BERT embeddings
with torch.no_grad():
    outputs = model(input_ids.unsqueeze(0))  # Batch size of 1

# The BERT model returns a tuple, and we're interested in the hidden states (output[0])
bert_embeddings = outputs[0]

# Now, bert_embeddings contains the contextual embeddings for the input text
print(bert_embeddings)


tensor([[[ 0.0119,  0.0805,  0.0208,  ..., -0.1908,  0.1084,  0.2260],
         [-0.0158, -0.3965,  0.5580,  ..., -0.3887,  0.6841,  0.6038],
         [-0.4054, -0.2355, -0.0674,  ..., -0.0769, -0.0490,  0.8397],
         [-0.5243, -0.6484, -0.3189,  ...,  0.5408,  0.4001,  0.0300],
         [ 0.7199,  0.0324, -0.1972,  ...,  0.1617, -0.5833, -0.2165]]])
