In [1]:
from transformers import BertTokenizer, BertModel
import torch

# Load pre-trained tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Example text
text = "Here is an example sentence."

# Tokenization with special tokens
# add_special_tokens=True adds [CLS] at beginning and [SEP] at end
encoded_input = tokenizer(text, add_special_tokens=True, return_tensors='pt')

# Let's see what's in the encoded input
print("Input IDs:", encoded_input['input_ids'])
print("Token type IDs:", encoded_input['token_type_ids'])
print("Attention mask:", encoded_input['attention_mask'])

# Decode back to see the tokens (including special tokens)
tokens = tokenizer.convert_ids_to_tokens(encoded_input['input_ids'][0])
print("Tokens:", tokens)

# Load model
model = BertModel.from_pretrained('bert-base-uncased')

# Get BERT embeddings
with torch.no_grad():
    outputs = model(**encoded_input)

# Get the [CLS] token representation (first token's embedding from last hidden state)
cls_embedding = outputs.last_hidden_state[:, 0, :]
print("Shape of [CLS] embedding:", cls_embedding.shape)


Input IDs: tensor([[ 101, 2182, 2003, 2019, 2742, 6251, 1012,  102]])
Token type IDs: tensor([[0, 0, 0, 0, 0, 0, 0, 0]])
Attention mask: tensor([[1, 1, 1, 1, 1, 1, 1, 1]])
Tokens: ['[CLS]', 'here', 'is', 'an', 'example', 'sentence', '.', '[SEP]']
Shape of [CLS] embedding: torch.Size([1, 768])


In [None]:
# Correct way to tokenize sentence pairs
sentence_a = "How are you?"
sentence_b = "I am fine."

# Use the tokenizer's sentence pair processing
encoded_pair = tokenizer(sentence_a, 
                         sentence_b,  # Second text provided separately
                         add_special_tokens=True, 
                         return_tensors='pt')

pair_tokens = tokenizer.convert_ids_to_tokens(encoded_pair['input_ids'][0])
print("Tokens for sentence pair:", pair_tokens)