In [None]:
from transformers import AutoTokenizer, AutoModel
import torch
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

# Step 1: Choose a text
text = "It is our choices, Harry, that show what we truly are, far more than our abilities."

# Step 2: Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModel.from_pretrained("bert-base-uncased")

# Tokenize the text
tokens = tokenizer.tokenize(text)
token_ids = tokenizer(text, return_tensors="pt")

# Step 3: Get embeddings
with torch.no_grad():
    outputs = model(**token_ids)
    embeddings = outputs.last_hidden_state.squeeze(0)

# Reduce dimensions using PCA
pca = PCA(n_components=2)
reduced_embeddings = pca.fit_transform(embeddings.numpy())

# Step 4: Plot the embeddings
plt.figure(figsize=(8, 6))
plt.scatter(reduced_embeddings[:, 0], reduced_embeddings[:, 1], color='blue')
for i, token in enumerate(tokens):
    plt.annotate(token, (reduced_embeddings[i, 0], reduced_embeddings[i, 1]))
plt.xlabel("PCA Component 1")
plt.ylabel("PCA Component 2")
plt.title("Token Embeddings Visualization")
plt.show()

# Step 5: Print token analysis
print(f"Total Tokens: {len(tokens)}")
print("Tokens:", tokens)
