<H1>A Note for understanding how to convert a sentence to an embedded sentence</H1>

In [1]:
from transformers import AutoTokenizer
import sys
import io
import os

# 載入 tokenizer（這是 all-MiniLM-L6-v2 使用的）
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
# print(tokenizer.all_special_tokens)
# print(tokenizer.all_special_ids)
# 一個句子
sentence = "I love Marvel movies"
inputs = tokenizer(sentence, return_tensors='pt')
print(inputs)
# 分詞
tokens = tokenizer.tokenize(sentence)
print(f"原始句子: '{sentence}'")
print(f"分詞結果: {tokens}")

token_ids = tokenizer.encode(sentence)
print(f"\nToken IDs: {token_ids}")

# 可以看看每個 token 對應什麼 ID
for token, token_id in zip(tokens, token_ids[1:-1]):  # 去掉特殊 token
    print(f"  '{token}' → {token_id}")

{'input_ids': tensor([[ 101, 1045, 2293, 8348, 5691,  102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1]])}
原始句子: 'I love Marvel movies'
分詞結果: ['i', 'love', 'marvel', 'movies']

Token IDs: [101, 1045, 2293, 8348, 5691, 102]
  'i' → 1045
  'love' → 2293
  'marvel' → 8348
  'movies' → 5691


Token ID to token embeddings

In [47]:
from transformers import AutoTokenizer, AutoModel
import torch

model_name = "sentence-transformers/all-MiniLM-L6-v2"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

text = "I love Marvel movies"

# tokenize
inputs = tokenizer(text, return_tensors='pt')
print("Input token:", tokenizer.convert_ids_to_tokens(inputs["input_ids"][0]))
print("Input IDs:", inputs["input_ids"])

# 直接從模型的 word_embeddings 抓 embedding
embedding_layer = model.embeddings.word_embeddings

token_ids = inputs["input_ids"][0]  # shape [seq_len]
print(token_ids)
# 取得 embedding
token_embeddings = embedding_layer(token_ids)

print("Token embeddings shape:", token_embeddings, token_embeddings.shape)


Input token: ['[CLS]', 'i', 'love', 'marvel', 'movies', '[SEP]']
Input IDs: tensor([[ 101, 1045, 2293, 8348, 5691,  102]])
tensor([ 101, 1045, 2293, 8348, 5691,  102])
Token embeddings shape: tensor([[-0.0176, -0.0076,  0.0471,  ..., -0.0545,  0.0076, -0.0617],
        [-0.0448, -0.0583, -0.0020,  ..., -0.0494, -0.0888, -0.0592],
        [-0.0354, -0.0099,  0.0123,  ..., -0.0080,  0.0671,  0.0367],
        [-0.0699, -0.0008,  0.0990,  ..., -0.1430, -0.0016, -0.0637],
        [-0.0076, -0.0655, -0.0421,  ...,  0.0339,  0.0667,  0.0039],
        [ 0.0332, -0.0085, -0.0400,  ...,  0.0207, -0.0034, -0.0004]],
       grad_fn=<EmbeddingBackward0>) torch.Size([6, 384])


In [48]:
# 方法：Mean Pooling（平均池化）
# 把所有 token 的向量平均起來

# 先移除特殊 token 的 attention mask
attention_mask = inputs['attention_mask']
print(f"\nAttention Mask: {attention_mask}", "shape= :", attention_mask.shape)

# 只對有效的 token 做平均（排除 padding）
mask = attention_mask.squeeze(0).unsqueeze(1)  # (6, 1)
print(f"Mask 形狀: {mask.shape}")

# 加權平均
masked_embeddings = token_embeddings * mask
sum_embeddings = torch.sum(masked_embeddings, dim=0)  # 在 token 維度上加總
sum_mask = torch.clamp(mask.sum(dim=0), min=1e-9)     # 避免除以 0
print("sum_mask shape = ", sum_mask.shape)
# 最終的句子向量
sentence_embedding = sum_embeddings / sum_mask
sentence_embedding = sentence_embedding / sentence_embedding.norm(p=2) #normalization
sentence_embedding = sentence_embedding.to("cuda")
print(f"\n最終句子向量形狀: {sentence_embedding.shape}")
print(f"最終句子向量前 10 個值: {sentence_embedding[:10]}")


Attention Mask: tensor([[1, 1, 1, 1, 1, 1]]) shape= : torch.Size([1, 6])
Mask 形狀: torch.Size([6, 1])
sum_mask shape =  torch.Size([1])

最終句子向量形狀: torch.Size([384])
最終句子向量前 10 個值: tensor([-0.0428, -0.0454,  0.0224,  0.0221, -0.0025,  0.0668,  0.0689,  0.0861,
         0.0192, -0.0113], device='cuda:0', grad_fn=<SliceBackward0>)


<H3>Cosine Similarity Test</H3>
<H4>Cosine Similarity Range	Meaning</H4>
<p>0.85 ~ 1.0:	Very similar, almost semantically identical<br>  
0.7 ~ 0.85:	Highly related, meaning is close<br>  
0.5 ~ 0.7:	Moderately similar, some semantic overlap<br>  
0.3 ~ 0.5:	Low similarity, weak semantic relation<br>  
< 0.3:	Almost unrelated, little to no semantic connection</p>

In [18]:
from sentence_transformers import SentenceTransformer
import torch
import torch.nn.functional as F

In [49]:
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

documents = [
    "Iron Man is a Marvel superhero film about Tony Stark",
    "The recipe for chocolate cake is very simple",
    "Avengers Endgame is an epic superhero movie",
    "Python is a programming language",
    "Captain America fights for justice"
]


query = ["Who is the main character in Iron Man?","Who is Tony stark?"]


# 文檔 embeddings
doc_embeddings = model.encode(documents, convert_to_tensor=True)  # shape [5, embedding_dim]


# query embedding
query_embedding = model.encode(query, convert_to_tensor=True)  # shape [embedding_dim]
# print(sentence_embedding.unsqueeze(0))

# cosine similarity: query 與每個 document
# cos_scores = F.cosine_similarity(query_embedding.unsqueeze(0), doc_embeddings)
cos_scores = F.cosine_similarity(sentence_embedding.unsqueeze(0), doc_embeddings)
# 排序
top_idx = torch.argmax(cos_scores)

print(f"Query: {query}\n")
for i, doc in enumerate(documents):
    print(f"Doc {i}: {doc}  ->  Cosine similarity: {cos_scores[i]:.4f}")

print(f"\nMost similar document: {documents[top_idx]} (score={cos_scores[top_idx]:.4f})")


Query: ['Who is the main character in Iron Man?', 'Who is Tony stark?']

Doc 0: Iron Man is a Marvel superhero film about Tony Stark  ->  Cosine similarity: 0.1851
Doc 1: The recipe for chocolate cake is very simple  ->  Cosine similarity: 0.0458
Doc 2: Avengers Endgame is an epic superhero movie  ->  Cosine similarity: 0.2421
Doc 3: Python is a programming language  ->  Cosine similarity: 0.0806
Doc 4: Captain America fights for justice  ->  Cosine similarity: 0.1441

Most similar document: Avengers Endgame is an epic superhero movie (score=0.2421)
