In [1]:
from transformers import AutoTokenizer, AutoModel
from transformers import BitsAndBytesConfig
import torch
import json
import numpy as np

# 8bit 量化配置
bnb_config = BitsAndBytesConfig(
    load_in_8bit=True,
    bnb_8bit_compute_dtype=torch.float16,
    bnb_8bit_use_double_quant=True,
)

# 加载 tokenizer + 模型
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-Embedding-8B")
model = AutoModel.from_pretrained(
    "Qwen3-Embedding-8B",
    device_map="auto",
    quantization_config=bnb_config,
)

2025-09-29 01:35:32.943592: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [2]:
from tqdm import tqdm
# 切换到 eval
model.eval()

# 读取数据集
questions = []
with open("gsm8k_test_public.jsonl", "r") as f:
    for line in f:
        data = json.loads(line)
        questions.append(data["question"])

# 批量生成 embedding
embeddings = []
batch_size = 8
for i in tqdm(range(0, len(questions), batch_size), desc="Embedding questions"):
    batch = questions[i:i+batch_size]
    inputs = tokenizer(batch, padding=True, truncation=True, return_tensors="pt").to("cuda")
    with torch.no_grad():
        outputs = model(**inputs)
        cls_emb = outputs.last_hidden_state[:, 0, :]  # CLS 池化
        embeddings.append(cls_emb)

embeddings = torch.cat(embeddings, dim=0)  # [num_questions, hidden_dim]

# 保存
np.save("gsm8k_test_public_embeddings.npy", embeddings.cpu().numpy())
print("Saved embeddings:", embeddings.shape)

Embedding questions: 100%|██████████| 17/17 [00:06<00:00,  2.59it/s]

Saved embeddings: torch.Size([132, 4096])





In [4]:
from tqdm import tqdm
# 切换到 eval
model.eval()

# 读取数据集
questions = []
with open("gsm8k_train.jsonl", "r") as f:
    for line in f:
        data = json.loads(line)
        questions.append(data["question"])

# 批量生成 embedding
embeddings = []
batch_size = 8
for i in tqdm(range(0, len(questions), batch_size), desc="Embedding questions"):
    batch = questions[i:i+batch_size]
    inputs = tokenizer(batch, padding=True, truncation=True, return_tensors="pt").to("cuda")
    with torch.no_grad():
        outputs = model(**inputs)
        cls_emb = outputs.last_hidden_state[:, 0, :]  # CLS 池化
        embeddings.append(cls_emb)

embeddings = torch.cat(embeddings, dim=0)  # [num_questions, hidden_dim]

# 保存
np.save("gsm8k_train_embeddings.npy", embeddings.cpu().numpy())
print("Saved embeddings:", embeddings.shape)

Embedding questions: 100%|██████████| 935/935 [05:19<00:00,  2.92it/s]


Saved embeddings: torch.Size([7473, 4096])


In [None]:
from tqdm import tqdm
# 切换到 eval
model.eval()

# 读取数据集
questions = []
with open("gsm8k_train_self-instruct.jsonl", "r") as f:
    for line in f:
        data = json.loads(line)
        questions.append(data["question"])

# 批量生成 embedding
embeddings = []
batch_size = 8
for i in tqdm(range(0, len(questions), batch_size), desc="Embedding questions"):
    batch = questions[i:i+batch_size]
    inputs = tokenizer(batch, padding=True, truncation=True, return_tensors="pt").to("cuda")
    with torch.no_grad():
        outputs = model(**inputs)
        cls_emb = outputs.last_hidden_state[:, 0, :]  # CLS 池化
        embeddings.append(cls_emb)

embeddings = torch.cat(embeddings, dim=0)  # [num_questions, hidden_dim]

# 保存
np.save("gsm8k_instruct_embeddings.npy", embeddings.cpu().numpy())
print("Saved embeddings:", embeddings.shape)