In [1]:
import csv
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from tqdm import tqdm


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# 加载 Qwen3 模型和 Tokenizer
model_name = "Qwen3-0.6B"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype="auto",
    device_map="auto"
)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


RuntimeError: Failed to import transformers.models.qwen3.modeling_qwen3 because of the following error (look up to see its traceback):
No module named 'torch.distributed.tensor'

In [None]:
# 设置参数
input_file = '../WeiboData/weibo_train_data.txt'
output_file = '../features/weibo_qwen3_encoder_features.csv'
batch_size = 8
max_length = 512


In [None]:
# 按照batch获取feature并写入
def write_batch(writer, uid_list, mid_list, content_list):
    inputs = tokenizer(content_list, return_tensors="pt", padding=True,
                       truncation=True, max_length=max_length)
    inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad():
        # 通过模型获取隐藏层的输出特征
        outputs = model.transformer(inputs['input_ids'])
        hidden_states = outputs.last_hidden_state  # 获取模型的最后一层隐藏状态

        # 获取 [CLS] token 的特征（或其他特定 token 的特征）
        cls_vectors = hidden_states[:, 0, :]  # 假设我们使用 [CLS] token 的向量

    # 将隐藏状态作为特征保存到 CSV
    cls_vectors = cls_vectors.cpu().numpy()
    for uid, mid, vec in zip(uid_list, mid_list, cls_vectors):
        vec_str = ' '.join(map(str, vec))
        writer.writerow([uid, mid, vec_str])


In [None]:
# 逐行读取文件并进行批处理
uids, mids, contents = [], [], []
total_lines = 1229619

with open(output_file, 'w', newline='', encoding='utf-8') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['uid', 'mid', 'content_feature'])

    with open(input_file, 'r', encoding='utf-8') as f:
        # 创建带总行数的 tqdm 进度条
        with tqdm(total=total_lines, desc="Extracting Qwen3 encoder features", unit="line") as pbar:
            for line in f:
                parts = line.strip().split('\t')
                if len(parts) < 7:
                    continue
                uid, mid, content = parts[0], parts[1], parts[6]
                uids.append(uid)
                mids.append(mid)
                contents.append(content)

                # 每处理完一个 batch 就写入
                if len(contents) >= batch_size:
                    write_batch(writer, uids, mids, contents)
                    uids, mids, contents = [], [], []

                pbar.update(1)  # 更新进度条

            # 写最后一批
            if contents:
                write_batch(writer, uids, mids, contents)
                pbar.update(len(contents))  # 更新进度条
