# 使用BERT提取文本特征，仅针对content

In [1]:
import csv
import torch
from transformers import AutoTokenizer, AutoModel
from tqdm import tqdm


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# 加载模型和分词器
tokenizer = AutoTokenizer.from_pretrained("bert-base-chinese")
model = AutoModel.from_pretrained("bert-base-chinese")
model.eval()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
device

device(type='cuda')

## 1、读取一些内容测试效果

In [None]:
# 读取文件前5行内容
file_path = '../WeiboData/weibo_train_data.txt'
contents = []
with open(file_path, 'r', encoding='utf-8') as f:
    for i, line in enumerate(f):
        parts = line.strip().split('\t')
        if len(parts) >= 7:
            contents.append(parts[6])  # 提取 content 字段
        if len(contents) >= 5:
            break


In [10]:
contents

['丽江旅游(sz002033)#股票##炒股##财经##理财##投资#推荐包赢股，盈利对半分成，不算本金，群：46251412',
 '#丁辰灵的红包#挣钱是一种能力，抢红包拼的是技术。我抢到了丁辰灵 和@阚洪岩 一起发出的现金红包，幸福感爆棚！情人节，一起来和粉丝红包约个会吧╮ (￣ 3￣) ╭http://t.cn/RZDIVjf',
 '淘宝网这些傻逼。。。气的劳资有火没地儿发~尼玛，你们都瞎了',
 '看点不能说的，你们都懂[笑cry]',
 '111多张']

In [7]:
# 提取 BERT [CLS] 特征向量
features = []
with torch.no_grad():
    for i, text in enumerate(contents):
        # 编码文本
        inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=128)
        outputs = model(**inputs)
        
        # 取 [CLS] 位置的向量（即第一个 token）
        cls_vector = outputs.last_hidden_state[:, 0, :].squeeze().numpy()
        features.append(cls_vector)

        print(f"\n微博 {i+1} 内容：{text[:30]}...")
        print(f"特征向量前10维：{cls_vector.shape}")


微博 1 内容：丽江旅游(sz002033)#股票##炒股##财经##理财#...
特征向量前10维：(768,)

微博 2 内容：#丁辰灵的红包#挣钱是一种能力，抢红包拼的是技术。我抢到了丁...
特征向量前10维：(768,)

微博 3 内容：淘宝网这些傻逼。。。气的劳资有火没地儿发~尼玛，你们都瞎了...
特征向量前10维：(768,)

微博 4 内容：看点不能说的，你们都懂[笑cry]...
特征向量前10维：(768,)

微博 5 内容：111多张...
特征向量前10维：(768,)


## 2、正式提取特征，BERT

In [None]:
# 文件路径
input_file = '../WeiboData/weibo_train_data.txt'
output_file = '../features/weibo_train_bert_features.csv'


In [4]:
# 一些参数
batch_size = 32
max_length = 512


In [3]:
# 使用batch加速
uids, mids, contents = [], [], []
batch = []

def write_batch(writer, uids, mids, contents):
    inputs = tokenizer(contents, return_tensors="pt", padding=True,
                       truncation=True, max_length=max_length)
    inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad():
        outputs = model(**inputs)
        cls_vectors = outputs.last_hidden_state[:, 0, :]  # shape: (B, 768)

    cls_vectors = cls_vectors.cpu().numpy()
    for uid, mid, vec in zip(uids, mids, cls_vectors):
        vec_str = ' '.join(map(str, vec))
        writer.writerow([uid, mid, vec_str])


In [6]:
with open(output_file, 'w', newline='', encoding='utf-8') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['uid', 'mid', 'content_feature'])

    with open(input_file, 'r', encoding='utf-8') as f:
        for line in tqdm(f, desc="Processing"):
            parts = line.strip().split('\t')
            if len(parts) < 7:
                continue
            uid, mid, content = parts[0], parts[1], parts[6]
            uids.append(uid)
            mids.append(mid)
            contents.append(content)

            if len(contents) >= batch_size:
                write_batch(writer, uids, mids, contents)
                uids, mids, contents = [], [], []

        # 处理剩余的
        if contents:
            write_batch(writer, uids, mids, contents)

Processing: 178297it [00:00, 351051.41it/s]
