## 设置headers

### 注意请您自行于siliconflow上申请API接口并填入后运行

In [None]:
headers = {
    "Authorization": "Bearer <your api>",
    "Content-Type": "application/json"
}

## 按照100条为一组进行嵌入

### 需要修改读取、导出文件的地址

In [None]:
import pandas as pd
import requests
import json
import time

# 读取CSV文件
df = pd.read_csv('processed_data.csv')

# API相关参数
url = "https://api.siliconflow.cn/v1/embeddings"


# 存储嵌入结果和失败的文档
success_results = []
failed_keys = []

# 批量处理，每次最多处理batch_size个文档
batch_size = 50  # 根据实际情况调整
for start_idx in range(0, len(df), batch_size):
    # 取出当前批次的文档
    batch_df = df.iloc[start_idx:start_idx + batch_size]
    
    # 将当前批次的所有文档内容组合成一个字符串数组
    input_texts = batch_df['content_cutted'].tolist()
    
    payload = {
        "model": "Qwen/Qwen3-Embedding-8B",
        "input": input_texts
    }

    try:
        # 发送API请求
        response = requests.post(url, json=payload, headers=headers)
        response_data = response.json()

        # 检查API响应的状态
        if response.status_code == 200 and 'data' in response_data:
            # 提取嵌入数据
            embeddings = [item['embedding'] for item in response_data['data']]
            for key, embedding in zip(batch_df['key'], embeddings):
                success_results.append({"key": key, "embedding": embedding})
            print(f"成功处理批次: {start_idx}-{start_idx + batch_size - 1}")  # 仅输出当前批次的范围
        else:
            # 如果没有成功嵌入，记录失败
            failed_keys.extend(batch_df['key'].tolist())
            print(f"处理失败批次: {start_idx}-{start_idx + batch_size - 1}, 错误信息: {response_data}")
    
    except Exception as e:
        # 处理请求异常
        failed_keys.extend(batch_df['key'].tolist())
        print(f"请求失败批次: {start_idx}-{start_idx + batch_size - 1}, 错误信息: {str(e)}")
    
    # 每个请求完成后，稍微休息一下，避免频繁请求导致API限流
    time.sleep(1)
    
    # 每批次处理完后，将成功的结果写入文件
    with open('/embedding_results.json', 'a') as f:
        for result in success_results:
            f.write(json.dumps(result) + '\n')
    
    # 清空成功结果列表，准备下一个批次
    success_results = []

# 处理完所有批次后，输出失败的文档key（简化输出，只显示失败的文档数量）
if failed_keys:
    print(f"处理失败的文档数量: {len(failed_keys)}")
else:
    print("所有文档均已成功处理！")

print("嵌入处理完毕，失败的文档key已输出。")