In [1]:
import json
import pandas as pd
from tqdm import tqdm
dataset = 'Yelp'
df = pd.read_csv(f'csv/{dataset}.csv', names=['user_id', 'item_id'], usecols=[0, 1])

In [2]:
with open(f'{dataset}_feature.json', 'r') as file:
    feature = json.load(file)

In [4]:
index_u=df['user_id'].max()

In [None]:
#提取用户感知
user_h={}
for user in tqdm(range(index_u+1)):
    history = df[df['user_id'] == user]['item_id'].values[-(10 + 2):-2]
    history=[feature[str(h)] for h in history]
    user_h[user]='\n\n'.join(history)

In [None]:
len(user_h)

In [None]:
print(user_h[19])

In [None]:
prompt = """Based on the titles and features of the items the user has interacted with in chronological order, summarize the user's preferences directly as concise and precise keywords, separated by commas, without any additional explanation.
Example output: Culture & History, Seafood Cuisine, Japanese Cuisine, Gardening Enthusiast
History:\n"""

data_list = []
file_count = 1

for uid, u_history in tqdm(user_h.items()):
    data_entry = {
        "custom_id": str(uid),
        "method": "POST",
        "url": "/v1/chat/completions",
        "body": {
            "model": "gpt-4o-mini",
            "messages": [
                {"role": "system", "content": "You are a helpful assistant."},
                {"role": "user", "content": prompt + u_history}
            ]
        }
    }
    data_list.append(data_entry)
    if len(data_list) >= 30000:
        with open(f'{dataset}_userrequest_{file_count}.jsonl', 'w', encoding='utf-8') as f:
            for entry in data_list:
                json.dump(entry, f, ensure_ascii=False)
                f.write('\n')
        file_count += 1
        data_list = []

if data_list:
    with open(f'{dataset}_userrequest_{file_count}.jsonl', 'w', encoding='utf-8') as f:
        for entry in data_list:
            json.dump(entry, f, ensure_ascii=False)
            f.write('\n')

In [2]:
import os
from pathlib import Path
from openai import OpenAI

client = OpenAI(
    api_key="",  # 如果您没有配置环境变量，请在此处用您的API Key进行替换
)


In [6]:
xuhao=1

In [None]:

file_object = client.files.create(file=Path(f"request_{xuhao}.jsonl"), purpose="batch")

print(file_object.model_dump_json())
print(file_object.id)  # 打印文件id
file_object_id=file_object.id

In [None]:
batch = client.batches.create(
    input_file_id=file_object_id,  # 上传文件返回的 id
    endpoint="/v1/chat/completions",  # 大语言模型固定填写，/v1/chat/completions
    completion_window="24h"  # 当前只支持24h，24小时未运行完会超时
)
print(batch)
print(batch.id)  # 打印Batch任务的id
batch_id=batch.id

In [None]:
#查询
batch = client.batches.retrieve('')  # 将batch_id替换为Batch任务的id
print(batch)
print(batch.error_file_id)
print(batch.output_file_id)  # 打印输出文件id
error_file_id=batch.error_file_id
output_file_id=batch.output_file_id

In [None]:
content = client.files.content(file_id=error_file_id)
print(content.text)
content.write_to_file(f"resulterror_{xuhao}.jsonl")

In [None]:
content = client.files.content(file_id=output_file_id)
print(content.text)
content.write_to_file(f"result_{xuhao}.jsonl")

In [None]:
import json

def parse_jsonl(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        data = []
        for line in file:
            data.append(json.loads(line))
    return data

def extract_custom_id_and_content(parsed_data, extracted_data):
    for item in parsed_data:
        custom_id = item.get("custom_id")
        content = item["response"]["body"]["choices"][0]["message"]["content"]
        extracted_data[custom_id] = content

def save_to_json(data, output_path):
    with open(output_path, 'w', encoding='utf-8') as file:
        json.dump(data, file, ensure_ascii=False, indent=4)

jsonl_files = ['result_1.jsonl', 'result_2.jsonl', 'result_3.jsonl']

all_extracted_data = {}

for jsonl_file in jsonl_files:
    parsed_data = parse_jsonl(jsonl_file)
    extract_custom_id_and_content(parsed_data, all_extracted_data)

output_file_path = 'merged_data.json'
save_to_json(all_extracted_data, output_file_path)

print(f"合并的数据已保存到 {output_file_path}")