# Use the openai batch api to generate reasons and save money

In [None]:
import requests
import json

# Set up your API key and common headers
API_KEY = ""
HEADERS = {
    "Authorization": f"Bearer {API_KEY}",
    "Content-Type": "application/json",
    "User-Agent": "OpenAI Python"
}

def upload_batch_file(file_path):
    """
    Uploads the input file for the batch processing job.
    """
    url = "https://api.openai.com/v1/files"
    files = {
        'purpose': (None, 'batch'),
        'file': (file_path, open(file_path, 'rb'))
    }
    response = requests.post(url, headers={"Authorization": f"Bearer {API_KEY}"}, files=files)
    response_data = response.json()
    try:
        file_id = response_data['id']
        return file_id
    except KeyError:
        print("Error uploading file:", response_data)
        return None

def create_batch(input_file_id, endpoint, completion_window="24h", metadata=None):
    """
    Creates a batch processing job using the uploaded input file.
    """
    url = "https://api.openai.com/v1/batches"
    data = {
        "input_file_id": input_file_id,
        "endpoint": endpoint,
        "completion_window": completion_window
    }
    if metadata:
        data["metadata"] = metadata
    response = requests.post(url, headers=HEADERS, data=json.dumps(data))
    response_data = response.json()
    try:
        batch_id = response_data['id']
        return batch_id
    except KeyError:
        print("Error creating batch:", response_data)
        return None

def check_batch_status(batch_id):
    """
    Checks the status of the batch processing job.
    """
    url = f"https://api.openai.com/v1/batches/{batch_id}"
    response = requests.get(url, headers=HEADERS)
    response_data = response.json()
    return response_data

def retrieve_batch_results(output_file_id, output_file_path):
    """
    Retrieves the results of the completed batch and saves them to a file.
    """
    url = f"https://api.openai.com/v1/files/{output_file_id}/content"
    response = requests.get(url, headers=HEADERS, stream=True)
    if response.status_code == 200:
        with open(output_file_path, 'wb') as f:
            for chunk in response.iter_content(chunk_size=8192):
                f.write(chunk)
        print(f"Results saved to {output_file_path}")
    else:
        print("Error retrieving results:", response.json())

def cancel_batch(batch_id):
    """
    Cancels an ongoing batch processing job.
    """
    url = f"https://api.openai.com/v1/batches/{batch_id}/cancel"
    response = requests.post(url, headers=HEADERS)
    response_data = response.json()
    return response_data

def list_batches(limit=10):
    """
    Retrieves a list of all batch processing jobs.
    """
    url = f"https://api.openai.com/v1/batches?limit={limit}"
    response = requests.get(url, headers=HEADERS)
    response_data = response.json()
    return response_data


## Convert 4 batchapi

In [6]:
datasets_name = "Amazon_Books_small"
output_format_prompt = "Please give the reasons in the format of list without explaining and prevent any possible data leakage, for example do not show the movie name."

In [57]:
# convert 2 jsonl files to batch api format
import copy
import json
template = {"custom_id": "request-1", 
            "method": "POST", 
            "url": "/v1/chat/completions", 
            "body": {"model": "gpt-3.5-turbo", 
                     "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],
                     "max_tokens":500,
                     "n":1,
                     "temperature":0,}}

idx = 0
output_file_name = f"prompts_for_generating_explanations/get_reasons_for_LLMAPI_{datasets_name}_batchapi.jsonl"
with open(output_file_name, 'w', encoding='utf8') as f_write:
    with open(f"prompts_for_generating_explanations/get_reasons_for_LLMAPI_{datasets_name}.json", 'r', encoding='utf8') as f:
        data = json.load(f)
        tmp_template = copy.deepcopy(template)
        for user_id, user_item_data in data.items():
            for item_id, prompt in user_item_data['prompt'].items():
                if len(prompt) > 1:
                    print(prompt)
                tmp_template['custom_id'] = f"{user_id}-{item_id}"
                tmp_template['body']['messages'] = [{"role":"user","content":prompt[0]+" "+output_format_prompt}]
                f_write.write(json.dumps(tmp_template, ensure_ascii=False)+'\n')
                idx += 1

In [58]:
# the number of tokens is too large for batchapi, split into 5000 lines
with open(output_file_name, 'r', encoding='utf8') as f:
    lines = f.readlines()
    for i in range(0, len(lines), 5000):
        new_file_path = output_file_name.split("/")[0]+"/split/"+output_file_name.split("/")[-1]
        with open(new_file_path.split(".")[0]+f"_{i//5000}.jsonl", 'w', encoding='utf8') as f_write:
            f_write.write(''.join(lines[i:i+5000]))


## Run batch api

In [101]:
import os
processed_file_name = new_file_path.split(".")[0]+"_3.jsonl"
output_file_name = processed_file_name.split(".")[0]+"_output.jsonl"
print(f"Processing file: {processed_file_name}")
print(f"Output file: {output_file_name}")
if os.path.exists(output_file_name):
    print(f"Output file already exists: {output_file_name}")

Processing file: prompts_for_generating_explanations/split/get_reasons_for_LLMAPI_Amazon_Books_small_batchapi_3.jsonl
Output file: prompts_for_generating_explanations/split/get_reasons_for_LLMAPI_Amazon_Books_small_batchapi_3_output.jsonl


In [102]:
import sys
sys.path.append('..')
from openai_batch_api import upload_batch_file, list_batches, create_batch, check_batch_status, retrieve_batch_results, cancel_batch

In [103]:
# upload file
file_id = upload_batch_file(processed_file_name)
print(f"Uploaded file name {processed_file_name}")
print(f"Uploaded file ID: {file_id}")

Uploaded file name prompts_for_generating_explanations/split/get_reasons_for_LLMAPI_Amazon_Books_small_batchapi_3.jsonl
Uploaded file ID: file-L8ETpy8PN23Xi9t56xtGbVGt


In [105]:
# execute task
batch_id = create_batch(
    input_file_id=file_id,
    endpoint="/v1/chat/completions",
    completion_window="24h"
)
print(f"Created batch ID: {batch_id}")

Created batch ID: batch_672cbf4dabd0819096148ec3e42ef063


In [107]:
# check task status
batch_status = check_batch_status(batch_id)
print(json.dumps(batch_status, indent=2))

{
  "id": "batch_672cbf4dabd0819096148ec3e42ef063",
  "object": "batch",
  "endpoint": "/v1/chat/completions",
  "errors": null,
  "input_file_id": "file-L8ETpy8PN23Xi9t56xtGbVGt",
  "completion_window": "24h",
  "status": "completed",
  "output_file_id": "file-eSLzsQ2Qw2znDXuzUUtVGkXB",
  "error_file_id": null,
  "created_at": 1730985805,
  "in_progress_at": 1730985807,
  "expires_at": 1731072205,
  "finalizing_at": 1730987008,
  "completed_at": 1730987411,
  "failed_at": null,
  "expired_at": null,
  "cancelling_at": null,
  "cancelled_at": null,
  "request_counts": {
    "total": 4844,
    "completed": 4844,
    "failed": 0
  },
  "metadata": null
}


In [108]:
# Wait until the batch is completed before retrieving results
batch_status = check_batch_status(batch_id)
if batch_status.get('status') == 'completed':
    output_file_id = batch_status.get('output_file_id')
    retrieve_batch_results(output_file_id, output_file_name)
else:
    print(batch_status.get('status'))
    print("Batch is not yet completed.")

Results saved to prompts_for_generating_explanations/split/get_reasons_for_LLMAPI_Amazon_Books_small_batchapi_3_output.jsonl


In [56]:
# cancel batch
cancel_response = cancel_batch(batch_id)
print(json.dumps(cancel_response, indent=2))

{
  "id": "batch_672c9106637081909b03021ac330604d",
  "object": "batch",
  "endpoint": "/v1/chat/completions",
  "errors": null,
  "input_file_id": "file-iQKoE2DYoYE6uiVzDzeTkZ1z",
  "completion_window": "24h",
  "status": "cancelling",
  "output_file_id": null,
  "error_file_id": null,
  "created_at": 1730973958,
  "in_progress_at": 1730973959,
  "expires_at": 1731060358,
  "finalizing_at": null,
  "completed_at": null,
  "failed_at": null,
  "expired_at": null,
  "cancelling_at": 1730974479,
  "cancelled_at": null,
  "request_counts": {
    "total": 5000,
    "completed": 4240,
    "failed": 0
  },
  "metadata": null
}


In [22]:
# show all tasks
batches = list_batches(limit=10)
print(json.dumps(batches, indent=2))

{
  "object": "list",
  "data": [
    {
      "id": "batch_672c813c746881908f033c65391ae742",
      "object": "batch",
      "endpoint": "/v1/chat/completions",
      "errors": {
        "object": "list",
        "data": [
          {
            "code": "token_limit_exceeded",
            "message": "Enqueued token limit reached for gpt-4o-mini-2024-07-18 in organization org-onfIZYZQRYpQGMvatBPCy5kn. Limit: 2,000,000 enqueued tokens. Please try again once some in_progress batches have been completed.",
            "param": null,
            "line": null
          }
        ]
      },
      "input_file_id": "file-Dzt1ddESddlwSpETeC7hlk61",
      "completion_window": "24h",
      "status": "failed",
      "output_file_id": null,
      "error_file_id": null,
      "created_at": 1730969916,
      "in_progress_at": null,
      "expires_at": 1731056316,
      "finalizing_at": null,
      "completed_at": null,
      "failed_at": 1730969921,
      "expired_at": null,
      "cancelling_at": nu

## Convert 2 same format

In [4]:
import json
def get_user_item_reasons_pair(raw_batch_data: dict):
    union_ids = raw_batch_data["custom_id"]
    reasons = raw_batch_data["response"]["body"]["choices"][0]["message"]["content"]
    return union_ids, reasons

out_writter = open("openai_reasons_cache/" + "openai_reasons_amazonbooks.txt", 'w', encoding='utf8')
idx = 0
for i in range(4):
    raw_batch_file_path = f"prompts_for_generating_explanations/split/get_reasons_for_LLMAPI_Amazon_Books_small_batchapi_{i}_output.jsonl"
    with open(raw_batch_file_path, 'r', encoding='utf8') as f:
        for line in f: 
            if len(line.strip()) == 0:
                continue
            raw_batch_data = json.loads(line)
            union_ids, reasons = get_user_item_reasons_pair(raw_batch_data)
            out_writter.write(json.dumps({"union_id":union_ids, "reasons":reasons}, ensure_ascii=False)+"\n")
            idx += 1
print(idx)

19844
