# OpenAI Model Prompting

This notebook provides code for manual batched prompting of the OpenAI models. 
The script 'prompt_openai_model.py' provides a shortened version of this code which can be run via console (compatible with nohup command).

In [None]:
%%capture output
%run ./04_prompt_creation.ipynb

In [None]:
import pandas as pd
import os
import json
import time

from openai import OpenAI

## Prompting Options

In [None]:
models = ["gpt-3.5-turbo-0125", "gpt-4.1-nano-2025-04-14", "gpt-4.1-mini-2025-04-14", "gpt-4.1-2025-04-14"]
model = models[3]

chunking = "1024_20"
only_text = True
ai_prompt = False
suit_prompt = True

path = f"../data/dfs/{'only_text_' if only_text else ''}{chunking}/ReferenceErrorDetection_data_with_chunk_info.pkl"
print(path)

# read the dataframe from a pickle file
df = pd.read_pickle(path)

## Creating Batch Files

In [None]:
prompt_chars = []

def create_batch_files(df, model, number_files=1, ignore_ids=[], ai_prompt=False, suit_prompt=False):
    global prompt_chars

    output_dir = f"../data/batch_files/{'only_text_' if only_text else ''}{chunking}/{model}{'/AI_prompt/' if ai_prompt else ''}{'/suit_prompt/' if suit_prompt else ''}"
    # Empty the folder if it exists
    if os.path.exists(output_dir):
        for filename in os.listdir(output_dir):
            file_path = os.path.join(output_dir, filename)
            if os.path.isfile(file_path):
                os.remove(file_path)
    os.makedirs(output_dir, exist_ok=True)

    output_files = []
    for i in range(number_files):
        output_file = os.path.join(output_dir, f"prompt_batch_{i}.jsonl")
        # If the file already exists, empty it
        open(output_file, "w").close()
        output_files.append(output_file)
    
    for index, row in df.iterrows():
        if row['Reference Article Downloaded'] == 'Yes' and index not in ignore_ids:
            prompt = create_prompt(row)
            prompt_char = len(prompt)
            prompt_chars.append(prompt_char)

            json_sequence = {
                "custom_id": f"request-{index}", 
                "method": "POST", 
                "url": "/v1/chat/completions", 
                "body": {
                    "model": model, 
                    "messages": [
                        {
                            "role": "user",
                            "content": prompt
                        }
                    ],
                    "temperature": 0,
                }
            }

            output_file = output_files[index % number_files]
            with open(output_file, "a") as f:
                f.write(json.dumps(json_sequence) + "\n")
                
    # Remove empty output files from list
    output_files = [file for file in output_files if os.path.getsize(file) > 0]
    
    return output_files

In [None]:
os.makedirs(f"../data/batch_responses/{'only_text_' if only_text else ''}{chunking}{'/AI_prompt/' if ai_prompt else ''}{'/suit_prompt/' if suit_prompt else ''}", exist_ok=True)
responses_dict_path = f"../data/batch_responses/{'only_text_' if only_text else ''}{chunking}{'/AI_prompt/' if ai_prompt else ''}{'/suit_prompt/' if suit_prompt else ''}/{model}_responses_dict_batch.json"

responses_dict = {}
try:
    with open(responses_dict_path, 'r') as file:
        responses_dict = json.load(file)
    ids_to_ignore = [int(key) for key in responses_dict.keys()]
except FileNotFoundError:
    ids_to_ignore = []

print(ids_to_ignore)

In [None]:
batch_file_paths = create_batch_files(df, model, 5, ids_to_ignore, ai_prompt=ai_prompt, suit_prompt=suit_prompt)
batch_file_paths

## Batched Model Prompting

Only one batch request is sent at once and its completion is awaited before starting the next batch, because otherwise the OpenAI API token limit can be easily exceeded which leads to the cancellation of all requests and a required refresh time before trying again.

This can make prompting the OpenAI models slow, but this sequential batched processing is still faster (and cheaper) than prompting each citation separately.

An OpenAI API key needs to be generated and put into a file called "open_ai_key.txt" for the following code to work. 

In [None]:
# Read the content of open_ai_key.txt into a variable
with open('../open_ai_key.txt', 'r') as file:
    open_ai_key = file.read().strip()

In [None]:
def check_batch(batch_id, client):
    batch = client.batches.retrieve(batch_id)
    print(f"{batch_id} - Current status: {batch.status}")
    if (batch.status == 'in_progress'):
        print(f"{batch.request_counts.completed} / {batch.request_counts.total} completed")

    if batch.status == 'completed' or batch.status == 'failed':
        return batch
    return None

In [None]:
batch_input_files = []
batch_creation_responses = []
batches = []

client = OpenAI(api_key=open_ai_key)

def prompt_model_in_batches():
    global batch_input_files
    global batch_creation_responses
    global batches

    for batch_file_path in batch_file_paths:
        # Creating input file
        if os.stat(batch_file_path).st_size == 0:
            print(f"Skipping empty file: {batch_file_path}")
            continue
        batch_input_file = client.files.create(
            file=open(batch_file_path, "rb"),
            purpose="batch"
        )
        print(batch_input_file)
        batch_input_files.append(batch_input_file)

        # Starting batch job
        batch_input_file_id = batch_input_file.id
        batch_creation_response = client.batches.create(
            input_file_id=batch_input_file_id,
            endpoint="/v1/chat/completions",
            completion_window="24h"
        )
        print("Started: " + batch_creation_response.id)

        time.sleep(5)
        # Check the status of the created batch until it is completed
        while True:
            batch_id = batch_creation_response.id
            batch = check_batch(batch_id, client)
            if batch:
                if batch.status == "failed":
                    return
                elif batch.status == "completed":
                    batches.append(batch)
                    break
            time.sleep(60)

In [None]:
%%time

prompt_model_in_batches()

## Checking Open Batches

The following code can be used to view currently running batches and cancel them if needed.

In [None]:
client = OpenAI(api_key=open_ai_key)

current_millis = int(time.time())
recently = current_millis - 24 * 60 * 60

open_batches = client.batches.list()
relevant_open_batches = [batch for batch in open_batches if batch.created_at >= recently]
in_progress_batch_ids = [batch.id for batch in relevant_open_batches if batch.status == 'in_progress']

In [None]:
in_progress_batch_ids

In [None]:
list(relevant_open_batches)

In [None]:
client.files.content(relevant_open_batches[0].output_file_id).text

In [None]:
# client.batches.cancel("batch_67e3cf592eb081908cd64e5e1dc55fa0")

## Waiting for Batch Completion

The following function can be used to wait for an existing batch to be finished by polling for the batch status with a given interval.

In [None]:
def wait_for_batch_completion(batch_id, client, interval=10):
    while True:
        batch = check_batch(batch_id, client)
        if batch != None:
            return batch
        time.sleep(interval)

In [None]:
batch = wait_for_batch_completion("batch_68efa5f819d4819081b526560a95616b", client, interval=60)

In [None]:
batch = check_batch("batch_68efa5f819d4819081b526560a95616b", client)

In [None]:
batches = [batch]

## Save model_responses

The responses of the completed batches are saved into a json file in the folder 'data/batch_responses'.
Additionally, the responses are added to the dataframe to the corresponding data rows.

In [None]:
# save responds of completed batches
for batch in batches:
    if batch.status != "completed":
        continue
    model_responses = client.files.content(batch.output_file_id).text

    # Parse the model_responses into a list of objects
    responses_list = [json.loads(line) for line in model_responses.splitlines()]
    # print(responses_list)

    try:
        for response in responses_list:
            responses_dict[int(response['custom_id'].split('-')[1])] = response
            responses_dict = dict(sorted(responses_dict.items(), key=lambda item: int(item[0])))
    except NameError:
        responses_dict = {int(response['custom_id'].split('-')[1]): response for response in responses_list}

In [None]:
# Save responses_dict to a JSON file
with open(responses_dict_path, 'w') as file:
    json.dump(responses_dict, file, indent=4)

### Save responds to Dataframe

In [None]:
# Create a new column in the dataframe to store the responses
if 'Model Classification' not in df.columns:
    df['Model Classification'] = None

# Iterate through the dataframe
for index, row in df.iterrows():
    if row['Reference Article Downloaded'] == 'Yes':
        i = index
        if i not in responses_dict:
            i = str(i)
        model_response = responses_dict[i]['response']['body']['choices'][0]['message']['content']
        
        # Save the response to the new column
        df.at[index, 'Model Classification'] = model_response

In [None]:
dfs_path = f"../data/dfs/{'only_text_' if only_text else ''}{chunking}/{model}/{'AI_prompt/' if ai_prompt else ''}"
os.makedirs(dfs_path, exist_ok=True)
df.to_pickle(f"{dfs_path}ReferenceErrorDetection_data_with_prompt_results.pkl")
df.to_excel(f"{dfs_path}ReferenceErrorDetection_data_with_prompt_results.xlsx", index=False)