# Prompting the models to classify the statements

## GPT 3.5 and GPT 4.1

In [1]:
%%capture output
%run ./04_prompt_creation.ipynb

In [2]:
import pandas as pd

chunking = "1024_20"
only_text = True
ai_prompt = False
suit_prompt = True

path = f"../data/dfs/{'only_text_' if only_text else ''}{chunking}/ReferenceErrorDetection_data_with_chunk_info.pkl"
print(path)

# read the dataframe from a pickle file
df = pd.read_pickle(path)

../data/dfs/only_text_1024_20/ReferenceErrorDetection_data_with_chunk_info.pkl


In [3]:
df.head()

Unnamed: 0,Source,Citing Article ID,Citing Article DOI,Citing Article Title,Citing Article Retracted,Citing Article Downloaded,Domain,Citation ID,Statement with Citation,Corrected Statement,...,Reference Article PDF Available,Reference Article Retracted,Reference Article Downloaded,Label,Explanation,Error Type,Added,Previously Partially Substantiated,Top_3_Chunk_IDs,Top_3_Chunk_Texts
0,PubPeer,c001,10.1016/j.est.2021.103553,Heating a residential building using the heat ...,Yes,Yes,Engineering,cit001_1,Others have aimed to reduce irreversibility or...,Others have aimed to reduce irreversibility or...,...,Yes,No,Yes,Unsubstantiated,Irrelevant,Irrelevant,No,,"[2d702f6c-a1b1-4e0d-bc5b-efc4a257f7e3, 55221b8...","[-en, maintenance personnels can check the mot..."
1,PubPeer,c001,10.1016/j.est.2021.103553,Heating a residential building using the heat ...,Yes,Yes,Engineering,cit001_2,Some researchers have also studied various hea...,Some researchers have also studied various hea...,...,Yes,No,Yes,Unsubstantiated,Irrelevant,Irrelevant,No,,"[118c524c-3f14-4fe2-8d06-ce6d6570e788, d5cf0f4...",[Introduction\nThe mixture composed of nanopar...
2,PubPeer,c002,10.1155/2022/4601350,Oxidative Potential and Nanoantioxidant Activi...,Yes,Yes,Chemistry,cit002_1,The relative content of total flavonoids in th...,The relative content of total flavonoids in th...,...,Yes,No,Yes,Unsubstantiated,Irrelevant,Irrelevant,No,,"[cb961e2a-5ede-4186-b1ec-33297d140cdd, 9d6cf7b...",[This is the simple industrial flow. The strip...
3,PubPeer,c003,10.1155/2022/2408685,The Choice of Anesthetic Drugs in Outpatient H...,Yes,Yes,Medicine,cit003_1,Research has shown that remimazolam tosylate e...,Research has shown that remimazolam tosylate e...,...,Yes,No,Yes,Unsubstantiated,Irrelevant,Irrelevant,No,,"[d18a377e-c8dc-47ab-988f-0f8655f1fdc4, cd058e6...",[Low perioperative levels of NK activity are a...
4,PubPeer,c004,10.1155/2022/4783847,A Fault-Tolerant Structure for Nano-Power Comm...,Yes,Yes,Engineering,cit004_1,if the efficiency of the routing algorithm is ...,If the efficiency of the routing algorithm is ...,...,Yes,No,Yes,Unsubstantiated,Irrelevant,Irrelevant,No,,"[dba1e500-ca49-4b33-8d1d-106beafbf1b3, a230068...","[As can be seen from the figure, the most freq..."


### Prompting the models (batch processing)

In [4]:
import os
import json

prompt_chars = []

def create_batch_files(df, model, number_files=1, ignore_ids=[], ai_prompt=False, suit_prompt=False):
    global prompt_chars

    output_dir = f"../data/batch_files/{'only_text_' if only_text else ''}{chunking}/{model}{'/AI_prompt/' if ai_prompt else ''}{'/suit_prompt/' if suit_prompt else ''}"
    # Empty the folder if it exists
    if os.path.exists(output_dir):
        for filename in os.listdir(output_dir):
            file_path = os.path.join(output_dir, filename)
            if os.path.isfile(file_path):
                os.remove(file_path)
    os.makedirs(output_dir, exist_ok=True)

    output_files = []
    for i in range(number_files):
        output_file = os.path.join(output_dir, f"prompt_batch_{i}.jsonl")
        # If the file already exists, empty it
        open(output_file, "w").close()
        output_files.append(output_file)
    
    for index, row in df.iterrows():
        if row['Reference Article Downloaded'] == 'Yes' and index not in ignore_ids:
            if ai_prompt:
                prompt = create_prompt_ai_improved(row)
            elif suit_prompt:
                prompt = create_prompt_suitability(row)
            else:
                prompt = create_prompt(row)

            prompt_char = len(prompt)
            prompt_chars.append(prompt_char)

            json_sequence = {
                "custom_id": f"request-{index}", 
                "method": "POST", 
                "url": "/v1/chat/completions", 
                "body": {
                    "model": model, 
                    "messages": [
                        {
                            "role": "user",
                            "content": prompt
                        }
                    ],
                    "temperature": 0,
                }
            }

            output_file = output_files[index % number_files]
            with open(output_file, "a") as f:
                f.write(json.dumps(json_sequence) + "\n")
                
    # Remove empty output files from list
    output_files = [file for file in output_files if os.path.getsize(file) > 0]
    
    return output_files

In [5]:
import json

models = ["gpt-3.5-turbo-0125", "gpt-4.1-nano-2025-04-14", "gpt-4.1-mini-2025-04-14", "gpt-4.1-2025-04-14"]
model = models[3]

os.makedirs(f"../data/batch_responses/{'only_text_' if only_text else ''}{chunking}{'/AI_prompt/' if ai_prompt else ''}{'/suit_prompt/' if suit_prompt else ''}", exist_ok=True)
responses_dict_path = f"../data/batch_responses/{'only_text_' if only_text else ''}{chunking}{'/AI_prompt/' if ai_prompt else ''}{'/suit_prompt/' if suit_prompt else ''}/{model}_responses_dict_batch.json"

responses_dict = {}
try:
    with open(responses_dict_path, 'r') as file:
        responses_dict = json.load(file)
    ids_to_ignore = [int(key) for key in responses_dict.keys()]
except FileNotFoundError:
    ids_to_ignore = []

print(ids_to_ignore)

[]


In [None]:
batch_file_paths = create_batch_files(df, model, 5, ids_to_ignore, ai_prompt=ai_prompt, suit_prompt=suit_prompt)
batch_file_paths

['../data/batch_files/only_text_1024_20/gpt-4.1-2025-04-14/suit_prompt/prompt_batch_0.jsonl',
 '../data/batch_files/only_text_1024_20/gpt-4.1-2025-04-14/suit_prompt/prompt_batch_1.jsonl',
 '../data/batch_files/only_text_1024_20/gpt-4.1-2025-04-14/suit_prompt/prompt_batch_2.jsonl',
 '../data/batch_files/only_text_1024_20/gpt-4.1-2025-04-14/suit_prompt/prompt_batch_3.jsonl',
 '../data/batch_files/only_text_1024_20/gpt-4.1-2025-04-14/suit_prompt/prompt_batch_4.jsonl']

In [8]:
prompt_chars.sort()
min(prompt_chars), max(prompt_chars), sum(prompt_chars)/len(prompt_chars)

(4614, 21384, 15754.493927125506)

In [9]:
# Read the content of open_ai_key.txt into a variable
with open('../open_ai_key.txt', 'r') as file:
    open_ai_key = file.read().strip()

In [25]:
def check_batch(batch_id, client):
    batch = client.batches.retrieve(batch_id)
    print(f"{batch_id} - Current status: {batch.status}")
    if (batch.status == 'in_progress'):
        print(f"{batch.request_counts.completed} / {batch.request_counts.total} completed")

    if batch.status == 'completed' or batch.status == 'failed':
        return batch
    return None

In [27]:
import time
from openai import OpenAI

batch_input_files = []
batch_creation_responses = []
batches = []

client = OpenAI(api_key=open_ai_key)

def prompt_model_in_batches():
    global batch_input_files
    global batch_creation_responses
    global batches

    for batch_file_path in batch_file_paths:
        # Creating input file
        if os.stat(batch_file_path).st_size == 0:
            print(f"Skipping empty file: {batch_file_path}")
            continue
        batch_input_file = client.files.create(
            file=open(batch_file_path, "rb"),
            purpose="batch"
        )
        print(batch_input_file)
        batch_input_files.append(batch_input_file)

        # Starting batch job
        batch_input_file_id = batch_input_file.id
        batch_creation_response = client.batches.create(
            input_file_id=batch_input_file_id,
            endpoint="/v1/chat/completions",
            completion_window="24h"
        )
        print("Started: " + batch_creation_response.id)

        time.sleep(5)
        # Check the status of the created batch until it is completed
        while True:
            batch_id = batch_creation_response.id
            batch = check_batch(batch_id, client)
            if batch:
                if batch.status == "failed":
                    return
                elif batch.status == "completed":
                    batches.append(batch)
                    break
            time.sleep(60)

GPT 3.5 Turbo:
- 256_20 only_text: 27:22 min
- 1024_20: 10:10 
- 1024_20 only_text: 34 min (and batch 8 and 9 not successfull due to token limit)

GPT 4.1 Nano:
- 256_20: 35:10 min
- 256_20 only_text: 22:40 min
- 1024_20: 15:55 min

GPT 4.1 Mini:
- 256_20: 19:50 min
- 256_20 only_text: 21:40 min
- 1024_20: 28:26 min
- 1024_20 only_text: 28:20 min (and batch 8 and 9 not successfull due to token limit)

GPT 4.1

In [11]:
%%time

prompt_model_in_batches()

FileObject(id='file-HiDs61ARanxWXE116mcake', bytes=3995614, created_at=1760542533, filename='prompt_batch_0.jsonl', object='file', purpose='batch', status='processed', expires_at=1763134533, status_details=None)
Started: batch_68efbf46c92c819080a5760f7db4b031
batch_68efbf46c92c819080a5760f7db4b031 - Current status: validating
batch_68efbf46c92c819080a5760f7db4b031 - Current status: failed
CPU times: user 80.9 ms, sys: 16 ms, total: 96.9 ms
Wall time: 1min 8s


### Check all open batches

In [34]:
import time
from openai import OpenAI
client = OpenAI(api_key=open_ai_key)

current_millis = int(time.time())
recently = current_millis - 24 * 60 * 60

open_batches = client.batches.list()
relevant_open_batches = [batch for batch in open_batches if batch.created_at >= recently]
in_progress_batch_ids = [batch.id for batch in relevant_open_batches if batch.status == 'in_progress']

In [35]:
in_progress_batch_ids

['batch_68efa5f819d4819081b526560a95616b']

In [32]:
list(relevant_open_batches)

[Batch(id='batch_68efbf46c92c819080a5760f7db4b031', completion_window='24h', created_at=1760542534, endpoint='/v1/chat/completions', input_file_id='file-HiDs61ARanxWXE116mcake', object='batch', status='failed', cancelled_at=None, cancelling_at=None, completed_at=None, error_file_id=None, errors=Errors(data=[BatchError(code='token_limit_exceeded', line=None, message='Enqueued token limit reached for gpt-4.1-2025-04-14 in organization org-6SCiN9rjR6tU38WJ0DavgNRs. Limit: 900,000 enqueued tokens. Please try again once some in_progress batches have been completed.', param=None)], object='list'), expired_at=None, expires_at=1760628934, failed_at=1760542596, finalizing_at=None, in_progress_at=None, metadata=None, output_file_id=None, request_counts=BatchRequestCounts(completed=0, failed=0, total=0), model='gpt-4.1-2025-04-14', usage={'input_tokens': 0, 'output_tokens': 0, 'total_tokens': 0, 'input_tokens_details': {'cached_tokens': 0}, 'output_tokens_details': {'reasoning_tokens': 0}}),
 Bat

In [33]:
client.files.content(relevant_open_batches[0].output_file_id).text

ValueError: Expected a non-empty value for `file_id` but received None

In [None]:
# client.batches.cancel("batch_67e3cf592eb081908cd64e5e1dc55fa0")

ConflictError: Error code: 409 - {'error': {'message': "Cannot cancel a batch with status 'completed'.", 'type': 'invalid_request_error', 'param': None, 'code': None}}

### Check the batch status

In [28]:
import time

def wait_for_batch_completion(batch_id, client, interval=10):
    while True:
        batch = check_batch(batch_id, client)
        if batch != None:
            return batch
        time.sleep(interval)

In [36]:
batch = wait_for_batch_completion("batch_68efa5f819d4819081b526560a95616b", client, interval=60)

batch_68efa5f819d4819081b526560a95616b - Current status: in_progress
36 / 49 completed


KeyboardInterrupt: 

In [14]:
batch = check_batch("batch_68efa5f819d4819081b526560a95616b", client)

batch_68efa5f819d4819081b526560a95616b - Current status: in_progress


In [16]:
print(batch)

None


In [12]:
batches = [batch]

In [None]:
print(batch)

Batch(id='batch_68678417bf4c819085e0d77c012d5a7a', completion_window='24h', created_at=1751614487, endpoint='/v1/chat/completions', input_file_id='file-TubVrLSjHxewNncTaHNShi', object='batch', status='completed', cancelled_at=None, cancelling_at=None, completed_at=1751614569, error_file_id='file-39tX7oztfkdnqiNEpYDUMb', errors=None, expired_at=None, expires_at=1751700887, failed_at=None, finalizing_at=1751614556, in_progress_at=1751614490, metadata=None, output_file_id=None, request_counts=BatchRequestCounts(completed=0, failed=25, total=25))


### Save model_responses

In [None]:
import json

# save responds of completed batches
for batch in batches:
    if batch.status != "completed":
        continue
    model_responses = client.files.content(batch.output_file_id).text

    # Parse the model_responses into a list of objects
    responses_list = [json.loads(line) for line in model_responses.splitlines()]
    # print(responses_list)

    try:
        for response in responses_list:
            responses_dict[int(response['custom_id'].split('-')[1])] = response
            responses_dict = dict(sorted(responses_dict.items(), key=lambda item: int(item[0])))
    except NameError:
        responses_dict = {int(response['custom_id'].split('-')[1]): response for response in responses_list}

In [12]:
import json

# Save responses_dict to a JSON file
with open(responses_dict_path, 'w') as file:
    json.dump(responses_dict, file, indent=4)

#### Save responds to Dataframe

In [13]:
# Create a new column in the dataframe to store the responses
if 'Model Classification' not in df.columns:
    df['Model Classification'] = None

# Iterate through the dataframe
for index, row in df.iterrows():
    if row['Reference Article Downloaded'] == 'Yes':
        i = index
        if i not in responses_dict:
            i = str(i)
        model_response = responses_dict[i]['response']['body']['choices'][0]['message']['content']
        
        # Save the response to the new column
        df.at[index, 'Model Classification'] = model_response

In [14]:
dfs_path = f"../data/dfs/{'only_text_' if only_text else ''}{chunking}/{model}/{'AI_prompt/' if ai_prompt else ''}"
os.makedirs(dfs_path, exist_ok=True)
df.to_pickle(f"{dfs_path}ReferenceErrorDetection_data_with_prompt_results.pkl")
df.to_excel(f"{dfs_path}ReferenceErrorDetection_data_with_prompt_results.xlsx", index=False)