# Prompting the models to classify the statements

In [1]:
%%capture output
%run ./06_0_prompt_creation.ipynb

In [4]:
import pandas as pd

embedding = "te3l" # / "te3s"
grobid_model = "full_model"
no_prev_chunking = True

annotated = False
corrected_statements = False

path = f"../data/dfs{'/annotated_data' if annotated else ''}{'/corrected_statements' if corrected_statements else ''}/{embedding}{'_no_prev_chunking' if no_prev_chunking else ''}/{grobid_model}/ReferenceErrorDetection_data_with_chunk_info.pkl"
print(path)

# read the dataframe from a pickle file
df = pd.read_pickle(path)

../data/dfs/te3l_no_prev_chunking/full_model/ReferenceErrorDetection_data_with_chunk_info.pkl


In [5]:
df.head()

Unnamed: 0,Source,Citing Article ID,Citing Article DOI,Citing Article Title,Citing Article Retracted,Citing Article Downloaded,Domain,Statement with Citation,Reference Article ID,Reference Article DOI,Reference Article Title,Reference Article Abstract,Reference Article PDF Available,Reference Article Retracted,Reference Article Downloaded,Label,Explanation,Top_3_Chunk_IDs,Top_3_Chunk_Texts
0,PubPeer,c001,10.1016/j.est.2021.103553,Heating a residential building using the heat ...,Yes,Yes,Engineering,Others have aimed to reduce irreversibility or...,r001,10.1155/2021/2087027,A Fault Analysis Method for Three-Phase Induct...,The fault prediction and abductive fault diagn...,Yes,No,Yes,Unsubstantiate,Irrelevant,"[d5149dde-3f61-477d-acd9-0c8f97b154b5, f834613...",[Automatic implementation of fuzzy reasoning s...
1,PubPeer,c001,10.1016/j.est.2021.103553,Heating a residential building using the heat ...,Yes,Yes,Engineering,Some researchers have also studied various hea...,r002,10.1016/j.physa.2018.12.031,Develop 24 dissimilar ANNs by suitable archite...,The artificial neural network optimization met...,Yes,No,Yes,Unsubstantiate,Irrelevant,"[dc4cd1b8-1ae4-4c15-be29-a4eb6e4270ba, 6210cc4...",[Heat transfer improvement of water/single-wal...
2,PubPeer,c002,10.1155/2022/4601350,Oxidative Potential and Nanoantioxidant Activi...,Yes,Yes,Chemistry,The relative content of total flavonoids in th...,r003,10.1088/1742-6596/1937/1/012038,Lipid Data Acquisition for devices Treatment o...,"Recently, the widespread deployment of smart p...",Yes,No,Yes,Unsubstantiate,Irrelevant,"[2eeaa48b-a7ec-4674-87ac-511b439139c8, 0d45e20...",[The correspondence curve for our photochemica...
3,PubPeer,c003,10.1155/2022/2408685,The Choice of Anesthetic Drugs in Outpatient H...,Yes,Yes,Medicine,Research has shown that remimazolam tosylate e...,r004,10.1186/s12871-018-0543-3,"Effect of propofol on breast cancer cell, the ...",Breast cancer is the second leading cause of c...,Yes,No,Yes,Unsubstantiate,Irrelevant,"[c6131293-c506-4347-8441-06e480444093, 821ac75...",[Determination of the median effective concent...
4,PubPeer,c004,10.1155/2022/4783847,A Fault-Tolerant Structure for Nano-Power Comm...,Yes,Yes,Engineering,if the efficiency of the routing algorithm is ...,r005,10.36410/jcpr.2022.23.3.312,Analysis and research hotspots of ceramic mate...,"From the perspective of scientometrics, comb t...",Yes,No,Yes,Unsubstantiate,Irrelevant,"[65b14485-4faa-441e-9aac-9967bad1f603, 8b03de6...","[In the table, China's intermediary centrality..."


## Prompting the models (batch processing)

In [14]:
import os
import json

def create_batch_files(df, model, number_files=1, ignore_ids=[]):
    output_dir = f"../data/batch_files/{embedding}{'_no_prev_chunking' if no_prev_chunking else ''}/{grobid_model}"
    os.makedirs(output_dir, exist_ok=True)

    output_files = []
    for i in range(number_files):
        output_file = os.path.join(output_dir, f"prompt_batch_{model}_{i}.jsonl")
        # If the file already exists, empty it
        if os.path.exists(output_file):
            open(output_file, "w").close()
        output_files.append(output_file)
    
    for index, row in df.iterrows():
        if row['Reference Article Downloaded'] == 'Yes' and index not in ignore_ids:
            prompt = create_prompt(row)
            json_sequence = {
                "custom_id": f"request-{index}", 
                "method": "POST", 
                "url": "/v1/chat/completions", 
                "body": {
                    "model": model, 
                    "messages": [
                        {
                            "role": "user",
                            "content": prompt
                        }
                    ],
                    "temperature": 0,
                }
            }

            output_file = output_files[index % number_files]
            with open(output_file, "a") as f:
                f.write(json.dumps(json_sequence) + "\n")
                
    # Remove empty output files from list
    output_files = [file for file in output_files if os.path.getsize(file) > 0]
    
    return output_files

In [8]:
import json

models = ["gpt-3.5-turbo-0125", "gpt-4-0125-preview", "gpt-4o-2024-05-13", "gpt-4-turbo-preview"]
model = models[2]

try:
    with open(f'../data/responses_dict_batch_{model}.json', 'r') as file:
        responses_dict = json.load(file)
    ids_to_ignore = [int(key) for key in responses_dict.keys()]
except FileNotFoundError:
    ids_to_ignore = []

print(ids_to_ignore)

[0, 1, 2, 3, 20, 21, 22, 23, 40, 41, 42, 43, 60, 61, 62, 63, 80, 81, 82, 83, 100, 101, 102, 103, 120, 121, 122, 123, 140, 141, 142, 143, 160, 161, 162, 163, 180, 181, 182, 183, 200, 201, 202, 203, 220, 221, 222, 223, 240, 241, 242, 243]


In [9]:
batch_file_paths = create_batch_files(df, model, 20, ids_to_ignore)
batch_file_paths

['../data/batch_files/te3l_no_prev_chunking/full_model/prompt_batch_gpt-4o-2024-05-13_4.jsonl',
 '../data/batch_files/te3l_no_prev_chunking/full_model/prompt_batch_gpt-4o-2024-05-13_5.jsonl',
 '../data/batch_files/te3l_no_prev_chunking/full_model/prompt_batch_gpt-4o-2024-05-13_6.jsonl',
 '../data/batch_files/te3l_no_prev_chunking/full_model/prompt_batch_gpt-4o-2024-05-13_7.jsonl',
 '../data/batch_files/te3l_no_prev_chunking/full_model/prompt_batch_gpt-4o-2024-05-13_8.jsonl',
 '../data/batch_files/te3l_no_prev_chunking/full_model/prompt_batch_gpt-4o-2024-05-13_9.jsonl',
 '../data/batch_files/te3l_no_prev_chunking/full_model/prompt_batch_gpt-4o-2024-05-13_10.jsonl',
 '../data/batch_files/te3l_no_prev_chunking/full_model/prompt_batch_gpt-4o-2024-05-13_11.jsonl',
 '../data/batch_files/te3l_no_prev_chunking/full_model/prompt_batch_gpt-4o-2024-05-13_12.jsonl',
 '../data/batch_files/te3l_no_prev_chunking/full_model/prompt_batch_gpt-4o-2024-05-13_13.jsonl',
 '../data/batch_files/te3l_no_prev_c

In [10]:
# Read the content of open_ai_key.txt into a variable
with open('../open_ai_key.txt', 'r') as file:
    open_ai_key = file.read().strip()

In [11]:
def check_batch(batch_id, client):
    batch = client.batches.retrieve(batch_id)
    print(f"{batch_id} - Current status: {batch.status}")
    
    if batch.status == 'completed' or batch.status == 'failed':
        return batch
    return None

In [12]:
import time
from openai import OpenAI

batch_input_files = []
batch_creation_responses = []
batches = []

client = OpenAI(api_key=open_ai_key)

def prompt_model_in_batches():
    global batch_input_files
    global batch_creation_responses
    global batches

    for batch_file_path in batch_file_paths:
        # Creating input file
        if os.stat(batch_file_path).st_size == 0:
            print(f"Skipping empty file: {batch_file_path}")
            continue
        batch_input_file = client.files.create(
            file=open(batch_file_path, "rb"),
            purpose="batch"
        )
        print(batch_input_file)
        batch_input_files.append(batch_input_file)

        # Starting batch job
        batch_input_file_id = batch_input_file.id
        batch_creation_response = client.batches.create(
            input_file_id=batch_input_file_id,
            endpoint="/v1/chat/completions",
            completion_window="24h"
        )
        print("Started: " + batch_creation_response.id)

        time.sleep(5)
        # Check the status of the created batch until it is completed
        while True:
            batch_id = batch_creation_response.id
            batch = check_batch(batch_id, client)
            if batch:
                if batch.status == "failed":
                    return
                elif batch.status == "completed":
                    batches.append(batch)
                    break
            time.sleep(30)

In [13]:
prompt_model_in_batches()

FileObject(id='file-1PSQzk27aTqCLHxxKqTp7i', bytes=78631, created_at=1742993668, filename='prompt_batch_gpt-4o-2024-05-13_4.jsonl', object='file', purpose='batch', status='processed', expires_at=None, status_details=None)
Started: batch_67e3f904c67c81908dfe7f867b6f2e45
batch_67e3f904c67c81908dfe7f867b6f2e45 - Current status: failed


### Check all open batches

In [18]:
from openai import OpenAI
client = OpenAI(api_key=open_ai_key)

open_batches = client.batches.list(limit=5)

In [19]:
list(open_batches)

[Batch(id='batch_67e3f904c67c81908dfe7f867b6f2e45', completion_window='24h', created_at=1742993668, endpoint='/v1/chat/completions', input_file_id='file-1PSQzk27aTqCLHxxKqTp7i', object='batch', status='failed', cancelled_at=None, cancelling_at=None, completed_at=None, error_file_id=None, errors=Errors(data=[BatchError(code='token_limit_exceeded', line=None, message='Enqueued token limit reached for gpt-4o-2024-05-13 in organization org-6SCiN9rjR6tU38WJ0DavgNRs. Limit: 90,000 enqueued tokens. Please try again once some in_progress batches have been completed.', param=None)], object='list'), expired_at=None, expires_at=1743080068, failed_at=1742993670, finalizing_at=None, in_progress_at=None, metadata=None, output_file_id=None, request_counts=BatchRequestCounts(completed=0, failed=0, total=0)),
 Batch(id='batch_67e3f7ceae3c819088917e7b6928d06e', completion_window='24h', created_at=1742993358, endpoint='/v1/chat/completions', input_file_id='file-NiajatLWPoMPnU5oVGkcVn', object='batch', st

In [62]:
client.batches.cancel("batch_67e3cf592eb081908cd64e5e1dc55fa0")

ConflictError: Error code: 409 - {'error': {'message': "Cannot cancel a batch with status 'completed'.", 'type': 'invalid_request_error', 'param': None, 'code': None}}

### Check the batch status

In [18]:
import time

def wait_for_batch_completion(batch_id, client, interval=10):
    while True:
        batch = check_batch(batch_id, client)
        if batch != None:
            return batch
        time.sleep(interval)

In [19]:
batch = wait_for_batch_completion("batch_67e3f623e6a4819090cd55dbd2881b80", client)

batch_67e3f623e6a4819090cd55dbd2881b80 - Current status: completed


In [36]:
import json

# save responds of completed batches
for batch in batches:
    if batch.status != "completed":
        continue
    model_responses = client.files.content(batch.output_file_id).text

    # Parse the model_responses into a list of objects
    responses_list = [json.loads(line) for line in model_responses.splitlines()]

    try:
        for response in responses_list:
            responses_dict[int(response['custom_id'].split('-')[1])] = response
            responses_dict = dict(sorted(responses_dict.items(), key=lambda item: int(item[0])))
    except NameError:
        responses_dict = {int(response['custom_id'].split('-')[1]): response for response in responses_list}

In [37]:
responses_dict

{'0': {'id': 'batch_req_67e3f66ca9e08190af68289b2a2932e0',
  'custom_id': 'request-0',
  'response': {'status_code': 200,
   'request_id': 'ffe57df5bc3436abb99959553d87203c',
   'body': {'id': 'chatcmpl-BFKNvTfovSy3j9ve8zmC0b5EDWmcP',
    'object': 'chat.completion',
    'created': 1742992979,
    'model': 'gpt-4o-2024-05-13',
    'choices': [{'index': 0,
      'message': {'role': 'assistant',
       'content': '```json\n{\n  "label": "Unsubstantiate",\n  "explanation": "The reference article focuses on fault analysis and diagnosis methods for three-phase induction motors using spiking neural P systems and related techniques. It does not discuss reducing irreversibility or optimizing energy-consumed devices, which is the subject of the statement in the citing article."\n}\n```',
       'refusal': None,
       'annotations': []},
      'logprobs': None,
      'finish_reason': 'stop'}],
    'usage': {'prompt_tokens': 1172,
     'completion_tokens': 75,
     'total_tokens': 1247,
     'pr

In [38]:
import json

# Save responses_dict to a JSON file
with open(f'../data/responses_dict_batch_{model}.json', 'w') as file:
    json.dump(responses_dict, file, indent=4)

#### Save responds to Dataframe

In [49]:
# Create a new column in the dataframe to store the responses
if 'Model Classification' not in df.columns:
    df['Model Classification'] = None

# Iterate through the dataframe
for index, row in df.iterrows():
    if row['Reference Article Downloaded'] == 'Yes':
        print(responses_dict)
        model_response = responses_dict[index]['response']['body']['choices'][0]['message']['content']
        
        # Save the response to the new column
        df.at[index, 'Model Classification'] = model_response

{0: {'id': 'batch_req_67e3cf8de7b88190ad3b1448870ebab2', 'custom_id': 'request-0', 'response': {'status_code': 200, 'request_id': 'f0520eab9a6260230996aaa5b8327b81', 'body': {'id': 'chatcmpl-BFHn1hPe6dsjo8DQBzB4L44OSa2P7', 'object': 'chat.completion', 'created': 1742983003, 'model': 'gpt-4-0125-preview', 'choices': [{'index': 0, 'message': {'role': 'assistant', 'content': '```json\n{\n  "label": "Unsubstantiate",\n  "explanation": "The reference article focuses on a fault analysis method for three-phase induction motors using modified fuzzy reasoning spiking neural P systems, which is unrelated to reducing irreversibility or optimizing energy-consumed devices as mentioned in the citing article\'s statement."\n}\n```', 'refusal': None, 'annotations': []}, 'logprobs': None, 'finish_reason': 'stop'}], 'usage': {'prompt_tokens': 1188, 'completion_tokens': 70, 'total_tokens': 1258, 'prompt_tokens_details': {'cached_tokens': 0, 'audio_tokens': 0}, 'completion_tokens_details': {'reasoning_tok

KeyError: 1

In [27]:
df.to_pickle(f"../data/dfs/{embedding}{'_no_prev_chunking' if no_prev_chunking else ''}/{grobid_model}/ReferenceErrorDetection_data_with_prompt_results_batched.pkl")

## Prompting the models (no batching)

In [9]:
# Read the content of open_ai_key.txt into a variable
with open('../open_ai_key.txt', 'r') as file:
    open_ai_key = file.read().strip()

In [10]:
from openai import OpenAI
client = OpenAI(api_key=open_ai_key)

def send_prompt(prompt, model):
    completion = client.chat.completions.create(
        model=model,
        messages=[
            {
                "role": "user",
                "content": prompt
            }
        ],
        temperature=0,
        timeout=30,

    )
    return completion.choices[0].message.content

In [11]:
models = ["gpt-3.5-turbo-0125", "gpt-4-0125-preview", "gpt-4o-2024-05-13", "gpt-4-turbo-preview"]
model = models[2]

send_prompt(example_prompt, model)

'```json\n{\n  "label": "Unsubstantiate",\n  "explanation": "The reference article \'DeepCleave: a deep learning predictor for caspase and matrix metalloprotease substrates and cleavage sites\' focuses on deep learning models for predicting protease-specific substrates and cleavage sites. It does not address issues related to arranging classes or summarizing grades in a high-level management context, which is the subject of the statement in the citing article."\n}\n```'

In [37]:
# path = f"../data/dfs/{embedding}{'_no_prev_chunking' if no_prev_chunking else ''}/{grobid_model}/ReferenceErrorDetection_data_with_prompt_results.pkl"
# df = pd.read_pickle(path)

In [12]:
ids_to_prompt = []
ids_not_to_prompt = []

In [None]:
def prompting_model(df, model, save_intermediate_results=False):
    print(f"Prompting model: {model}")

    # Create a new column in the dataframe to store the responses
    if 'Model Classification' not in df.columns:
        df['Model Classification'] = None

    # Iterate through the dataframe
    for index, row in df.iterrows():
        if row['Reference Article Downloaded'] == 'Yes':
            if len(ids_to_prompt) != 0 and row['Reference Article ID'] not in ids_to_prompt:
                continue

            if len(ids_not_to_prompt) != 0 and row['Reference Article ID'] in ids_not_to_prompt:
                continue

            print(f"Processing: " + row['Reference Article ID'])

            # Create the prompt
            prompt = create_prompt(row)
            
            # Send the prompt and get the response
            response = send_prompt(prompt, model)
            
            # Save the response to the new column
            df.at[index, 'Model Classification'] = response

            if save_intermediate_results and index % 10 == 0:
                df.to_pickle(f"../data/dfs/{embedding}{'_no_prev_chunking' if no_prev_chunking else ''}/{grobid_model}/ReferenceErrorDetection_data_with_prompt_results_{model}_intermed.pkl")
    return df

In [15]:
df2 = prompting_model(df, model, True)

Prompting model: gpt-4o-2024-05-13
Processing: r001
Processing: r002
Processing: r003
Processing: r004
Processing: r005
Processing: r006
Processing: r007
Processing: r008
Processing: r009
Processing: r010
Processing: r011
Processing: r012
Processing: r013
Processing: r013
Processing: r014
Processing: r015
Processing: r005
Processing: r017
Processing: r018
Processing: r019
Processing: r020
Processing: r021
Processing: r022
Processing: r023
Processing: r024
Processing: r013
Processing: r025
Processing: r026
Processing: r027
Processing: r028
Processing: r029
Processing: r030
Processing: r031
Processing: r032
Processing: r033
Processing: r034
Processing: r035
Processing: r036
Processing: r037
Processing: r038
Processing: r039
Processing: r040
Processing: r041
Processing: r042
Processing: r043
Processing: r044
Processing: r045
Processing: r046
Processing: r047
Processing: r048
Processing: r049
Processing: r050
Processing: r051
Processing: r052
Processing: r053
Processing: r051
Processing: r

In [16]:
df2.to_pickle(f"../data/dfs/{embedding}{'_no_prev_chunking' if no_prev_chunking else ''}/{grobid_model}/ReferenceErrorDetection_data_with_prompt_results_{model}.pkl")