# ========== INSTALL DEPENDENCIES ==========


In [None]:
!pip install openai --upgrade

# ========== IMPORT LIBRARIES ==========


In [None]:
import os
import json
import numpy as np
from openai import OpenAI

# ========== LOAD PREDICTED DATA ==========


In [None]:
# Load the GLiNER output with XML tags
with open('file-path','r',encoding='utf-8') as f:
    data = json.load(f)
pred  = [d['tagged_text'] for d in data]


# ========== CONFIG ==========


In [None]:
client = OpenAI(
    api_key="YOUR-API-KEY"
)
CHAT_MODEL  = 'gpt-4.1'


# ========== SYSTEM PROMPT ==========


In [None]:
SYSTEM_PROMPT="""You are a biomedical named entity recognition (NER) expert. Your task is to review, correct, and complete the entity annotations in the following text using inline XML-style tags.

Instructions:

* The input text already contains XML-style tags (e.g., <RADIOISOTOPE>uranium 238</RADIOISOTOPE>).
* Verify each existing tag:
  * Ensure the entity label is correct.
  * Correct any mislabeling.
* Tag any missing entities using only the valid labels from the glossary below.
* Return only the corrected and fully tagged version of the text in valid XML format — no extra text or explanation.

Annotation Rules:

* Use only labels from the glossary below.
* Exclude determiners, pronouns, and punctuation from inside tags.
* Tags must not cross paragraph boundaries.
* Do not tag generic terms like “virus”, “bactérie”, or any pronouns.
* Do not invent or use tags that are not present in the glossary below.
* Ensure all XML is valid: tags must be correctly opened and closed.

Glossary of Valid Entity Labels and Definitions:

* <DOC_AUTHOR> — Document author(s).
* <DOC_SOURCE> — The source or publisher of the document (e.g., 'AFP', 'Reuters').
* <INF_DISEASE> — Infectious diseases (caused by bacteria, viruses, fungi, parasites, etc.).
* <NON_INF_DISEASE> — Non-infectious diseases (e.g., diabetes, cancer).
* <PATHOGEN> — The infectious agent itself (bacterium, virus, parasite, etc.).
* <DIS_REF_TO_PATH> — A disease name used to refer to the pathogen.
* <PATH_REF_TO_DIS> — A pathogen name used to refer to the disease.
* <RADIOISOTOPE> — A radioactive form of an element (e.g., polonium, uranium-238).
* <TOXIC_C_AGENT> — Inorganic toxic chemicals (e.g., chlorine gas).
* <EXPLOSIVE> — Any explosive substance or compound.
* <BIO_TOXIN> — Organic chemical toxins from biological sources (e.g., ricin, botulinum toxin).
* <LOCATION> — Named geographic places (countries, cities, rivers, etc.).
* <ORGANIZATION> — Institutions or agencies with social/legal identity (e.g., WHO, Institut Pasteur).
* <LOC_REF_TO_ORG> — Place name used to refer to an organization.
* <ORG_REF_TO_LOC> — Organization name used to refer to the place it is located.
* <ABS_DATE> — Exact date (e.g., “15 mars 2020”).
* <REL_DATE> — Relative date (e.g., “hier”, “lundi dernier”).
* <DOC_DATE> — Document publication date.
* <ABS_PERIOD> — Exact period (e.g., “mars 2020”, “du 1er au 3 mai”).
* <REL_PERIOD> — Relative period (e.g., “les 3 derniers jours”).
* <FUZZY_PERIOD> — Vague time period (e.g., “ces dernières années”, “depuis plusieurs mois”).

**Examples:**

Input:
"La réunion a eu lieu le 12 avril 2020."
→ Correction:
"La réunion a eu lieu le <ABS_DATE>12 avril 2020</ABS_DATE>."

Input:
"Ces dernières années, les cas ont augmenté."
→ Correction:
"<FUZZY_PERIOD>Ces dernières années</FUZZY_PERIOD>, les cas ont augmenté."

Input:
"<LOCATION>Paris</LOCATION> a annoncé un plan d'urgence sanitaire."
→ Correction:
<LOC_REF_TO_ORG>Paris</LOC_REF_TO_ORG> a annoncé un plan d'urgence sanitaire.

Input:
"Les tests ont été menés entre mars et juin 2021."
→ Correction:
"Les tests ont été menés entre <ABS_PERIOD>mars et juin 2021</ABS_PERIOD>."

Input:
"Le <PATHOGEN>virus</PATHOGEN> peut causer des dommages importants."
→ Correction:
"Le virus peut causer des dommages importants." // Do not tag generic terms like 'virus' when unspecific.

Input:
"Un accident a eu lieu dans la centrale nucléaire de <LOCATION>Tchernobyl<LOCATION>."
→ Correction:
"Un accident a eu lieu dans la <ORG_REF_TO_LOC>centrale nucléaire de Tchernobyl</ORG_REF_TO_LOC>."

Input:
"Le <PATHOGEN>paludisme</PATHOGEN> est causé par un parasite."
→ Correction:
"<DIS_REF_TO_PATH>paludisme</DIS_REF_TO_PATH> est causé par un parasite."

Input:
"Le <PATHOGEN>VIH</PATHOGEN> est une infection virale chronique."
→ Correction:
"<PATH_REF_TO_DIS>VIH</PATH_REF_TO_DIS> est une infection virale chronique."

Only output the corrected and completed XML-tagged version of the text. Do not include any additional explanation."""

# ========== BUILD VERIFICATION TASKS FOR OPENAI API ==========


In [None]:
tasks = []
for i, text in enumerate(pred):
  user_content = f"{pred[i]}"

  tasks.append({
      "custom_id": f"task-{i}",
      "method": "POST",
      "url": "/v1/chat/completions",
      "body": {
          "model": CHAT_MODEL,
          "temperature": 0,
          "messages": [
              {"role": "system", "content": SYSTEM_PROMPT},
              {"role": "user",   "content": user_content}
          ]
      }
  })

# ========== SAVE BATCH TASKS TO JSONL ==========


In [None]:
file_name = "batch_evalLLM.jsonl"

with open(file_name, 'w') as file:
    for obj in tasks:
        file.write(json.dumps(obj, ensure_ascii=False) + '\n')

# ========== SUBMIT BATCH TO OPENAI API ==========


In [None]:
batch_file = client.files.create(
  file=open(file_name, "rb"),
  purpose="batch"
)

In [None]:
batch_job = client.batches.create(
  input_file_id=batch_file.id,
  endpoint="/v1/chat/completions",
  completion_window="24h"
)

In [None]:
batch_job = client.batches.retrieve(batch_job.id)
print(batch_job)

In [None]:
batch_job.status

# ========== RETRIEVE AND SAVE LLM OUTPUT ==========


In [None]:
result_file_id = batch_job.output_file_id
result = client.files.content(result_file_id).content

In [None]:
result_file_name = "batch_job_results_evalLLM.jsonl"

with open(result_file_name, 'wb') as file:
    file.write(result)

# ========== LOAD, PARSE, AND RECONSTRUCT FINAL OUTPUT ==========


In [None]:
results = []
with open(result_file_name, 'r') as file:
    for line in file:
        json_object = json.loads(line.strip())
        results.append(json_object)

In [None]:
l = []
for res in results:
    cid = res["custom_id"]
    idx = int(cid.split("-")[1])
    result = res['response']['body']['choices'][0]['message']['content']
    item=data[idx]
    l.append({
        "text": item['text'],
        "prediction":  result
    })


# ========== SAVE FINAL VERIFICATION OUTPUT ==========


In [None]:
with open('Verified-output.json', 'w', encoding='utf-8') as f:
    json.dump(l, f, ensure_ascii=False, indent=4)