#  ========== INSTALL DEPENDENCIES ==========

In [None]:
!pip install sentence_transformers

In [None]:
!pip install openai --upgrade

# ========== IMPORT LIBRARIES ==========


In [None]:
import os
import json
import numpy as np
from openai import OpenAI
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

# ========== CONFIGURATION ==========


In [None]:
client = OpenAI(
    api_key="YOUR-API-KEY"
)
EMBED_MODEL = 'paraphrase-multilingual-MiniLM-L12-v2'
CHAT_MODEL  = 'gpt-4.1'
K           = 10

# ========== SYSTEM PROMPT DEFINITION ==========


In [None]:
SYSTEM_PROMPT = ("""You are an expert French medical annotator.

═════════ TASK ═════════
1. Read the input French text.
2. Insert XML tags **in-line** around every entity mention according to the label definitions provided.
   → Example: Le <INF_DISEASE>paludisme</INF_DISEASE> est endémique.
3. Return **only** the full annotated text. No commentary, no metadata.

→ Think step by step **internally**, but output only the final tagged text.

═════════ ANNOTATION RULES ═════════
• Use only the labels from the glossary.
• Exclude determiners, pronouns, and punctuation from entity spans.
• If an entity is **discontinuous**, tag **each contiguous part separately** with the same label and shared `ent_id`.
   → Ex: les <PATHOGEN ent_id="P1"><PATHOGEN ent_id="P2">virus</PATHOGEN></PATHOGEN> de la <PATHOGEN ent_id="P1">dengue</PATHOGEN> et du <PATHOGEN ent_id="P2">chikungunya</PATHOGEN>
    <ORGANIZATION ent_id="O1">Agence régionale de santé</ORGANIZATION> (<ORGANIZATION ent_id="O2">ARS</ORGANIZATION>) <ORGANIZATION ent_id="O1"><ORGANIZATION ent_id="O2">d’Île de France</ORGANIZATION></ORGANIZATION>
• Tags must not cross paragraph boundaries.
• Ignore misspellings, generic terms ("virus", "bactérie", etc.), and pronouns.
• Do **not** generate any tag that does not exist in the input.
• Use valid XML syntax. Tags must be correctly opened/closed and perfectly nested.
• Overlapping tags are allowed **only** for discontinuous spans (as shown above).

═════════ LABEL GLOSSARY ═════════
✔ = tag it ✘ = don’t tag it

→ **Document-level metadata**
• DOC_AUTHOR ✔ "Jean Dupont" (byline only) ✘ in body
• DOC_SOURCE ✔ "AFP", "Reuters" ✘ "la presse"

→ **Diseases & Pathogens**
• INF_DISEASE ✔ grippe, rougeole ✘ "maladie", "infection"
• NON_INF_DISEASE ✔ cancer, diabète ✘ syndromes mixtes
• PATHOGEN ✔ Escherichia coli, virus Ebola ✘ "virus" (generic)
• DIS_REF_TO_PATH ✔ paludisme in “parasites tels que le paludisme” ✘ paludisme as disease
• PATH_REF_TO_DIS ✔ VIH in “cas de VIH” ✘ virus VIH

→ **Toxins, Chemicals, Explosives**
• RADIOISOTOPE ✔ uranium 238, césium-137
• TOXIC_C_AGENT ✔ sarin, chlore gazeux
• EXPLOSIVE ✔ TNT, RDX
• BIO_TOXIN ✔ ricine, toxine botulique

→ **Locations & Organizations**
• LOCATION ✔ Paris, Rhône, Alpes ✘ pronouns, "le pays"
• ORGANIZATION ✔ OMS, hôpital Georges-Pompidou
• LOC_REF_TO_ORG ✔ Paris (dans “Paris annonce…”)
• ORG_REF_TO_LOC ✔ centrale nucléaire de Tchernobyl

→ **Dates & Time References**
• ABS_DATE ✔ 8 janvier 2025, 01/08/2025
• REL_DATE ✔ hier, lundi dernier, 8 janvier (sans année)
• DOC_DATE ✔ date en tête d’article
• ABS_PERIOD ✔ mars 2024, du 1er au 3 mai 2024
• REL_PERIOD ✔ la semaine dernière, du 10 au 20 mai
• FUZZY_PERIOD ✔ ces dernières années, depuis plusieurs semaines

═════════ CONSTRAINTS ═════════
1. Output must contain **valid XML** with correct nesting.
2. A token may belong to multiple tags **only** when discontinuity requires it.
3. Never output tags for absent entities or unsupported labels.

═════════ EXAMPLES ═════════
"""
)

# ========== LOAD DATA ==========


In [None]:
# Load test set (texts to annotate)
with open('20250516_NP_test_evalLLM.json', 'r', encoding='utf-8') as f:
    test_data = json.load(f)
test_texts = [d['text'] for d in test_data]

# Load training set (few-shot pool) with tagged XML outputs
with open('20250428_NP_train-evalLLM_XML.json', 'r', encoding='utf-8') as f:
    train_data = json.load(f)
train_texts = [d['text'] for d in train_data]
train_tagged = [d['tagged_text'] for d in train_data]


# ========== COMPUTE EMBEDDINGS ==========


In [None]:
# Sentence embeddings for similarity-based retrieval
embedder = SentenceTransformer(EMBED_MODEL)
test_embeddings = embedder.encode(test_texts, convert_to_numpy=True)
train_embeddings = embedder.encode(train_texts, convert_to_numpy=True)




# ========== SELECT FEW-SHOT EXAMPLES BY SIMILARITY ==========


In [None]:
def top_k_similar(test_emb, train_embs, k=10):
    sims = cosine_similarity([test_emb], train_embs)[0]
    return np.argsort(sims)[-k:][::-1]



# ========== BUILD BATCH TASKS FOR OPENAI CHAT COMPLETION ==========


In [None]:
tasks = []
for i, (text, emb) in enumerate(zip(test_texts, test_embeddings)):
    neighbors = top_k_similar(emb, train_embeddings, k=10)

    # Format few-shot examples
    few_shot = [
        f"INPUT: {train_texts[nb]}\nOUTPUT: {train_tagged[nb]}"
        for nb in neighbors
    ]
    prompt_with_few_shot = SYSTEM_PROMPT + "\n\n" + "\n\n".join(few_shot)
    user_content = f"INPUT: {text}\nOUTPUT:"

    # Build the OpenAI API request
    tasks.append({
        "custom_id": f"task-{i}",
        "method": "POST",
        "url": "/v1/chat/completions",
        "body": {
            "model": CHAT_MODEL,
            "temperature": 0,
            "messages": [
                {"role": "system", "content": prompt_with_few_shot},
                {"role": "user",   "content": user_content}
            ]
        }
    })

# ========== SAVE BATCH TASKS TO JSONL ==========


In [None]:
file_name = "batch_evalLLM.jsonl"

with open(file_name, 'w') as file:
    for obj in tasks:
        file.write(json.dumps(obj) + '\n')

# ========== SUBMIT BATCH TO OPENAI API ==========


In [None]:
# Upload batch file for processing
batch_file = client.files.create(
  file=open(file_name, "rb"),
  purpose="batch"
)

In [None]:
# Launch the batch job
batch_job = client.batches.create(
  input_file_id=batch_file.id,
  endpoint="/v1/chat/completions",
  completion_window="24h"
)

In [None]:
# Optionally: print job info and status
batch_job = client.batches.retrieve(batch_job.id)
print(batch_job)
batch_job.status

# ========== RETRIEVE BATCH OUTPUT ==========


In [None]:
result_file_id = batch_job.output_file_id
result = client.files.content(result_file_id).content

In [None]:
result_file_name = "batch_job_results_evalLLM.jsonl"

with open(result_file_name, 'wb') as file:
    file.write(result)

# ========== PARSE RESULTS AND BUILD FINAL OUTPUT ==========


In [None]:
results = []
with open(result_file_name, 'r') as file:
    for line in file:
        json_object = json.loads(line.strip())
        results.append(json_object)

In [None]:
l = []
for res in results:
    cid = res["custom_id"]
    idx = int(cid.split("-")[1])
    result = res['response']['body']['choices'][0]['message']['content']
    item=test_data[idx]
    l.append({
        "text": item['text'],
        "prediction":  result,
    })

# ========== SAVE FINAL PREDICTIONS TO FILE ==========


In [None]:
with open('YOUR-OUTPUT-FILE', 'w', encoding='utf-8') as f:
    json.dump(l, f, ensure_ascii=False, indent=4)