# ========== INSTALL DEPENDENCIES ==========


In [None]:
!pip install sentence_transformers

In [None]:
!pip install openai --upgrade

# ========== IMPORT LIBRARIES ==========


In [None]:
import os
import json
import numpy as np
from openai import OpenAI
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import ast, pprint




# ========== CONFIGURATION ==========



In [None]:
client = OpenAI(
    api_key="YOUR-API-KEY"
)
EMBED_MODEL = 'paraphrase-multilingual-MiniLM-L12-v2'
CHAT_MODEL  = 'gpt-4.1'
K           = 10


# ========== SYSTEM PROMPT FOR EVENT EXTRACTION ==========


In [None]:
SYSTEM_PROMPT =("""You are an epidemiology analyst. Your job is to extract structured events from French articles.

═════════ TASK ═════════
INPUT:
• A French article.
• A list of extracted named entities (ID, text span, and type).

OUTPUT:
Return a JSON array named events following this schema:

[
  [
    {"attribute":"evt:central_element", "occurrences":["ID_c1", "ID_c2", ...]},
    {"attribute":"evt:associated_element", "occurrences":["ID_a1", "ID_a2", ...]}
  ],
]

═════════ RULES ═════════
 1. CENTRAL ELEMENT — REQUIRED (1 per event)
- Must be exactly one of: INF_DISEASE, NON_INF_DISEASE, PATHOGEN, DIS_REF_TO_PATH, PATH_REF_TO_DIS, RADIOISOTOPE, TOXIC_C_AGENT, EXPLOSIVE, BIO_TOXIN
- Each event has exactly one central element (but it may have several synonymous IDs see Rule 4).

 2. ASSOCIATED ELEMENTS — REQUIRED (at least one location + at least one date/periode)
Add all entity IDs relevant to:
- Locations: LOCATION, LOC_REF_TO_ORG, ORG_REF_TO_LOC
- Dates: ABS_DATE, REL_DATE, ABS_PERIOD, REL_PERIOD, FUZZY_PERIOD, DOC_DATE
- Use DOC_DATE only if no other date is found.
- Prefer absolute over relative dates if both exist.

3. WINDOW OF RELEVANCE
- Start from the sentence containing the central element.
- If no associated location/date is there, check the adjacent sentences.

4. SYNONYMS
If several entity IDs refer to the same real‑world object (e.g. three mentions of “uranium 238”, or “Paris” vs “Ville‑Lumière”, or different surface forms of the same date),  include all those IDs together in the same occurrences list.

5. EVENT LIMIT
- Max 10 events.
- If more are present, keep the 10 most relevant to public health risk.

6. VALIDITY
- Each entity ID appears in only one event.
- Output must be valid JSON and contain nothing else.

═════════ TIPS ═════════
 For event splitting, use this rule:
– Same central + coherent dates/places → merge into one event.
– Distant in time/space or different causes → separate events.

When in doubt between including or skipping an associated element: include it if it helps answer: Where? When? What agent?

══════════ EXAMPLES ══════════
""")

# ========== FILE PATHS ==========


In [None]:
TRAIN_JSON = "the train file for few shot examples"
TEST_JSON  = "Your input file containing already extracted entities into the challenge format"

# ========== LOAD AND FILTER TRAIN DATA ==========


In [None]:
with open(TRAIN_JSON, encoding="utf-8") as f:
    train_docs = json.load(f)

train_texts        = [d["text"]     for d in train_docs]
train_entities     = [d["entities"] for d in train_docs]
train_events       = [d["events"]   for d in train_docs]

# Filter out training docs without annotated events
valid_train_idxs   = [i for i, e in enumerate(train_events) if e]
valid_train_texts  = [train_texts[i]     for i in valid_train_idxs]
valid_train_entities = [train_entities[i] for i in valid_train_idxs]
valid_train_events   = [train_events[i]   for i in valid_train_idxs]

# ========== COMPUTE TRAIN EMBEDDINGS ==========
embedder = SentenceTransformer(EMBED_MODEL)
train_embeddings = embedder.encode(valid_train_texts, convert_to_numpy=True, show_progress_bar=True)

# ========== LOAD TEST DATA ==========
with open(TEST_JSON, encoding="utf-8") as f:
    test_docs = json.load(f)

test_texts    = [d["text"]     for d in test_docs]
test_entities = [d["entities"] for d in test_docs]


# ========== BUILD OPENAI BATCH TASKS ==========


In [None]:
tasks = []

for i, (text, entities) in enumerate(zip(test_texts, test_entities)):
    # Encode test text
    test_embedding = embedder.encode([text], convert_to_numpy=True)[0]

    # Compute similarity with train embeddings
    sims = cosine_similarity([test_embedding], train_embeddings)[0]

    # Get top-k similar train documents
    k = min(K, len(sims))
    best_pos = np.argpartition(sims, -k)[-k:]
    best_pos = best_pos[np.argsort(sims[best_pos])[::-1]]

    # Build few-shot prompt blocks
    few_shot_blocks = [
        "INPUT: {}\nENTITIES: {}\nOUTPUT: {}\n".format(
            valid_train_texts[pos],
            json.dumps(valid_train_entities[pos], ensure_ascii=False),
            json.dumps(valid_train_events[pos],   ensure_ascii=False)
        )
        for pos in best_pos if sims[pos] > -1
    ]

    prompt_with_few_shot = SYSTEM_PROMPT
    if few_shot_blocks:
        prompt_with_few_shot += "\n\n" + "\n\n".join(few_shot_blocks)

    user_content = f"INPUT: {text}\nENTITIES: {json.dumps(entities, ensure_ascii=False)}\nOUTPUT: "

    # Create a task entry for the batch
    tasks.append({
        "custom_id": f"task-{i}",
        "method": "POST",
        "url": "/v1/chat/completions",
        "body": {
            "model": CHAT_MODEL,
            "temperature": 0,
            "messages": [
                {"role": "system", "content": prompt_with_few_shot},
                {"role": "user",   "content": user_content}
            ]
        }
    })

print(f"Built {len(tasks)} tasks using top-{K} valid train docs per test entry.")

# ========== SAVE BATCH TASKS TO JSONL ==========


In [None]:
file_name = "batch_evalLLM.jsonl"

with open(file_name, 'w') as file:
    for obj in tasks:
        file.write(json.dumps(obj, ensure_ascii=False) + '\n')


# ========== SUBMIT BATCH TO OPENAI API ==========


In [None]:
batch_file = client.files.create(
  file=open(file_name, "rb"),
  purpose="batch"
)

In [None]:
batch_job = client.batches.create(
  input_file_id=batch_file.id,
  endpoint="/v1/chat/completions",
  completion_window="24h"
)

In [None]:
batch_job = client.batches.retrieve(batch_job.id)
print(batch_job)

In [None]:
batch_job.status

# ========== RETRIEVE BATCH OUTPUT ==========


In [None]:
result_file_id = batch_job.output_file_id
result = client.files.content(result_file_id).content

In [None]:

result_file_name = "batch_job_results_evalLLM.jsonl"

with open(result_file_name, 'wb') as file:
    file.write(result)

# ========== PARSE RESULTS AND BUILD FINAL OUTPUT ==========


In [None]:
results = []
with open(result_file_name, 'r') as file:
    for line in file:
        json_object = json.loads(line.strip())
        results.append(json_object)

In [None]:
l = []
for res in results:
    cid = res["custom_id"]
    idx = int(cid.split("-")[1])
    result = res['response']['body']['choices'][0]['message']['content']
    item=test_docs[idx]
    l.append({
        "text": item['text'],
        "entities": item['entities'],
        "events":  result
    })

with open('OUTPUT-FILE', 'w', encoding='utf-8') as f:
    json.dump(l, f, ensure_ascii=False, indent=4)



# ========== CLEANING & STRUCTURING OUTPUT EVENTS ==========


In [None]:
RAW_JSON   = "OUTPUT-FILE"
CLEAN_JSON = "OUTPUT-FILE-cleaned.json"

# Convert an 'events' field to a valid list, handling strings, dicts, and malformed cases.
def parse_events_field(events_field):

    if isinstance(events_field, list):
        return events_field

    if isinstance(events_field, dict) and "events" in events_field:
        return events_field["events"]

    if isinstance(events_field, str):
        text = events_field.strip()
        if not text:
            return []
        try:
            obj = json.loads(text)
            return obj["events"] if isinstance(obj, dict) and "events" in obj else obj
        except json.JSONDecodeError:
            pass
        try:
            obj = ast.literal_eval(text)
            return obj["events"] if isinstance(obj, dict) and "events" in obj else obj
        except Exception:
            print("⚠️  could not parse one events string → left empty")
            return []
    return []

# ---------------- load, convert, save -------------------------------
with open(RAW_JSON, encoding="utf-8") as f:
    docs = json.load(f)

for doc in docs:
    doc["events"] = parse_events_field(doc.get("events", []))

with open(CLEAN_JSON, "w", encoding="utf-8") as f:
    json.dump(docs, f, ensure_ascii=False, indent=2)

print("Converted strings → lists with NO structural changes.")
print("Clean file saved to:", CLEAN_JSON)


