# LLM-based event detection

In [1]:
# pip install openai

In [2]:
# pip install fastapi-poe

In [3]:
import os
from dotenv import load_dotenv
import openai
import pandas as pd

from src.vars import OUTPUT_DIR, ROOT_DIR
import json
import re
import time

In [4]:
EXECUTE_LLM = True  # to do not waste resources by mistake
EXECUTE_PRELIMINARY_PHASE = False
EXTRACT_PRELIMINARY_SAMPLE = False

In [5]:
df = pd.read_csv(f'{OUTPUT_DIR}/df_complementary_info_filter_dates_keywords_acronims.csv')

In [6]:
df_llm_output = pd.DataFrame(columns=['case_id', 'output'])
if os.path.exists(f'{OUTPUT_DIR}/df_llm_output.csv'):
    df_llm_output = pd.read_csv(f'{OUTPUT_DIR}/df_llm_output.csv')

if EXECUTE_PRELIMINARY_PHASE:
    df_llm_output = pd.DataFrame(columns=['case_id', 'output'])

df_llm_output

Unnamed: 0,case_id,output
0,219-630869,"['[{""event"": ""DDG n. 36"", ""date"": ""31/05/2022""..."
1,123-350412,['[]']
2,153-437453,"['[{""event"": ""DGC n. 567"", ""date"": ""07/07/2022..."
3,243-697432,"['[{""event"": ""DD n. 73/2022"", ""date"": ""28/10/2..."
4,035-089020,['[]']
...,...,...
596,102-284909,['[]']
597,114-321551,['[]']
598,248-722796,['[]']
599,164-467269,"['[{""event"": ""DD n. 141"", ""date"": ""17/12/2021""..."


In [7]:
def extract_json_objects(text):
    # Pattern to find potential JSON objects (objects and arrays)
    array_pattern = r'(\[[^\[\]]*(\[[^\[\]]*\][^\[\]]*)*\])'

    # Find all potential matches
    assert type(text) == str
    potential_arrays = re.findall(array_pattern, text)

    # Extract just the matched strings
    potential_jsons = [match[0] for match in potential_arrays]

    # Try to parse each potential match
    valid_jsons = []

    for json_str in potential_jsons:
        try:
            # Check if it's valid JSON
            json_obj = json.loads(json_str)
            valid_jsons.append(json_str)
        except json.JSONDecodeError:
            # Skip invalid JSON
            continue

    return [json.dumps(json.loads(obj)) for obj in valid_jsons]

In [8]:
sys_prompt = ''
with open(f'{ROOT_DIR}/res/prompts/System Prompt.txt', encoding='utf-8') as file:
    sys_prompt = file.read()

In [9]:
if EXTRACT_PRELIMINARY_SAMPLE:
    idx_sample = df.sample(n=10).case_id
    idx_sample.to_csv(f'{OUTPUT_DIR}/preliminary_evaluation/idx_sample.csv', index=False)
    df = df[df.case_id.isin(idx_sample)]
elif EXECUTE_PRELIMINARY_PHASE:
    idx_sample = pd.read_csv(f'{OUTPUT_DIR}/preliminary_evaluation/idx_sample.csv').case_id
    df = df[df.case_id.isin(idx_sample)]
else:
    df = df.head(600)

In [10]:
df

Unnamed: 0,case_id,complementary_info
0,219-630869,Atto esito di gara: Provvedimento DG n. 36 del...
1,123-350412,RUP: dott.ssa Maria Lomboni. Delibera di aggiu...
2,153-437453,"1) il bando di gara, il fac-simile modulo di o..."
3,243-697432,Determinazione Agg.ne AD di Marche Multiserviz...
4,035-089020,1) La procedura è gestita con il Sistema telem...
...,...,...
595,118-333509,Determinazioni a contrarre e di approvazione d...
596,102-284909,La ricezione delle richieste di partecipazione...
597,114-321551,La procedura di scelta del contraente si svolg...
598,248-722796,La presente procedura viene gestita interament...


In [11]:
def run_prompt(user_message: str, model: str):
    time.sleep(3)
    result = dict()

    response = client.chat.completions.create(
        model=model,
        messages=[
            {"role": "system", "content": sys_prompt},
            {"role": "user", "content": user_message}],
        stream=True
    )
    result[model] = ''
    for chunk in response:
        if chunk.choices[0].delta.content:
            result[model] += chunk.choices[0].delta.content
    return result

def analyse_instances(df, model, idx):
    executed_caseid = df_llm_output['case_id'].tolist()

    for index, row in df.iterrows():
        if row.case_id in executed_caseid:
            continue
        print(f"Row id: {index}")

        prompt = prompts[idx]
        user_message = prompt % (df[df['case_id'] == row.case_id].iloc[0]['complementary_info'])
        new_row = dict()
        result = run_prompt(user_message, model)

        json_objs = extract_json_objects(result[model])
        if len(json_objs) == 1:
            json_objects = json_objs[0]
        elif len(json_objs) == 0:
            result = run_prompt(user_message, model)
            json_objs = extract_json_objects(result[model])
            if len(json_objs) == 1:
                json_objects = json_objs[0]
            elif len(json_objs) == 0:
                print(f"Error: no json objects found again in instance {index} -> '{row}'")
                continue

        new_row = {"case_id": row.case_id, "output": json_objs}
        df_llm_output.loc[len(df_llm_output)] = new_row

        if index % 5 == 0:
            print(f"TED number {index} / {len(df.index)}...")
            if EXECUTE_PRELIMINARY_PHASE:
                df_llm_output.to_csv(f'{OUTPUT_DIR}/preliminary_evaluation/prompt_{idx}_{model}.csv', index=False)
            else:
                df_llm_output.to_csv(f'{OUTPUT_DIR}/df_llm_output.csv', index=False)

## Prompt validation, model selection, execution

In [12]:
prompts = ["", "", ""]
with open(f'{ROOT_DIR}/res/prompts/prompt1 - zero shot.txt', encoding='utf-8') as file:
    prompts[0] = file.read()

In [13]:
prompts[1] = ""
with open(f'{ROOT_DIR}/res/prompts/prompt2 - few shot.txt', encoding='utf-8') as file:
    prompts[1] = file.read()

In [14]:
prompts[2] = ""
with open(f'{ROOT_DIR}/res/prompts/prompt3 - chain of thoughts.txt', encoding='utf-8') as file:
    prompts[2] = file.read()

In [15]:
if EXECUTE_LLM:
    load_dotenv()
    client = openai.OpenAI(
      api_key=os.getenv('POE_API_KEY'),
      base_url="https://api.poe.com/v1",
    )

    if EXECUTE_PRELIMINARY_PHASE:
        models = ["Llama-3.1-8B", "GPT-4o", "Claude-Sonnet-4"]

        for idx, prompt in enumerate(prompts):
            for model in models:
                print(f"{model} - prompt {idx}")
                analyse_instances(df, model, idx)
    else:
        model = "Claude-Sonnet-4"
        idx = 2
        print(f"{model} - prompt {idx}")
        analyse_instances(df, model, idx)

Claude-Sonnet-4 - prompt 2


In [16]:
df_llm_output

Unnamed: 0,case_id,output
0,219-630869,"['[{""event"": ""DDG n. 36"", ""date"": ""31/05/2022""..."
1,123-350412,['[]']
2,153-437453,"['[{""event"": ""DGC n. 567"", ""date"": ""07/07/2022..."
3,243-697432,"['[{""event"": ""DD n. 73/2022"", ""date"": ""28/10/2..."
4,035-089020,['[]']
...,...,...
596,102-284909,['[]']
597,114-321551,['[]']
598,248-722796,['[]']
599,164-467269,"['[{""event"": ""DD n. 141"", ""date"": ""17/12/2021""..."


In [17]:
print("Correct instances (manual annotation) after the Preliminary Phase:")

print("\nPrompt 0 - Claude-Sonnet-4: 8/10")
print("Prompt 0 - GPT-4o: 7/10")
print("Prompt 0 - Llama-3.1-8B: 2/10")

print("\nPrompt 1 - Claude-Sonnet-4: 7/10")
print("Prompt 1 - GPT-4o: 7/10")
print("Prompt 1 - Llama-3.1-8B: 6/10")

print("\nPrompt 2 - Claude-Sonnet-4: 9/10")
print("Prompt 2 - GPT-4o: 8/10")
print("Prompt 2 - Llama-3.1-8B: 4/10")

print("Chosen combination: Prompt 2 - Claude-Sonnet-4")

Correct instances (manual annotation) after the Preliminary Phase:

Prompt 0 - Claude-Sonnet-4: 8/10
Prompt 0 - GPT-4o: 7/10
Prompt 0 - Llama-3.1-8B: 2/10

Prompt 1 - Claude-Sonnet-4: 7/10
Prompt 1 - GPT-4o: 7/10
Prompt 1 - Llama-3.1-8B: 6/10

Prompt 2 - Claude-Sonnet-4: 9/10
Prompt 2 - GPT-4o: 8/10
Prompt 2 - Llama-3.1-8B: 4/10
Chosen combination: Prompt 2 - Claude-Sonnet-4
