# 1. --- IMPORTS AND CONFIGURATION ---

In [1]:
import os
import json
from pathlib import Path
from dotenv import load_dotenv
import openai
from pydantic import BaseModel

# Importing the custom preprocessing function for BASSE dataset and preparation tasks
from preprocessing.basse_preprocessing import process_basse_summaries
from preparation.basse_preparation import prepare_basse_tasks

# Importing the custom preprocessing functions for FLARES dataset and preparation tasks
from preprocessing.flares_preprocessing import load_and_merge_datasets, process_and_flatten_data
from preparation.flares_preparation import prepare_flares_tasks

# Define the Pydantic models for structured output
from pydantic_models.basse_pydantic_models import DetailedEvaluation

# Load environment variables (will look for .env file)
load_dotenv()

PROJECT_ROOT = Path.cwd()

# Initialize the OpenAI client.
# It will automatically read the OPENAI_API_KEY environment variable
try:
    client = openai.OpenAI()
except openai.OpenAIError as e:
    print("Error initializing the OpenAI client. Please ensure the OPENAI_API_KEY environment variable is configured.")
    print(e)
    # Exit or handle the error appropriately
    exit()

# 2. --- LOAD DATASETS ---

In [2]:
def load_basse_dataset(basse_path: str, ):
    """
    Loads the BASSE dataset from a JSONL file and processes it.
    """
    try:
        return process_basse_summaries(basse_path)
    except FileNotFoundError:
        print(f"Error: The BASSE dataset file was not found at path '{basse_path}'")
        return []


def load_flares_dataset(flares_path: list):
    """
    Loads and processes the FLARES dataset from multiple JSON files.
    """
    try:
        # Load and merge the datasets
        flares_datasets_merged = load_and_merge_datasets(flares_path)
        # Process and flatten the data
        return process_and_flatten_data(flares_datasets_merged)
    except FileNotFoundError:
        print(
            f"Error: The FLARES dataset files were not found at the specified paths: {flares_path[0]} and {flares_path[1]}")
        return []

# 3. --- FUNCTION TO LOAD THE PROMPT ---

In [3]:
def load_prompt_from_file(filepath: str) -> str:
    """Reads and returns the content of a text file."""
    try:
        with open(filepath, 'r', encoding='utf-8') as f:
            return f.read()
    except FileNotFoundError:
        print(f"Error: The prompt file was not found at path '{filepath}'")
        return ""  # Return an empty string in case of an error

# 3. --- EVALUATION FUNCTION WITH STRUCTURED OUTPUT ---

In [4]:
def evaluate_extraction(document: str, extraction: str, prompt_template: str, pydantic_model: BaseModel) -> tuple:
    """
    Prepares and calls the OpenAI API using 'response_model' for structured and validated output.
    """
    if not prompt_template:
        print("Cannot proceed without a prompt template.")
        return None, None

    filled_prompt = prompt_template.format(
        original_document=document,
        extraction_to_evaluate=extraction
    )

    print("--- STARTING STRUCTURED EVALUATION ---")

    try:
        response = client.chat.completions.create(
            model=os.environ.get("OPENAI_MODEL", "gpt-4.1-nano"),  # Default to gpt-4.1-nano if no key is set
            messages=[{"role": "user", "content": filled_prompt}],
            # 1. We define the "tool" that the model should use.
            #    We use .model_json_schema() to automatically generate the schema.
            tools=[
                {
                    "type": "function",
                    "function": {
                        "name": "save_evaluation",
                        "description": "Saves the structured evaluation result of an extraction.",
                        "parameters": pydantic_model.model_json_schema()
                    }
                }
            ],
            # 2. We force the model to call our tool.
            tool_choice={"type": "function", "function": {"name": "save_evaluation"}}
        )

        # 3. We extract the result from the tool call arguments.
        #    The response is a JSON string, not an object.
        tool_call = response.choices[0].message.tool_calls[0]
        json_arguments = tool_call.function.arguments

        # We capture tokens' usage
        token_usage = response.usage

        # 4. We parse and validate the JSON against our Pydantic model.
        evaluation_object = pydantic_model.model_validate_json(json_arguments)

        return evaluation_object, token_usage

    except Exception as e:
        print(f"An error occurred calling the API or processing the response: {e}")
        return None, None

# 4. --- MAIN ITERATION AND STORAGE LOGIC ---

In [5]:
def process_dataset(
        dataset: list,
        prepare_tasks_func: callable,
        prompt_path: str,
        pydantic_model: BaseModel
):
    """
    A generic main loop to process any dataset using a specific task preparer.
    """
    prompt_template = load_prompt_from_file(prompt_path)
    if not prompt_template:
        return [], 0

    all_evaluations = []
    total_tokens = 0
    env = os.environ.get("ENVIRONMENT", "development")
    docs_to_process = dataset[:5] if env == "development" else dataset

    print(f"Processing {len(docs_to_process)} documents...")

    for entry in docs_to_process:
        # The preparer function handles the differences between datasets
        for task in prepare_tasks_func(entry):
            doc_id, original_text, summary_to_evaluate, model_name = task

            print(f"\n---> Evaluating '{model_name}' for doc: {doc_id}")

            evaluation_object, usage_data = evaluate_extraction(
                original_text,
                summary_to_evaluate,
                prompt_template,
                pydantic_model
            )

            if evaluation_object and usage_data:
                total_tokens += usage_data.total_tokens
                result_record = {
                    "document_idx": doc_id,
                    "model_evaluated": model_name,
                    "evaluation_data": evaluation_object.model_dump(),
                    "token_usage": {
                        "prompt_tokens": usage_data.prompt_tokens,
                        "completion_tokens": usage_data.completion_tokens,
                        "total_tokens": usage_data.total_tokens
                    },
                }
                all_evaluations.append(result_record)
                print(f"  Result stored. Used tokens: {usage_data.total_tokens}")

    return all_evaluations, total_tokens

# 5. --- EXECUTION BLOCK ---

In [7]:
if __name__ == "__main__":
    # --- Configure what you want to execute here ---
    # Options: "BASSE" or "FLARES"
    EVALUATION_TARGET = "FLARES"

    # --- EVALUATIONS PARAMETERS ---
    BASSE_DATASET_FILEPATH = str(PROJECT_ROOT / 'data' / 'basse' / 'BASSE.jsonl')
    FLARES_DATASET_FILEPATHS = [
        str(PROJECT_ROOT / 'data' / 'flares' / '5w1h_subtarea_1_train.json'),
        str(PROJECT_ROOT / 'data' / 'flares' / '5w1h_subtask_1_trial.json')
    ]
    PROMPT_FILE = str(PROJECT_ROOT / 'prompts' / 'evaluation_prompt_v3.txt')
    # Make sure the Pydantic model to use is defined or imported
    PYDANTIC_MODEL = DetailedEvaluation

    dataset_to_run = None
    task_preparer = None
    output_filename = None

    print(f"Starting evaluation for target: {EVALUATION_TARGET}")

    if EVALUATION_TARGET == "BASSE":
        dataset_to_run = load_basse_dataset(BASSE_DATASET_FILEPATH)
        task_preparer = prepare_basse_tasks
        output_filename = "results/evaluation_results_basse.json"
    elif EVALUATION_TARGET == "FLARES":
        dataset_to_run = load_flares_dataset(FLARES_DATASET_FILEPATHS)
        task_preparer = prepare_flares_tasks
        output_filename = "results/evaluation_results_flares.json"

    if dataset_to_run and task_preparer:
        final_results, total_tokens = process_dataset(
            dataset=dataset_to_run,
            prepare_tasks_func=task_preparer,
            prompt_path=PROMPT_FILE,
            pydantic_model=PYDANTIC_MODEL
        )

        print("\n\n####################################################")
        print(f"####### {EVALUATION_TARGET} EVALUATION FINISHED #######")
        print(f"         Total used tokens: {total_tokens}          ")
        print("####################################################")

        export_data = {"total_tokens": total_tokens, "results": final_results}
        with open(output_filename, "w", encoding="utf-8") as f:
            json.dump(export_data, f, indent=2, ensure_ascii=False)
        print(f"\nResults exported to: {output_filename}")
    else:
        print("Evaluation target not found or dataset could not be loaded.")


Starting evaluation for target: FLARES
Processed and merged 1753 objects from 2 file(s).
Example of the first object in 'merged_dataset':
{
  "Id": 732,
  "Text": "Dos días, exactamente han pasado dos días desde que Sánchez compareciera en rueda de prensa en la Moncloa afirmando que a España llegarían, entre abril y septiembre, un total de 87 millones de vacunas para darnos cuenta de que las mentiras de Sánchez hacen bueno ese refrán que dice que “la mentira tiene las patas muy cortas”.",
  "Processed_Tags": [
    {
      "5W1H_Label": "WHO",
      "Enumerated_Tag_Id": "WHO_1",
      "Reliability_Label": "confiable",
      "Tag_Text": "Sánchez",
      "Tag_Start": 52
    },
    {
      "5W1H_Label": "WHERE",
      "Enumerated_Tag_Id": "WHERE_1",
      "Reliability_Label": "confiable",
      "Tag_Text": "en rueda de prensa en la Moncloa",
      "Tag_Start": 73
    },
    {
      "5W1H_Label": "WHO",
      "Enumerated_Tag_Id": "WHO_2",
      "Reliability_Label": "confiable",
      "Tag_T