# 1. --- IMPORTS AND CONFIGURATION ---

In [None]:
import os
import json
from datetime import datetime
from pathlib import Path
from dotenv import load_dotenv
import openai
from pydantic import BaseModel

# Importing the custom preprocessing function for BASSE dataset and preparation tasks
from preprocessing.basse_preprocessing import process_basse_extractions
from preparation.basse_preparation import prepare_basse_tasks

# Importing the custom preprocessing functions for FLARES dataset and preparation tasks
from preprocessing.flares_preprocessing import load_and_merge_datasets, process_and_flatten_data
from preparation.flares_preparation import prepare_flares_tasks

# Define the Pydantic models for structured output
from pydantic_models.output_pydantic_models import DetailedEvaluation

# Import the helper function to create the expert review task structure
from validation.create_expert_review_task import create_expert_review_task


PROJECT_ROOT = Path.cwd()

# Load environment variables (will look for .env file)
load_dotenv()

# --- Dynamic AI Provider Configuration ---

# Read environment variables
ai_provider = os.getenv("MODEL_PROVIDER", "openai").lower()  # Default to 'openai'
model_name = os.getenv("MODEL", "gpt-5-mini") # Default to 'gpt-5-mini'

api_key = None
base_url = None

print(f"Using provider: {ai_provider.capitalize()}")

if ai_provider == "openai":
    api_key = os.getenv("OPENAI_API_KEY")
    # base_url is not needed; the client defaults to OpenAI's endpoint.
elif ai_provider == "gemini":
    api_key = os.getenv("GEMINI_API_KEY")
    base_url = "https://generativelanguage.googleapis.com/v1beta/openai/"
elif ai_provider == "anthropic":
    api_key = os.getenv("ANTHROPIC_API_KEY")
    base_url = "https://api.anthropic.com/v1/"
elif ai_provider == "openrouter":
    api_key = os.getenv("OPENROUTER_API_KEY")
    base_url = "https://openrouter.ai/api/v1"
else:
    print(f"Error: AI provider '{ai_provider}' is not supported. Options: openai, gemini, anthropic.")
    exit()

# --- Client Initialization ---

# Validate that the API key and model are defined
if not api_key:
    print(f"Error: API key environment variable for '{ai_provider}' is not set.")
    exit()
if not model_name:
    print("Error: 'MODEL' environment variable is not set.")
    exit()

# Initialize the OpenAI client with the dynamic configuration
try:
    client = openai.OpenAI(api_key=api_key, base_url=base_url)
except openai.OpenAIError as e:
    print(f"Error initializing the client for {ai_provider.capitalize()}.")
    print(e)
    exit()

# Now you can use the client and model name in your API calls
print(f"âœ… Client initialized successfully. Model to use: {model_name}")

# 2. --- LOAD DATASETS ---

In [None]:
def load_basse_dataset(basse_path: str, ):
    """
    Loads the BASSE dataset from a JSONL file and processes it.
    """
    try:
        return process_basse_extractions(basse_path)
    except FileNotFoundError:
        print(f"Error: The BASSE dataset file was not found at path '{basse_path}'")
        return []


def load_flares_dataset(flares_path: list):
    """
    Loads and processes the FLARES dataset from multiple JSON files.
    """
    try:
        # Load and merge the datasets
        flares_datasets_merged = load_and_merge_datasets(flares_path)
        # Process and flatten the data
        return process_and_flatten_data(flares_datasets_merged)
    except FileNotFoundError:
        print(
            f"Error: The FLARES dataset files were not found at the specified paths: {flares_path[0]} and {flares_path[1]}")
        return []

# 3. --- FUNCTION TO LOAD THE PROMPT ---

In [None]:
def load_prompts_from_files(system_prompt_filepath: str, user_prompt_template_filepath) -> (str, str):
    """Reads and returns the content of a text file."""
    try:
        with open(system_prompt_filepath, 'r', encoding='utf-8') as sf:
            system_prompt = sf.read()
        with open(user_prompt_template_filepath, 'r', encoding='utf-8') as uf:
            user_prompt = uf.read()
        return system_prompt, user_prompt
    except FileNotFoundError:
        print(f"Error: The prompt file was not found")
        return ""  # Return an empty string in case of an error

# 4. --- EVALUATION FUNCTION WITH STRUCTURED OUTPUT ---

In [None]:
def evaluate_extraction(document: str, extraction: str, system_prompt: str, user_prompt_template: str,
                        pydantic_model: BaseModel) -> tuple:
    """
    Prepares and calls the OpenAI API using 'response_model' for structured and validated output.
    """
    if not system_prompt and user_prompt_template:
        print("Cannot proceed without a prompt template.")
        return None, None

    user_prompt_filled = user_prompt_template.format(
        original_document=document,
        extraction_to_evaluate=extraction
    )

    print("--- STARTING STRUCTURED EVALUATION ---")

    try:
        response = client.chat.completions.create(
            model=model_name,
            # messages=[{"role": "user", "content": filled_prompt}],
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_prompt_filled}
            ],  # 1. We define the "tool" that the model should use.
            #    We use .model_json_schema() to automatically generate the schema.
            tools=[
                {
                    "type": "function",
                    "function": {
                        "name": "save_evaluation",
                        "description": "Saves the structured evaluation result of an extraction.",
                        "parameters": pydantic_model.model_json_schema()
                    }
                }
            ],
            # 2. We force the model to call our tool.
            tool_choice={"type": "function", "function": {"name": "save_evaluation"}}
        )

        # 3. We extract the result from the tool call arguments.
        #    The response is a JSON string, not an object.
        tool_call = response.choices[0].message.tool_calls[0]
        json_arguments = tool_call.function.arguments

        # We capture tokens' usage
        token_usage = response.usage

        # 4. We parse and validate the JSON against our Pydantic model.
        evaluation_object = pydantic_model.model_validate_json(json_arguments)

        return evaluation_object, token_usage

    except Exception as e:
        print(f"An error occurred calling the API or processing the response: {e}")
        return None, None

# 5. --- MAIN ITERATION AND STORAGE LOGIC ---

In [None]:
def process_dataset(
        dataset: list,
        prepare_tasks_func: callable,
        system_prompt_path: str,
        user_prompt_template_path: str,
        pydantic_model: BaseModel,
        env: str = "development",
):
    """
    A generic main loop to process any dataset using a specific task preparer.
    """
    system_prompt, user_prompt_template = load_prompts_from_files(system_prompt_path, user_prompt_template_path)
    if not system_prompt and user_prompt_template:
        return [], 0

    all_evaluations = []
    expert_review_tasks = []
    total_tokens = 0
    docs_to_process = dataset[:1] if env == "development" else dataset

    print(f"Processing {len(docs_to_process)} documents...")

    for entry in docs_to_process:
        for task in prepare_tasks_func(entry):
            doc_id, original_text, extraction_to_evaluate, model_name = task

            print(f"\n---> Evaluating '{model_name}' for doc: {doc_id}")

            evaluation_object, usage_data = evaluate_extraction(
                original_text,
                extraction_to_evaluate,
                system_prompt,
                user_prompt_template,
                pydantic_model
            )

            if evaluation_object and usage_data:
                total_tokens += usage_data.total_tokens
                result_record = {
                    "document_idx": doc_id,
                    "model_evaluated": model_name,
                    "evaluation_data": evaluation_object.model_dump(),
                    "token_usage": {
                        "prompt_tokens": usage_data.prompt_tokens,
                        "completion_tokens": usage_data.completion_tokens,
                        "total_tokens": usage_data.total_tokens
                    },
                }
                all_evaluations.append(result_record)

                # --- Create expert review structure using helper function ---
                review_task = create_expert_review_task(
                    doc_id,
                    model_name,
                    original_text,
                    extraction_to_evaluate,
                    evaluation_object
                )
                expert_review_tasks.append(review_task)

                print(f"  Result stored. Used tokens: {usage_data.total_tokens}")

    return all_evaluations, expert_review_tasks, total_tokens

# 6. --- EXECUTION BLOCK ---

In [None]:
if __name__ == "__main__":
    # --- Configure what you want to execute here ---
    # Options: "BASSE" or "FLARES"
    evaluation_dataset = os.environ.get("EVALUATION_DATASET", "BASSE").upper()

    # Get environment value
    environment = os.getenv("ENVIRONMENT", "development")

    if not evaluation_dataset:
        print(f"Error: 'EVALUATION_DATASET' environment variable is not set.")
        exit()
    if evaluation_dataset not in ["BASSE", "FLARES"]:
        print(f"Error: 'EVALUATION_DATASET' must be either 'BASSE' or 'FLARES'.")
        exit()

    # --- EVALUATIONS PARAMETERS ---
    BASSE_DATASET_FILEPATH = str(PROJECT_ROOT / 'data' / 'basse' / 'BASSE.jsonl')
    FLARES_DATASET_FILEPATHS = [
        str(PROJECT_ROOT / 'data' / 'flares' / '5w1h_subtarea_1_train.json'),
        str(PROJECT_ROOT / 'data' / 'flares' / '5w1h_subtask_1_trial.json')
    ]
    SYSTEM_PROMPT_FILE = str(PROJECT_ROOT / 'prompts' / 'system_evaluation_prompt.txt')
    USER_PROMPT_FILE = str(PROJECT_ROOT / 'prompts' / 'user_evaluation_prompt.txt')
    # Make sure the Pydantic model to use is defined or imported
    PYDANTIC_MODEL = DetailedEvaluation

    dataset_to_run = None
    task_preparer = None
    output_filename = None

    print(f"Starting evaluation for target: {evaluation_dataset}")

    if evaluation_dataset == "BASSE":
        dataset_to_run = load_basse_dataset(BASSE_DATASET_FILEPATH)
        task_preparer = prepare_basse_tasks
    elif evaluation_dataset == "FLARES":
        dataset_to_run = load_flares_dataset(FLARES_DATASET_FILEPATHS)
        task_preparer = prepare_flares_tasks

    if dataset_to_run and task_preparer:
        final_results, review_tasks, total_tokens = process_dataset(
            dataset=dataset_to_run,
            prepare_tasks_func=task_preparer,
            system_prompt_path=SYSTEM_PROMPT_FILE,
            user_prompt_template_path=USER_PROMPT_FILE,
            pydantic_model=PYDANTIC_MODEL,
            env=environment
        )

        # --- Create the dynamic filename ---

        # 1. Get the current date
        current_date = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

        # 2. Sanitize model name to be filename-friendly (replaces slashes)
        safe_model_name = model_name.replace('/', '_')

        # 3. Assemble the final filename
        output_filename = f"results/{current_date}_{environment}_{evaluation_dataset}_{ai_provider}_{safe_model_name}.json"
        output_review_task_filename = f"results/{current_date}_{environment}_{evaluation_dataset}_{ai_provider}_{safe_model_name}_review.json"

        # --- Export the data and to results and review files ---

        export_data = {"total_tokens": total_tokens, "results": final_results}

        with open(output_filename, "w", encoding="utf-8") as f:
            json.dump(export_data, f, indent=2, ensure_ascii=False)

        print(f"\nResults exported to: {output_filename}")

        with open(output_review_task_filename, "w", encoding="utf-8") as f:
            json.dump(review_tasks, f, indent=2, ensure_ascii=False)

        print(f"\nReview data exported to: {output_filename}")
    else:
        print("Evaluation target not found or dataset could not be loaded.")
