# 1. --- IMPORTS AND CONFIGURATION ---

In [1]:
import os
import json
from pathlib import Path
from dotenv import load_dotenv
import openai
from pydantic import BaseModel, Field
from preprocessing.basse_preprocessing import process_basse_summaries

# Load environment variables (will look for .env file)
load_dotenv()

PROJECT_ROOT = Path.cwd()

BASSE_DATASET_FILEPATH = str(PROJECT_ROOT / 'data' / 'basse' / 'BASSE.jsonl')
FLARES_TRAIN_DATASET_FILEPATH = str(PROJECT_ROOT / 'data' / 'flares' / '5w1h_subtarea_1_train.json')
FLARES_TRAIL_DATASET_FILEPATH = str(PROJECT_ROOT / 'data' / 'flares' / '5w1h_subtask_1_trial.json')

# Initialize the OpenAI client.
# It will automatically read the OPENAI_API_KEY environment variable
try:
    client = openai.OpenAI()
except openai.OpenAIError as e:
    print("Error initializing the OpenAI client. Please ensure the OPENAI_API_KEY environment variable is configured.")
    print(e)
    # Exit or handle the error appropriately
    exit()

# 2. --- LOAD DATASETS ---

In [2]:
basse_dataset = process_basse_summaries(BASSE_DATASET_FILEPATH)

First object from the list of summaries:
{
  "idx": "http://elpais.com/deportes/2019/08/17/actualidad/1566005143_044557.html",
  "round": 1,
  "original_document": "El jet lag ante Argentina , que quedó maquillado por el arrebato febril de Ricky ( 15 puntos en los últimos cuatro minutos ) , se consolidó 24 horas después ante Rusia . A tres días del comienzo del Mundial , de nuevo sin Marc Gasol en la rotación , el conjunto de Scariolo firmó su segunda derrota de la preparación y , por números y sensaciones , abrazó las dudas justo antes del estreno oficial . En el marcador , un contundente 55-74 ; en la estadística , unos pobres porcentajes de tiro ( 15 de 37 de dos , 3 de 18 en triples y 16 de 19 en tiros libres ) , 18 pérdidas , 12 robos del rival… Los 26 puntos y 11 rebotes de Willy Hernangómez fueron lo único lustroso de una prueba para olvidar , o para tentarse la ropa . “ Esperemos que esta no sea la referencia . Nos faltan muchos jugadores y la cabeza estará también en una dimen

# 2. --- DEFINITION OF STRUCTURED OUTPUT MODEL ---

In [3]:
class Scores(BaseModel):
    """Contains numerical scores for each evaluation criterion."""

    factual_accuracy: int = Field(
        ...,
        description="Score (1–5) for: Is the extracted information correct and does it faithfully reflect the facts presented in the source text?",
        ge=1,  # ge = Greater than or equal to 1
        le=5   # le = Less than or equal to 5
    )

    completeness: int = Field(
        ...,
        description="Score (1–5) for: Does the extraction capture all essential information from the source text that answers the specific 5W1H question?",
        ge=1,
        le=5
    )

    relevance_and_conciseness: int = Field(
        ...,
        description="Score (1–5) for: Does the extraction focus only on the answer, avoiding superfluous information or content that would belong to another 5W1H element?",
        ge=1,
        le=5
    )

    clarity_and_readability: int = Field(
        ...,
        description="Score (1–5) for: Is the extracted segment grammatically correct, coherent, and easy to understand on its own?",
        ge=1,
        le=5
    )

    source_faithfulness: int = Field(
        ...,
        description="Score (1–5) for: Is the extraction strictly based on the source text information, without adding interpretations or hallucinations?",
        ge=1,
        le=5
    )

    overall_coherence: int = Field(
        ...,
        description="Score (1–5) for: When considering all extractions together, do they form a logically connected and coherent set?",
        ge=1,
        le=5
    )


class Justifications(BaseModel):
    """Contains textual justifications for each assigned score."""

    factual_accuracy: str = Field(
        ...,
        description="Brief justification for the Factual Accuracy score."
    )

    completeness: str = Field(
        ...,
        description="Brief justification for the Completeness score."
    )

    relevance_and_conciseness: str = Field(
        ...,
        description="Brief justification for the Relevance and Conciseness score."
    )

    clarity_and_readability: str = Field(
        ...,
        description="Brief justification for the Clarity and Readability score."
    )

    source_faithfulness: str = Field(
        ...,
        description="Brief justification for the Source Faithfulness score."
    )

    overall_coherence: str = Field(
        ...,
        description="Brief justification for the Overall Coherence score."
    )


class DetailedEvaluation(BaseModel):
    """
    Root model for a structured and detailed evaluation of a 5W1H summary,
    based on a set of research-driven metrics.
    """
    scores: Scores = Field(..., description="The set of all numerical scores for the evaluation.")
    justifications: Justifications = Field(..., description="The set of all textual justifications supporting the scores.")

# 3. --- FUNCTION TO LOAD THE PROMPT ---

In [4]:
def load_prompt_from_file(filepath: str) -> str:
    """Reads and returns the content of a text file."""
    try:
        with open(filepath, 'r', encoding='utf-8') as f:
            return f.read()
    except FileNotFoundError:
        print(f"Error: The prompt file was not found at path '{filepath}'")
        return "" # Return an empty string in case of an error

# 4. --- EVALUATION FUNCTION WITH STRUCTURED OUTPUT ---

In [5]:
def evaluate_summary_structured(document: str, summary: str, prompt_template: str):
    """
    Prepares and calls the OpenAI API using 'response_model' for structured and validated output.
    """
    if not prompt_template:
        print("Cannot proceed without a prompt template.")
        return None

    filled_prompt = prompt_template.format(
        original_document=document,
        summary_to_evaluate=summary
    )

    print("--- STARTING STRUCTURED EVALUATION ---")

    try:
        response = client.chat.completions.create(
            model=os.environ.get("OPENAI_MODEL", "gpt-4.1-nano-2025-04-14"),  # Default to gpt-4.1-nano if no key is set
            messages=[{"role": "user", "content": filled_prompt}],
            # 1. We define the "tool" that the model should use.
            #    We use .model_json_schema() to automatically generate the schema.
            tools=[
                {
                    "type": "function",
                    "function": {
                        "name": "save_evaluation",
                        "description": "Saves the structured evaluation result of a summary.",
                        "parameters": DetailedEvaluation.model_json_schema()
                    }
                }
            ],
            # 2. We force the model to call our tool.
            tool_choice={"type": "function", "function": {"name": "save_evaluation"}}
        )

        # 3. We extract the result from the tool call arguments.
        #    The response is a JSON string, not an object.
        tool_call = response.choices[0].message.tool_calls[0]
        json_arguments = tool_call.function.arguments

        # We capture tokens' usage
        token_usage = response.usage

        # 4. We parse and validate the JSON against our Pydantic model.
        evaluation_object = DetailedEvaluation.model_validate_json(json_arguments)

        return evaluation_object, token_usage

    except Exception as e:
        print(f"An error occurred calling the API or processing the response: {e}")
        return None

    print("--- EVALUATION RECEIVED ---")
    return response

# 5. --- MAIN ITERATION AND STORAGE LOGIC ---

In [6]:
def main_evaluation_loop():
    """
    Iterates over the dataset, extracts summaries, and sends them for evaluation.
    """
    prompt_template = load_prompt_from_file("evaluation_prompt_v3.txt")
    if not prompt_template:
        return []  # Stop execution if the prompt could not be loaded

    all_evaluations = []
    total_tokens = 0

    # Get environment from env vars
    env = os.environ.get("ENVIRONMENT", "development")

    # Determine number of documents to process based on environment
    docs_to_process = basse_dataset[:1] if env == "development" else basse_dataset

    for document_data in docs_to_process:
        doc_id = document_data["idx"]
        original_text = document_data["original_document"]

        print(f"\n================ Processing: {doc_id} ================")

        summary_keys = [key for key in document_data if key.endswith('_summ')]

        for key in summary_keys:
            model_name = key.replace('-5w1h_summ', '')
            summary_text = document_data[key]

            print(f"\n---> Evaluating summary from: [{model_name.upper()}]")

            evaluation_object, usage_data = evaluate_summary_structured(original_text, summary_text, prompt_template)

            if evaluation_object and usage_data:
                total_tokens += usage_data.total_tokens

                result_record = {
                    "document_idx": doc_id,
                    "model_evaluated": model_name,
                    # We use .model_dump() to convert the Pydantic object to a Python dict
                    "evaluation_data": evaluation_object.model_dump(),
                    "token_usage": {
                        "prompt_tokens": usage_data.prompt_tokens,
                        "completion_tokens": usage_data.completion_tokens,
                        "total_tokens": usage_data.total_tokens
                    },
                }
                all_evaluations.append(result_record)
                print(f"Result for [{model_name.upper()}] stored. Used tokens: {usage_data.total_tokens}")

    return all_evaluations, total_tokens


In [7]:
if __name__ == "__main__":
    final_results, total_tokens = main_evaluation_loop()

    print("\n\n####################################################")
    print("####### EVALUATION PROCESS FINISHED #######")
    print("####################################################")

    print(f"Total used tokens: {total_tokens}")
    print("\nEvaluation results collected:")
    # We use json.dumps to print the final result in a readable format
    print(json.dumps(final_results, indent=2, ensure_ascii=False))

    # Export results to JSON file with total tokens
    output_path = "evaluation_results.json"
    export_data = {
        "total_tokens": total_tokens,
        "results": final_results
    }
    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(export_data, f, indent=2, ensure_ascii=False)
    print(f"\nResults exported to: {output_path}")




---> Evaluating summary from: [CLAUDE]
--- STARTING STRUCTURED EVALUATION ---
Result for [CLAUDE] stored. Used tokens: 2729

---> Evaluating summary from: [COMMANDR]
--- STARTING STRUCTURED EVALUATION ---
Result for [COMMANDR] stored. Used tokens: 2736

---> Evaluating summary from: [GPT4O]
--- STARTING STRUCTURED EVALUATION ---
Result for [GPT4O] stored. Used tokens: 2694

---> Evaluating summary from: [REKA]
--- STARTING STRUCTURED EVALUATION ---
Result for [REKA] stored. Used tokens: 2734

---> Evaluating summary from: [LLAMA3]
--- STARTING STRUCTURED EVALUATION ---
Result for [LLAMA3] stored. Used tokens: 2702


---> Evaluating summary from: [CLAUDE]
--- STARTING STRUCTURED EVALUATION ---
Result for [CLAUDE] stored. Used tokens: 2757

---> Evaluating summary from: [COMMANDR]
--- STARTING STRUCTURED EVALUATION ---
Result for [COMMANDR] stored. Used tokens: 2875

---> Evaluating summary from: [GPT4O]
--- STARTING STRUCTURED EVALUATION ---
Result for [GPT4O] stored. Used tokens: 284