# 1. --- IMPORTS AND CONFIGURATION ---

In [1]:
import os
import json
from pathlib import Path
from dotenv import load_dotenv
import openai
from pydantic import BaseModel, Field
from preprocessing.basse_preprocessing import process_basse_summaries

# Load environment variables (will look for .env file)
load_dotenv()

PROJECT_ROOT = Path.cwd()

BASSE_DATASET_FILEPATH = str(PROJECT_ROOT / 'data' / 'basse' / 'BASSE.jsonl')
FLARES_TRAIN_DATASET_FILEPATH = str(PROJECT_ROOT / 'data' / 'flares' / '5w1h_subtarea_1_train.json')
FLARES_TRAIL_DATASET_FILEPATH = str(PROJECT_ROOT / 'data' / 'flares' / '5w1h_subtask_1_trial.json')

# Initialize the OpenAI client.
# It will automatically read the OPENAI_API_KEY environment variable
try:
    client = openai.OpenAI()
except openai.OpenAIError as e:
    print("Error initializing the OpenAI client. Please ensure the OPENAI_API_KEY environment variable is configured.")
    print(e)
    # Exit or handle the error appropriately
    exit()

# 2. --- LOAD DATASETS ---

In [2]:
basse_dataset = process_basse_summaries(BASSE_DATASET_FILEPATH)

First object from the list of summaries:
{
  "idx": "http://elpais.com/deportes/2019/08/17/actualidad/1566005143_044557.html",
  "round": 1,
  "original_document": "El jet lag ante Argentina , que quedó maquillado por el arrebato febril de Ricky ( 15 puntos en los últimos cuatro minutos ) , se consolidó 24 horas después ante Rusia . A tres días del comienzo del Mundial , de nuevo sin Marc Gasol en la rotación , el conjunto de Scariolo firmó su segunda derrota de la preparación y , por números y sensaciones , abrazó las dudas justo antes del estreno oficial . En el marcador , un contundente 55-74 ; en la estadística , unos pobres porcentajes de tiro ( 15 de 37 de dos , 3 de 18 en triples y 16 de 19 en tiros libres ) , 18 pérdidas , 12 robos del rival… Los 26 puntos y 11 rebotes de Willy Hernangómez fueron lo único lustroso de una prueba para olvidar , o para tentarse la ropa . “ Esperemos que esta no sea la referencia . Nos faltan muchos jugadores y la cabeza estará también en una dimen

# 2. --- DEFINITION OF STRUCTURED OUTPUT MODEL ---

In [3]:
# This class defines the EXACT structure of the JSON we want to receive.
# OpenAI will use this definition to guarantee the output format.

class EvaluationScores(BaseModel):
    fidelity: int = Field(..., description="Score from 1 to 5 for faithfulness to the original text.")
    completeness: int = Field(..., description="Score from 1 to 5 for the completeness of the 5W1H elements.")
    correctness: int = Field(..., description="Score from 1 to 5 for the correct assignment of each element.")
    atomicity: int = Field(..., description="Score from 1 to 5 for the non-mixing of information between elements.")

class EvaluationJustification(BaseModel):
    fidelity: str = Field(..., description="Brief justification for the fidelity score.")
    completeness: str = Field(..., description="Brief justification for the completeness score.")
    correctness: str = Field(..., description="Brief justification for the correctness score.")
    atomicity: str = Field(..., description="Brief justification for the atomicity score.")

class StructuredEvaluation(BaseModel):
    """The root model for the structured evaluation of a 5W1H summary."""
    scores: EvaluationScores = Field(..., description="The set of numerical scores for each criterion.")
    justification: EvaluationJustification = Field(..., description="The set of textual justifications for each score.")
    general_summary: str = Field(..., description="A very brief overall evaluation of the summary.")

# 3. --- FUNCTION TO LOAD THE PROMPT ---

In [4]:
def load_prompt_from_file(filepath: str) -> str:
    """Reads and returns the content of a text file."""
    try:
        with open(filepath, 'r', encoding='utf-8') as f:
            return f.read()
    except FileNotFoundError:
        print(f"Error: The prompt file was not found at path '{filepath}'")
        return "" # Return an empty string in case of an error

# 4. --- EVALUATION FUNCTION WITH STRUCTURED OUTPUT ---

In [5]:
def evaluate_summary_structured(document: str, summary: str, prompt_template: str):
    """
    Prepares and calls the OpenAI API using 'response_model' for structured and validated output.
    """
    if not prompt_template:
        print("Cannot proceed without a prompt template.")
        return None

    filled_prompt = prompt_template.format(
        original_document=document,
        summary_to_evaluate=summary
    )

    print("--- STARTING STRUCTURED EVALUATION ---")

    # --- REAL OPENAI API CALL WOULD GO HERE ---
    try:
        response = client.chat.completions.create(
            model=os.environ.get("OPENAI_MODEL", "gpt-4.1-nano-2025-04-14"),  # Default to gpt-4.1-nano if no key is set
            # model="gpt-4.1-mini-2025-04-14",
            messages=[{"role": "user", "content": filled_prompt}],
            # 1. We define the "tool" that the model should use.
            #    We use .model_json_schema() to automatically generate the schema.
            tools=[
                {
                    "type": "function",
                    "function": {
                        "name": "save_evaluation", # Translated variable name
                        "description": "Saves the structured evaluation result of a summary.",
                        "parameters": StructuredEvaluation.model_json_schema()
                    }
                }
            ],
            # 2. We force the model to call our tool.
            tool_choice={"type": "function", "function": {"name": "save_evaluation"}} # Translated variable name
        )

        # 3. We extract the result from the tool call arguments.
        #    The response is a JSON string, not an object.
        tool_call = response.choices[0].message.tool_calls[0]
        json_arguments = tool_call.function.arguments

        # 4. We parse and validate the JSON against our Pydantic model.
        evaluation_object = StructuredEvaluation.model_validate_json(json_arguments)
        return evaluation_object

    except Exception as e:
        print(f"An error occurred calling the API or processing the response: {e}")
        return None

    print("--- EVALUATION RECEIVED ---")
    return response

In [None]:
# 5. --- MAIN ITERATION AND STORAGE LOGIC ---

In [6]:
def main_evaluation_loop():
    """
    Iterates over the dataset, extracts summaries, and sends them for evaluation.
    """
    prompt_template = load_prompt_from_file("evaluation_prompt.txt")
    if not prompt_template:
        return []  # Stop execution if the prompt could not be loaded

    all_evaluations = []

    for document_data in basse_dataset[:2]:  # Limiting to 2 documents for testing
        doc_id = document_data["idx"]
        original_text = document_data["original_document"]

        print(f"\n================ Processing: {doc_id} ================")

        summary_keys = [key for key in document_data if key.endswith('_summ')]

        for key in summary_keys:
            model_name = key.replace('-5w1h_summ', '')
            summary_text = document_data[key]

            print(f"\n---> Evaluating summary from: [{model_name.upper()}]")

            evaluation_object = evaluate_summary_structured(original_text, summary_text, prompt_template)

            if evaluation_object:
                result_record = {
                    "document_idx": doc_id,
                    "model_evaluated": model_name,
                    # Use .model_dump() to convert the Pydantic object to a Python dict
                    "evaluation_data": evaluation_object.model_dump()
                }
                all_evaluations.append(result_record)
                print(f"Result for [{model_name.upper()}] stored.")

    return all_evaluations

In [7]:
if __name__ == "__main__":
    final_results = main_evaluation_loop()

    print("\n\n####################################################")
    print("####### EVALUATION PROCESS FINISHED #######")
    print("####################################################")

    print("\nEvaluation results collected:")
    # Use json.dumps to print the final result in a readable format
    print(json.dumps(final_results, indent=2, ensure_ascii=False))



---> Evaluating summary from: [CLAUDE]
--- STARTING STRUCTURED EVALUATION ---
Result for [CLAUDE] stored.

---> Evaluating summary from: [COMMANDR]
--- STARTING STRUCTURED EVALUATION ---
Result for [COMMANDR] stored.

---> Evaluating summary from: [GPT4O]
--- STARTING STRUCTURED EVALUATION ---
Result for [GPT4O] stored.

---> Evaluating summary from: [REKA]
--- STARTING STRUCTURED EVALUATION ---
Result for [REKA] stored.

---> Evaluating summary from: [LLAMA3]
--- STARTING STRUCTURED EVALUATION ---
Result for [LLAMA3] stored.


---> Evaluating summary from: [CLAUDE]
--- STARTING STRUCTURED EVALUATION ---
Result for [CLAUDE] stored.

---> Evaluating summary from: [COMMANDR]
--- STARTING STRUCTURED EVALUATION ---
Result for [COMMANDR] stored.

---> Evaluating summary from: [GPT4O]
--- STARTING STRUCTURED EVALUATION ---
Result for [GPT4O] stored.

---> Evaluating summary from: [REKA]
--- STARTING STRUCTURED EVALUATION ---
Result for [REKA] stored.

---> Evaluating summary from: [LLAMA3]