# Libraries & Config

In [1]:
import os
import yaml
import json
import logging
import numpy as np
import pandas as pd

from dotenv import load_dotenv
from dataclasses import dataclass, asdict
from typing import Dict, Tuple, List, Any, Optional

from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix

from pydantic import BaseModel, Field
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import JsonOutputParser, StrOutputParser

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

load_dotenv()
openai_api_key = os.getenv("OPENAI_API_KEY")

with open("config.yaml", "r") as f:
    config = yaml.safe_load(f)

  from .autonotebook import tqdm as notebook_tqdm


# Data Classes Definitions

In [2]:
@dataclass
class AgentResult:
    """Holds the output from a single worker agent."""
    anomaly_detected: bool
    score: float
    reasons: List[str]  # Generated by the rules engine
    summary: str        # Generated by the LLM

@dataclass
class SupervisorDecision:
    """Holds the final decision from the supervisor agent."""
    risk_level: str  # "LOW", "MODERATE", "HIGH"
    global_score: float
    recommendations: List[str]

@dataclass
class FinalReport:
    """Represents the entire structured output for a single row, matching the ground truth."""
    row_index: int
    description: str
    agent_outputs: Dict[str, AgentResult] # Using a Dict is key for easy lookup
    supervisor_decision: SupervisorDecision

@dataclass
class EvaluationMetrics:
    """Holds all the metrics from comparing the FinalReport to the ground truth."""
    # Anomaly detection accuracy for all agents combined
    anomaly_detection_accuracy: float
    
    # Accuracy of the supervisor's final risk level classification
    risk_level_accuracy: float
    
    # Average semantic similarity of the agent's reasons
    reason_similarity_score: float
    
    # Semantic similarity of the supervisor's recommendations
    recommendation_similarity_score: float
    
    # The error margin of the calculated global score vs. expected
    global_score_error: float

# Agent & Supervisor Defininitions

In [3]:
# 1. It only defines what the LLM will return: a summary.
class AgentLLMOutput(BaseModel):
    """Defines the expected JSON output structure from the worker agent's LLM call."""
    summary: str = Field(description="A brief, one-sentence summary of the detected anomalies.")

# 2. The new WorkerAgent class contains all the logic for rules, scoring, and summarization.
class WorkerAgent:
    """A configurable worker agent that performs analysis based on rules and scoring logic."""

    def __init__(self, agent_name: str, config: Dict, llm: ChatOpenAI):
        self.name = agent_name
        self.features = config['features']
        self.rules = config['rules']
        self.scoring_config = config['scoring']
        self.llm = llm
        self.prompt_template = ChatPromptTemplate.from_template(config['prompt'])
        self.parser = JsonOutputParser(pydantic_object=AgentLLMOutput)

    def _apply_rules(self, data_input: Dict) -> Tuple[bool, List[str]]:
        """Applies the deterministic rules from the config to detect anomalies."""
        anomalies_found = False
        reasons = []
        for rule in self.rules:
            feature = rule['feature']
            value = data_input.get(feature)
            if value is None:
                continue

            threshold = rule['threshold']
            triggered = False
            if rule['condition'] == 'greater_than' and value > threshold:
                triggered = True
            elif rule['condition'] == 'less_than' and value < threshold:
                triggered = True
            elif rule['condition'] == 'outside_range' and (value < threshold[0] or value > threshold[1]):
                triggered = True
            elif rule['condition'] == 'equals' and value == threshold:
                triggered = True

            if triggered:
                anomalies_found = True
                reasons.append(rule['reason'].format(value=value, threshold=threshold))
        return anomalies_found, reasons

    def _calculate_score(self, data_input: Dict) -> float:
        """Calculates a 0-1 risk score based on the scoring configuration."""
        scores = [0.0] # Start with 0.0 to handle cases with no matching features
        for config_item in self.scoring_config:
            feature = config_item['feature']
            value = data_input.get(feature)
            if value is None:
                continue

            score = 0.0
            if config_item['type'] == 'direct':
                score = float(value)
            elif config_item['type'] == 'categorical':
                score = float(config_item['mapping'].get(value, 0.0))
            elif config_item['type'] in ['normalize', 'inverse_normalize']:
                min_val, max_val = config_item['range']
                # Avoid division by zero if range is a single point
                if max_val == min_val:
                    normalized = 0.0 if value == min_val else 1.0
                else:
                    # Clamp the value to be within the defined range for stable normalization
                    clamped_value = max(min_val, min(value, max_val))
                    normalized = (clamped_value - min_val) / (max_val - min_val)
                
                if config_item['type'] == 'inverse_normalize':
                    score = 1.0 - normalized
                else:
                    score = normalized
            scores.append(score)
        
        # The agent's final score is the highest score among its features
        return max(scores)

    def analyze(self, data_row: Dict) -> AgentResult:
        """Runs the full analysis process: rules, scoring, and optional LLM summary."""
        data_input = {k: data_row.get(k) for k in self.features if k in data_row}

        # Step A: Apply deterministic rules
        anomaly_detected, reasons = self._apply_rules(data_input)

        # Step B: Calculate a deterministic score
        score = self._calculate_score(data_input)
        
        # Step C: Get LLM summary ONLY if an anomaly is detected
        summary = "No anomalies detected."
        if anomaly_detected:
            chain = self.prompt_template | self.llm | self.parser
            llm_output = chain.invoke({"reasons_list": "\n- ".join(reasons)})
            summary = llm_output.get('summary', "Summary could not be generated.")

        # Step D: Return the final, structured result
        return AgentResult(
            anomaly_detected=anomaly_detected,
            score=score,
            reasons=reasons,
            summary=summary
        )

In [4]:
# This function will handle all the supervisor's tasks.
def run_supervisor(
    agent_outputs: Dict[str, AgentResult],
    supervisor_llm: ChatOpenAI,
    config: Dict
) -> SupervisorDecision:
    """
    Aggregates worker agent results, calculates a global score, determines a risk level,
    and generates final recommendations using the supervisor LLM.
    """
    supervisor_config = config['supervisor']
    
    # 1. Calculate the weighted global score
    global_score = 0.0
    for agent_name, result in agent_outputs.items():
        weight = supervisor_config['agent_weights'].get(agent_name, 0)
        global_score += result.score * weight
    
    # 2. Determine the risk level based on thresholds in the config
    thresholds = supervisor_config['risk_thresholds']
    risk_level = "LOW"
    if global_score >= thresholds['HIGH']:
        risk_level = "HIGH"
    elif global_score >= thresholds['MODERATE']:
        risk_level = "MODERATE"

    # 3. Prepare a summary for the LLM
    report_summary = ""
    for agent_name, result in agent_outputs.items():
        if result.anomaly_detected:
            report_summary += f"Agent '{agent_name}' detected: {result.summary} (Score: {result.score:.2f})\n"
    
    if not report_summary:
        report_summary = "No anomalies were detected by any agent."

    # 4. Generate final recommendations using the supervisor LLM
    prompt = ChatPromptTemplate.from_template(supervisor_config['prompt'])
    chain = prompt | supervisor_llm | StrOutputParser()
    
    llm_response = chain.invoke({
        "risk_level": risk_level,
        "global_score": global_score,
        "report_summary": report_summary
    })
    
    recommendations = [rec.strip() for rec in llm_response.strip().split('\n') if rec.strip()]

    # 5. Return the final decision object
    return SupervisorDecision(
        risk_level=risk_level,
        global_score=global_score,
        recommendations=recommendations
    )

# Initialization

In [5]:
llm_worker = ChatOpenAI(model=config["worker_model"])
llm_supervisor = ChatOpenAI(model=config["supervisor_model"])

agents = {}
for agent_name, agent_config in config.items():
    if agent_name.startswith("agent_"):
        agents[agent_name] = WorkerAgent(
            agent_name=agent_name,
            config=agent_config,
            llm=llm_worker
        )
logger.info(f"Initialized {len(agents)} worker agents.")

2025-07-22 11:40:00,858 - INFO - Initialized 6 worker agents.


# Workflow 

In [6]:
with open(r"D:\VeriFlow\data\ground_truth.json", "r") as f:
    ground_truth_data = json.load(f)
    ground_truth_map = {item['row_index']: item for item in ground_truth_data}

sentence_model = SentenceTransformer('all-MiniLM-L6-v2')

2025-07-22 11:40:00,928 - INFO - Use pytorch device_name: cuda
2025-07-22 11:40:00,929 - INFO - Load pretrained SentenceTransformer: all-MiniLM-L6-v2


In [7]:
# 1. Main Workflow 
def process_row(
    row_index: int,
    data_row: Dict,
    agents: Dict[str, WorkerAgent],
    supervisor_llm: ChatOpenAI,
    config: Dict
) -> FinalReport:
    """
    Orchestrates the processing of a single data row through all agents and the supervisor.
    """
    # A. Run all worker agents serially
    agent_outputs = {}
    for agent_name, agent in agents.items():
        logger.info(f"Running agent: {agent_name} for row {row_index}...")
        agent_outputs[agent_name] = agent.analyze(data_row)

    # B. Run the supervisor to aggregate results and make a final decision
    logger.info(f"Running supervisor for row {row_index}...")
    supervisor_decision = run_supervisor(agent_outputs, supervisor_llm, config)

    # C. Create the final, structured report
    description = supervisor_decision.recommendations[0] if supervisor_decision.recommendations else "No significant issues to report."
    
    final_report = FinalReport(
        row_index=row_index,
        description=description,
        agent_outputs=agent_outputs,
        supervisor_decision=supervisor_decision
    )
    
    return final_report

# 2. Evaluation Function
def evaluate_report(
    final_report: FinalReport,
    ground_truth: Dict
) -> EvaluationMetrics:
    """
    Compares a single FinalReport against its corresponding ground truth entry.
    """
    # Supervisor Evaluation 
    supervisor_pred = final_report.supervisor_decision
    supervisor_gt = ground_truth['supervisor_decision']

    # Accuracy of risk level 
    risk_level_accuracy = 1.0 if supervisor_pred.risk_level == supervisor_gt['risk_level'] else 0.0
    
    # Error of the global score
    global_score_error = abs(supervisor_pred.global_score - supervisor_gt['global_score'])

    # Semantic similarity of recommendations
    pred_recs = " ".join(supervisor_pred.recommendations)
    gt_recs = " ".join(supervisor_gt['recommendations'])
    pred_embedding = sentence_model.encode([pred_recs])
    gt_embedding = sentence_model.encode([gt_recs])
    recommendation_similarity = cosine_similarity(pred_embedding, gt_embedding)[0][0]

    # Agent Evaluation
    agent_accuracies = []
    reason_similarities = []
    agent_preds = final_report.agent_outputs
    agent_gt = ground_truth['agent_outputs']

    for agent_name, agent_pred_result in agent_preds.items():
        agent_gt_result = agent_gt.get(agent_name)
        if agent_gt_result:
            # Anomaly detection accuracy for each agent
            is_correct = (agent_pred_result.anomaly_detected == agent_gt_result['anomaly_detected'])
            agent_accuracies.append(1.0 if is_correct else 0.0)

            # If there's an anomaly, compare the reasons/summary
            if agent_pred_result.anomaly_detected and agent_gt_result['anomaly_detected']:
                pred_reason = agent_pred_result.summary
                gt_reason = agent_gt_result['summary']
                pred_emb = sentence_model.encode([pred_reason])
                gt_emb = sentence_model.encode([gt_reason])
                reason_similarities.append(cosine_similarity(pred_emb, gt_emb)[0][0])
    
    # Final Metrics 
    return EvaluationMetrics(
        anomaly_detection_accuracy=np.mean(agent_accuracies) if agent_accuracies else 1.0,
        risk_level_accuracy=risk_level_accuracy,
        reason_similarity_score=np.mean(reason_similarities) if reason_similarities else 1.0,
        recommendation_similarity_score=float(recommendation_similarity),
        global_score_error=global_score_error
    )

# Execution Loop

In [8]:
def main():
    """
    Main function to initialize the system, process data, and evaluate results.
    """
    # 1. SETUP
    logger.info("Starting the main process...")
    
    data_df = pd.read_csv(r"D:\VeriFlow\data\test_data.csv")

    # Initialize LLMs
    llm_worker = ChatOpenAI(model=config['worker_model'], temperature=0.1)
    llm_supervisor = ChatOpenAI(model=config['supervisor_model'], temperature=0.3)

    # Initialize Worker Agents from config
    agents = {}
    for agent_name, agent_config in config.items():
        if agent_name.startswith("agent_"):
            agents[agent_name] = WorkerAgent(
                agent_name=agent_name,
                config=agent_config,
                llm=llm_worker
            )
    logger.info(f"Initialized {len(agents)} worker agents.")

    # 2. PROCESSING & EVALUATION LOOP
    all_reports = []
    all_metrics = []

    # Loop through each row of the dataframe
    for index, row in data_df.head(51).iterrows():
        # Only process rows that have a corresponding ground truth entry
        if index not in ground_truth_map:
            continue

        logger.info(f"----- Processing Row {index} -----")
        
        # A. Process the row to get the final report
        final_report = process_row(index, row.to_dict(), agents, llm_supervisor, config)
        all_reports.append(final_report)
        
        # B. Evaluate the report against the ground truth
        ground_truth_entry = ground_truth_map[index]
        metrics = evaluate_report(final_report, ground_truth_entry)
        all_metrics.append(metrics)
        
        logger.info(f"Finished processing and evaluating row {index}. Risk Level: {final_report.supervisor_decision.risk_level}")

    # 3. FINAL SUMMARY REPORT
    if not all_metrics:
        logger.warning("No data was processed, as no matching ground truth entries were found.")
        return
        
    # Create a DataFrame from our evaluation metrics for easy analysis
    metrics_df = pd.DataFrame([asdict(m) for m in all_metrics])
    average_metrics = metrics_df.mean()

    print("\n" + "="*60)
    print("           OVERALL SYSTEM EVALUATION SUMMARY")
    print("="*60)
    print(f"Total Rows Processed: {len(all_metrics)}")
    print("---")
    print("Average Performance Metrics:")
    print(f"  - Risk Level Accuracy:           {average_metrics['risk_level_accuracy']:.2%}")
    print(f"  - Anomaly Detection Accuracy:    {average_metrics['anomaly_detection_accuracy']:.2%}")
    print(f"  - Recommendation Similarity:     {average_metrics['recommendation_similarity_score']:.3f}")
    print(f"  - Agent Reason Similarity:       {average_metrics['reason_similarity_score']:.3f}")
    print(f"  - Global Score Error (MAE):      {average_metrics['global_score_error']:.3f}")
    print("="*60)
    
    print("\nSample Output for First Processed Row:\n")
    print(json.dumps(asdict(all_reports[0]), indent=2))
    print("="*60)
    
    return all_reports

if __name__ == "__main__":
    all_generated_reports = main()

2025-07-22 11:40:06,691 - INFO - Starting the main process...
2025-07-22 11:40:08,653 - INFO - Initialized 6 worker agents.
2025-07-22 11:40:08,654 - INFO - ----- Processing Row 1 -----
2025-07-22 11:40:08,655 - INFO - Running agent: agent_1_geolocation for row 1...
2025-07-22 11:40:11,896 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-07-22 11:40:11,915 - INFO - Running agent: agent_2_fuel for row 1...
2025-07-22 11:40:11,916 - INFO - Running agent: agent_3_logistics for row 1...
2025-07-22 11:40:11,916 - INFO - Running agent: agent_4_supplier for row 1...
2025-07-22 11:40:13,443 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-07-22 11:40:13,461 - INFO - Running agent: agent_5_cargo for row 1...
2025-07-22 11:40:14,593 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-07-22 11:40:14,598 - INFO - Running agent: agent_6_risk for row 1...
2025-07-22 11:


           OVERALL SYSTEM EVALUATION SUMMARY
Total Rows Processed: 50
---
Average Performance Metrics:
  - Risk Level Accuracy:           100.00%
  - Anomaly Detection Accuracy:    100.00%
  - Recommendation Similarity:     0.781
  - Agent Reason Similarity:       0.804
  - Global Score Error (MAE):      0.000

Sample Output for First Processed Row:

{
  "row_index": 1,
  "description": "- IMMEDIATE: Avoid travel through the area with severe route risk (Score: 0.96).",
  "agent_outputs": {
    "agent_1_geolocation": {
      "anomaly_detected": true,
      "score": 0.9611988304357689,
      "reasons": [
        "Severe route risk level detected (9.61198830435769 on a 0-10 scale)."
      ],
      "summary": "A severe route risk has been identified, indicating significant safety concerns for travel in the area."
    },
    "agent_2_fuel": {
      "anomaly_detected": false,
      "score": 0.006767435376858918,
      "reasons": [],
      "summary": "No anomalies detected."
    },
    "agen

In [9]:
from IPython.display import display, HTML
import html

def display_comparison(row_index: int):
    """
    Displays a side-by-side comparison of the generated report and the ground truth
    for a specific row index.
    """
    # Find the generated report for the given index
    generated_report = next((r for r in all_generated_reports if r.row_index == row_index), None)
    
    # Find the corresponding ground truth
    ground_truth_entry = ground_truth_map.get(row_index)

    if not generated_report:
        print(f"Error: No generated report found for row_index {row_index}. It might not have been processed.")
        return
        
    if not ground_truth_entry:
        print(f"Error: No ground truth entry found for row_index {row_index}.")
        return

    # Convert both to formatted JSON strings
    generated_json_str = json.dumps(asdict(generated_report), indent=2)
    ground_truth_json_str = json.dumps(ground_truth_entry, indent=2)

    # Escape special characters for safe HTML display
    escaped_generated = html.escape(generated_json_str)
    escaped_ground_truth = html.escape(ground_truth_json_str)

    # Create the HTML for the two-column view
    side_by_side_html = f"""
    <h2 style="text-align: center; color: #FFFFFF;">Comparison for Row Index: {row_index}</h2>
    <div style="display: flex; flex-direction: row; justify-content: space-between; width: 100%;">
        <div style="width: 49%; border: 1px solid #ccc; border-radius: 8px; padding: 10px; 
                    box-sizing: border-box; background-color: #000000; color: #ffa500;">
            <h3 style="text-align: center; color: #FFFFFF;">Generated Result</h3>
            <pre style="white-space: pre-wrap; word-wrap: break-word; font-family: monospace; font-size: 13px;">{escaped_generated}</pre>
        </div>
        <div style="width: 49%; border: 1px solid #ccc; border-radius: 8px; padding: 10px;
                    box-sizing: border-box; background-color: #000000; color: #ffa500;">
            <h3 style="text-align: center; color: #FFFFFF;">Ground Truth</h3>
            <pre style="white-space: pre-wrap; word-wrap: break-word; font-family: monospace; font-size: 13px;">{escaped_ground_truth}</pre>
        </div>
    </div>
    """

    # Display the result
    display(HTML(side_by_side_html))

display_comparison(10)