In [None]:
import json
import requests
import pandas as pd
import time
from typing import Dict, List, Any

# --- CONFIGURATION ---
# The URL of your running FastAPI application
API_BASE_URL = "http://127.0.0.1:8000"
CHAT_ENDPOINT = f"{API_BASE_URL}/chat"

# The ground truth data file
TEST_DATA_FILE = "flight_test_data.json"

pd.set_option('display.max_colwidth', None)

In [None]:
try:
    with open(TEST_DATA_FILE, 'r') as f:
        ground_truth_data = json.load(f)
    print(f"Successfully loaded {len(ground_truth_data)} records from '{TEST_DATA_FILE}'.")
    
    # Display the first record to verify it's correct
    print("\nFirst Record Example")
    print(json.dumps(ground_truth_data[0], indent=2))
    
except FileNotFoundError:
    print(f"ERROR: The file '{TEST_DATA_FILE}' was not found.")
    print("Please make sure the test data file is in the same directory as this notebook.")
except json.JSONDecodeError:
    print(f"ERROR: Could not parse '{TEST_DATA_FILE}'. Please ensure it is a valid JSON file.")

Successfully loaded 100 records from 'flight_test_data.json'.

--- First Record Example ---
{
  "departure_city": "Amsterdam",
  "arrival_city": "Hong Kong",
  "departure_date": "2025-11-01",
  "return_date": "2025-11-21",
  "adult_passengers": 1,
  "round_trip": true,
  "child_passengers": 0,
  "infant_passengers": 0,
  "cabin_class": "Economy",
  "budget": 1719,
  "flexible_dates": false,
  "routing": "one_stop",
  "points_booking": false,
  "refundable": false
}


In [10]:
def create_data_dumper_conversation(record: Dict[str, Any]) -> List[str]:
    """Generates a single, detailed message (user dumps all info at once)."""
    
    # Build a descriptive sentence
    message = f"I'd like to book a flight from {record['departure_city']} to {record['arrival_city']}."
    message += f" It's for {record['adult_passengers']} adults"
    if record['child_passengers'] > 0:
        message += f" and {record['child_passengers']} children"
    if record['infant_passengers'] > 0:
        message += f" and {record['infant_passengers']} infants"
    
    message += f" departing on {record['departure_date']}."
    
    if record['round_trip']:
        message += f" We'll be returning on {record['return_date']}."
    else:
        message += " It is a one-way trip."
        
    message += f" We'd like to fly in {record['cabin_class']}."
    
    return [message]

def create_turn_by_turn_conversation(record: Dict[str, Any]) -> List[str]:
    """Generates a sequence of short messages (natural conversation)."""
    
    convo = []
    convo.append(f"I need a flight from {record['departure_city']} to {record['arrival_city']}.")
    convo.append(f"We are departing on {record['departure_date']}.")
    if record['round_trip']:
        convo.append("It's a round trip.")
        convo.append(f"The return date is {record['return_date']}.")
    else:
        convo.append("It's just a one-way flight.")
    
    convo.append(f"There will be {record['adult_passengers']} adults.")
    
    # Only ask about children/infants if they exist in the record
    if record['child_passengers'] > 0:
        convo.append(f"Also {record['child_passengers']} children.")
    if record['infant_passengers'] > 0:
         convo.append(f"And {record['infant_passengers']} infants.")
         
    convo.append(f"We would prefer to fly in {record['cabin_class']}.")
    
    return convo

In [11]:
def run_single_test_case(record: Dict[str, Any], conversation_script: List[str]) -> Dict[str, Any]:
    """
    Simulates a full conversation for a single test case and returns the final extracted data.
    """
    session = requests.Session()
    session_id = None
    final_flight_info = {}

    # 1. Start a new chat session
    try:
        response = session.post(CHAT_ENDPOINT, json={"content": "", "session_id": None}, timeout=30)
        response.raise_for_status()
        data = response.json()
        session_id = data["session_id"]
        print(f"BOT: {data['response']}")
    except requests.RequestException as e:
        print(f"ERROR starting session: {e}")
        return {"error": "Failed to start session"}

    # 2. Go through the generated conversation script
    for user_message in conversation_script:
        print(f"USER: {user_message}")
        try:
            response = session.post(CHAT_ENDPOINT, json={"content": user_message, "session_id": session_id}, timeout=30)
            response.raise_for_status()
            data = response.json()
            final_flight_info = data.get("flight_info", {})
            print(f"BOT: {data['response']}")
            if data.get("is_complete"):
                print("--- Conversation ended prematurely ---")
                return final_flight_info
        except requests.RequestException as e:
            print(f"ERROR during conversation: {e}")
            return {"error": "Request failed during script"}

    # 3. Handle the confirmation loop
    # The bot will now likely ask for confirmation. We need to say "yes" to finish.
    max_confirmation_attempts = 3
    for i in range(max_confirmation_attempts):
        print("USER: yes, that is correct")
        try:
            response = session.post(CHAT_ENDPOINT, json={"content": "yes, that is correct", "session_id": session_id}, timeout=30)
            response.raise_for_status()
            data = response.json()
            final_flight_info = data.get("flight_info", {})
            print(f"BOT: {data['response']}")
            if data.get("is_complete"):
                print("--- Conversation successfully completed ---")
                return final_flight_info
        except requests.RequestException as e:
            print(f"ERROR during confirmation: {e}")
            return {"error": "Request failed during confirmation"}

    print("--- Conversation did not complete after confirmation attempts ---")
    return final_flight_info

# Run for single test case

In [None]:
import os
import json
import openai
import requests
from dotenv import load_dotenv
from typing import Dict, List, Any

#CONFIGURATION FOR THE LLM USER SIMULATOR

# Load environment variables from the .env file
load_dotenv()

try:
    # Correctly load the API key for OpenRouter from the .env file
    api_key = os.environ["OPENROUTER_API_KEY"]
    print("Successfully loaded OPENROUTER_API_KEY.")
except KeyError:
    api_key = None
    print("ERROR: OPENROUTER_API_KEY not found in .env file or environment variables.")
    print("Please ensure your .env file is in the same directory and contains the key.")


# Initialize the client that will act as our "USER"
if api_key:
    user_llm_client = openai.OpenAI(
        base_url="https://openrouter.ai/api/v1",
        api_key=api_key
    )
    print("User LLM client configured for OpenRouter.")
else:
    user_llm_client = None

# The model that will simulate the user
USER_LLM_MODEL = "google/gemini-2.5-flash-preview-05-20" 

# --- 1. The LLM User Simulator Function ---

def get_llm_user_response(ground_truth: Dict[str, Any], conversation_history: List[Dict[str, str]]) -> str:
    """
    Uses an LLM to generate a natural user response based on a goal and conversation history.
    This version uses a more direct, task-oriented prompt.
    """
    if not user_llm_client:
        return "Error: User LLM client not initialized."

    # Extract the last thing the bot said to make the user's task explicit.
    last_bot_message = conversation_history[-1]['content']

    # This new system prompt is much more direct and less ambiguous.
    system_prompt = f"""
    You are a user simulator testing a flight booking bot. Your task is to answer the bot's questions based on a predefined set of data.

    **Your Required Flight Information:**
    {json.dumps(ground_truth, indent=2)}

    **The bot's most recent message to you was:**
    "{last_bot_message}"

    **Your Instructions:**
    1.  Look at the bot's last message and your required flight information.
    2.  Formulate a concise, natural response that directly answers the bot's question.
    3.  If the bot asks for information you have (e.g., 'how many passengers?'), provide it directly from your data (e.g., '1 adult').
    4.  If the bot asks for confirmation of details and they are correct, respond with a simple 'yes' or 'that looks correct'.
    5.  Do NOT make up information. Only use the data provided above.
    6.  Your response MUST NOT be empty.
    """
    
    messages_for_llm = [{"role": "system", "content": system_prompt}]
    
    try:
        response = user_llm_client.chat.completions.create(
            model=USER_LLM_MODEL,
            messages=messages_for_llm,
            temperature=0.2, 
            max_tokens=100,
        )
        # Add a fallback in case the content is empty
        user_response = response.choices[0].message.content or "yes"
        return user_response.strip()
    except Exception as e:
        print(f"ERROR calling User LLM: {e}")
        return "An error occurred."


def run_llm_driven_test(record: Dict[str, Any]) -> Dict[str, Any]:
    """
    Orchestrates a conversation between the LLM User and the Flight Bot.
    """
    session = requests.Session()
    session_id = None
    final_flight_info = {}
    conversation_history = []
    MAX_TURNS = 15 # Note i set this up to 15, pros and cons to higher and lower but need proably at least 12

    try:
        response = session.post(CHAT_ENDPOINT, json={"content": "", "session_id": None}, timeout=30)
        response.raise_for_status()
        data = response.json()
        session_id = data["session_id"]
        bot_response = data['response']
        print(f"BOT: {bot_response}")
        conversation_history.append({"role": "assistant", "content": bot_response})
    except requests.RequestException as e:
        return {"error": f"Failed to start session: {e}"}

    for turn in range(MAX_TURNS):
        user_response = get_llm_user_response(record, conversation_history)
        # If the user LLM fails, we can't continue.
        if "error" in user_response.lower():
            print(f"USER (LLM): {user_response}")
            break
        print(f"USER (LLM): {user_response}")
        conversation_history.append({"role": "user", "content": user_response})

        try:
            response = session.post(CHAT_ENDPOINT, json={"content": user_response, "session_id": session_id}, timeout=30)
            response.raise_for_status()
            data = response.json()
            bot_response = data['response']
            final_flight_info = data.get("flight_info", {})
            print(f"BOT: {bot_response}")
            conversation_history.append({"role": "assistant", "content": bot_response})

            if data.get("is_complete"):
                print("\nConversation successfully completed by bot ")
                return final_flight_info
        except requests.RequestException as e:
            return {"error": f"Request failed during conversation: {e}"}

    print("\nConversation ended due to reaching max turns or an error")
    return final_flight_info


# 3. Run a Single LLM-Driven Test 

TEST_INDEX = 10 

if not user_llm_client:
    print("Cannot run test: User LLM client is not configured due to missing API key.")
elif 'ground_truth_data' not in globals() or not ground_truth_data:
    print("Cannot run test: Ground truth data is not loaded.")
else:
    record = ground_truth_data[TEST_INDEX]
    
    print(f"RUNNING LLM-DRIVEN TEST (Index: {TEST_INDEX}) \n")
    print("Ground Truth for this Test")
    print(json.dumps(record, indent=2))
    print("\nStarting Conversation\n")

    extracted_data = run_llm_driven_test(record)
    
    print(f"\nFINAL TEST ANALYSIS")
    
    expected_data = {key: record.get(key) for key in record.keys()}
    expected_data.setdefault("redress_number", None)

    is_match = expected_data == extracted_data
    
    print("\nExpected Data (Ground Truth)")
    print(json.dumps(expected_data, indent=2))
    
    print("\nExtracted Data (from bot conversation) ")
    print(json.dumps(extracted_data, indent=2))
    
    print(f"\n VERDICT: {'PASS' if is_match else 'FAIL'} ")

    if not is_match:
        print("\n Mismatch Details ")
        for key in expected_data:
            if expected_data.get(key) != extracted_data.get(key):
                print(f"  - Field: '{key}' | Expected: {expected_data.get(key)} | Extracted: {extracted_data.get(key)}")

Successfully loaded OPENROUTER_API_KEY.
User LLM client configured for OpenRouter.

--- Ground Truth for this Test ---
{
  "departure_city": "Berlin",
  "arrival_city": "Paris",
  "departure_date": "2026-03-20",
  "return_date": "2026-04-01",
  "adult_passengers": 1,
  "round_trip": true,
  "child_passengers": 0,
  "infant_passengers": 1,
  "cabin_class": "Economy",
  "budget": 1276,
  "flexible_dates": false,
  "routing": "any",
  "points_booking": false,
  "refundable": true
}

--- Starting Conversation ---

BOT: Hello! I'm your flight booking assistant. Where would you like to fly from?
USER (LLM): I'd like to fly from Berlin.


KeyboardInterrupt: 

# Run for entire batch of 100

In [None]:
import os
import json
import openai
import requests
import pandas as pd
import time
from dotenv import load_dotenv
from typing import Dict, List, Any

# -CONFIGURATION FOR THE LLM USER SIMULATOR AND TEST RUN 

# Load environment variables from the .env file
load_dotenv()

pd.set_option('display.max_colwidth', None)
pd.set_option('display.width', 200)

try:
    # Correctly load the API key for OpenRouter from the .env file
    api_key = os.environ["OPENROUTER_API_KEY"]
    print("Successfully loaded OPENROUTER_API_KEY.")
except KeyError:
    api_key = None
    print("ERROR: OPENROUTER_API_KEY not found in .env file or environment variables.")

# Initialize the client that will act as our "USER"
if api_key:
    user_llm_client = openai.OpenAI(
        base_url="https://openrouter.ai/api/v1",
        api_key=api_key
    )
    print("User LLM client configured for OpenRouter.")
else:
    user_llm_client = None

# Configuration for the test
CHAT_ENDPOINT = "http://127.0.0.1:8000/chat"
TEST_DATA_FILE = "flight_test_data.json"
USER_LLM_MODEL = "google/gemini-flash-1.5-8b" 

# The LLM User Simulator Function

def get_llm_user_response(ground_truth: Dict[str, Any], conversation_history: List[Dict[str, str]]) -> str:
    """Uses an LLM to generate a natural user response based on a goal and conversation history."""
    if not user_llm_client: return "Error: User LLM client not initialized."
    last_bot_message = conversation_history[-1]['content']
    system_prompt = f"""
    You are a user simulator testing a flight booking bot. Your task is to answer the bot's questions based on a predefined set of data.
    **Your Required Flight Information:**
    {json.dumps(ground_truth, indent=2)}
    **The bot's most recent message to you was:**
    "{last_bot_message}"
    **Your Instructions:**
    1. Look at the bot's last message and your required flight information.
    2. Formulate a concise, natural response that directly answers the bot's question.
    3. If the bot asks for information you have (e.g., 'how many passengers?'), provide it directly from your data (e.g., '1 adult').
    4. If the bot asks for confirmation of details and they are correct, respond with a simple 'yes' or 'that looks correct'.
    5. Do NOT make up information. Only use the data provided above.
    6. Your response MUST NOT be empty.
    """
    messages_for_llm = [{"role": "system", "content": system_prompt}]
    try:
        response = user_llm_client.chat.completions.create(model=USER_LLM_MODEL, messages=messages_for_llm, temperature=0.2, max_tokens=100)
        user_response = response.choices[0].message.content or "yes"
        return user_response.strip()
    except Exception as e:
        print(f"ERROR calling User LLM: {e}")
        return "An error occurred."

# The Test Orchestration Engine

def run_llm_driven_test(record: Dict[str, Any]) -> Dict[str, Any]:
    """Orchestrates a conversation between the LLM User and the Flight Bot."""
    session = requests.Session()
    session_id, final_flight_info = None, {}
    conversation_history = []
    MAX_TURNS = 18 

    try:
        response = session.post(CHAT_ENDPOINT, json={"content": "", "session_id": None}, timeout=60)
        response.raise_for_status()
        data = response.json()
        session_id, bot_response = data["session_id"], data['response']
        print(f"BOT: {bot_response}")
        conversation_history.append({"role": "assistant", "content": bot_response})
    except requests.RequestException as e:
        return {"error": f"Failed to start session: {e}"}

    for turn in range(MAX_TURNS):
        user_response = get_llm_user_response(record, conversation_history)
        if "error" in user_response.lower():
            print(f"USER (LLM): {user_response}")
            break
        print(f"USER (LLM): {user_response}")
        conversation_history.append({"role": "user", "content": user_response})
        try:
            response = session.post(CHAT_ENDPOINT, json={"content": user_response, "session_id": session_id}, timeout=60)
            response.raise_for_status()
            data = response.json()
            bot_response, final_flight_info = data['response'], data.get("flight_info", {})
            print(f"BOT: {bot_response}")
            conversation_history.append({"role": "assistant", "content": bot_response})
            if data.get("is_complete"):
                print("\n--- Conversation successfully completed by bot ---")
                return final_flight_info
        except requests.RequestException as e:
            return {"error": f"Request failed during conversation: {e}"}
    print("\n--- Conversation ended due to reaching max turns or an error ---")
    return final_flight_info


# Full Batch Test Runner 

def run_full_evaluation():
    if not user_llm_client:
        print("Cannot run evaluation: User LLM client is not configured.")
        return
    try:
        with open(TEST_DATA_FILE, 'r') as f:
            ground_truth_data = json.load(f)
        print(f"Successfully loaded {len(ground_truth_data)} records for evaluation.")
    except Exception as e:
        print(f"ERROR loading test data file: {e}")
        return
    
    test_results = []
    total_tests = len(ground_truth_data)

    for i, record in enumerate(ground_truth_data):
        print(f"\n{'='*30} RUNNING TEST {i+1}/{total_tests} {'='*30}\n")
        
        extracted_data = run_llm_driven_test(record)
        
        # Prepare expected data, ensuring it has the same structure as the extracted model
        expected_data = {
            "departure_city": record.get("departure_city"), "arrival_city": record.get("arrival_city"),
            "departure_date": record.get("departure_date"), "return_date": record.get("return_date"),
            "adult_passengers": record.get("adult_passengers"), "round_trip": record.get("round_trip"),
            "child_passengers": record.get("child_passengers", 0), "infant_passengers": record.get("infant_passengers", 0),
            "cabin_class": record.get("cabin_class"), "budget": float(record.get("budget")) if record.get("budget") is not None else None,
            "flexible_dates": record.get("flexible_dates", False), "routing": record.get("routing"),
            "points_booking": record.get("points_booking", False), "refundable": record.get("refundable", False),
            "redress_number": None
        }

        # Calculate field-level accuracy
        matched_fields = 0
        mismatched_fields = []
        total_fields = len(expected_data)
        
        for key, expected_value in expected_data.items():
            extracted_value = extracted_data.get(key)
            if str(expected_value) == str(extracted_value): 
                matched_fields += 1
            else:
                mismatched_fields.append({
                    'field': key,
                    'expected': expected_value,
                    'extracted': extracted_value
                })
        
        accuracy = (matched_fields / total_fields) * 100 if total_fields > 0 else 0
        
        test_results.append({
            "test_id": i + 1,
            "accuracy_%": round(accuracy, 2),
            "mismatches": mismatched_fields if mismatched_fields else "None",
        })
        print(f"\n--- TEST {i+1} ACCURACY: {accuracy:.2f}% ---")
        time.sleep(5) # Add a delay to avoid rate limiting issues

    print(f"\n\n{'='*30} OVERALL EVALUATION COMPLETE {'='*30}\n")
    return pd.DataFrame(test_results)

# RUN THE EVALUATION 
results_df = run_full_evaluation()

# DISPLAY THE FINAL REPORT
if results_df is not None:
    overall_accuracy = results_df['accuracy_%'].mean()
    
    print("--- FINAL ACCURACY REPORT ---")
    print(f"\nOverall System Accuracy: {overall_accuracy:.2f}%\n")
    print("-----------------------------------")
    
    print("\n DETAILED TEST RESULTS \n")
    # Display the full DataFrame
    display(results_df)

    # Display a summary of the worst-performing tests for easier debugging
    print("\nTESTS WITH THE MOST ERRORS \n")
    display(results_df[results_df['accuracy_%'] < 100].sort_values(by='accuracy_%', ascending=True).head(15))

Successfully loaded OPENROUTER_API_KEY.
User LLM client configured for OpenRouter.
Successfully loaded 100 records for evaluation.


BOT: Hello! I'm your flight booking assistant. Where would you like to fly from?
USER (LLM): Amsterdam
BOT: Okay, and where would you like to fly to?

USER (LLM): Hong Kong
BOT: How many adults are traveling?

USER (LLM): 1 adult
BOT: When would you like to depart?

USER (LLM): 2025-11-01
BOT: What about a return date?

USER (LLM): 2025-11-21
BOT: What is your preferred cabin class?  (e.g., Economy, Business, First)

USER (LLM): Economy
BOT: Great.  Are there any specific airports you'd prefer to fly from in Amsterdam or to in Hong Kong?

USER (LLM): No specific airports.
BOT: Do you have a budget in mind for this trip?

USER (LLM): My budget is €1719.
BOT: Perfect.  Are you flexible with your travel dates?

USER (LLM): No, I am not flexible with my travel dates.
BOT: Understood.  Is there a preferred routing or airline you'd like to consider?

USER (LLM)

KeyboardInterrupt: 