# Eval

In [1]:
# run_evaluation.py

import os
import pandas as pd
import requests
import time
from datetime import datetime
from google import genai
from dotenv import load_dotenv
from pydantic import BaseModel
from typing import Literal, List
import json

load_dotenv()
google_client = genai.Client(api_key=os.getenv("GOOGLE_API_KEY"))

class JudgeEvaluation(BaseModel):
    """Schema for the quality evaluation of the RAG answer."""
    rating: Literal["Excellent", "Good", "Poor"]

def get_llm_judge_evaluation(question: str, answer: str, sources: List[str]) -> str:
    """
    Uses an LLM to judge the quality of an answer based on the question and sources.

    Returns:
        str: 'Excellent', 'Good', 'Poor', or 'JUDGE_ERROR'.
    """
    # Format the sources into a single block for the prompt
    sources_text = "\n\n".join(f"--- Source {i+1} ---\n{src}" for i, src in enumerate(sources))
    if not sources:
        sources_text = "No sources were provided."

    # Define the role and criteria for the judge LLM
    system_instruction = """
You are an impartial AI evaluator. Your task is to assess the quality of a generated answer based on a user's question and the provided source material.
You must return your rating in the specified JSON format.

Evaluation Criteria:
- **Excellent**: The answer directly and completely addresses the user's question. It is 100% based on the provided sources, contains no fabricated information (hallucinations), and is clear and concise.
- **Good**: The answer addresses the main point of the question but might be slightly incomplete, indirect, or contain minor, non-critical information not found in the sources. It is mostly faithful to the sources.
- **Poor**: The answer is incorrect, irrelevant, fabricated, or fails to use the provided sources to answer the question.
"""

    prompt = f"""
Please evaluate the following `ANSWER` based on the `QUESTION` and `SOURCES`.

**QUESTION:**
{question}

**SOURCES:**
{sources_text}

**ANSWER:**
{answer}
"""    
    try:
        response = google_client.models.generate_content(
            model='gemini-2.0-flash',
            contents=[prompt],
            config={
                'system_instruction': system_instruction,
                'response_mime_type': 'application/json',
                'response_schema': JudgeEvaluation,
                'temperature': 0.1,
                'seed': 42,
            },   
        )
        verdict: JudgeEvaluation = response.parsed
        result = verdict.rating

        return result
        
    except Exception as e:
        print(f"  [ERROR] LLM Judge call failed: {e}")
        return "JUDGE_ERROR"

In [4]:
# --- CONFIGURATION ---
# The full URL to your running FastAPI endpoint
API_URL = "http://127.0.0.1:8000/api/v1/chat"

# Path to your input CSV file
INPUT_CSV_PATH = "bma_questions.csv" 

# Name of the column in your CSV that contains the questions
QUESTION_COLUMN_NAME = "Question"

# Directory where all results will be saved
TIMESTAMP = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
OUTPUT_PARENT_DIR = f"evaluation_results_{TIMESTAMP}"

def create_output_directories():
    """Creates the necessary output directories for MD files and the final CSV."""
    global MARKDOWN_DIR, OUTPUT_CSV_PATH
    
    # Create the main parent directory
    os.makedirs(OUTPUT_PARENT_DIR, exist_ok=True)
    print(f"Created main output directory: '{OUTPUT_PARENT_DIR}'")

    # Create a subdirectory for the markdown files
    MARKDOWN_DIR = os.path.join(OUTPUT_PARENT_DIR, "markdown_files")
    os.makedirs(MARKDOWN_DIR, exist_ok=True)
    print(f"Created markdown subdirectory: '{MARKDOWN_DIR}'")

    # Define the path for the final summary CSV
    OUTPUT_CSV_PATH = os.path.join(OUTPUT_PARENT_DIR, "summary_results.csv")

def call_chat_api(question: str) -> dict:
    """
    Calls the chat API with a single question and returns the parsed JSON response.
    Handles non-streaming requests.
    """
    payload = {
        "messages": [
            {"role": "user", "message": question}
        ],
        "streaming": False # We want the complete JSON response
    }
    
    try:
        response = requests.post(API_URL, json=payload, timeout=120) # 120-second timeout
        # Raise an exception for bad status codes (4xx or 5xx)
        response.raise_for_status() 
        return response.json()
    except requests.exceptions.RequestException as e:
        print(f"  [ERROR] API call failed: {e}")
        # Return a dictionary with an error message to be logged in the CSV
        return {
            "response": f"API_ERROR: {e}",
            "sources": [],
            "intent": "ERROR"
        }

def format_sources(sources: list) -> str:
    """Formats a list of sources into a clean, readable string."""
    if not sources:
        return "No sources provided."
    
    # Create a Markdown-formatted list
    return "\n".join(f"* {source.strip()}" for source in sources)

def save_result_as_markdown(index: int, question: str, result: dict, judge_score: str):
    """Saves the result of a single question as a .md file."""
    
    # Extract data from the result, with fallbacks for safety
    answer = result.get("response", "No response found.")
    sources = result.get("sources", [])
    intent = result.get("intent", "N/A")
    
    # Sanitize the question for use as a filename (optional but good practice)
    # For simplicity, we will just use the index to avoid issues with special characters
    filename = f"result_{index:03d}.md"
    filepath = os.path.join(MARKDOWN_DIR, filename)

    # Format the content for the markdown file
    md_content = f"""
# Evaluation Result #{index}

## ⚖️ Judge Score: {judge_score}

## ❓ Question
{question}

---

## 🤖 Answer
{answer}
"""
    
    try:
        with open(filepath, "w", encoding="utf-8") as f:
            f.write(md_content.strip())
        print(f"  [SUCCESS] Saved markdown to '{filepath}'")
    except IOError as e:
        print(f"  [ERROR] Could not write markdown file '{filepath}': {e}")


def main():
    """Main function to run the evaluation process."""
    print("--- Starting Evaluation Script ---")
    
    # 1. Setup environment
    create_output_directories()
    
    # 2. Check for and load the input CSV
    if not os.path.exists(INPUT_CSV_PATH):
        print(f"\n[CRITICAL] Input file not found at '{INPUT_CSV_PATH}'")
        print("Please create it with a column named 'Question' and add your questions.")
        # Create a sample file for the user
        sample_df = pd.DataFrame({QUESTION_COLUMN_NAME: ["What is FastAPI?", "How do I stream data?"]})
        sample_df.to_csv(INPUT_CSV_PATH, index=False)
        print(f"A sample file has been created for you at '{INPUT_CSV_PATH}'.")
        return

    try:
        df = pd.read_csv(INPUT_CSV_PATH)
        if QUESTION_COLUMN_NAME not in df.columns:
            print(f"[CRITICAL] The input CSV must have a column named '{QUESTION_COLUMN_NAME}'.")
            return
    except Exception as e:
        print(f"[CRITICAL] Failed to read or parse CSV file: {e}")
        return
        
    # 3. Process each question
    results_list = []
    total_questions = len(df)
    print(f"\nFound {total_questions} questions to process...")

    for index, row in df.iterrows():
        question_text = row[QUESTION_COLUMN_NAME]
        print(f"\n--- Processing question {index + 1}/{total_questions} ---")
        print(f"  Question: {question_text}")

        # Call API and get result
        api_result = call_chat_api(question_text)

        # LLM Judge
        answer_text = api_result.get("response", "")
        sources_list = api_result.get("sources", [])
        judge_score = get_llm_judge_evaluation(question_text, answer_text, sources_list)

        # Save individual markdown file
        save_result_as_markdown(index + 1, question_text, api_result, judge_score)
        
        # Prepare data for the final summary CSV
        # Join list of sources into a single string for the CSV cell
        sources_str = "\n".join(api_result.get("sources", []))
        
        results_list.append({
            "No": str(int(index)+1),
            "Question": question_text,
            "Answer": api_result.get("response", ""),
            "LLM_Judge_Score": judge_score,
            "Intent": api_result.get("intent", ""),
            "Sources": sources_str
        })
        
        # Small delay to avoid overwhelming the server
        time.sleep(1)

    # 4. Save all results to a single CSV file
    if results_list:
        summary_df = pd.DataFrame(results_list)
        summary_df.to_csv(OUTPUT_CSV_PATH, index=False, encoding="utf-8")
        print(f"\n--- Evaluation Complete ---")
        print(f"✅ All results compiled into '{OUTPUT_CSV_PATH}'")
    else:
        print("\n--- Evaluation Finished ---")
        print("No questions were processed.")

if __name__ == "__main__":
    main()

--- Starting Evaluation Script ---
Created main output directory: 'evaluation_results_2025-07-30_22-34-43'
Created markdown subdirectory: 'evaluation_results_2025-07-30_22-34-43/markdown_files'

Found 57 questions to process...

--- Processing question 1/57 ---
  Question: ขั้นตอนการติดตั้งเครื่อง edc 
  [SUCCESS] Saved markdown to 'evaluation_results_2025-07-30_22-34-43/markdown_files/result_001.md'

--- Processing question 2/57 ---
  Question: โหลดไดร์เวอร์ได้ที่ไหน 
  [SUCCESS] Saved markdown to 'evaluation_results_2025-07-30_22-34-43/markdown_files/result_002.md'

--- Processing question 3/57 ---
  Question: วิธีการตรวจสอบเวอร์ชันโปรแกรม 
  [SUCCESS] Saved markdown to 'evaluation_results_2025-07-30_22-34-43/markdown_files/result_003.md'

--- Processing question 4/57 ---
  Question: ดูไดร์เวอร์เครื่อง edc และ โปรแกรมเชื่อมต่อกับเครื่องรูดบัตร ว่าเป็นไดร์เวอร์ ไหนยังไง 
  [SUCCESS] Saved markdown to 'evaluation_results_2025-07-30_22-34-43/markdown_files/result_004.md'

--- Processing