In [None]:
import json
import re
from datetime import datetime
from pathlib import Path
from concurrent.futures import ThreadPoolExecutor, as_completed
from threading import Lock
from typing import List, Dict, Any
from dataclasses import dataclass

import pandas as pd
from tqdm.notebook import tqdm
from openai import OpenAI

# Thread-safe locks
print_lock = Lock()
results_lock = Lock()

@dataclass
class EvaluationConfig:
    """Configuration for model evaluation."""
    json_file: str
    model: str = "openai@gpt-4o"
    num_threads: int = 1
    system_prompt: str = "You are a helpful assistant."
    output_dir: str = "evaluation_results"

def create_client() -> OpenAI:
    """Create an OpenAI client with appropriate configuration."""
    return OpenAI(
        api_key="abcd",
        base_url="http://localhost:3060/auto/v1/",
        default_headers={
            "x-proxy-key": "3870443e-1f3e-4e5d-a1fe-4e945828cdc6"
        }
    )

def extract_answer(response: str) -> str:
    """Extract the final answer (A-F) from the model's response."""
    # Try to find "Final Answer: X" pattern first
    if match := re.search(r"Final Answer:\s*([A-F])", response, re.IGNORECASE):
        return match.group(1).upper()
    # Fall back to looking for single letter at the end
    if match := re.search(r"([A-F])\s*$", response, re.IGNORECASE):
        return match.group(1).upper()
    return "INVALID"

def process_question(
    client: OpenAI,
    question: Dict[str, Any],
    model: str,
    system_prompt: str,
    pbar: tqdm
) -> Dict[str, Any]:
    """Process a single question and return the result."""
    try:
        # Get model response
        response = client.chat.completions.create(
            model=model,
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": question["prompt"]}
            ],
            temperature=0.0
        )
        
        model_response = response.choices[0].message.content
        model_answer = extract_answer(model_response)
        correct = model_answer == question["answer"]
        
        result = {
            "question_id": question["question_id"],
            "correct_answer": question["answer"],
            "model_answer": model_answer,
            "correct": correct,
            "full_response": model_response,
            "prompt": question["prompt"]
        }
        
        # Thread-safe progress update and printing
        with print_lock:
            pbar.update(1)
            print(f"\nQuestion {question['question_id']}:")
            print(f"Model Response:\n{model_response}")
            print(f"Expected: {question['answer']}, Got: {model_answer}")
            print(f"Correct: {'✓' if correct else '✗'}")
            print("=" * 80)
        
        return result
        
    except Exception as e:
        with print_lock:
            pbar.update(1)
            print(f"\nError processing question {question['question_id']}: {str(e)}")
        
        return {
            "question_id": question["question_id"],
            "correct_answer": question["answer"],
            "model_answer": "ERROR",
            "correct": False,
            "full_response": str(e),
            "prompt": question["prompt"]
        }

def save_results(results: List[Dict], config: EvaluationConfig) -> None:
    """Save evaluation results in multiple formats."""
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    base_path = Path(config.output_dir) / timestamp
    base_path.mkdir(parents=True, exist_ok=True)
    
    # Save detailed JSON results
    detailed_results = {
        "model": config.model,
        "timestamp": timestamp,
        "system_prompt": config.system_prompt,
        "results": results
    }
    with open(base_path / "detailed_results.json", 'w') as f:
        json.dump(detailed_results, f, indent=2)
    
    # Save CSV results
    pd.DataFrame(results).to_csv(base_path / "results.csv", index=False)
    
    # Save individual question results
    cot_dir = base_path / "chain_of_thought"
    cot_dir.mkdir(exist_ok=True)
    
    for result in results:
        with open(cot_dir / f"question_{result['question_id']}.txt", 'w') as f:
            f.write(f"Question ID: {result['question_id']}\n")
            f.write(f"Prompt:\n{result['prompt']}\n\n")
            f.write(f"Model Response:\n{result['full_response']}\n\n")
            f.write(f"Expected Answer: {result['correct_answer']}\n")
            f.write(f"Model Answer: {result['model_answer']}\n")
            f.write(f"Correct: {result['correct']}\n")

def evaluate_model(config: EvaluationConfig) -> pd.DataFrame:
    """
    Evaluate the model on questions using multiple threads.
    
    Args:
        config: EvaluationConfig object containing all necessary parameters
    
    Returns:
        DataFrame containing evaluation results
    """
    # Load questions
    with open(config.json_file, 'r') as f:
        questions = json.load(f)["eval_data"]
    
    # Initialize progress bar and results
    pbar = tqdm(total=len(questions), desc=f"Evaluating {config.model}")
    results = []
    
    # Create clients for each thread
    clients = [create_client() for _ in range(config.num_threads)]
    
    # Process questions in parallel
    with ThreadPoolExecutor(max_workers=config.num_threads) as executor:
        futures = [
            executor.submit(
                process_question,
                clients[i % config.num_threads],
                question,
                config.model,
                config.system_prompt,
                pbar
            )
            for i, question in enumerate(questions)
        ]
        
        # Collect results as they complete
        for future in as_completed(futures):
            with results_lock:
                results.append(future.result())
    
    # Sort results by question_id
    results.sort(key=lambda x: x["question_id"])
    
    # Save results
    save_results(results, config)
    
    # Calculate and display metrics
    df = pd.DataFrame(results)
    correct_answers = df['correct'].sum()
    accuracy = correct_answers / len(questions)
    
    print(f"\nFinal Results for {config.model}")
    print(f"Total Questions: {len(questions)}")
    print(f"Correct Answers: {correct_answers}")
    print(f"Accuracy: {accuracy:.2%}")
    
    return df

config = EvaluationConfig(
    json_file="simple_bench_public.json",
    model="gemini@gemini-2.0-flash-exp",
    # model="anthropic@claude-3-5-sonnet-latest",
    num_threads=2,
    system_prompt="""You are to analyze multiple choice questions through an intense philosophical debate between ancient Greek philosophers. Structure your response as a detailed dialogue with at least 50 turns between:

1. SOCRATES (The Moderator): Guides the discussion through probing questions, challenges assumptions, and helps reach truth through his famous method. Starts and ends the debate.

2. PLATO (The Idealist): Focuses on perfect forms, abstract reasoning, and seeks absolute truth. Considers how the question relates to higher principles and ideal forms.

3. DIOGENES (The Cynic): Provocative and unconventional, questions everything, makes seemingly absurd but potentially insightful observations. May interrupt with shocking statements or challenge basic assumptions. Often uses humor and sarcasm. Challenges every single assumption (explicit or implicit) that the others make, and tries to make them see the absurdity of their assumptions.

4. ARISTOTLE (The Empiricist): Provides practical observations, categorizes information, and analyzes based on observable evidence and logical deduction.

5. EPICURUS (The Pragmatist): Considers practical outcomes, weighs pleasure against pain, and focuses on what leads to the best results with minimal suffering.

Format the debate as:
SOCRATES: [Introduces the question and begins moderating]
[At least 50 turns of detailed philosophical dialogue, with each philosopher contributing their unique perspective]
SOCRATES: [Summarizes key points and guides to conclusion]
DIOGENES: [Makes one final provocative observation]
SOCRATES: "After our lengthy discourse, we must now render judgment. Based on our philosophical examination, I conclude that...
Final Answer: [A-F]"

Ensure the debate:
- Maintains each philosopher's distinct voice and worldview
- Challenge every assumption, however large or small, multiple times and thoroughly
- Includes frequent interruptions and challenges from Diogenes
- Examines multiple angles of the question
- Features proper philosophical terminology and references
- Eventually reaches a clear conclusion, but only at the very end and after considering ALL perspectives
- Always ends with "Final Answer: [A-F]"
- Do not abbreviate the debate at all - it might be very, very long, and go in circles, or even hit dead ends, but this is exactly what we want.
- Diogenes is the most important character in the debate, challenging every single assumption (explicit or implicit) that the others make, and tries to make them see the absurdity of their assumptions.
"""

#     system_prompt="""
# You are a thoughtful and intelligent system who thinks through your responses thoroughly before responding. After you think through your answer, write it out in the format "Final Answer: [A-F]".
# """.strip(),
)
    
results_df = evaluate_model(config)


Evaluating gemini@gemini-2.0-flash-exp:   0%|          | 0/10 [00:00<?, ?it/s]


Question 1:
Model Response:
Okay, let's analyze this problem step-by-step, considering the philosophical perspectives as we go.

SOCRATES: Let us begin with this puzzle of ice cubes and frying pans. The question asks, "If the average number of ice cubes per minute placed in the pan while it was frying a crispy egg was five, how many whole ice cubes can be found in the pan at the end of the third minute?" It seems a simple numerical problem, but let us examine it with care.

PLATO: Indeed, Socrates. Before we delve into the particular numbers, we must consider the ideal form of "average". It represents a perfect mean, a balance of quantities that transcends the messy reality of individual minutes. The average number of ice cubes per minute being five suggests a higher, more perfect arrangement of number, a manifestation of an ideal.

DIOGENES: Hah! Ideal forms of ice cubes, you say, Plato? Are these ideal forms also melting in your abstract frying pan? What use is an "ideal average" wh