In [None]:
import json
import time
from openai import OpenAI
import re
from typing import List, Dict
import pandas as pd
from tqdm.notebook import tqdm
from pathlib import Path
from datetime import datetime
from concurrent.futures import ThreadPoolExecutor, as_completed
from threading import Lock

# Lock for thread-safe printing and progress updates
print_lock = Lock()
results_lock = Lock()

def create_client():
    return OpenAI(
        api_key="abcd",
        base_url="http://localhost:3060/auto/v1/",
        default_headers={
            "x-proxy-key": "3870443e-1f3e-4e5d-a1fe-4e945828cdc6"
        }
    )

def extract_answer(response: str) -> str:
    """Extract the final answer (A-F) from the model's response."""
    match = re.search(r"Final Answer:\s*([A-F])", response, re.IGNORECASE)
    if match:
        return match.group(1).upper()
    match = re.search(r"([A-F])\s*$", response, re.IGNORECASE)
    if match:
        return match.group(1).upper()
    return "INVALID"

def evaluate_single_question(args):
    """Worker function for thread pool."""
    client, question, model, pbar, total_questions = args
    try:
        response = client.chat.completions.create(
            model=model,
            messages=[
                {"role": "system", "content": "You are a helpful assistant."},
                {"role": "user", "content": question["prompt"]}
            ]
        )
        
        model_response = response.choices[0].message.content
        model_answer = extract_answer(model_response)
        correct = model_answer == question["answer"]
        
        result = {
            "question_id": question["question_id"],
            "correct_answer": question["answer"],
            "model_answer": model_answer,
            "correct": correct,
            "full_response": model_response,
            "prompt": question["prompt"]
        }
        
        # Thread-safe progress and result printing
        with print_lock:
            pbar.update(1)
            print(f"\nQuestion {question['question_id']}:")
            print("\nChain of Thought:")
            print("-" * 50)
            print(model_response)
            print("-" * 50)
            print(f"Expected: {question['answer']}")
            print(f"Got: {model_answer}")
            print(f"Correct: {'✓' if correct else '✗'}")
            print("=" * 80)
        
        return result
    
    except Exception as e:
        with print_lock:
            pbar.update(1)
            print(f"\nError processing question {question['question_id']}: {str(e)}")
        
        return {
            "question_id": question["question_id"],
            "correct_answer": question["answer"],
            "model_answer": "ERROR",
            "correct": False,
            "full_response": str(e),
            "prompt": question["prompt"]
        }

def save_results(results: List[Dict], model: str, output_dir: str = "evaluation_results"):
    """Save results in multiple formats with timestamps."""
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    base_path = Path(output_dir) / timestamp
    base_path.mkdir(parents=True, exist_ok=True)
    
    detailed_results = {
        "model": model,
        "timestamp": timestamp,
        "results": results
    }
    with open(base_path / "detailed_results.json", 'w') as f:
        json.dump(detailed_results, f, indent=2)
    
    df = pd.DataFrame(results)
    df.to_csv(base_path / "results.csv", index=False)
    
    cot_dir = base_path / "chain_of_thought"
    cot_dir.mkdir(exist_ok=True)
    
    for result in results:
        with open(cot_dir / f"question_{result['question_id']}.txt", 'w') as f:
            f.write(f"Question ID: {result['question_id']}\n")
            f.write(f"Prompt:\n{result['prompt']}\n\n")
            f.write(f"Model Response:\n{result['full_response']}\n\n")
            f.write(f"Expected Answer: {result['correct_answer']}\n")
            f.write(f"Model Answer: {result['model_answer']}\n")
            f.write(f"Correct: {result['correct']}\n")

def evaluate_model(json_file: str, model: str = "openai@gpt-4o", num_threads: int = 10) -> pd.DataFrame:
    """Evaluate the model on all questions in the JSON file using multiple threads."""
    # Load questions
    with open(json_file, 'r') as f:
        data = json.load(f)
    
    questions = data["eval_data"]
    total_questions = len(questions)
    results = []
    
    # Create progress bar
    pbar = tqdm(total=total_questions, desc=f"Evaluating {model}")
    
    # Create a client for each thread
    clients = [create_client() for _ in range(num_threads)]
    
    # Prepare arguments for thread pool
    thread_args = [
        (clients[i % num_threads], question, model, pbar, total_questions)
        for i, question in enumerate(questions)
    ]
    
    # Process questions in parallel
    with ThreadPoolExecutor(max_workers=num_threads) as executor:
        future_to_question = {
            executor.submit(evaluate_single_question, args): args[1]["question_id"]
            for args in thread_args
        }
        
        # Collect results as they complete
        for future in as_completed(future_to_question):
            result = future.result()
            with results_lock:
                results.append(result)
    
    # Sort results by question_id to maintain order
    results.sort(key=lambda x: x["question_id"])
    
    # Save results
    save_results(results, model)
    
    # Calculate and display final metrics
    df = pd.DataFrame(results)
    correct_answers = df['correct'].sum()
    accuracy = correct_answers / total_questions
    
    print(f"\nFinal Results for {model}")
    print(f"Total Questions: {total_questions}")
    print(f"Correct Answers: {correct_answers}")
    print(f"Accuracy: {accuracy:.2%}")
    
    return df

# Example usage
if __name__ == "__main__":
    results_df = evaluate_model(
        "simple_bench_public.json", 
        # model="openai@gpt-4o-mini",
        model="anthropic@claude-3-5-sonnet-latest",
        # num_threads=10,
        # model="openai@o1-mini", 
        # model="openai@o1-mini", 
        # num_threads=2,
        # model="gemini@gemini-2.0-flash-exp",
        num_threads=1
    )



Question 4:

Chain of Thought:
--------------------------------------------------
Let me solve this step by step:

1) We know one sister always speaks mistruths and the other always lies, but we don't know which is which.

2) Let's evaluate each option:
- Option E and F are irrelevant to finding the correct path
- Option B won't help find the path
- Option D is not logically helpful

3) Let's analyze Option C "What path leads to the treasure?"
- If you ask the truth-teller, she'll tell the truth
- If you ask the liar, she'll lie
- Since we don't know which is which, this won't help

4) Let's analyze Option A "What would your sister say if I asked her which path leads to the treasure?"
- If you ask the truth-teller: She would truthfully tell you what her lying sister would say (which would be the wrong path)
- If you ask the liar: She would lie about what her truth-telling sister would say (so she would also indicate the wrong path)

5) Key insight: No matter which sister you ask Optio

In [None]:
from dataclasses import dataclass
from typing import Any, List, Dict, Optional, Set
from enum import Enum
import networkx as nx
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm
import json
import pandas as pd
import re

class StepStatus(Enum):
    PENDING = "pending"
    RUNNING = "running"
    COMPLETED = "completed"
    FAILED = "failed"

import logging
import time
from datetime import datetime
from pathlib import Path

class ChainLogger:
    """Handles logging for model chain execution"""
    def __init__(self, log_dir: str = "chain_logs"):
        self.log_dir = Path(log_dir)
        self.log_dir.mkdir(exist_ok=True)
        
        # Create run-specific directory
        self.run_id = datetime.now().strftime("%Y%m%d_%H%M%S")
        self.run_dir = self.log_dir / self.run_id
        self.run_dir.mkdir(exist_ok=True)
        
        # Setup logging
        self.logger = logging.getLogger(f"chain_run_{self.run_id}")
        self.logger.setLevel(logging.INFO)
        
        # File handler for detailed logs
        fh = logging.FileHandler(self.run_dir / "detailed.log")
        fh.setLevel(logging.INFO)
        
        # Console handler for important updates
        ch = logging.StreamHandler()
        ch.setLevel(logging.INFO)
        
        # Formatting
        formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
        fh.setFormatter(formatter)
        ch.setFormatter(formatter)
        
        self.logger.addHandler(fh)
        self.logger.addHandler(ch)
        
        # Metrics tracking
        self.metrics = {
            "step_times": {},
            "success_rates": {},
            "total_questions": 0,
            "completed_questions": 0,
            "errors": [],
        }
    
    def log_step_start(self, step_name: str, question_id: str):
        """Log the start of a step execution"""
        self.logger.info(f"Starting step {step_name} for question {question_id}")
        if step_name not in self.metrics["step_times"]:
            self.metrics["step_times"][step_name] = []
        return time.time()
    
    def log_step_end(self, step_name: str, question_id: str, start_time: float, success: bool):
        """Log the completion of a step execution"""
        duration = time.time() - start_time
        self.metrics["step_times"][step_name].append(duration)
        
        if step_name not in self.metrics["success_rates"]:
            self.metrics["success_rates"][step_name] = {"success": 0, "total": 0}
        
        self.metrics["success_rates"][step_name]["total"] += 1
        if success:
            self.metrics["success_rates"][step_name]["success"] += 1
        
        self.logger.info(
            f"Completed step {step_name} for question {question_id} "
            f"(duration: {duration:.2f}s, success: {success})"
        )
    
    def log_error(self, step_name: str, question_id: str, error: Exception):
        """Log an error during execution"""
        self.logger.error(f"Error in step {step_name} for question {question_id}: {str(error)}")
        self.metrics["errors"].append({
            "step": step_name,
            "question": question_id,
            "error": str(error),
            "timestamp": datetime.now().isoformat()
        })
    
    def log_response(self, step_name: str, question_id: str, response: Dict):
        """Log a model response"""
        response_file = self.run_dir / f"responses_{question_id}.jsonl"
        with open(response_file, "a") as f:
            f.write(json.dumps({
                "step": step_name,
                "question": question_id,
                "response": response,
                "timestamp": datetime.now().isoformat()
            }) + "\n")
    
    def save_metrics(self):
        """Save current metrics to file"""
        metrics_file = self.run_dir / "metrics.json"
        
        # Calculate averages and success rates
        summary = {
            "average_step_times": {
                step: sum(times)/len(times) if times else 0
                for step, times in self.metrics["step_times"].items()
            },
            "success_rates": {
                step: (data["success"] / data["total"] if data["total"] > 0 else 0)
                for step, data in self.metrics["success_rates"].items()
            },
            "total_questions": self.metrics["total_questions"],
            "completed_questions": self.metrics["completed_questions"],
            "error_count": len(self.metrics["errors"])
        }
        
        with open(metrics_file, "w") as f:
            json.dump({
                "raw_metrics": self.metrics,
                "summary": summary
            }, f, indent=2)
        
        return summary

@dataclass
class ModelStep:
    """Combined model step and result tracking"""
    name: str
    model_name: str
    system_prompt: str = "You are a helpful assistant."
    depends_on: Set[str] = None
    status: StepStatus = StepStatus.PENDING
    response: Optional[Dict] = None
    
    def __post_init__(self):
        self.depends_on = self.depends_on or set()

class ModelChain:
    """Model chain with integrated scheduling and logging"""
    def __init__(self, steps: List[ModelStep], max_concurrent: int = 5):
        self.steps = {step.name: step for step in steps}
        self.max_concurrent = max_concurrent
        self.execution_order = self._compute_execution_order()
        self.logger = ChainLogger()
    
    def execute(self, client: Any, question: Dict, pbar: Optional[tqdm] = None) -> List[Dict]:
        """Execute the model chain with logging"""
        results = []
        
        for step_name in self.execution_order:
            step = self.steps[step_name]
            
            if not self._can_execute(step_name):
                continue
            
            start_time = self.logger.log_step_start(step_name, question["question_id"])
            
            try:
                step.status = StepStatus.RUNNING
                dep_responses = self._get_dependency_responses(step_name)
                
                response = evaluate_single_step(
                    client,
                    step,
                    question["prompt"],
                    dep_responses
                )
                
                step.response = response
                step.status = StepStatus.COMPLETED
                results.append(response)
                
                self.logger.log_step_end(step_name, question["question_id"], start_time, True)
                self.logger.log_response(step_name, question["question_id"], response)
                
                if pbar:
                    pbar.update(1)
                    
            except Exception as e:
                step.status = StepStatus.FAILED
                self.logger.log_error(step_name, question["question_id"], e)
                self.logger.log_step_end(step_name, question["question_id"], start_time, False)
        
        return results
    
    def _compute_execution_order(self) -> List[str]:
        """Compute topological sort of steps"""
        graph = nx.DiGraph()
        for step in self.steps.values():
            graph.add_node(step.name)
            for dep in step.depends_on:
                if dep not in self.steps:
                    raise ValueError(f"Unknown dependency: {dep}")
                graph.add_edge(dep, step.name)
        
        if not nx.is_directed_acyclic_graph(graph):
            raise ValueError("Dependency cycle detected")
        
        return list(nx.topological_sort(graph))
    
    def _can_execute(self, step_name: str) -> bool:
        """Check if all dependencies are completed"""
        step = self.steps[step_name]
        return all(
            self.steps[dep].status == StepStatus.COMPLETED
            for dep in step.depends_on
        )
    
    def _get_dependency_responses(self, step_name: str) -> List[Dict]:
        """Get responses from completed dependencies"""
        step = self.steps[step_name]
        return [
            self.steps[dep].response
            for dep in step.depends_on
            if self.steps[dep].response
        ]
    
def evaluate_model_chain(json_file: str, model_chain: ModelChain, num_threads: int = 10) -> pd.DataFrame:
    """Evaluate using parallel execution of questions with comprehensive logging"""
    with open(json_file, 'r') as f:
        questions = json.load(f)["eval_data"]
    
    model_chain.logger.metrics["total_questions"] = len(questions)
    total_steps = len(model_chain.steps) * len(questions)
    pbar = tqdm(total=total_steps, desc="Evaluating")
    
    results = []
    clients = [create_client() for _ in range(num_threads)]
    
    with ThreadPoolExecutor(max_workers=num_threads) as executor:
        futures = [
            executor.submit(model_chain.execute, clients[i % num_threads], q, pbar)
            for i, q in enumerate(questions)
        ]
        
        for future, question in zip(futures, questions):
            try:
                step_responses = future.result()
                final_answer = step_responses[-1]["extracted_answer"]
                correct = final_answer == question["answer"]
                
                result = {
                    "question_id": question["question_id"],
                    "correct_answer": question["answer"],
                    "model_answer": final_answer,
                    "correct": correct,
                    "step_responses": step_responses,
                    "prompt": question["prompt"]
                }
                
                results.append(result)
                model_chain.logger.metrics["completed_questions"] += 1
                
            except Exception as e:
                model_chain.logger.log_error("evaluation", question["question_id"], e)
                results.append({
                    "question_id": question["question_id"],
                    "correct_answer": question["answer"],
                    "model_answer": "ERROR",
                    "correct": False,
                    "step_responses": [],
                    "prompt": question["prompt"]
                })
    
    df = pd.DataFrame(results)
    
    # Save and display final metrics
    summary = model_chain.logger.save_metrics()
    print("\nEvaluation Summary:")
    print(f"Total Questions: {summary['total_questions']}")
    print(f"Completed Questions: {summary['completed_questions']}")
    print(f"Accuracy: {df['correct'].mean():.2%}")
    print("\nAverage Step Times:")
    for step, time in summary['average_step_times'].items():
        print(f"  {step}: {time:.2f}s")
    print("\nSuccess Rates:")
    for step, rate in summary['success_rates'].items():
        print(f"  {step}: {rate:.2%}")
    
    return df


def evaluate_single_step(client: Any, step: ModelStep, prompt: str, 
                        previous_responses: Optional[List[Dict]] = None) -> Dict:
    """Execute a single model step"""
    if previous_responses:
        context = "\n\nPrevious model responses:\n"
        for idx, resp in enumerate(previous_responses, 1):
            context += f"\nModel {idx} ({resp['model']}):\n{resp['response']}\n"
            context += f"Model {idx}'s answer: {resp['extracted_answer']}\n"
        prompt += context
    
    response = client.chat.completions.create(
        model=step.model_name,
        messages=[
            {"role": "system", "content": step.system_prompt},
            {"role": "user", "content": prompt}
        ]
    )
    
    model_response = response.choices[0].message.content
    return {
        "model": step.model_name,
        "response": model_response,
        "extracted_answer": extract_answer(model_response)
    }


# Define base models
base_models = [
    ModelStep(
        name="gpt4",
        model_name="openai@gpt-4o",
        system_prompt="You are an expert at solving multiple choice questions."
    ),
    ModelStep(
        name="gpt4mini",
        model_name="openai@gpt-4o-mini",
        system_prompt="You are an expert at solving multiple choice questions."
    ),
    ModelStep(
        name="claude",
        model_name="anthropic@claude-3-5-sonnet-latest",
        system_prompt="You are an expert at solving multiple choice questions."
    )
]

# Define reviewer and aggregator
reviewer = ModelStep(
    name="reviewer",
    model_name="openai@gpt-4o",
    system_prompt="Review the responses from GPT-4 and Claude and provide your analysis.",
    depends_on={"gpt4", "claude"}
)

aggregator = ModelStep(
    name="aggregator",
    model_name="openai@gpt-4o",
    system_prompt="Analyze all previous responses and provide the final answer.",
    depends_on={"gpt4", "gpt4mini", "claude", "reviewer"}
)

# Create and run the chain
model_chain = ModelChain(base_models + [reviewer, aggregator])
results_df = evaluate_model_chain("simple_bench_public.json", model_chain)

