In [21]:
import os
from typing import List, Dict, Tuple
import random
import asyncio
import re
import numpy as np
from openai import AsyncOpenAI
from anthropic import AsyncAnthropic
import google.generativeai as genai
from dotenv import load_dotenv

In [15]:
load_dotenv()

# Initialize API clients
openai_client = AsyncOpenAI(api_key=os.getenv("SELF_RAG_OPENAI_API_KEY"))
anthropic_client = AsyncAnthropic(api_key=os.getenv("NEZ_CLAUDE_API_KEY"))
genai.configure(api_key=os.getenv("GEMINI_API"))

In [None]:
class ModelDiscussionSystem:
    def __init__(self):
        self.model_map = {
            "R1": "OpenAI",
            "R2": "Claude",
            "R3": "Gemini"
        }
        self.reverse_map = {v: k for k, v in self.model_map.items()}
        
        # Scoring criteria and weights
        self.scoring_criteria = {
            "factual_accuracy": 0.35,
            "explanation_quality": 0.25,
            "practical_examples": 0.20,
            "technical_depth": 0.20
        }
        
        # Minimum score difference required for a clear winner
        self.min_score_difference = 5.0
        
        # Categories for detailed evaluation
        self.evaluation_categories = [
            "factual_accuracy",
            "explanation_quality",
            "practical_examples",
            "technical_depth",
            "clarity",
            "completeness"
        ]
    
    async def get_initial_responses(self, prompt: str) -> Dict[str, str]:
        """Get initial responses from all three models."""
        tasks = [
            self._get_openai_response(prompt),
            self._get_claude_response(prompt),
            self._get_gemini_response(prompt)
        ]
        responses = await asyncio.gather(*tasks)
        
        # Randomly assign response labels
        labels = list(self.reverse_map.values())
        random.shuffle(labels)
        
        return {labels[i]: resp for i, resp in enumerate(responses)}

    async def _get_openai_response(self, prompt: str) -> str:
        """Get response from OpenAI's GPT model."""
        response = await openai_client.chat.completions.create(
            model="gpt-4",
            messages=[{"role": "user", "content": prompt}]
        )
        return response.choices[0].message.content

    async def _get_claude_response(self, prompt: str) -> str:
        """Get response from Anthropic's Claude model."""
        response = await anthropic_client.messages.create(
            model="claude-3-opus-20240229",
            max_tokens=1000,
            messages=[{"role": "user", "content": prompt}]
        )
        return response.content[0].text

    async def _get_gemini_response(self, prompt: str) -> str:
        """Get response from Google's Gemini model."""
        model = genai.GenerativeModel('gemini-pro')
        response = await model.generate_content_async(prompt)
        return response.text

    def _create_discussion_prompt(self, responses: Dict[str, str]) -> str:
        """Create enhanced prompt for models to discuss the best response."""
        formatted_responses = "\n\n".join([
            f"=== Response {label} ===\n{response}\n"
            for label, response in responses.items()
        ])
        
        criteria_list = "\n".join([f"- {c.replace('_', ' ').title()}" for c in self.evaluation_categories])
        
        return f"""Analyze these three responses to the same prompt and provide a detailed evaluation. 
        Don't reveal which model you are, and refer to responses only by their labels (R1, R2, R3).

        {formatted_responses}

        Please provide your analysis in exactly this format:

        === DETAILED SCORING ===
        Score each response in every category (1-100):

        R1:
        {criteria_list}
        Overall Score: [Calculate weighted average]

        R2:
        {criteria_list}
        Overall Score: [Calculate weighted average]

        R3:
        {criteria_list}
        Overall Score: [Calculate weighted average]

        === WEIGHTED ANALYSIS ===
        Consider these weights in your final evaluation:
        - Factual Accuracy: 35%
        - Explanation Quality: 25%
        - Practical Examples: 20%
        - Technical Depth: 20%

        === CONSENSUS EVALUATION ===
        TOP RESPONSE: [State which response scored highest]
        Score Difference from Runner-up: [Calculate difference]
        Confidence Level: [High/Medium/Low based on score difference]

        === QUALITATIVE ANALYSIS ===
        STRENGTHS of winning response:
        1. [First strength]
        2. [Second strength]
        3. [Third strength]

        IMPROVEMENTS NEEDED:
        1. [First improvement]
        2. [Second improvement]
        3. [Third improvement]

        === COMPARATIVE INSIGHTS ===
        Briefly explain why the winning response outperformed others in specific areas."""

    def _analyze_discussion(self, discussion_responses: List[str]) -> Tuple[str, str, Dict]:
        """Enhanced analysis of discussion responses with multiple evaluation methods."""
        # Initialize data structures
        category_scores = {label: {cat: [] for cat in self.evaluation_categories} 
                         for label in ["R1", "R2", "R3"]}
        overall_scores = {label: [] for label in ["R1", "R2", "R3"]}
        refinements = []
        strengths = []
        
        for response in discussion_responses:
            response_lower = response.lower()
            
            # Extract category scores
            for label in ["R1", "R2", "R3"]:
                for category in self.evaluation_categories:
                    pattern = f"{label.lower()}.*?{category.replace('_', ' ')}:?.*?(\d+)"
                    matches = re.findall(pattern, response_lower)
                    if matches:
                        try:
                            score = int(matches[0])
                            category_scores[label][category].append(score)
                        except ValueError:
                            continue
            
            # Extract overall scores
            for label in ["R1", "R2", "R3"]:
                pattern = f"{label.lower()}.*?overall score:?.*?(\d+)"
                matches = re.findall(pattern, response_lower)
                if matches:
                    try:
                        score = int(matches[0])
                        overall_scores[label].append(score)
                    except ValueError:
                        continue
            
            # Extract improvements and strengths
            if "improvements needed:" in response_lower:
                improvements = response_lower.split("improvements needed:")[1].split("===")[0]
                refinements.extend([imp.strip() for imp in improvements.split("\n") if imp.strip()])
            
            if "strengths" in response_lower:
                strength_section = response_lower.split("strengths")[1].split("improvements")[0]
                strengths.extend([s.strip() for s in strength_section.split("\n") if s.strip()])
        
        # Calculate average scores
        avg_category_scores = {
            label: {
                cat: sum(scores)/len(scores) if scores else 0
                for cat, scores in categories.items()
            }
            for label, categories in category_scores.items()
        }
        
        avg_overall_scores = {
            label: sum(scores)/len(scores) if scores else 0
            for label, scores in overall_scores.items()
        }
        
        # Calculate weighted scores
        weighted_scores = {}
        for label in ["R1", "R2", "R3"]:
            weighted_score = sum(
                avg_category_scores[label][cat.replace(' ', '_')] * weight
                for cat, weight in self.scoring_criteria.items()
            )
            weighted_scores[label] = weighted_score
        
        # Determine winner
        sorted_scores = sorted(weighted_scores.items(), key=lambda x: x[1], reverse=True)
        winner = sorted_scores[0][0]
        runner_up = sorted_scores[1][0]
        score_difference = weighted_scores[winner] - weighted_scores[runner_up]
        
        # Determine confidence level
        confidence = "High" if score_difference >= self.min_score_difference * 2 else \
                    "Medium" if score_difference >= self.min_score_difference else \
                    "Low"
        
        # Prepare analysis result
        analysis_result = {
            "winner": winner,
            "confidence_level": confidence,
            "score_difference": score_difference,
            "weighted_scores": weighted_scores,
            "category_scores": avg_category_scores,
            "overall_scores": avg_overall_scores,
            "strengths": list(set(strengths)),
            "refinements": list(set(refinements))
        }
        
        return winner, "\n".join(refinements), analysis_result

    async def get_final_response(self, prompt: str, chosen_label: str, refinements: str) -> str:
        """Get final refined response from the chosen model."""
        final_prompt = self._create_final_prompt(prompt, refinements)
        
        # Map the response label back to the actual model
        chosen_model = self.model_map[chosen_label]
        
        if chosen_model == "OpenAI":
            return await self._get_openai_response(final_prompt)
        elif chosen_model == "Claude":
            return await self._get_claude_response(final_prompt)
        else:  # Gemini
            return await self._get_gemini_response(final_prompt)

    def _create_final_prompt(self, original_prompt: str, refinements: str) -> str:
        """Create prompt for final refined response."""
        return f"""Original prompt: {original_prompt}

        Please provide a refined response incorporating these suggested improvements:
        {refinements}

        Give your best, most comprehensive answer incorporating these refinements."""

    async def facilitate_discussion(self, initial_responses: Dict[str, str]) -> Tuple[str, str, Dict]:
        """Facilitate enhanced discussion between models."""
        discussion_prompt = self._create_discussion_prompt(initial_responses)
        
        tasks = [
            self._get_openai_response(discussion_prompt),
            self._get_claude_response(discussion_prompt),
            self._get_gemini_response(discussion_prompt)
        ]
        discussion_responses = await asyncio.gather(*tasks)
        
        return self._analyze_discussion(discussion_responses)

In [17]:
async def get_model_discussion(prompt: str):
    """Enhanced utility function to run the discussion process."""
    system = ModelDiscussionSystem()
    
    print("Getting initial responses...")
    initial_responses = await system.get_initial_responses(prompt)
    print("\nInitial Responses:")
    for label, response in initial_responses.items():
        model_name = system.model_map[label]
        print(f"\n{label} ({model_name}):\n{response}\n{'-'*50}")
    
    print("\nFacilitating discussion...")
    chosen_label, refinements, analysis = await system.facilitate_discussion(initial_responses)
    
    print("\n" + "="*50)
    print("ANALYSIS RESULTS")
    print("="*50)
    
    # Winner Information
    winning_model = analysis['winner_model']
    print(f"\n🏆 WINNING RESPONSE:")
    print(f"Model: {winning_model} (Response {chosen_label})")
    print(f"Confidence: {analysis['confidence_level']}")
    print(f"Margin of Victory: {analysis['score_difference']:.2f} points")
    
    # Scores
    print("\n📊 FINAL SCORES:")
    for label, score in analysis['overall_scores'].items():
        model_name = system.model_map[label]
        print(f"{label} ({model_name}): {score:.2f}")
    
    # Category Breakdown
    print(f"\n📋 CATEGORY BREAKDOWN FOR {winning_model}:")
    winner_scores = analysis['category_scores'][chosen_label]
    for category, score in winner_scores.items():
        print(f"{category.replace('_', ' ').title()}: {score:.2f}")
    
    # Strengths
    print("\n💪 STRENGTHS:")
    for i, strength in enumerate(analysis['strengths'], 1):
        print(f"{i}. {strength}")
    
    # Improvements
    print("\n📈 SUGGESTED IMPROVEMENTS:")
    for i, refinement in enumerate(analysis['refinements'], 1):
        print(f"{i}. {refinement}")
    
    # Final Response
    print(f"\n{'='*50}")
    print(f"FINAL REFINED RESPONSE FROM {winning_model.upper()}")
    print(f"{'='*50}")
    final_response = await system.get_final_response(prompt, chosen_label, refinements)
    print(f"\n{final_response}")
    
    # Return the complete analysis
    return {
        'initial_responses': initial_responses,
        'analysis': analysis,
        'final_response': final_response
    }

In [18]:
prompt = "Explain the concept of seasons"
result = await get_model_discussion(prompt)
result

Getting initial responses...

Initial Responses:

R3:
Seasons are periods of the year that are distinguished by special climate conditions. These seasons—spring, summer, autumn, and winter—occur because of the Earth's changing position relative to the sun. The Earth's axis is tilted at an angle of 23.5 degrees, which means the Earth always points to one side as it goes around the Sun. So, as the Earth orbits the Sun, different parts of the world receive different amounts of direct sunlight.

When it's winter in one hemisphere, it's summer in the other. This is because during winter, the sun's rays hit that part of Earth at a shallow angle, causing the sunlight to spread out over a larger area and the temperatures to be cooler. During summer, the sun's rays hit that part of Earth at a steeper angle, focusing the rays on a smaller area and increasing temperatures. 

In between, spring and autumn mark the transitions where days and nights are of roughly equal length. The climate and the e

{'initial_responses': {'R3': "Seasons are periods of the year that are distinguished by special climate conditions. These seasons—spring, summer, autumn, and winter—occur because of the Earth's changing position relative to the sun. The Earth's axis is tilted at an angle of 23.5 degrees, which means the Earth always points to one side as it goes around the Sun. So, as the Earth orbits the Sun, different parts of the world receive different amounts of direct sunlight.\n\nWhen it's winter in one hemisphere, it's summer in the other. This is because during winter, the sun's rays hit that part of Earth at a shallow angle, causing the sunlight to spread out over a larger area and the temperatures to be cooler. During summer, the sun's rays hit that part of Earth at a steeper angle, focusing the rays on a smaller area and increasing temperatures. \n\nIn between, spring and autumn mark the transitions where days and nights are of roughly equal length. The climate and the ecosystems respond to

In [19]:

# initial_responses = result['initial_responses']
# final_response = result['final_response']

# print("\nInitial Responses:", initial_responses)
# print("final response:", final_response)