In [21]:
import os
from typing import List, Dict, Tuple
import random
import asyncio
import re
import numpy as np
from openai import AsyncOpenAI
from anthropic import AsyncAnthropic
import google.generativeai as genai
from dotenv import load_dotenv

In [22]:
load_dotenv()

# Initialize API clients
openai_client = AsyncOpenAI(api_key=os.getenv("SELF_RAG_OPENAI_API_KEY"))
anthropic_client = AsyncAnthropic(api_key=os.getenv("NEZ_CLAUDE_API_KEY"))
genai.configure(api_key=os.getenv("GEMINI_API"))

In [23]:
class ModelDiscussionSystem:
    def __init__(self):
        self.model_map = {
            "R1": "OpenAI",
            "R2": "Claude",
            "R3": "Gemini"
        }
        self.reverse_map = {v: k for k, v in self.model_map.items()}
        
        # Scoring criteria and weights
        self.scoring_criteria = {
            "factual_accuracy": 0.35,
            "explanation_quality": 0.25,
            "practical_examples": 0.20,
            "technical_depth": 0.20
        }
        
        # Minimum score difference required for a clear winner
        self.min_score_difference = 5.0
        
        # Categories for detailed evaluation
        self.evaluation_categories = [
            "factual_accuracy",
            "explanation_quality",
            "practical_examples",
            "technical_depth",
            "clarity",
            "completeness"
        ]
        
        # Score extraction patterns
        self.score_patterns = [
            r'(\d+)\s*(?:points|\/100|%)?',
            r'score:?\s*(\d+)',
            r'rated:?\s*(\d+)',
            r'(\d+)\s*out of\s*100'
        ]
    
    async def get_initial_responses(self, prompt: str) -> Dict[str, str]:
        """Get initial responses from all three models."""
        tasks = [
            self._get_openai_response(prompt),
            self._get_claude_response(prompt),
            self._get_gemini_response(prompt)
        ]
        responses = await asyncio.gather(*tasks)
        
        # Randomly assign response labels
        labels = list(self.reverse_map.values())
        random.shuffle(labels)
        
        return {labels[i]: resp for i, resp in enumerate(responses)}

    async def _get_openai_response(self, prompt: str) -> str:
        """Get response from OpenAI's GPT model."""
        response = await openai_client.chat.completions.create(
            model="chatgpt-4o-latest",
            messages=[{"role": "user", "content": prompt}]
        )
        return response.choices[0].message.content

    async def _get_claude_response(self, prompt: str) -> str:
        """Get response from Anthropic's Claude model."""
        response = await anthropic_client.messages.create(
            model="claude-3-5-sonnet-20241022",
            max_tokens=1000,
            messages=[{"role": "user", "content": prompt}]
        )
        return response.content[0].text

    async def _get_gemini_response(self, prompt: str) -> str:
        """Get response from Google's Gemini model."""
        model = genai.GenerativeModel('gemini-2.0-pro-exp-02-05')
        response = await model.generate_content_async(prompt)
        return response.text

    def _create_discussion_prompt(self, responses: Dict[str, str]) -> str:
        """Create enhanced prompt for models to discuss the best response."""
        formatted_responses = "\n\n".join([
            f"=== Response {label} ===\n{response}\n"
            for label, response in responses.items()
        ])
        
        criteria_list = "\n".join([
            f"- {c.replace('_', ' ').title()} (Score 1-100)" 
            for c in self.evaluation_categories
        ])
        
        weights_list = "\n".join([
            f"- {c.replace('_', ' ').title()}: {w*100}%"
            for c, w in self.scoring_criteria.items()
        ])
        
        return f"""Analyze these three responses to the same prompt and provide a detailed evaluation. 
        Don't reveal which model you are, and refer to responses only by their labels (R1, R2, R3).

        {formatted_responses}

        Please provide your analysis following EXACTLY this format:

        === DETAILED SCORING ===
        Score each response in every category (1-100):

        R1:
        {criteria_list}
        Overall Score: [Calculate weighted average]

        R2:
        {criteria_list}
        Overall Score: [Calculate weighted average]

        R3:
        {criteria_list}
        Overall Score: [Calculate weighted average]

        === WEIGHTED ANALYSIS ===
        Consider these weights in your evaluation:
        {weights_list}

        === CONSENSUS EVALUATION ===
        TOP RESPONSE: [Write exactly "I choose RX" where X is 1, 2, or 3]
        Score Difference from Runner-up: [Calculate difference]
        Confidence Level: [High/Medium/Low based on score difference]

        === QUALITATIVE ANALYSIS ===
        STRENGTHS of winning response:
        1. [First major strength]
        2. [Second major strength]
        3. [Third major strength]

        IMPROVEMENTS NEEDED:
        1. [First specific improvement]
        2. [Second specific improvement]
        3. [Third specific improvement]

        === COMPARATIVE INSIGHTS ===
        [Explain why the winning response outperformed others]"""

    def _extract_scores(self, text: str, label: str, category: str) -> List[int]:
        """Extract scores using multiple patterns."""
        scores = []
        for pattern in self.score_patterns:
            # Create category-specific pattern
            category_pattern = rf"{label.lower()}.*?{category.replace('_', ' ')}.*?{pattern}"
            matches = re.findall(category_pattern, text.lower())
            for match in matches:
                try:
                    score = int(match)
                    if 0 <= score <= 100:  # Validate score range
                        scores.append(score)
                except ValueError:
                    continue
        return scores

    def _clean_text(self, text: str) -> str:
        """Clean text by removing markdown and extra whitespace."""
        cleaned = text.strip()
        cleaned = re.sub(r'[*#\-_]', '', cleaned)
        cleaned = re.sub(r'\s+', ' ', cleaned)
        return cleaned.strip()

    def _extract_points(self, text: str, section_type: str) -> List[str]:
        """Extract and clean points from a section."""
        points = []
        if section_type.lower() in text.lower():
            section = text.lower().split(section_type.lower())[1]
            section = section.split("===")[0] if "===" in section else section
            
            # Split by numbers or bullet points
            raw_points = re.split(r'\d+\.|•|-|\*', section)
            
            for point in raw_points:
                cleaned = self._clean_text(point)
                if (len(cleaned) > 10 and  # Minimum length
                    not any(x in cleaned.lower() for x in ["r1", "r2", "r3", "response"]) and
                    not cleaned.startswith("[")):  # Remove template placeholders
                    points.append(cleaned)
        
        return list(set(points))  # Remove duplicates

    def _analyze_discussion(self, discussion_responses: List[str]) -> Tuple[str, str, Dict]:
        """Enhanced analysis of discussion responses with multiple evaluation methods."""
        # Initialize data structures
        category_scores = {label: {cat: [] for cat in self.evaluation_categories} 
                         for label in ["R1", "R2", "R3"]}
        all_strengths = []
        all_refinements = []
        
        # Process each response
        for response in discussion_responses:
            # Extract scores for each label and category
            for label in ["R1", "R2", "R3"]:
                for category in self.evaluation_categories:
                    scores = self._extract_scores(response, label, category)
                    category_scores[label][category].extend(scores)
            
            # Extract strengths and refinements
            all_strengths.extend(self._extract_points(response, "STRENGTHS"))
            all_refinements.extend(self._extract_points(response, "IMPROVEMENTS"))
        
        # Calculate normalized scores with outlier removal
        normalized_scores = {}
        for label in ["R1", "R2", "R3"]:
            normalized_scores[label] = {}
            for category in self.evaluation_categories:
                scores = category_scores[label][category]
                if scores:
                    # Remove outliers using IQR method
                    q1 = np.percentile(scores, 25)
                    q3 = np.percentile(scores, 75)
                    iqr = q3 - q1
                    lower_bound = q1 - 1.5 * iqr
                    upper_bound = q3 + 1.5 * iqr
                    filtered_scores = [s for s in scores if lower_bound <= s <= upper_bound]
                    
                    if filtered_scores:
                        normalized_scores[label][category] = np.mean(filtered_scores)
                    else:
                        normalized_scores[label][category] = 70  # Default score
                else:
                    normalized_scores[label][category] = 70  # Default score
        
        # Calculate weighted scores
        weighted_scores = {}
        for label in ["R1", "R2", "R3"]:
            score = 0
            total_weight = 0
            for category, weight in self.scoring_criteria.items():
                if category in normalized_scores[label]:
                    score += normalized_scores[label][category] * weight
                    total_weight += weight
            weighted_scores[label] = score / total_weight if total_weight > 0 else 70
        
        # Determine winner and confidence
        sorted_scores = sorted(weighted_scores.items(), key=lambda x: x[1], reverse=True)
        winner = sorted_scores[0][0]
        runner_up = sorted_scores[1][0]
        score_difference = weighted_scores[winner] - weighted_scores[runner_up]
        
        # Determine confidence level with more granularity
        if score_difference >= 15:
            confidence = "Very High"
        elif score_difference >= 10:
            confidence = "High"
        elif score_difference >= 5:
            confidence = "Medium"
        else:
            confidence = "Low"
        
        analysis_result = {
            "winner": winner,
            "winner_model": self.model_map[winner],
            "confidence_level": confidence,
            "score_difference": score_difference,
            "weighted_scores": weighted_scores,
            "category_scores": normalized_scores,
            "overall_scores": {
                label: round(weighted_scores[label], 2) 
                for label in ["R1", "R2", "R3"]
            },
            "strengths": all_strengths[:5],  # Top 5 strengths
            "refinements": all_refinements[:5]  # Top 5 refinements
        }
        
        return winner, "\n".join(all_refinements), analysis_result

    def _create_final_prompt(self, original_prompt: str, refinements: str) -> str:
        """Create prompt for final refined response."""
        return f"""Original prompt: {original_prompt}

        Please provide a refined response incorporating ALL these improvements:
        {refinements}

        Ensure your response is:
        1. Comprehensive and detailed
        2. Well-structured and clear
        3. Incorporates all the suggested improvements
        4. Maintains technical accuracy while being accessible

        Give your best, most refined answer:"""

    async def facilitate_discussion(self, initial_responses: Dict[str, str]) -> Tuple[str, str, Dict]:
        """Facilitate enhanced discussion between models."""
        discussion_prompt = self._create_discussion_prompt(initial_responses)
        
        tasks = [
            self._get_openai_response(discussion_prompt),
            self._get_claude_response(discussion_prompt),
            self._get_gemini_response(discussion_prompt)
        ]
        discussion_responses = await asyncio.gather(*tasks)
        
        return self._analyze_discussion(discussion_responses)

    async def get_final_response(self, prompt: str, chosen_label: str, refinements: str) -> str:
        """Get final refined response from the chosen model."""
        final_prompt = self._create_final_prompt(prompt, refinements)
        chosen_model = self.model_map[chosen_label]
        
        if chosen_model == "OpenAI":
            return await self._get_openai_response(final_prompt)
        elif chosen_model == "Claude":
            return await self._get_claude_response(final_prompt)
        else:  # Gemini
            return await self._get_gemini_response(final_prompt)

In [24]:
async def get_model_discussion(prompt: str):
    """Enhanced utility function to run the discussion process."""
    system = ModelDiscussionSystem()
    
    print("Getting initial responses...")
    initial_responses = await system.get_initial_responses(prompt)
    print("\nInitial Responses:")
    for label, response in initial_responses.items():
        model_name = system.model_map[label]
        print(f"\n{label} ({model_name}):\n{response}\n{'-'*50}")
    
    print("\nFacilitating discussion...")
    chosen_label, refinements, analysis = await system.facilitate_discussion(initial_responses)
    
    print("\n" + "="*50)
    print("ANALYSIS RESULTS")
    print("="*50)
    
    # Winner Information
    winning_model = analysis['winner_model']
    print(f"\n🏆 WINNING RESPONSE:")
    print(f"Model: {winning_model} (Response {chosen_label})")
    print(f"Confidence: {analysis['confidence_level']}")
    print(f"Margin of Victory: {analysis['score_difference']:.2f} points")
    
    # Scores
    print("\n📊 FINAL SCORES:")
    for label, score in analysis['overall_scores'].items():
        model_name = system.model_map[label]
        print(f"{label} ({model_name}): {score:.2f}")
    
    # Category Breakdown
    print(f"\n📋 CATEGORY BREAKDOWN FOR {winning_model}:")
    winner_scores = analysis['category_scores'][chosen_label]
    for category, score in winner_scores.items():
        print(f"{category.replace('_', ' ').title()}: {score:.2f}")
    
    # Strengths
    print("\n💪 STRENGTHS:")
    for i, strength in enumerate(analysis['strengths'], 1):
        print(f"{i}. {strength}")
    
    # Improvements
    print("\n📈 SUGGESTED IMPROVEMENTS:")
    for i, refinement in enumerate(analysis['refinements'], 1):
        print(f"{i}. {refinement}")
    
    # Final Response
    print(f"\n{'='*50}")
    print(f"FINAL REFINED RESPONSE FROM {winning_model.upper()}")
    print(f"{'='*50}")
    final_response = await system.get_final_response(prompt, chosen_label, refinements)
    print(f"\n{final_response}")
    
    # Return the complete analysis
    return {
        'initial_responses': initial_responses,
        'analysis': analysis,
        'final_response': final_response
    }

In [25]:
prompt = "Explain the concept of seasons"
result = await get_model_discussion(prompt)
result

Getting initial responses...

Initial Responses:

R3 (Gemini):
The concept of **seasons** refers to the four distinct periods of the year—**spring, summer, autumn (fall), and winter**—that are characterized by changes in temperature, weather patterns, and daylight hours. Seasons occur due to the **tilt of Earth's axis (approximately 23.5 degrees) and its orbit around the Sun**.

### **Key Factors Behind Seasons:**
1. **Earth's Tilt**: The Earth's axis is tilted relative to its orbit around the Sun. This means that different parts of the planet receive varying amounts of sunlight throughout the year.
2. **Revolution Around the Sun**: As Earth moves around the Sun in a year-long orbit, the Sun appears to shift in the sky, causing changes in temperatures and daylight hours.
3. **Sunlight Intensity & Daylight Duration**: When a hemisphere is tilted toward the Sun, it experiences longer days and more direct sunlight, resulting in **warmer temperatures (summer)**. When it is tilted away, day

{'initial_responses': {'R3': "The concept of **seasons** refers to the four distinct periods of the year—**spring, summer, autumn (fall), and winter**—that are characterized by changes in temperature, weather patterns, and daylight hours. Seasons occur due to the **tilt of Earth's axis (approximately 23.5 degrees) and its orbit around the Sun**.\n\n### **Key Factors Behind Seasons:**\n1. **Earth's Tilt**: The Earth's axis is tilted relative to its orbit around the Sun. This means that different parts of the planet receive varying amounts of sunlight throughout the year.\n2. **Revolution Around the Sun**: As Earth moves around the Sun in a year-long orbit, the Sun appears to shift in the sky, causing changes in temperatures and daylight hours.\n3. **Sunlight Intensity & Daylight Duration**: When a hemisphere is tilted toward the Sun, it experiences longer days and more direct sunlight, resulting in **warmer temperatures (summer)**. When it is tilted away, days are shorter, and the sunli

In [19]:

# initial_responses = result['initial_responses']
# final_response = result['final_response']

# print("\nInitial Responses:", initial_responses)
# print("final response:", final_response)