In [1]:
import os
from typing import List, Dict, Tuple
import random
import asyncio
import re
import numpy as np
from openai import AsyncOpenAI
from anthropic import AsyncAnthropic
import google.generativeai as genai
from dotenv import load_dotenv

  from .autonotebook import tqdm as notebook_tqdm


In [29]:
load_dotenv()

# Initialize API clients
openai_client = AsyncOpenAI(api_key=os.getenv("SELF_RAG_OPENAI_API_KEY"))
anthropic_client = AsyncAnthropic(api_key=os.getenv("NEZ_CLAUDE_API_KEY"))
genai.configure(api_key=os.getenv("GEMINI_API"))

In [35]:
class ModelDiscussionSystem:
    def __init__(self):
        self.model_map = {
            "R1": "OpenAI",
            "R2": "Claude",
            "R3": "Gemini"
        }
        self.reverse_map = {v: k for k, v in self.model_map.items()}
        
        # Scoring criteria and weights
        self.scoring_criteria = {
            "factual_accuracy": 0.35,
            "explanation_quality": 0.25,
            "practical_examples": 0.20,
            "technical_depth": 0.20
        }
        
        # Minimum score difference required for a clear winner
        self.min_score_difference = 5.0
        
        # Categories for detailed evaluation
        self.evaluation_categories = [
            "factual_accuracy",
            "explanation_quality",
            "practical_examples",
            "technical_depth",
            "clarity",
            "completeness"
        ]
    
    async def get_initial_responses(self, prompt: str) -> Dict[str, str]:
        """Get initial responses from all three models."""
        tasks = [
            self._get_openai_response(prompt),
            self._get_claude_response(prompt),
            self._get_gemini_response(prompt)
        ]
        responses = await asyncio.gather(*tasks)
        
        # Randomly assign response labels
        labels = list(self.reverse_map.values())
        random.shuffle(labels)
        
        return {labels[i]: resp for i, resp in enumerate(responses)}

    async def _get_openai_response(self, prompt: str) -> str:
        """Get response from OpenAI's GPT model."""
        response = await openai_client.chat.completions.create(
            model="chatgpt-4o-latest",
            messages=[{"role": "user", "content": prompt}]
        )
        return response.choices[0].message.content

    async def _get_claude_response(self, prompt: str) -> str:
        """Get response from Anthropic's Claude model."""
        response = await anthropic_client.messages.create(
            model="claude-3-5-sonnet-20241022",
            max_tokens=1000,
            messages=[{"role": "user", "content": prompt}]
        )
        return response.content[0].text

    async def _get_gemini_response(self, prompt: str) -> str:
        """Get response from Google's Gemini model."""
        model = genai.GenerativeModel('gemini-2.0-pro-exp-02-05')
        response = await model.generate_content_async(prompt)
        return response.text

    def _create_discussion_prompt(self, responses: Dict[str, str]) -> str:
        """Create enhanced prompt for models to discuss the best response."""
        formatted_responses = "\n\n".join([
            f"=== Response {label} ===\n{response}\n"
            for label, response in responses.items()
        ])
        
        scoring_template = "\n".join([
            f"- {cat.replace('_', ' ').title()}: [Enter score 0-100]"
            for cat in self.evaluation_categories
        ])
        
        weights_desc = "\n".join([
            f"- {cat.replace('_', ' ').title()}: {weight*100}%"
            for cat, weight in self.scoring_criteria.items()
        ])

        return f"""Analyze these three responses and provide detailed numerical scores.

{formatted_responses}

=== DETAILED SCORING ===
You MUST score each response from 0-100 in every category. Be precise and objective.

R1 Evaluation:
{scoring_template}
Overall Score: [Calculate weighted average]

R2 Evaluation:
{scoring_template}
Overall Score: [Calculate weighted average]

R3 Evaluation:
{scoring_template}
Overall Score: [Calculate weighted average]

=== WEIGHTED ANALYSIS ===
Weights for calculation:
{weights_desc}

Calculate and explicitly state:
1. Final weighted score for each response (show calculation)
2. Which response scored highest
3. Score difference between top two responses

=== QUALITATIVE ANALYSIS ===
STRENGTHS of winning response:
1. [First major strength]
2. [Second major strength]
3. [Third major strength]

IMPROVEMENTS NEEDED:
1. [First specific improvement]
2. [Second specific improvement]
3. [Third specific improvement]

=== COMPARATIVE INSIGHTS ===
Explain why the winning response outperformed others in specific areas."""

    def _analyze_discussion(self, discussion_responses: List[str]) -> Tuple[str, str, Dict]:
        """Enhanced analysis of discussion responses with multiple evaluation methods."""
        # Initialize data structures
        category_scores = {label: {cat: [] for cat in self.evaluation_categories} 
                         for label in ["R1", "R2", "R3"]}
        overall_scores = {label: [] for label in ["R1", "R2", "R3"]}
        
        # Score pattern variations
        score_patterns = [
            r'(?:score:?\s*|rated:?\s*|:\s*)(\d{1,3})(?:\s*\/100|\s*points|\s*%)?',
            r'(\d{1,3})(?:\s*\/100|\s*points|\s*%)',
            r'(\d{1,3})\s*$'
        ]
        
        for response in discussion_responses:
            response_lower = response.lower()
            sections = response_lower.split('===')
            
            # Process detailed scoring section
            for section in sections:
                if 'detailed scoring' in section or 'evaluation' in section:
                    for label in ["R1", "R2", "R3"]:
                        if label.lower() in section:
                            label_segment = section.split(label.lower())[1].split(label.lower() if label != "R3" else "===")[0]
                            
                            # Extract scores for each category
                            for category in self.evaluation_categories:
                                category_name = category.replace('_', ' ')
                                for pattern in score_patterns:
                                    category_context = label_segment.split(category_name.lower())
                                    if len(category_context) > 1:
                                        # Look for scores in the text after category name
                                        context = category_context[1].split('\n')[0]
                                        matches = re.findall(pattern, context)
                                        
                                        if matches:
                                            try:
                                                scores = [
                                                    int(score) for score in matches 
                                                    if 0 <= int(score) <= 100
                                                ]
                                                if scores:
                                                    category_scores[label][category].extend(scores)
                                            except ValueError:
                                                continue
            
            # Extract overall scores
            if 'weighted analysis' in response_lower:
                weighted_section = response_lower.split('weighted analysis')[1]
                for label in ["R1", "R2", "R3"]:
                    for pattern in score_patterns:
                        matches = re.findall(f"{label.lower()}.*?{pattern}", weighted_section)
                        if matches:
                            try:
                                scores = [int(score) for score in matches if 0 <= int(score) <= 100]
                                overall_scores[label].extend(scores)
                            except ValueError:
                                continue
        
        # Calculate normalized category scores with default values
        avg_category_scores = {}
        for label in ["R1", "R2", "R3"]:
            avg_category_scores[label] = {}
            for category in self.evaluation_categories:
                scores = category_scores[label][category]
                if scores:
                    # Remove extreme outliers
                    sorted_scores = sorted(scores)
                    if len(sorted_scores) > 2:
                        scores = sorted_scores[1:-1]  # Remove highest and lowest
                    avg_category_scores[label][category] = sum(scores) / len(scores)
                else:
                    avg_category_scores[label][category] = 70  # Default score
        
        # Calculate weighted scores
        weighted_scores = {}
        for label in ["R1", "R2", "R3"]:
            score = 0
            total_weight = 0
            for category, weight in self.scoring_criteria.items():
                category_score = avg_category_scores[label][category]
                score += category_score * weight
                total_weight += weight
            weighted_scores[label] = score / total_weight if total_weight > 0 else 70
        
        # Determine winner and confidence
        sorted_scores = sorted(weighted_scores.items(), key=lambda x: x[1], reverse=True)
        winner = sorted_scores[0][0]
        runner_up = sorted_scores[1][0]
        score_difference = weighted_scores[winner] - weighted_scores[runner_up]
        
        # Determine confidence level with more granularity
        if score_difference >= 15:
            confidence = "Very High"
        elif score_difference >= 10:
            confidence = "High"
        elif score_difference >= 5:
            confidence = "Medium"
        else:
            confidence = "Low"
        
        # Extract strengths and improvements
        strengths = []
        refinements = []
        for response in discussion_responses:
            response_lower = response.lower()
            
            # Extract strengths
            if 'strengths' in response_lower:
                strength_section = response_lower.split('strengths')[1].split('improvements')[0]
                for line in strength_section.split('\n'):
                    cleaned = line.strip().strip('*-').strip()
                    if cleaned and len(cleaned) > 10 and not any(x in cleaned.lower() for x in ['r1', 'r2', 'r3']):
                        strengths.append(cleaned)
            
            # Extract improvements
            if 'improvements' in response_lower:
                improvement_section = response_lower.split('improvements')[1].split('===')[0]
                for line in improvement_section.split('\n'):
                    cleaned = line.strip().strip('*-').strip()
                    if cleaned and len(cleaned) > 10 and not any(x in cleaned.lower() for x in ['r1', 'r2', 'r3']):
                        refinements.append(cleaned)
        
        # Prepare final analysis
        analysis_result = {
            "winner": winner,
            "winner_model": self.model_map[winner],
            "confidence_level": confidence,
            "score_difference": score_difference,
            "weighted_scores": weighted_scores,
            "category_scores": avg_category_scores,
            "overall_scores": {
                label: round(weighted_scores[label], 2) 
                for label in ["R1", "R2", "R3"]
            },
            "strengths": list(set(strengths))[:5],  # Top 5 unique strengths
            "refinements": list(set(refinements))[:5]  # Top 5 unique refinements
        }
        
        return winner, "\n".join(refinements), analysis_result

    def _create_final_prompt(self, original_prompt: str, refinements: str) -> str:
        """Create prompt for final refined response."""
        return f"""Original prompt: {original_prompt}

        Please provide a refined response incorporating these suggested improvements:
        {refinements}

        Ensure your response is:
        1. Comprehensive and detailed
        2. Well-structured and clear
        3. Incorporates all suggested improvements
        4. Balances technical accuracy with accessibility

        Give your best, most refined answer."""

    async def facilitate_discussion(self, initial_responses: Dict[str, str]) -> Tuple[str, str, Dict]:
        """Facilitate enhanced discussion between models."""
        discussion_prompt = self._create_discussion_prompt(initial_responses)
        tasks = [
            self._get_openai_response(discussion_prompt),
            self._get_claude_response(discussion_prompt),
            self._get_gemini_response(discussion_prompt)
        ]
        discussion_responses = await asyncio.gather(*tasks)
        return self._analyze_discussion(discussion_responses)

    async def get_final_response(self, prompt: str, chosen_label: str, refinements: str) -> str:
        """Get final refined response from the chosen model."""
        final_prompt = self._create_final_prompt(prompt, refinements)
        chosen_model = self.model_map[chosen_label]
        
        if chosen_model == "OpenAI":
            return await self._get_openai_response(final_prompt)
        elif chosen_model == "Claude":
            return await self._get_claude_response(final_prompt)
        else:  # Gemini
            return await self._get_gemini_response(final_prompt)

In [37]:
async def get_model_discussion(prompt: str):
    """Enhanced utility function to run the discussion process."""
    system = ModelDiscussionSystem()
    
    print("Getting initial responses...")
    initial_responses = await system.get_initial_responses(prompt)
    print("\nInitial Responses:")
    for label, response in initial_responses.items():
        model_name = system.model_map[label]
        print(f"\n{label} ({model_name}):\n{response}\n{'-'*50}")
    
    print("\nFacilitating discussion...")
    chosen_label, refinements, analysis = await system.facilitate_discussion(initial_responses)
    
    print("\n" + "="*50)
    print("ANALYSIS RESULTS")
    print("="*50)
    
    # Winner Information
    winning_model = analysis['winner']
    print(f"\n🏆 WINNING RESPONSE:")
    print(f"Model: {winning_model} (Response {chosen_label})")
    print(f"Confidence: {analysis['confidence_level']}")
    print(f"Margin of Victory: {analysis['score_difference']:.2f} points")
    
    # Scores
    print("\n📊 FINAL SCORES:")
    for label, score in analysis['overall_scores'].items():
        model_name = system.model_map[label]
        print(f"{label} ({model_name}): {score:.2f}")
    
    # Category Breakdown
    print(f"\n📋 CATEGORY BREAKDOWN FOR {winning_model}:")
    winner_scores = analysis['category_scores'][chosen_label]
    for category, score in winner_scores.items():
        print(f"{category.replace('_', ' ').title()}: {score:.2f}")
    
    # Strengths
    print("\n💪 STRENGTHS:")
    for i, strength in enumerate(analysis['strengths'], 1):
        print(f"{i}. {strength}")
    
    # Improvements
    print("\n📈 SUGGESTED IMPROVEMENTS:")
    for i, refinement in enumerate(analysis['refinements'], 1):
        print(f"{i}. {refinement}")
    
    # Final Response
    print(f"\n{'='*50}")
    print(f"FINAL REFINED RESPONSE FROM {winning_model.upper()}")
    print(f"{'='*50}")
    final_response = await system.get_final_response(prompt, chosen_label, refinements)
    print(f"\n{final_response}")
    
    # Return the complete analysis
    return {
        'initial_responses': initial_responses,
        'analysis': analysis,
        'final_response': final_response
    }

In [38]:
prompt = "Explain the concept of seasons"
await get_model_discussion(prompt)


Getting initial responses...

Initial Responses:

R1 (OpenAI):
Seasons are divisions of the year based on changes in weather, daylight, and ecological conditions. They occur due to the Earth's axial tilt and its orbit around the Sun.

### **Causes of Seasons**
1. **Earth's Tilt** - The Earth's axis is tilted at approximately 23.5 degrees relative to its orbit around the Sun. This tilt is the primary reason for seasonal changes.
2. **Revolution Around the Sun** - As the Earth orbits the Sun, different parts of the planet receive varying amounts of sunlight at different times of the year.

### **Four Seasons (in most temperate regions)**
1. **Spring** (March – May in the Northern Hemisphere, September – November in the Southern Hemisphere)
   - Days begin to get longer.
   - Temperatures gradually rise.
   - Plants start blooming, and animals become more active.

2. **Summer** (June – August in the Northern Hemisphere, December – February in the Southern Hemisphere)
   - Warmest season w

{'initial_responses': {'R1': "Seasons are divisions of the year based on changes in weather, daylight, and ecological conditions. They occur due to the Earth's axial tilt and its orbit around the Sun.\n\n### **Causes of Seasons**\n1. **Earth's Tilt** - The Earth's axis is tilted at approximately 23.5 degrees relative to its orbit around the Sun. This tilt is the primary reason for seasonal changes.\n2. **Revolution Around the Sun** - As the Earth orbits the Sun, different parts of the planet receive varying amounts of sunlight at different times of the year.\n\n### **Four Seasons (in most temperate regions)**\n1. **Spring** (March – May in the Northern Hemisphere, September – November in the Southern Hemisphere)\n   - Days begin to get longer.\n   - Temperatures gradually rise.\n   - Plants start blooming, and animals become more active.\n\n2. **Summer** (June – August in the Northern Hemisphere, December – February in the Southern Hemisphere)\n   - Warmest season with the longest dayl

In [None]:

# initial_responses = result['initial_responses']
# final_response = result['final_response']

# print("\nInitial Responses:", initial_responses)
# print("final response:", final_response)