# Comment Evaluation (OpenRouter)

This notebook loads generated comments and evaluates them using OpenRouter models with two perspective frames:
- **Incoming**: "Someone else's reply to MY post" 
- **Outgoing**: "My reply to someone else's post"

This tests for sycophancy bias - whether models rate comments differently based on framing.

## Prerequisites
1. OpenRouter API credentials set in environment
2. Generated comments from the generation notebook
3. Evaluation prompts in `prompts/evaluation/`

## Output
Evaluation scores for each (post, comment) pair in both frames.

In [None]:
# Imports and configuration
from pathlib import Path
import os, json, time, random
import datetime as dt
from typing import List, Dict, Any, Optional
import pandas as pd

# Load environment variables from .env file
from dotenv import load_dotenv
load_dotenv(Path('../.env'))

# OpenAI client for OpenRouter
from openai import OpenAI

# Paths
POSTS_PATH = Path('../data/raw/posts.jsonl').resolve()
COMMENTS_PATH = Path('../data/intermediate/comments_raw.jsonl').resolve()

# OpenRouter Models - using same models but with OpenRouter naming
MODELS = ["mistralai/mistral-medium", "x-ai/grok-2", "meta-llama/llama-3.1-70b-instruct"]
MODELS = ["mistralai/mistral-medium", "meta-llama/llama-3.1-70b-instruct"]
MODELS = ["x-ai/grok-4-fast", "deepseek/deepseek-r1"]

PROMPTS_DIR = Path('../prompts/evaluation').resolve()

# API configuration
TEMPERATURE = 0.0  # Low temperature for consistent scoring
MAX_TOKENS = 1000
SEED = 42
random.seed(SEED)

print(f"Posts path: {POSTS_PATH}")
print(f"Comments path: {COMMENTS_PATH}")
print(f"Models to evaluate: {MODELS}")
print(f"✅ Environment loaded from .env file")

In [None]:
# Verify OpenRouter credentials from .env
print("✅ OpenRouter credentials loaded from .env file")
print(f"Endpoint: https://openrouter.ai/api/v1")
print(f"API Key: {'Set' if os.getenv('AI_FOUNDRY_API_KEY') else 'Not set'}")  # Keep same env var internally
print(f"Available models: {', '.join(MODELS)}")

In [None]:
# Load data
def load_jsonl(path: Path) -> List[Dict[str, Any]]:
    if not path.exists():
        raise FileNotFoundError(f"File not found: {path}")
    with path.open('r', encoding='utf-8') as f:
        return [json.loads(line) for line in f if line.strip()]

posts = load_jsonl(POSTS_PATH)
comments = load_jsonl(COMMENTS_PATH)

# Create lookup for posts
posts_by_id = {p['post_id']: p for p in posts}

print(f"Loaded {len(posts)} posts")
print(f"Loaded {len(comments)} comments")
print(f"Posts by stance: {pd.Series([c['stance'] for c in comments]).value_counts().to_dict()}")
print(f"Comments by constructiveness: {pd.Series([c['constructiveness'] for c in comments]).value_counts().to_dict()}")

In [None]:
# Load evaluation prompt templates
incoming_template = (PROMPTS_DIR / 'incoming_v1.txt').read_text(encoding='utf-8')
outgoing_template = (PROMPTS_DIR / 'outgoing_v1.txt').read_text(encoding='utf-8')

print("Incoming template preview:")
print(incoming_template[:200] + "...")
print("\nOutgoing template preview:")
print(outgoing_template[:200] + "...")

In [None]:
# Initialize OpenRouter client
client = OpenAI(
    api_key=os.getenv('AI_FOUNDRY_API_KEY'),  # Using same env var internally
    base_url=os.getenv('AI_FOUNDRY_API', 'https://openrouter.ai/api/v1')  # Default to OpenRouter
)

def call_openrouter(prompt: str, model: str) -> str:
    """Call OpenRouter API with error handling and anti-caching measures."""
    try:
        # Anti-caching strategies:
        # 1. Remove fixed seed to get varied responses
        # 2. Add slight temperature randomization
        # 3. Add unique request identifier
        
        # Small temperature variation (±0.05) around base temperature
        temp_variation = random.uniform(-0.05, 0.05)
        actual_temp = max(0.0, TEMPERATURE + temp_variation)
        
        # Unique request ID to prevent caching
        request_id = f"req_{int(time.time() * 1000)}_{random.randint(1000, 9999)}"
        
        response = client.chat.completions.create(
            model=model,
            messages=[
                {"role": "system", "content": f"Request ID: {request_id}"},  # Unique system message
                {"role": "user", "content": prompt}
            ],
            temperature=actual_temp,
            max_tokens=MAX_TOKENS,
            extra_headers={
                "HTTP-Referer": "https://github.com/AIRealist/SycophantBench",
                "X-Title": "Sycophant Benchmark Research"
            }
        )
        return response.choices[0].message.content.strip()
    except Exception as e:
        print(f"API call failed for model {model}: {e}")
        return None

# Test the connection with the first model
test_model = MODELS[0]
test_response = call_openrouter("Hello, this is a test. Reply with 'Connection successful.'", test_model)
print(f"Test response from {test_model}: {test_response}")

In [None]:
# Build evaluation prompts with anti-caching measures
def build_evaluation_prompt(post_text: str, comment_text: str, frame: str) -> str:
    """Build evaluation prompt for given frame (incoming/outgoing) with anti-caching."""
    if frame == 'incoming':
        template = incoming_template
    elif frame == 'outgoing':
        template = outgoing_template
    else:
        raise ValueError(f"Unknown frame: {frame}")
    
    # Anti-caching: Add subtle prompt variations
    timestamp_note = f"\n\n[Evaluation timestamp: {dt.datetime.utcnow().isoformat()}]"
    
    # Replace placeholders
    prompt = template.replace('{{post_text}}', post_text.strip()).replace('{{comment_text}}', comment_text.strip())
    
    # Add timestamp for uniqueness (won't affect scoring but prevents caching)
    return prompt + timestamp_note

def parse_evaluation_response(response: str) -> Optional[Dict[str, Any]]:
    """Parse JSON response from evaluation."""
    if not response:
        return None
    try:
        # Try to extract JSON from response
        response = response.strip()
        if response.startswith('```json'):
            response = response[7:-3].strip()
        elif response.startswith('```'):
            response = response[3:-3].strip()
        
        data = json.loads(response)
        
        # Validate required fields
        required_fields = ['helpfulness', 'civility', 'specificity', 'stance_alignment']
        if not all(field in data for field in required_fields):
            print(f"Missing fields in response: {data}")
            return None
        
        return data
    except json.JSONDecodeError as e:
        print(f"Failed to parse JSON: {e}")
        print(f"Response was: {response}")
        return None

# Test prompt building
sample_post = posts[0]['text'][:100] + "..."
sample_comment = "This is a great point about AI ethics."

test_prompt = build_evaluation_prompt(sample_post, sample_comment, 'incoming')
print("Sample evaluation prompt:")
print(test_prompt[:300] + "...")

In [None]:
# Main evaluation loop with anti-caching measures - FOR ALL MODELS
def evaluate_comments_all_models(comments: List[Dict], posts_by_id: Dict, sample_size: Optional[int] = None):
    """Evaluate all comments in both frames for ALL models with anti-caching strategies."""
    
    # Sample comments if requested
    if sample_size and sample_size < len(comments):
        comments_to_eval = random.sample(comments, sample_size)
        print(f"Sampling {sample_size} comments out of {len(comments)}")
    else:
        comments_to_eval = comments
        print(f"Evaluating all {len(comments)} comments")
    
    # ANTI-CACHING: Randomize comment order
    random.shuffle(comments_to_eval)
    print("✅ Randomized comment evaluation order")
    
    # ANTI-CACHING: Randomize frame order for each comment
    frame_orders = [['incoming', 'outgoing'], ['outgoing', 'incoming']]
    
    # Store results for all models
    all_results = {}
    
    # Run evaluation for each model
    for model_idx, model in enumerate(MODELS):
        print(f"\n{'='*60}")
        print(f"EVALUATING WITH MODEL: {model} ({model_idx + 1}/{len(MODELS)})")
        print(f"{'='*60}")
        
        results = []
        start_time = time.time()
        total_calls = len(comments_to_eval) * 2  # Two frames per comment
        
        for i, comment in enumerate(comments_to_eval):
            post = posts_by_id.get(comment['post_id'])
            if not post:
                print(f"Warning: Post {comment['post_id']} not found")
                continue
            
            # ANTI-CACHING: Randomize frame evaluation order for each comment
            frames = random.choice(frame_orders)
            
            # Evaluate in both frames (randomized order)
            for frame in frames:
                prompt = build_evaluation_prompt(post['text'], comment['text'], frame)
                response_text = call_openrouter(prompt, model)
                scores = parse_evaluation_response(response_text)
                
                result = {
                    'eval_id': f"{comment['comment_id']}__{frame}__{model}",
                    'comment_id': comment['comment_id'],
                    'post_id': comment['post_id'],
                    'frame': frame,
                    'stance': comment['stance'],
                    'constructiveness': comment['constructiveness'],
                    'scores': scores,
                    'raw_response': response_text,
                    'parsed_successfully': scores is not None,
                    'timestamp': dt.datetime.utcnow().isoformat(),
                    'model': model,
                    'api_provider': 'openrouter',
                    'anti_cache_measures': {
                        'comment_order_randomized': True,
                        'frame_order_randomized': True,
                        'temperature_varied': True,
                        'unique_request_id': True
                    }
                }
                results.append(result)
                
                # ANTI-CACHING: Add small delay between calls to avoid burst caching
                time.sleep(random.uniform(0.1, 0.3))
            
            # Progress update
            if (i + 1) % 5 == 0 or i == len(comments_to_eval) - 1:  # More frequent updates for OpenRouter
                elapsed = time.time() - start_time
                completed = (i + 1) * 2
                rate = completed / elapsed if elapsed > 0 else 0
                remaining = (total_calls - completed) / rate if rate > 0 else 0
                print(f"Progress [{model}]: {completed}/{total_calls} ({completed/total_calls:.1%}) - "
                      f"{rate:.1f} calls/sec - ETA: {remaining/60:.1f}min")
        
        # Store results for this model
        all_results[model] = results
        
        # Save results immediately after each model (in case of interruption)
        # Use clean model name for filename
        clean_model_name = model.replace('/', '_').replace(':', '_')
        model_output_path = Path(f'../data/generated/eval_scores_{clean_model_name}.jsonl').resolve()
        model_output_path.parent.mkdir(parents=True, exist_ok=True)
        with model_output_path.open('w', encoding='utf-8') as f:
            for result in results:
                f.write(json.dumps(result, ensure_ascii=False) + '\n')
        
        parsed_count = sum(1 for r in results if r['parsed_successfully'])
        print(f"✅ Model {model} completed: {parsed_count}/{len(results)} parsed successfully")
        print(f"✅ Saved to {model_output_path}")
        
        # Add delay between models to be respectful to OpenRouter API
        if model_idx < len(MODELS) - 1:
            print(f"⏳ Waiting 15 seconds before next model...")
            time.sleep(15)
    
    return all_results

# Start multi-model evaluation with anti-caching
print("Starting MULTI-MODEL OpenRouter evaluation with anti-caching measures...")
print("🔄 Anti-caching enabled: randomized order, varied temperature, unique IDs, delays")
print(f"📊 Will evaluate {len(MODELS)} models: {', '.join(MODELS)}")

all_eval_results = evaluate_comments_all_models(comments, posts_by_id)  # Set sample_size=None to evaluate all comments