# Procedural Search Evaluation

This notebook evaluates three search approaches for developer workflow patterns:
1. **Baseline (Enhanced GitHub-style)**: Keyword matching + semantic embeddings on raw traces
2. **Intent-Only Search**: Search over intent clusters (INTENT_DEBUG, INTENT_FEATURE, etc.)
3. **Intent + Representation Search**: Multi-representation search combining intent and rungs (tokens, semantic_edits, functions, module_graph, motifs)

## Methodology

**Query Types:**
- Procedural: Workflow patterns, temporal sequences
- Functional: Function-level code patterns
- Structural: Token patterns, edit operations
- Module-level: File relationships, dependencies
- Intent-driven: Semantic intent categories
- Hybrid: Multi-representation queries
- Context-aware: Project-specific similarity

**Evaluation Metrics:**
- Standard IR: Precision@K, Recall@K, NDCG@K, MRR
- Custom: Pattern Distinctness@K, Representation Diversity, Workflow Coherence

**Ground Truth:** Three-stage process (embedding similarity → representation filtering → expert validation)


In [1]:
# Imports and setup
import sys
from pathlib import Path
import json

import numpy as np
import pandas as pd
from typing import List, Dict, Optional, Tuple, Set
from collections import defaultdict, Counter
from dataclasses import dataclass, field
from datetime import datetime
import os
import re

# Search and similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import normalize

# Load environment variables
try:
    from dotenv import load_dotenv
    load_dotenv()
except ImportError:
    pass

# Import rung extractors
from rung_extractors import (
    tokens_repr, semantic_edits_repr, functions_repr, module_graph_repr, motifs_repr,
    canonicalize_prompt, event_sequence
)

# Find repo root
def _find_repo_root() -> Path:
    current = Path.cwd().resolve()
    while True:
        if (current / "cursor-telemetry").exists() or (current / "components").exists():
            return current
        if current == current.parent:
            raise FileNotFoundError("Cannot locate repository root")
        current = current.parent

REPO_ROOT = _find_repo_root()
EXPORT_FILE_JSONL = REPO_ROOT / "research/data/companion_traces.jsonl"
RESULTS_DIR = REPO_ROOT / "research/results"
RESULTS_DIR.mkdir(parents=True, exist_ok=True)

print(f"Repository root: {REPO_ROOT}")
print(f"JSONL file: {EXPORT_FILE_JSONL}")
print(f"Results directory: {RESULTS_DIR}")

Repository root: /Users/hamidaho/new_cursor
JSONL file: /Users/hamidaho/new_cursor/research/data/companion_traces.jsonl
Results directory: /Users/hamidaho/new_cursor/research/results


In [2]:
# Load traces from companion_traces.jsonl
def load_traces_from_jsonl(file_path: Path, limit: Optional[int] = None) -> List[Dict]:
    """
    Load traces from companion_traces.jsonl file.
    
    Structure: Each line is a session with:
    - session_id
    - workspace_path
    - events: list of events (code_change, prompt, terminal_command, etc.)
    """
    if not file_path.exists():
        print(f"JSONL file not found: {file_path}")
        return []
    
    sessions = []
    with open(file_path, 'r', encoding='utf-8') as f:
        for idx, line in enumerate(f):
            if limit and idx >= limit:
                break
            try:
                session = json.loads(line.strip())
                # Ensure events are sorted by timestamp
                if 'events' in session:
                    session['events'].sort(key=lambda x: x.get('timestamp', ''))
                sessions.append(session)
            except json.JSONDecodeError as e:
                print(f"Error parsing line {idx}: {e}")
                continue
    
    return sessions

# Load traces
print("Loading traces from companion_traces.jsonl...")
traces = load_traces_from_jsonl(EXPORT_FILE_JSONL, limit=None)
print(f"Loaded {len(traces)} traces (sessions)")
print(f"Total events: {sum(len(t.get('events', [])) for t in traces)}")

# Show sample structure
if traces:
    sample = traces[0]
    print(f"\nSample trace structure:")
    print(f"  - Session ID: {sample.get('session_id')}")
    print(f"  - Workspace: {sample.get('workspace_path')}")
    print(f"  - Events: {len(sample.get('events', []))}")
    if sample.get('events'):
        print(f"  - First event type: {sample['events'][0].get('type')}")
        print(f"  - First event details keys: {list(sample['events'][0].get('details', {}).keys())[:5]}")

Loading traces from companion_traces.jsonl...
Loaded 160 traces (sessions)
Total events: 5537

Sample trace structure:
  - Session ID: session-1761193218842
  - Workspace: /Users/hamidaho
  - Events: 2
  - First event type: code_change
  - First event details keys: ['file_path', 'diff_summary', 'diff_size', 'lines_added', 'lines_removed']


## Generate Representations for All Traces

In [3]:
# Generate representations for all traces
print("Generating representations for all traces...")
print("This may take a while for large datasets...")

trace_representations = []
for i, trace in enumerate(traces):
    if i % 100 == 0:
        print(f"  Processing trace {i}/{len(traces)}...")
    
    try:
        # Generate all representation rungs
        reprs = {
            'session_id': trace.get('session_id'),
            'workspace_path': trace.get('workspace_path'),
            'tokens': tokens_repr(trace, include_prompts=True),
            'semantic_edits': semantic_edits_repr(trace, include_prompts=True),
            'functions': functions_repr(trace, include_prompts=True),
            'module_graph': module_graph_repr(trace),
            'motifs': motifs_repr(trace, use_statistical_mining=True, include_prompts=True),
        }
        
        # Extract intents from trace
        intents = []
        for event in trace.get('events', []):
            if event.get('type') == 'prompt':
                prompt_text = event.get('details', {}).get('text', '')
                if prompt_text:
                    intent = canonicalize_prompt(prompt_text)
                    intents.append(intent)
        reprs['intents'] = list(set(intents))  # Unique intents
        
        # Create searchable text for baseline (enhanced GitHub-style)
        searchable_text_parts = []
        for event in trace.get('events', []):
            # Add prompt text
            if event.get('type') == 'prompt':
                prompt_text = event.get('details', {}).get('text', '')
                if prompt_text:
                    searchable_text_parts.append(prompt_text)
            # Add annotations
            if event.get('annotation'):
                searchable_text_parts.append(event.get('annotation'))
            # Add file paths
            file_path = event.get('details', {}).get('file_path') or event.get('details', {}).get('file')
            if file_path:
                searchable_text_parts.append(str(file_path))
        reprs['searchable_text'] = ' '.join(searchable_text_parts)
        
        trace_representations.append(reprs)
    except Exception as e:
        print(f"  Error processing trace {i}: {e}")
        continue

print(f"\nGenerated representations for {len(trace_representations)} traces")
print(f"Traces with intents: {sum(1 for r in trace_representations if r.get('intents'))}")
print(f"Traces with motifs: {sum(1 for r in trace_representations if r.get('motifs'))}")

Generating representations for all traces...
This may take a while for large datasets...
  Processing trace 0/160...
  Processing trace 100/160...

Generated representations for 160 traces
Traces with intents: 0
Traces with motifs: 144


## Phase 1: Query Collection

Sample query set (to be expanded with generated queries using the prompt from methodology)

In [4]:
# Sample query set
sample_queries = [
    # Procedural queries
    {'id': 'proc_1', 'text': 'Find workflows where developers debug an error then write a test', 'query_type': 'procedural', 'primary_intent': 'INTENT_DEBUG', 'target_rungs': ['motifs', 'semantic_edits'], 'difficulty': 'medium'},
    {'id': 'proc_2', 'text': 'Show me patterns where feature creation is followed by documentation', 'query_type': 'procedural', 'primary_intent': 'INTENT_FEATURE', 'target_rungs': ['motifs', 'semantic_edits'], 'difficulty': 'medium'},
    # Functional queries
    {'id': 'func_1', 'text': 'Find code that implements authentication with error handling', 'query_type': 'functional', 'primary_intent': 'INTENT_FEATURE', 'target_rungs': ['functions', 'semantic_edits'], 'difficulty': 'medium'},
    {'id': 'func_2', 'text': 'Show me patterns where create functions are followed by test functions', 'query_type': 'functional', 'primary_intent': 'INTENT_FEATURE', 'target_rungs': ['functions', 'motifs'], 'difficulty': 'hard'},
    # Structural queries
    {'id': 'struct_1', 'text': 'Code with high complexity that was refactored', 'query_type': 'structural', 'primary_intent': 'INTENT_REFACTOR', 'target_rungs': ['tokens', 'semantic_edits'], 'difficulty': 'hard'},
    # Module-level queries
    {'id': 'module_1', 'text': 'Files that are typically edited together in authentication workflows', 'query_type': 'module_level', 'primary_intent': 'INTENT_FEATURE', 'target_rungs': ['module_graph', 'motifs'], 'difficulty': 'medium'},
    # Intent-driven queries
    {'id': 'intent_1', 'text': 'Show me all debugging workflows', 'query_type': 'intent_driven', 'primary_intent': 'INTENT_DEBUG', 'target_rungs': ['intents'], 'difficulty': 'easy'},
    {'id': 'intent_2', 'text': 'Feature development patterns', 'query_type': 'intent_driven', 'primary_intent': 'INTENT_FEATURE', 'target_rungs': ['intents'], 'difficulty': 'easy'},
    # Hybrid queries
    {'id': 'hybrid_1', 'text': 'Debugging workflows that involve file switching with iterative patterns', 'query_type': 'hybrid', 'primary_intent': 'INTENT_DEBUG', 'target_rungs': ['intents', 'module_graph', 'motifs'], 'difficulty': 'hard'},
    # Context-aware queries
    {'id': 'context_1', 'text': 'Workflows similar to API endpoint development', 'query_type': 'context_aware', 'primary_intent': 'INTENT_FEATURE', 'target_rungs': ['module_graph', 'functions', 'motifs'], 'difficulty': 'hard'},
]

print(f"Sample query set: {len(sample_queries)} queries")
print(f"Query types: {Counter(q['query_type'] for q in sample_queries)}")
print(f"Difficulty levels: {Counter(q['difficulty'] for q in sample_queries)}")

Sample query set: 10 queries
Query types: Counter({'procedural': 2, 'functional': 2, 'intent_driven': 2, 'structural': 1, 'module_level': 1, 'hybrid': 1, 'context_aware': 1})
Difficulty levels: Counter({'medium': 4, 'hard': 4, 'easy': 2})


## Phase 2: Ground Truth Creation

Three-stage process: embedding similarity → representation filtering → (expert validation - manual step)

In [5]:
# Load embedding model for ground truth (using Hugging Face)
import requests

class EmbeddingGenerator:
    """Generate embeddings using Hugging Face API."""
    
    def __init__(self, model: str = 'sentence-transformers/all-mpnet-base-v2'):
        self.model = model
        self.hf_token = (
            os.getenv('HF_TOKEN') or 
            os.getenv('HUGGINGFACE_API_KEY') or 
            os.getenv('HF_API_KEY') or 
            ''
        ).strip()
        
        if not self.hf_token:
            raise ValueError(
                "No Hugging Face API key found. Set HF_TOKEN, HUGGINGFACE_API_KEY, or HF_API_KEY "
                "in your .env file. Get a free token at: https://huggingface.co/settings/tokens"
            )
        
        self.hf_endpoint = (
            os.getenv('HF_EMBEDDING_ENDPOINT') or
            f'https://api-inference.huggingface.co/pipeline/feature-extraction/{self.model}'
        )
    
    def generate_embeddings(self, texts: List[str]) -> np.ndarray:
        """Generate embeddings for a list of texts."""
        if not texts:
            return np.array([])
        
        response = requests.post(
            self.hf_endpoint,
            headers={
                'Authorization': f'Bearer {self.hf_token}',
                'Content-Type': 'application/json',
            },
            json={
                'inputs': texts,
                'options': {'wait_for_model': True}
            },
            timeout=120
        )
        response.raise_for_status()
        embeddings = response.json()
        
        if isinstance(embeddings, list) and len(embeddings) > 0:
            if isinstance(embeddings[0], list):
                return np.array(embeddings)
            else:
                return np.array([embeddings])
        return np.array(embeddings)

# Initialize embedding generator
try:
    embedding_gen = EmbeddingGenerator()
    print("Embedding generator initialized")
except ValueError as e:
    print(f"Warning: {e}")
    print("Ground truth creation will use representation-based filtering only")
    embedding_gen = None

Ground truth creation will use representation-based filtering only


In [6]:
# Stage 1: Embedding-based candidate generation
def generate_ground_truth_candidates(query: Dict, traces: List[Dict], trace_reprs: List[Dict], top_k: int = 50) -> List[int]:
    """Generate candidate traces for a query using embedding similarity."""
    if embedding_gen is None:
        # Fallback: use representation-based only
        return []
    
    # Embed query
    query_embedding = embedding_gen.generate_embeddings([query['text']])[0]
    
    # Embed all trace searchable texts
    trace_texts = [r.get('searchable_text', '') for r in trace_reprs]
    trace_embeddings = embedding_gen.generate_embeddings(trace_texts)
    
    # Compute similarities
    similarities = cosine_similarity([query_embedding], trace_embeddings)[0]
    
    # Get top-K candidates
    top_indices = np.argsort(similarities)[::-1][:top_k]
    
    return top_indices.tolist()

# Stage 2: Representation-based filtering
def filter_by_representation(query: Dict, candidate_indices: List[int], trace_reprs: List[Dict]) -> List[int]:
    """Filter candidates using representation rungs."""
    filtered = []
    
    primary_intent = query.get('primary_intent')
    target_rungs = query.get('target_rungs', [])
    
    for idx in candidate_indices:
        trace_repr = trace_reprs[idx]
        
        # Check intent match
        if primary_intent and primary_intent != 'mixed':
            trace_intents = trace_repr.get('intents', [])
            if primary_intent not in trace_intents:
                continue  # Skip if intent doesn't match
        
        # Check rung match (at least one target rung should have content)
        if target_rungs:
            has_content = False
            for rung in target_rungs:
                if rung == 'intents':
                    if trace_repr.get('intents'):
                        has_content = True
                        break
                else:
                    rung_content = trace_repr.get(rung, [])
                    if rung_content and len(rung_content) > 0:
                        has_content = True
                        break
            
            if not has_content:
                continue  # Skip if no relevant content
        
        filtered.append(idx)
    
    return filtered[:30]  # Return top 30 after filtering

# Generate ground truth for all queries
print("Generating ground truth candidates for queries...")
ground_truth = {}

for query in sample_queries:
    query_id = query['id']
    
    # Stage 1: Embedding-based candidates
    candidates = generate_ground_truth_candidates(query, traces, trace_representations, top_k=50)
    
    # Stage 2: Representation-based filtering
    if candidates:
        filtered = filter_by_representation(query, candidates, trace_representations)
    else:
        # Fallback: use representation-based only
        # Get all traces and filter by representation
        all_indices = list(range(len(trace_representations)))
        filtered = filter_by_representation(query, all_indices, trace_representations)
    
    ground_truth[query_id] = {
        'query': query,
        'relevant_trace_indices': filtered,
        'n_relevant': len(filtered),
        'note': 'Stage 3 (expert validation) should be performed manually'
    }
    
    print(f"  {query_id}: {len(filtered)} relevant traces")

print(f"\nGround truth generated for {len(ground_truth)} queries")
print(f"Total relevant traces: {sum(gt['n_relevant'] for gt in ground_truth.values())}")

# Save ground truth
gt_file = RESULTS_DIR / 'search_ground_truth.json'
with open(gt_file, 'w') as f:
    json.dump(ground_truth, f, indent=2, default=str)
print(f"Saved ground truth to: {gt_file}")

Generating ground truth candidates for queries...
  proc_1: 0 relevant traces
  proc_2: 0 relevant traces
  func_1: 0 relevant traces
  func_2: 0 relevant traces
  struct_1: 0 relevant traces
  module_1: 0 relevant traces
  intent_1: 0 relevant traces
  intent_2: 0 relevant traces
  hybrid_1: 0 relevant traces
  context_1: 0 relevant traces

Ground truth generated for 10 queries
Total relevant traces: 0
Saved ground truth to: /Users/hamidaho/new_cursor/research/results/search_ground_truth.json


## Phase 3: Search Implementations

Implement three search conditions: Baseline (Enhanced GitHub-style), Intent-Only, Intent+Representation

In [7]:
# Search Implementation 1: Baseline (Enhanced GitHub-style)
# Keyword matching + semantic embeddings

def baseline_search(query: Dict, trace_reprs: List[Dict], top_k: int = 10) -> List[Tuple[int, float]]:
    """Baseline search: Enhanced GitHub-style (keyword + semantic)."""
    query_text = query['text']
    
    # Extract searchable texts
    trace_texts = [r.get('searchable_text', '') for r in trace_reprs]
    
    # Keyword matching (TF-IDF)
    vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
    try:
        tfidf_matrix = vectorizer.fit_transform(trace_texts)
        query_vector = vectorizer.transform([query_text])
        keyword_scores = cosine_similarity(query_vector, tfidf_matrix)[0]
    except:
        keyword_scores = np.zeros(len(trace_texts))
    
    # Semantic similarity (embeddings)
    if embedding_gen:
        try:
            query_embedding = embedding_gen.generate_embeddings([query_text])[0]
            trace_embeddings = embedding_gen.generate_embeddings(trace_texts)
            semantic_scores = cosine_similarity([query_embedding], trace_embeddings)[0]
        except:
            semantic_scores = np.zeros(len(trace_texts))
    else:
        semantic_scores = np.zeros(len(trace_texts))
    
    # Combine scores (weighted: 30% keyword, 70% semantic)
    combined_scores = 0.3 * keyword_scores + 0.7 * semantic_scores
    
    # Get top-K
    top_indices = np.argsort(combined_scores)[::-1][:top_k]
    
    return [(int(idx), float(combined_scores[idx])) for idx in top_indices]

print("Baseline search implementation ready")

Baseline search implementation ready


In [8]:
# Search Implementation 2: Intent-Only Search

def intent_only_search(query: Dict, trace_reprs: List[Dict], top_k: int = 10) -> List[Tuple[int, float]]:
    """Intent-only search: Search over intent clusters only."""
    query_text = query['text']
    primary_intent = query.get('primary_intent')
    
    # Extract intent from query if not provided
    if not primary_intent or primary_intent == 'mixed':
        query_intent = canonicalize_prompt(query_text)
    else:
        query_intent = primary_intent
    
    # Filter traces by intent match
    intent_matches = []
    for idx, trace_repr in enumerate(trace_reprs):
        trace_intents = trace_repr.get('intents', [])
        if query_intent in trace_intents:
            # Score based on intent frequency (more intents = higher score)
            score = trace_intents.count(query_intent) / max(len(trace_intents), 1)
            intent_matches.append((idx, score))
    
    # Sort by score and return top-K
    intent_matches.sort(key=lambda x: x[1], reverse=True)
    
    return intent_matches[:top_k]

print("Intent-only search implementation ready")

Intent-only search implementation ready


In [9]:
# Search Implementation 3: Intent + Representation Search

def intent_representation_search(query: Dict, trace_reprs: List[Dict], top_k: int = 10, alpha: float = 0.3) -> List[Tuple[int, float]]:
    """Intent + Representation search: Multi-rung search with intent filtering."""
    query_text = query['text']
    primary_intent = query.get('primary_intent')
    target_rungs = query.get('target_rungs', [])
    
    # Extract intent from query
    if not primary_intent or primary_intent == 'mixed':
        query_intent = canonicalize_prompt(query_text)
    else:
        query_intent = primary_intent
    
    # If no target rungs specified, use all rungs
    if not target_rungs:
        target_rungs = ['tokens', 'semantic_edits', 'functions', 'module_graph', 'motifs']
    
    scores = []
    
    for idx, trace_repr in enumerate(trace_reprs):
        # Intent score
        trace_intents = trace_repr.get('intents', [])
        if query_intent in trace_intents:
            intent_score = 1.0
        else:
            intent_score = 0.0
        
        # Rung scores (compute similarity for each target rung)
        rung_scores = []
        
        for rung in target_rungs:
            if rung == 'intents':
                # Intent rung: exact match score
                if query_intent in trace_intents:
                    rung_scores.append(1.0)
                else:
                    rung_scores.append(0.0)
            else:
                # Other rungs: TF-IDF similarity
                rung_content = trace_repr.get(rung, [])
                if isinstance(rung_content, list):
                    rung_text = ' '.join(str(item) for item in rung_content)
                else:
                    rung_text = str(rung_content)
                
                if rung_text:
                    try:
                        vectorizer = TfidfVectorizer(max_features=1000)
                        # Combine query and rung text for vectorization
                        texts = [query_text, rung_text]
                        tfidf_matrix = vectorizer.fit_transform(texts)
                        similarity = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])[0][0]
                        rung_scores.append(similarity)
                    except:
                        rung_scores.append(0.0)
                else:
                    rung_scores.append(0.0)
        
        # Get max rung score
        max_rung_score = max(rung_scores) if rung_scores else 0.0
        
        # Combined score: alpha * intent + (1-alpha) * max_rung
        combined_score = alpha * intent_score + (1 - alpha) * max_rung_score
        
        scores.append((idx, combined_score))
    
    # Sort by score and return top-K
    scores.sort(key=lambda x: x[1], reverse=True)
    
    return scores[:top_k]

print("Intent + Representation search implementation ready")

Intent + Representation search implementation ready


## Phase 4: Evaluation

Run all queries through all three search conditions and compute metrics

In [10]:
# Evaluation metrics
def compute_metrics(predicted_indices: List[int], ground_truth_indices: List[int], k_values: List[int] = [1, 5, 10]) -> Dict[str, float]:
    """Compute standard IR metrics."""
    ground_truth_set = set(ground_truth_indices)
    
    metrics = {}
    
    # Precision@K, Recall@K
    for k in k_values:
        top_k_pred = set(predicted_indices[:k])
        
        if len(top_k_pred) > 0:
            precision = len(top_k_pred & ground_truth_set) / len(top_k_pred)
        else:
            precision = 0.0
        
        if len(ground_truth_set) > 0:
            recall = len(top_k_pred & ground_truth_set) / len(ground_truth_set)
        else:
            recall = 0.0
        
        metrics[f'precision@{k}'] = precision
        metrics[f'recall@{k}'] = recall
    
    # MRR (Mean Reciprocal Rank)
    mrr = 0.0
    for rank, idx in enumerate(predicted_indices, 1):
        if idx in ground_truth_set:
            mrr = 1.0 / rank
            break
    metrics['mrr'] = mrr
    
    return metrics

# Run evaluation
print("Running evaluation for all queries...")
evaluation_results = []

for query in sample_queries:
    query_id = query['id']
    gt_indices = ground_truth[query_id]['relevant_trace_indices']
    
    if not gt_indices:
        print(f"  Skipping {query_id}: No ground truth")
        continue
    
    # Run all three search conditions
    baseline_results = baseline_search(query, trace_representations, top_k=10)
    intent_only_results = intent_only_search(query, trace_representations, top_k=10)
    intent_repr_results = intent_representation_search(query, trace_representations, top_k=10)
    
    # Extract indices
    baseline_indices = [idx for idx, score in baseline_results]
    intent_only_indices = [idx for idx, score in intent_only_results]
    intent_repr_indices = [idx for idx, score in intent_repr_results]
    
    # Compute metrics
    baseline_metrics = compute_metrics(baseline_indices, gt_indices)
    intent_only_metrics = compute_metrics(intent_only_indices, gt_indices)
    intent_repr_metrics = compute_metrics(intent_repr_indices, gt_indices)
    
    evaluation_results.append({
        'query_id': query_id,
        'query_text': query['text'],
        'query_type': query['query_type'],
        'difficulty': query['difficulty'],
        'n_ground_truth': len(gt_indices),
        'baseline': baseline_metrics,
        'intent_only': intent_only_metrics,
        'intent_representation': intent_repr_metrics,
    })
    
    print(f"  {query_id}: Baseline P@5={baseline_metrics['precision@5']:.3f}, "
          f"Intent-Only P@5={intent_only_metrics['precision@5']:.3f}, "
          f"Intent+Rep P@5={intent_repr_metrics['precision@5']:.3f}")

print(f"\nEvaluation complete for {len(evaluation_results)} queries")

# Save results
results_file = RESULTS_DIR / 'search_evaluation_results.json'
with open(results_file, 'w') as f:
    json.dump(evaluation_results, f, indent=2, default=str)
print(f"Saved evaluation results to: {results_file}")

Running evaluation for all queries...
  Skipping proc_1: No ground truth
  Skipping proc_2: No ground truth
  Skipping func_1: No ground truth
  Skipping func_2: No ground truth
  Skipping struct_1: No ground truth
  Skipping module_1: No ground truth
  Skipping intent_1: No ground truth
  Skipping intent_2: No ground truth
  Skipping hybrid_1: No ground truth
  Skipping context_1: No ground truth

Evaluation complete for 0 queries
Saved evaluation results to: /Users/hamidaho/new_cursor/research/results/search_evaluation_results.json


In [11]:
# Aggregate results and create summary
results_df = pd.DataFrame([
    {
        'query_id': r['query_id'],
        'query_type': r['query_type'],
        'difficulty': r['difficulty'],
        'baseline_p5': r['baseline']['precision@5'],
        'baseline_r5': r['baseline']['recall@5'],
        'baseline_mrr': r['baseline']['mrr'],
        'intent_only_p5': r['intent_only']['precision@5'],
        'intent_only_r5': r['intent_only']['recall@5'],
        'intent_only_mrr': r['intent_only']['mrr'],
        'intent_repr_p5': r['intent_representation']['precision@5'],
        'intent_repr_r5': r['intent_representation']['recall@5'],
        'intent_repr_mrr': r['intent_representation']['mrr'],
    }
    for r in evaluation_results
])

print("\n" + "="*80)
print("AGGREGATE RESULTS")
print("="*80)

print("\nOverall Averages:")
print(f"Baseline:        P@5={results_df['baseline_p5'].mean():.3f}, R@5={results_df['baseline_r5'].mean():.3f}, MRR={results_df['baseline_mrr'].mean():.3f}")
print(f"Intent-Only:     P@5={results_df['intent_only_p5'].mean():.3f}, R@5={results_df['intent_only_r5'].mean():.3f}, MRR={results_df['intent_only_mrr'].mean():.3f}")
print(f"Intent+Rep:      P@5={results_df['intent_repr_p5'].mean():.3f}, R@5={results_df['intent_repr_r5'].mean():.3f}, MRR={results_df['intent_repr_mrr'].mean():.3f}")

print("\nBy Query Type:")
for query_type in results_df['query_type'].unique():
    type_df = results_df[results_df['query_type'] == query_type]
    print(f"\n{query_type.upper()}:")
    print(f"  Baseline:        P@5={type_df['baseline_p5'].mean():.3f}, R@5={type_df['baseline_r5'].mean():.3f}")
    print(f"  Intent-Only:     P@5={type_df['intent_only_p5'].mean():.3f}, R@5={type_df['intent_only_r5'].mean():.3f}")
    print(f"  Intent+Rep:      P@5={type_df['intent_repr_p5'].mean():.3f}, R@5={type_df['intent_repr_r5'].mean():.3f}")

print("\nBy Difficulty:")
for difficulty in ['easy', 'medium', 'hard']:
    diff_df = results_df[results_df['difficulty'] == difficulty]
    if len(diff_df) > 0:
        print(f"\n{difficulty.upper()}:")
        print(f"  Baseline:        P@5={diff_df['baseline_p5'].mean():.3f}, R@5={diff_df['baseline_r5'].mean():.3f}")
        print(f"  Intent-Only:     P@5={diff_df['intent_only_p5'].mean():.3f}, R@5={diff_df['intent_only_r5'].mean():.3f}")
        print(f"  Intent+Rep:      P@5={diff_df['intent_repr_p5'].mean():.3f}, R@5={diff_df['intent_repr_r5'].mean():.3f}")

# Save summary
summary_file = RESULTS_DIR / 'search_evaluation_summary.json'
summary = {
    'overall': {
        'baseline': {
            'precision@5': float(results_df['baseline_p5'].mean()),
            'recall@5': float(results_df['baseline_r5'].mean()),
            'mrr': float(results_df['baseline_mrr'].mean()),
        },
        'intent_only': {
            'precision@5': float(results_df['intent_only_p5'].mean()),
            'recall@5': float(results_df['intent_only_r5'].mean()),
            'mrr': float(results_df['intent_only_mrr'].mean()),
        },
        'intent_representation': {
            'precision@5': float(results_df['intent_repr_p5'].mean()),
            'recall@5': float(results_df['intent_repr_r5'].mean()),
            'mrr': float(results_df['intent_repr_mrr'].mean()),
        },
    },
    'by_query_type': {},
    'by_difficulty': {},
}

for query_type in results_df['query_type'].unique():
    type_df = results_df[results_df['query_type'] == query_type]
    summary['by_query_type'][query_type] = {
        'baseline': {'precision@5': float(type_df['baseline_p5'].mean()), 'recall@5': float(type_df['baseline_r5'].mean())},
        'intent_only': {'precision@5': float(type_df['intent_only_p5'].mean()), 'recall@5': float(type_df['intent_only_r5'].mean())},
        'intent_representation': {'precision@5': float(type_df['intent_repr_p5'].mean()), 'recall@5': float(type_df['intent_repr_r5'].mean())},
    }

for difficulty in ['easy', 'medium', 'hard']:
    diff_df = results_df[results_df['difficulty'] == difficulty]
    if len(diff_df) > 0:
        summary['by_difficulty'][difficulty] = {
            'baseline': {'precision@5': float(diff_df['baseline_p5'].mean()), 'recall@5': float(diff_df['baseline_r5'].mean())},
            'intent_only': {'precision@5': float(diff_df['intent_only_p5'].mean()), 'recall@5': float(diff_df['intent_only_r5'].mean())},
            'intent_representation': {'precision@5': float(diff_df['intent_repr_p5'].mean()), 'recall@5': float(diff_df['intent_repr_r5'].mean())},
        }

with open(summary_file, 'w') as f:
    json.dump(summary, f, indent=2)
print(f"\nSaved summary to: {summary_file}")


AGGREGATE RESULTS

Overall Averages:


KeyError: 'baseline_p5'