# Retrieval Dataset Analysis

This notebook analyzes the `retrieval_dataset_verified.yaml` generated by the validation pipeline. It visualizes the empirical relevance of contexts mined from benchmarks vs. vector search.

In [None]:
import yaml
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

# Configure plotting
sns.set_theme(style="whitegrid")
plt.rcParams['figure.figsize'] = (10, 6)

In [None]:
# Load Data
dataset_path = Path("../retrieval_dataset_verified.yaml")
if not dataset_path.exists():
    print(f"File not found: {dataset_path}")
else:
    with open(dataset_path, 'r') as f:
        data = yaml.safe_load(f)
    
    cases = data.get('cases', [])
    print(f"Loaded {len(cases)} cases.")

In [None]:
# Flatten Candidates into a DataFrame
records = []
for case in cases:
    # Access lists safely
    pos = case.get('positive_ctxs', []) or []
    neg = case.get('negative_ctxs', []) or []
    
    candidates = pos + neg
    for ctx in candidates:
        meta = ctx.get('metadata', { })
        records.append({
            'case_id': case['id'],
            'query': case['query'],
            'fqn': ctx['fqn'],
            'source_type': ctx['type'], # gold, retrieved, negative
            'empirical_relevance': ctx.get('empirical_relevance', 'UNKNOWN'),
            'delta_p': meta.get('delta_p', 0.0),
            'p_in': meta.get('p_in', 0.0),
            'p_out': meta.get('p_out', 0.0)
        })

df = pd.DataFrame(records)
print(f"Total Candidates Analyzed: {len(df)}")
df.head()

## 1. Relevance Distribution by Source
How often is a "Gold" context actually relevant? How often does Vector Search find relevant items?

In [None]:
relevance_counts = df.groupby(['source_type', 'empirical_relevance']).size().unstack(fill_value=0)
print(relevance_counts)

relevance_counts.plot(kind='bar', stacked=True, colormap='viridis')
plt.title('Empirical Relevance by Source Type')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.show()

## 2. Impact Score (Delta P) Distribution
Distribution of causal impact scores. High positive values indicate strong relevance.

In [None]:
plt.figure(figsize=(10, 6))
sns.histplot(data=df, x='delta_p', hue='empirical_relevance', bins=20, multiple="stack")
plt.title('Distribution of Impact Scores (Delta P)')
plt.xlabel('Delta P (P_in - P_out)')
plt.show()

## 3. High Impact Contexts
Top contexts that most improved model performance.

In [None]:
top_impact = df[df['empirical_relevance'] == 'YES'].sort_values('delta_p', ascending=False).head(20)
top_impact[['fqn', 'delta_p', 'source_type', 'case_id']]