# Evaluating Self-RAG Performance

Comprehensive evaluation of retrieval and generation quality.

In [None]:
import sys
sys.path.append('..')

import json
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_style('whitegrid')

## 1. Evaluate Retrieval Performance

In [None]:
%%bash
# Run retrieval evaluation
uv run python -m src.evaluation.retrieval_eval \
    --config ../configs/retrieval_config.yaml \
    --index-dir ../data/embeddings \
    --test-data ../data/samples/sample_test_queries.json \
    --output ../results/retrieval_results.json

echo "✅ Retrieval evaluation complete!"

In [None]:
# Load results
with open('../results/retrieval_results.json', 'r') as f:
    ret_results = json.load(f)

print("Retrieval Metrics:")
print(json.dumps(ret_results, indent=2))

In [None]:
# Visualize Precision@k and Recall@k
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 4))

# Precision@k
prec = ret_results['precision@k']
ax1.bar(prec.keys(), prec.values(), color='skyblue')
ax1.set_xlabel('k')
ax1.set_ylabel('Precision@k')
ax1.set_title('Retrieval Precision')

# Recall@k
rec = ret_results['recall@k']
ax2.bar(rec.keys(), rec.values(), color='lightgreen')
ax2.set_xlabel('k')
ax2.set_ylabel('Recall@k')
ax2.set_title('Retrieval Recall')

plt.tight_layout()
plt.show()

## 2. Evaluate Generation Quality

In [None]:
%%bash
# Run generation evaluation
uv run python -m src.evaluation.generation_eval \
    --retrieval-config ../configs/retrieval_config.yaml \
    --generator-config ../configs/generator_config.yaml \
    --index-dir ../data/embeddings \
    --generator-weights ../models/generator_lora/final \
    --test-data ../data/samples/sample_qa_data.json \
    --output ../results/generation_results.json

echo "✅ Generation evaluation complete!"

In [None]:
# Load results
with open('../results/generation_results.json', 'r') as f:
    gen_results = json.load(f)

print("Generation Metrics:")
print(json.dumps(gen_results, indent=2))

In [None]:
# Visualize generation metrics
metrics = {
    'Hallucination Rate': gen_results['hallucination_rate'],
    'FactScore': gen_results['avg_factscore'],
    'Utility Score': gen_results['avg_utility_score'],
    'Completeness': gen_results['avg_completeness']
}

fig, ax = plt.subplots(figsize=(10, 5))
colors = ['red' if 'Hallucination' in k else 'green' for k in metrics.keys()]
ax.barh(list(metrics.keys()), list(metrics.values()), color=colors, alpha=0.7)
ax.set_xlabel('Score')
ax.set_title('Self-RAG Generation Quality')
ax.set_xlim(0, 1)
plt.tight_layout()
plt.show()

## 3. Compare with Baselines (Optional)

Compare Self-RAG with vanilla RAG or no-RAG baselines.

In [None]:
# Example comparison data (replace with actual results)
comparison = pd.DataFrame({
    'Model': ['No RAG', 'Vanilla RAG', 'Self-RAG'],
    'Hallucination Rate': [0.45, 0.30, 0.20],
    'FactScore': [0.55, 0.65, 0.75],
    'Utility': [0.60, 0.70, 0.80]
})

comparison.set_index('Model').plot(kind='bar', figsize=(10, 5))
plt.title('Model Comparison')
plt.ylabel('Score')
plt.legend(title='Metric')
plt.xticks(rotation=0)
plt.tight_layout()
plt.show()

## 4. Analyze Reflection Tokens

Analyze how reflection tokens correlate with quality.

In [None]:
# Example analysis (replace with actual data)
token_analysis = pd.DataFrame({
    'Support Level': ['Fully Supported', 'Partially Supported', 'No Support'],
    'Count': [7, 2, 1],
    'Avg Quality': [0.85, 0.65, 0.40]
})

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 4))

ax1.bar(token_analysis['Support Level'], token_analysis['Count'], color='steelblue')
ax1.set_title('Distribution of Support Levels')
ax1.set_ylabel('Count')
plt.setp(ax1.xaxis.get_majorticklabels(), rotation=45, ha='right')

ax2.bar(token_analysis['Support Level'], token_analysis['Avg Quality'], color='coral')
ax2.set_title('Quality by Support Level')
ax2.set_ylabel('Average Quality Score')
plt.setp(ax2.xaxis.get_majorticklabels(), rotation=45, ha='right')

plt.tight_layout()
plt.show()

## Summary

Evaluation complete!
- ✅ Retrieval metrics computed
- ✅ Generation quality measured
- ✅ Hallucination rate assessed
- ✅ Visualizations created

**Next:** Use `05_demo.ipynb` for interactive demonstration