# RAG Experimentation Notebook

Comprehensive analysis of RAG configurations for PDF Q&A system.

## Setup

In [None]:
import json
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

# Configure plotting
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

print("✓ Imports complete")

## Run Experiments

Execute the experimentation harness with desired configurations.

In [None]:
# Run experiments (uncomment to execute)
# !python experiment.py --embeddings onnx nomic-embed-text sentence-transformers \
#                        --llms phi3 llama3.2 llama3.1 \
#                        --top-k 3 5 10 \
#                        --output experiment_results.json

## Load Results

In [None]:
# Load experiment results
with open('experiment_results.json', 'r') as f:
    results = json.load(f)

df = pd.DataFrame(results)

print(f"Loaded {len(df)} experiment results")
print(f"Questions: {df['question_id'].unique()}")
print(f"Embeddings: {df['embedding_model'].unique()}")
print(f"LLMs: {df['llm_model'].unique()}")
print(f"Top-K values: {sorted(df['top_k'].unique())}")

df.head()

## Summary Statistics

In [None]:
# Aggregate by configuration
config_summary = df.groupby(['embedding_model', 'llm_model', 'top_k']).agg({
    'exact_match': 'sum',
    'token_f1': 'mean',
    'latency_sec': 'mean',
    'llm_size_gb': 'first',
    'question_id': 'count'
}).rename(columns={'question_id': 'num_questions'})

config_summary['exact_match_rate'] = config_summary['exact_match'] / config_summary['num_questions']
config_summary = config_summary.sort_values('token_f1', ascending=False)

print("\nTop 10 Configurations by F1 Score:")
config_summary.head(10)

## Embedding Model Comparison

Are different embeddings producing different results?

In [None]:
# Compare embeddings for same LLM and top_k
embedding_comparison = df.groupby(['embedding_model', 'llm_model', 'top_k']).agg({
    'token_f1': 'mean',
    'latency_sec': 'mean'
}).reset_index()

# Pivot for easier comparison
embedding_pivot = embedding_comparison.pivot_table(
    index=['llm_model', 'top_k'],
    columns='embedding_model',
    values='token_f1'
)

print("\nF1 Scores by Embedding Model:")
print(embedding_pivot)

# Check if embeddings produce identical results
if len(embedding_pivot.columns) > 1:
    correlations = embedding_pivot.corr()
    print("\nCorrelation between embedding models:")
    print(correlations)

In [None]:
# Visualize embedding comparison
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# F1 scores
embedding_f1 = df.groupby('embedding_model')['token_f1'].mean().sort_values(ascending=False)
embedding_f1.plot(kind='bar', ax=axes[0], color='steelblue')
axes[0].set_title('Average F1 Score by Embedding Model')
axes[0].set_xlabel('Embedding Model')
axes[0].set_ylabel('Avg F1 Score')
axes[0].tick_params(axis='x', rotation=45)

# Latency
embedding_latency = df.groupby('embedding_model')['latency_sec'].mean().sort_values()
embedding_latency.plot(kind='bar', ax=axes[1], color='coral')
axes[1].set_title('Average Latency by Embedding Model')
axes[1].set_xlabel('Embedding Model')
axes[1].set_ylabel('Avg Latency (seconds)')
axes[1].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

## LLM Model Comparison

In [None]:
# Compare LLM performance
llm_comparison = df.groupby('llm_model').agg({
    'token_f1': 'mean',
    'latency_sec': 'mean',
    'llm_size_gb': 'first'
}).sort_values('token_f1', ascending=False)

print("\nLLM Performance Summary:")
llm_comparison

In [None]:
# Visualize LLM comparison
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# F1 by LLM
llm_f1 = df.groupby('llm_model')['token_f1'].mean().sort_values(ascending=False)
llm_f1.plot(kind='barh', ax=axes[0, 0], color='steelblue')
axes[0, 0].set_title('F1 Score by LLM')
axes[0, 0].set_xlabel('Avg F1 Score')

# Latency by LLM
llm_latency = df.groupby('llm_model')['latency_sec'].mean().sort_values()
llm_latency.plot(kind='barh', ax=axes[0, 1], color='coral')
axes[0, 1].set_title('Latency by LLM')
axes[0, 1].set_xlabel('Avg Latency (seconds)')

# F1 vs Latency scatter
scatter_data = df.groupby('llm_model').agg({
    'token_f1': 'mean',
    'latency_sec': 'mean',
    'llm_size_gb': 'first'
}).reset_index()

axes[1, 0].scatter(scatter_data['latency_sec'], scatter_data['token_f1'], 
                   s=scatter_data['llm_size_gb']*50, alpha=0.6)
for idx, row in scatter_data.iterrows():
    axes[1, 0].annotate(row['llm_model'], 
                        (row['latency_sec'], row['token_f1']),
                        fontsize=8)
axes[1, 0].set_xlabel('Latency (seconds)')
axes[1, 0].set_ylabel('F1 Score')
axes[1, 0].set_title('F1 vs Latency (bubble size = model size)')
axes[1, 0].grid(True, alpha=0.3)

# Efficiency (F1 / latency)
scatter_data['efficiency'] = scatter_data['token_f1'] / scatter_data['latency_sec']
scatter_data.sort_values('efficiency', ascending=False)['efficiency'].plot(
    kind='barh', ax=axes[1, 1], color='green'
)
axes[1, 1].set_title('Efficiency (F1 / Latency)')
axes[1, 1].set_xlabel('Efficiency Score')

plt.tight_layout()
plt.show()

## Top-K Analysis

In [None]:
# Impact of top_k on performance
topk_analysis = df.groupby(['top_k', 'llm_model']).agg({
    'token_f1': 'mean',
    'latency_sec': 'mean'
}).reset_index()

fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# F1 vs top_k
for llm in topk_analysis['llm_model'].unique():
    data = topk_analysis[topk_analysis['llm_model'] == llm]
    axes[0].plot(data['top_k'], data['token_f1'], marker='o', label=llm)
axes[0].set_xlabel('Top-K')
axes[0].set_ylabel('F1 Score')
axes[0].set_title('F1 Score vs Top-K by LLM')
axes[0].legend()
axes[0].grid(True, alpha=0.3)

# Latency vs top_k
for llm in topk_analysis['llm_model'].unique():
    data = topk_analysis[topk_analysis['llm_model'] == llm]
    axes[1].plot(data['top_k'], data['latency_sec'], marker='o', label=llm)
axes[1].set_xlabel('Top-K')
axes[1].set_ylabel('Latency (seconds)')
axes[1].set_title('Latency vs Top-K by LLM')
axes[1].legend()
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## Per-Question Analysis

In [None]:
# Analyze each question separately
for qid in df['question_id'].unique():
    print(f"\n{'=' * 80}")
    print(f"Question: {qid}")
    print(f"{'=' * 80}")
    
    q_data = df[df['question_id'] == qid]
    
    # Show question and expected answer
    print(f"\nQ: {q_data.iloc[0]['question']}")
    print(f"\nExpected: {q_data.iloc[0]['expected'][:200]}...")
    
    # Best performers for this question
    best = q_data.nlargest(5, 'token_f1')[[
        'embedding_model', 'llm_model', 'top_k', 'token_f1', 'latency_sec', 'llm_size_gb'
    ]]
    
    print("\nTop 5 Configurations:")
    print(best.to_string())
    
    # Show actual answer from best config
    best_answer = q_data.loc[q_data['token_f1'].idxmax(), 'answer']
    print(f"\nBest Answer ({q_data.loc[q_data['token_f1'].idxmax(), 'config']}):")
    print(best_answer[:300] + "..." if len(best_answer) > 300 else best_answer)

## Diagnostic: Low F1 Scores

Investigate why F1 scores are low.

In [None]:
# Sample a few results to inspect
print("Sample Results for Inspection:")
print("=" * 80)

for idx in df.sample(min(3, len(df))).index:
    row = df.loc[idx]
    print(f"\nConfig: {row['config']}")
    print(f"Question: {row['question']}")
    print(f"\nExpected:\n{row['expected']}")
    print(f"\nActual:\n{row['answer']}")
    print(f"\nF1: {row['token_f1']:.3f}")
    print("-" * 80)

## Recommendations

In [None]:
print("RECOMMENDATIONS")
print("=" * 80)

# Best overall
best_overall = config_summary.iloc[0]
print(f"\n1. Best Overall Configuration:")
print(f"   Embedding: {best_overall.name[0]}")
print(f"   LLM: {best_overall.name[1]}")
print(f"   Top-K: {best_overall.name[2]}")
print(f"   Avg F1: {best_overall['token_f1']:.3f}")
print(f"   Avg Latency: {best_overall['latency_sec']:.2f}s")

# Fastest acceptable quality
threshold_f1 = df['token_f1'].quantile(0.9)  # Top 10% F1
fast_and_good = config_summary[
    config_summary['token_f1'] >= threshold_f1
].sort_values('latency_sec').iloc[0]

print(f"\n2. Fastest with Good Quality:")
print(f"   Embedding: {fast_and_good.name[0]}")
print(f"   LLM: {fast_and_good.name[1]}")
print(f"   Top-K: {fast_and_good.name[2]}")
print(f"   Avg F1: {fast_and_good['token_f1']:.3f}")
print(f"   Avg Latency: {fast_and_good['latency_sec']:.2f}s")

# Most efficient
config_summary['efficiency'] = config_summary['token_f1'] / config_summary['latency_sec']
most_efficient = config_summary.sort_values('efficiency', ascending=False).iloc[0]

print(f"\n3. Most Efficient (F1/latency):")
print(f"   Embedding: {most_efficient.name[0]}")
print(f"   LLM: {most_efficient.name[1]}")
print(f"   Top-K: {most_efficient.name[2]}")
print(f"   Efficiency: {most_efficient['efficiency']:.4f}")

print("\n" + "=" * 80)

## Export Detailed Results

Save comprehensive CSV for further analysis.

In [None]:
# Export to CSV
output_file = 'experiment_results_detailed.csv'
df.to_csv(output_file, index=False)
print(f"✓ Detailed results exported to: {output_file}")

# Also export configuration summary
config_summary.to_csv('experiment_summary.csv')
print(f"✓ Configuration summary exported to: experiment_summary.csv")