# LLMOps Evaluation Playground

This notebook demonstrates how to evaluate the LLMOps chatbot system using various metrics and test cases.

## 1. Setup Environment

Import required libraries and set up the evaluation environment.

In [None]:
import sys
import os
from pathlib import Path
import json
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Add project root to path
project_root = Path.cwd().parent if Path.cwd().name == 'notebooks' else Path.cwd()
sys.path.append(str(project_root))

# Import project modules
from app.config import Config
from app.chains import ChatChain
from eval.ragas_eval import RAGASEvaluator
from eval.promptfoo import PromptFooEvaluator

print(f"Project root: {project_root}")
print("Environment setup complete!")

## 2. Initialize System Components

Load configuration and initialize the chatbot system.

In [None]:
# Load configuration
config = Config()

# Initialize chat chain
chat_chain = ChatChain(config)

# Initialize evaluators
ragas_eval = RAGASEvaluator(config)
promptfoo_eval = PromptFooEvaluator()

print("System components initialized!")
print(f"OpenAI API Key configured: {'Yes' if config.OPENAI_API_KEY else 'No'}")
print(f"Vector store path: {config.VECTOR_STORE_PATH}")

## 3. Load Test Data

Load the evaluation test samples and examine the data structure.

In [None]:
# Load test samples
test_samples = ragas_eval.load_test_samples()

print(f"Loaded {len(test_samples.get('questions', []))} test questions")
print("\nSample questions:")
for i, question in enumerate(test_samples.get('questions', [])[:3]):
    print(f"{i+1}. {question}")

# Create DataFrame for easier analysis
if test_samples['questions']:
    test_df = pd.DataFrame({
        'question': test_samples['questions'],
        'ground_truth': test_samples['ground_truths'],
        'context': test_samples.get('contexts', [[]] * len(test_samples['questions']))
    })
    
    display(test_df.head())
else:
    print("No test samples found. Please check the test_samples.json file.")

## 4. Run Chat System Tests

Test the chatbot system with our evaluation questions.

In [None]:
import asyncio

async def run_chat_tests():
    """Run the chat system on test questions."""
    results = []
    
    for i, question in enumerate(test_samples.get('questions', [])):
        print(f"Processing question {i+1}/{len(test_samples['questions'])}: {question[:50]}...")
        
        try:
            # Test with RAG
            result = await chat_chain.process_message(
                message=question,
                conversation_id=f"test_{i}",
                use_rag=True
            )
            
            results.append({
                'question_id': i,
                'question': question,
                'answer': result['response'],
                'sources': result.get('sources', []),
                'conversation_id': result['conversation_id']
            })
            
        except Exception as e:
            print(f"Error processing question {i}: {e}")
            results.append({
                'question_id': i,
                'question': question,
                'answer': f"Error: {str(e)}",
                'sources': [],
                'conversation_id': f"test_{i}"
            })
    
    return results

# Run the tests
if test_samples.get('questions'):
    chat_results = await run_chat_tests()
    
    # Display results
    results_df = pd.DataFrame(chat_results)
    print(f"\nGenerated {len(chat_results)} responses")
    display(results_df[['question', 'answer']].head())
else:
    print("No questions to test. Skipping chat tests.")
    chat_results = []

## 5. RAGAS Evaluation

Evaluate the chat responses using RAGAS metrics.

In [None]:
if chat_results and test_samples.get('questions'):
    # Prepare data for RAGAS evaluation
    questions = [r['question'] for r in chat_results]
    answers = [r['answer'] for r in chat_results]
    contexts = [[source] for r in chat_results for source in r.get('sources', ['No context'])]
    ground_truths = test_samples['ground_truths']
    
    # Ensure contexts has the right length
    if len(contexts) != len(questions):
        contexts = [['No context available']] * len(questions)
    
    # Run RAGAS evaluation
    ragas_results = ragas_eval.evaluate_rag_system(
        questions=questions,
        answers=answers,
        contexts=contexts,
        ground_truths=ground_truths
    )
    
    print("RAGAS Evaluation Results:")
    for metric, score in ragas_results.items():
        print(f"  {metric}: {score:.3f}")
    
    # Generate evaluation report
    report = ragas_eval.generate_evaluation_report(ragas_results)
    print("\n" + "="*50)
    print(report)
else:
    print("No chat results available for RAGAS evaluation.")
    ragas_results = {}

## 6. Visualize Results

Create visualizations of the evaluation metrics.

In [None]:
if ragas_results:
    # Create a bar plot of RAGAS metrics
    plt.figure(figsize=(12, 6))
    
    # Metrics bar plot
    plt.subplot(1, 2, 1)
    metrics = list(ragas_results.keys())
    scores = list(ragas_results.values())
    
    bars = plt.bar(metrics, scores, color=['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728'])
    plt.title('RAGAS Evaluation Metrics')
    plt.ylabel('Score')
    plt.ylim(0, 1)
    plt.xticks(rotation=45)
    
    # Add score labels on bars
    for bar, score in zip(bars, scores):
        plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.01, 
                f'{score:.3f}', ha='center', va='bottom')
    
    # Radar chart
    plt.subplot(1, 2, 2, projection='polar')
    angles = [i * 2 * 3.14159 / len(metrics) for i in range(len(metrics))]
    angles += angles[:1]  # Complete the circle
    scores_radar = scores + scores[:1]  # Complete the circle
    
    plt.plot(angles, scores_radar, 'o-', linewidth=2)
    plt.fill(angles, scores_radar, alpha=0.25)
    plt.xticks(angles[:-1], metrics)
    plt.ylim(0, 1)
    plt.title('RAGAS Metrics Radar Chart')
    
    plt.tight_layout()
    plt.show()
    
    # Summary statistics
    avg_score = sum(scores) / len(scores)
    print(f"\nSummary Statistics:")
    print(f"Average Score: {avg_score:.3f}")
    print(f"Best Metric: {metrics[scores.index(max(scores))]} ({max(scores):.3f})")
    print(f"Worst Metric: {metrics[scores.index(min(scores))]} ({min(scores):.3f})")
else:
    print("No RAGAS results to visualize.")

## 7. Response Quality Analysis

Analyze the quality and characteristics of generated responses.

In [None]:
if chat_results:
    # Analyze response characteristics
    response_stats = []
    
    for result in chat_results:
        answer = result['answer']
        response_stats.append({
            'length': len(answer),
            'word_count': len(answer.split()),
            'sentence_count': answer.count('.') + answer.count('!') + answer.count('?'),
            'has_sources': len(result.get('sources', [])) > 0,
            'source_count': len(result.get('sources', []))
        })
    
    stats_df = pd.DataFrame(response_stats)
    
    # Visualize response statistics
    plt.figure(figsize=(15, 10))
    
    # Response length distribution
    plt.subplot(2, 3, 1)
    plt.hist(stats_df['word_count'], bins=10, edgecolor='black')
    plt.title('Response Word Count Distribution')
    plt.xlabel('Word Count')
    plt.ylabel('Frequency')
    
    # Character length distribution
    plt.subplot(2, 3, 2)
    plt.hist(stats_df['length'], bins=10, edgecolor='black', color='orange')
    plt.title('Response Character Length Distribution')
    plt.xlabel('Character Count')
    plt.ylabel('Frequency')
    
    # Source usage
    plt.subplot(2, 3, 3)
    source_counts = stats_df['source_count'].value_counts().sort_index()
    plt.bar(source_counts.index, source_counts.values, color='green')
    plt.title('Source Count Distribution')
    plt.xlabel('Number of Sources')
    plt.ylabel('Frequency')
    
    # Summary statistics table
    plt.subplot(2, 3, 4)
    plt.axis('off')
    summary_stats = stats_df.describe()
    table_data = []
    for col in ['word_count', 'length', 'source_count']:
        table_data.append([col, f"{summary_stats.loc['mean', col]:.1f}", 
                          f"{summary_stats.loc['std', col]:.1f}"])
    
    table = plt.table(cellText=table_data, 
                     colLabels=['Metric', 'Mean', 'Std Dev'],
                     cellLoc='center', loc='center')
    table.auto_set_font_size(False)
    table.set_fontsize(10)
    plt.title('Response Statistics Summary')
    
    # Response examples
    plt.subplot(2, 3, 5)
    plt.axis('off')
    plt.text(0.1, 0.9, "Sample Responses:", fontsize=14, fontweight='bold', transform=plt.gca().transAxes)
    
    for i, result in enumerate(chat_results[:2]):
        question = result['question'][:40] + "..." if len(result['question']) > 40 else result['question']
        answer = result['answer'][:60] + "..." if len(result['answer']) > 60 else result['answer']
        
        y_pos = 0.7 - i * 0.3
        plt.text(0.1, y_pos, f"Q: {question}", fontsize=10, transform=plt.gca().transAxes)
        plt.text(0.1, y_pos - 0.1, f"A: {answer}", fontsize=9, transform=plt.gca().transAxes, 
                style='italic', color='blue')
    
    plt.tight_layout()
    plt.show()
    
    # Print summary
    print("\nResponse Analysis Summary:")
    print(f"Average response length: {stats_df['word_count'].mean():.1f} words")
    print(f"Responses with sources: {stats_df['has_sources'].sum()}/{len(stats_df)} ({stats_df['has_sources'].mean()*100:.1f}%)")
    print(f"Average sources per response: {stats_df['source_count'].mean():.1f}")
else:
    print("No chat results available for analysis.")

## 8. Create Custom Test Cases

Create and run custom test cases for specific scenarios.

In [None]:
# Define custom test cases
custom_tests = [
    {
        "description": "Technical question about RAG",
        "question": "How does chunking affect RAG performance?",
        "expected_keywords": ["chunk", "performance", "retrieval"],
        "forbidden_keywords": ["don't know", "cannot"]
    },
    {
        "description": "Operational question",
        "question": "What are the best practices for monitoring LLM applications?",
        "expected_keywords": ["monitoring", "metrics", "performance"],
        "forbidden_keywords": ["unsure", "unclear"]
    },
    {
        "description": "Out-of-scope question",
        "question": "What's the weather like today?",
        "expected_keywords": [],
        "forbidden_keywords": ["sunny", "rainy", "cloudy"]
    }
]

# Run custom tests
async def run_custom_tests():
    custom_results = []
    
    for i, test in enumerate(custom_tests):
        print(f"Running custom test {i+1}: {test['description']}")
        
        try:
            result = await chat_chain.process_message(
                message=test['question'],
                conversation_id=f"custom_test_{i}",
                use_rag=True
            )
            
            # Check keywords
            answer_lower = result['response'].lower()
            expected_found = [kw for kw in test['expected_keywords'] if kw.lower() in answer_lower]
            forbidden_found = [kw for kw in test['forbidden_keywords'] if kw.lower() in answer_lower]
            
            custom_results.append({
                'test': test['description'],
                'question': test['question'],
                'answer': result['response'],
                'expected_found': expected_found,
                'forbidden_found': forbidden_found,
                'sources': result.get('sources', []),
                'pass': len(forbidden_found) == 0 and (len(test['expected_keywords']) == 0 or len(expected_found) > 0)
            })
            
        except Exception as e:
            custom_results.append({
                'test': test['description'],
                'question': test['question'],
                'answer': f"Error: {str(e)}",
                'expected_found': [],
                'forbidden_found': [],
                'sources': [],
                'pass': False
            })
    
    return custom_results

# Execute custom tests
custom_test_results = await run_custom_tests()

# Display results
print("\nCustom Test Results:")
print("=" * 50)

for result in custom_test_results:
    status = "✅ PASS" if result['pass'] else "❌ FAIL"
    print(f"\n{status} - {result['test']}")
    print(f"Q: {result['question']}")
    print(f"A: {result['answer'][:100]}...")
    if result['expected_found']:
        print(f"✓ Expected keywords found: {result['expected_found']}")
    if result['forbidden_found']:
        print(f"✗ Forbidden keywords found: {result['forbidden_found']}")

# Summary
passed_tests = sum(1 for r in custom_test_results if r['pass'])
print(f"\nCustom Tests Summary: {passed_tests}/{len(custom_test_results)} passed")

## 9. Export Results

Save evaluation results for further analysis and reporting.

In [None]:
# Prepare export data
export_data = {
    'timestamp': pd.Timestamp.now().isoformat(),
    'ragas_metrics': ragas_results if 'ragas_results' in locals() else {},
    'test_results': chat_results if 'chat_results' in locals() else [],
    'custom_test_results': custom_test_results if 'custom_test_results' in locals() else [],
    'response_statistics': response_stats if 'response_stats' in locals() else []
}

# Save to JSON
output_dir = project_root / 'eval' / 'results'
output_dir.mkdir(parents=True, exist_ok=True)

output_file = output_dir / f'evaluation_results_{pd.Timestamp.now().strftime("%Y%m%d_%H%M%S")}.json'

with open(output_file, 'w') as f:
    json.dump(export_data, f, indent=2, default=str)

print(f"✅ Evaluation results exported to: {output_file}")

# Create summary report
summary_report = f"""
LLMOps Evaluation Summary Report
Generated: {pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S')}

📊 RAGAS Metrics:
{chr(10).join([f'  - {k}: {v:.3f}' for k, v in (ragas_results.items() if 'ragas_results' in locals() and ragas_results else {})])}

📋 Test Results:
  - Standard tests: {len(chat_results) if 'chat_results' in locals() else 0}
  - Custom tests: {len(custom_test_results) if 'custom_test_results' in locals() else 0}
  - Custom tests passed: {passed_tests if 'passed_tests' in locals() else 0}

📈 Response Statistics:
  - Average word count: {stats_df['word_count'].mean():.1f if 'stats_df' in locals() and not stats_df.empty else 'N/A'}
  - Responses with sources: {stats_df['has_sources'].sum() if 'stats_df' in locals() and not stats_df.empty else 0}

🎯 Overall Assessment:
{ragas_eval.generate_evaluation_report(ragas_results).split('Performance Assessment:')[1] if 'ragas_results' in locals() and ragas_results else 'No RAGAS evaluation performed'}
"""

print(summary_report)

# Save summary report
report_file = output_dir / f'summary_report_{pd.Timestamp.now().strftime("%Y%m%d_%H%M%S")}.txt'
with open(report_file, 'w') as f:
    f.write(summary_report)

print(f"\n📄 Summary report saved to: {report_file}")