In [1]:
pip install rouge-score bert_score evaluate accelerate openai

Note: you may need to restart the kernel to use updated packages.


In [2]:
from typing import List, Dict, Union, Any
import numpy as np
from datetime import datetime
from rouge_score import rouge_scorer
from nltk.translate.bleu_score import sentence_bleu
from bert_score import score as bert_score
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
import evaluate
import json
import logging
from tqdm import tqdm
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import random
from elasticsearch import Elasticsearch


In [3]:
class RAGDataPreparator:
    def __init__(self, data_path: str):
        """Initialize with path to intents.json"""
        with open(data_path, 'r') as f:
            self.raw_data = json.load(f)
            
    def prepare_evaluation_data(self, num_samples: int = None) -> List[Dict]:
        """
        Prepare data for RAG evaluation
        """
        test_cases = []
        
        # Get all intents
        intents = self.raw_data['intents']
        
        # Sample intents if num_samples specified
        if num_samples and num_samples < len(intents):
            selected_intents = random.sample(intents, num_samples)
        else:
            selected_intents = intents
            
        for intent in selected_intents:
            # For each intent, create test cases from patterns and responses
            patterns = intent['patterns']
            responses = intent['responses']
            
            # Create test case from each pattern
            for pattern in patterns:
                # Randomly select one response as ground truth
                ground_truth = random.choice(responses)
                
                # Create test case
                test_case = {
                    'query': pattern,
                    'ground_truth': ground_truth,
                    'tag': intent['tag'],
                    # We'll fill these later when we have the RAG system
                    'contexts': [],  
                    'generated_response': None
                }
                test_cases.append(test_case)
        
        return test_cases
    
    def get_contexts_from_elastic(self, es_client, index_name: str, query: str, k: int = 3) -> List[str]:
        """
        Get contexts from Elasticsearch for a given query
        """
        search_query = {
            "size": k,
            "query": {
                "multi_match": {
                    "query": query,
                    "fields": ["text", "original_text"],
                    "type": "best_fields"
                }
            }
        }
        
        try:
            response = es_client.search(index=index_name, body=search_query)
            contexts = [hit['_source'].get('text', '') for hit in response['hits']['hits']]
            return contexts
        except Exception as e:
            print(f"Error retrieving contexts: {str(e)}")
            return []

In [4]:
preparator = RAGDataPreparator('/home/jovyan/data/intents.json')

# Get test cases
test_cases = preparator.prepare_evaluation_data(num_samples=3)

# Print sample test case
print("\nSample Test Case:")
print(json.dumps(test_cases[0], indent=2))

# Statistics
print("\nDataset Statistics:")
print(f"Total test cases: {len(test_cases)}")
unique_tags = len(set(case['tag'] for case in test_cases))
print(f"Unique intent tags: {unique_tags}")


Sample Test Case:
{
  "query": "What's the difference between sadness and depression?",
  "ground_truth": "Sadness is a normal reaction to a loss, disappointment, problems, or other difficult situations. Feeling sad from time to time is just another part of being human. In these cases, feelings of sadness go away quickly and you can go about your daily life. Other ways to talk about sadness might be feeling low, feeling down, or feeling blue.A person may say they are feeling depressed, but if it goes away on its own and doesn't impact life in a big way, it probably isn't the illness of depression. Depression is a mental illness that affects your mood, the way you understand yourself, and the way you understand and relate to things around you. It can also go by different names, such as clinical depression, major depressive disorder, or major depression. Depression can come up for no reason, and it lasts for a long time. It's much more than sadness or low mood. People who experience dep

In [5]:
class ComprehensiveRAGEvaluator:
    def __init__(self, llm_client, rag_system):
        """Initialize with better error handling"""
        self.llm = llm_client
        self.rag_system = rag_system
        self.model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')
        self.rouge_scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
        
        # Initialize NLTK's BLEU with smoothing
        from nltk.translate.bleu_score import SmoothingFunction
        self.smoothing = SmoothingFunction().method1
        
        # Initialize logging
        logging.basicConfig(level=logging.INFO)
        self.logger = logging.getLogger(__name__)

    def evaluate_traditional_metrics(self, 
                                  query: str,
                                  generated_response: str,
                                  ground_truth: str,
                                  retrieved_contexts: List[str]) -> Dict[str, float]:
        """Traditional metrics with improved BLEU calculation"""
        metrics = {}
        
        try:
            # ROUGE Scores
            rouge_scores = self.rouge_scorer.score(ground_truth, generated_response)
            metrics['rouge1_f1'] = rouge_scores['rouge1'].fmeasure
            metrics['rouge2_f1'] = rouge_scores['rouge2'].fmeasure
            metrics['rougeL_f1'] = rouge_scores['rougeL'].fmeasure
            
            # BLEU Score with smoothing
            metrics['bleu'] = sentence_bleu(
                [ground_truth.split()],
                generated_response.split(),
                smoothing_function=self.smoothing
            )
            
            # Semantic Similarity using SentenceTransformer
            ground_truth_embedding = self.model.encode(ground_truth)
            response_embedding = self.model.encode(generated_response)
            
            similarity = cosine_similarity(
                [ground_truth_embedding],
                [response_embedding]
            )[0][0]
            metrics['semantic_similarity'] = float(similarity)
            
            # Context Relevance (if contexts available)
            if retrieved_contexts:
                query_embedding = self.model.encode(query)
                context_embeddings = self.model.encode(retrieved_contexts)
                context_similarities = cosine_similarity(
                    [query_embedding],
                    context_embeddings
                )[0]
                metrics['context_relevance'] = float(np.mean(context_similarities))
            else:
                metrics['context_relevance'] = 0.0
                
        except Exception as e:
            self.logger.error(f"Error in metric calculation: {str(e)}")
            metrics = {
                'rouge1_f1': 0.0,
                'rouge2_f1': 0.0,
                'rougeL_f1': 0.0,
                'bleu': 0.0,
                'semantic_similarity': 0.0,
                'context_relevance': 0.0
            }
            
        return metrics

    def _extract_json_from_text(self, text: str) -> Dict:
        """
        Enhanced JSON extraction and cleaning
        """
        try:
            # If text is already a dict, return it
            if isinstance(text, dict):
                return text
                
            # Find JSON pattern
            import re
            json_pattern = r'\{[\s\S]*\}'
            match = re.search(json_pattern, text)
            
            if match:
                json_str = match.group()
                
                # Clean up JSON string
                json_str = (
                    json_str
                    .replace("'", '"')  # Replace single quotes
                    .replace('\n', ' ')  # Remove newlines
                    .replace('None', 'null')  # Replace Python None
                )
                
                # Remove trailing commas before closing braces/brackets
                json_str = re.sub(r',\s*([\]}])', r'\1', json_str)
                
                # Clean up any double commas
                json_str = re.sub(r',\s*,', ',', json_str)
                
                # Remove comments if any
                json_str = re.sub(r'//.*?[\n\r]', '', json_str)
                json_str = re.sub(r'/\*.*?\*/', '', json_str, flags=re.DOTALL)
                
                try:
                    # Try parsing the cleaned JSON
                    return json.loads(json_str)
                except json.JSONDecodeError as e:
                    # If still failing, try more aggressive cleaning
                    # Remove all whitespace between brackets
                    json_str = re.sub(r'\s+(?=[^"]*(?:"[^"]*"[^"]*)*$)', '', json_str)
                    # Ensure property names are quoted
                    json_str = re.sub(r'([{,])\s*([a-zA-Z0-9_]+):', r'\1"\2":', json_str)
                    return json.loads(json_str)
            else:
                raise ValueError("No JSON object found in response")
                
        except Exception as e:
            self.logger.error(f"JSON extraction failed: {str(e)}\nOriginal text: {text}")
            # Return a valid default structure
            return self._get_default_evaluation()
    
    def llm_judge_evaluation(self,
                           query: str,
                           generated_response: str,
                           ground_truth: str = None) -> Dict[str, Any]:
        """
        Improved LLM evaluation with stricter JSON formatting
        """
        try:
            evaluation_prompt = f"""Evaluate this Q&A interaction. Respond with a JSON object exactly matching this structure:
    
    Question: {query}
    Generated Response: {generated_response}
    {"Reference Answer: " + ground_truth if ground_truth else ""}
    
    Required JSON structure:
    {{
        "scores": {{
            "relevance": {{"score": 0, "explanation": "text"}},
            "accuracy": {{"score": 0, "explanation": "text"}},
            "completeness": {{"score": 0, "explanation": "text"}},
            "clarity": {{"score": 0, "explanation": "text"}}
        }},
        "overall_score": 0,
        "feedback": "text"
    }}
    
    Ensure:
    - No trailing commas
    - All property names in double quotes
    - No comments
    - Valid JSON only
    """
    
            response = self.llm.generate(evaluation_prompt)
            evaluation = self._extract_json_from_text(response)
            
            # Validate structure
            if not self._validate_evaluation_structure(evaluation):
                self.logger.warning("Invalid evaluation structure, using default")
                return self._get_default_evaluation()
                
            return evaluation
                
        except Exception as e:
            self.logger.error(f"LLM evaluation failed: {str(e)}")
            return self._get_default_evaluation()

    def _get_default_evaluation(self) -> Dict[str, Any]:
        """Return default evaluation when LLM evaluation fails"""
        return {
            "scores": {
                "relevance": {"score": 0, "explanation": "Evaluation failed"},
                "accuracy": {"score": 0, "explanation": "Evaluation failed"},
                "completeness": {"score": 0, "explanation": "Evaluation failed"},
                "clarity": {"score": 0, "explanation": "Evaluation failed"}
            },
            "overall_score": 0,
            "feedback": "Evaluation failed"
        }

    def _validate_evaluation_structure(self, evaluation: Dict) -> bool:
        """
        Validate the evaluation structure
        """
        try:
            required_fields = {
                'scores': {
                    'relevance': ['score', 'explanation'],
                    'accuracy': ['score', 'explanation'],
                    'completeness': ['score', 'explanation'],
                    'clarity': ['score', 'explanation']
                },
                'overall_score': None,
                'feedback': None
            }
            
            # Check main fields
            if not all(field in evaluation for field in required_fields):
                return False
                
            # Check scores structure
            scores = evaluation.get('scores', {})
            for category, fields in required_fields['scores'].items():
                if category not in scores:
                    return False
                if fields:
                    if not all(field in scores[category] for field in fields):
                        return False
                        
            return True
            
        except Exception:
            return False
    
    def aqa_evaluation(self, original_answer: str) -> Dict[str, float]:
        """
        Improved AQA evaluation with stricter JSON handling
        """
        try:
            comparison_prompt = f"""Compare these answers and provide scores as a JSON object:
    
    Original Answer: {original_answer}
    Generated Answer: {self.rag_system.generate_response(original_answer)}
    
    Required JSON structure:
    {{
        "semantic_similarity": 0.0,
        "factual_consistency": 0.0,
        "information_coverage": 0.0
    }}
    
    Ensure:
    - Scores between 0.0 and 1.0
    - No trailing commas
    - All property names in double quotes
    - Valid JSON only"""
    
            response = self.llm.generate(comparison_prompt)
            scores = self._extract_json_from_text(response)
            
            # Validate and normalize scores
            return self._normalize_scores(scores)
            
        except Exception as e:
            self.logger.error(f"AQA evaluation failed: {str(e)}")
            return {
                "semantic_similarity": 0.0,
                "factual_consistency": 0.0,
                "information_coverage": 0.0
            }


    def _normalize_scores(self, scores: Dict) -> Dict[str, float]:
        """
        Normalize scores to ensure they're valid floats between 0 and 1
        """
        required_fields = ["semantic_similarity", "factual_consistency", "information_coverage"]
        normalized = {}
        
        for field in required_fields:
            try:
                value = float(scores.get(field, 0))
                normalized[field] = max(0.0, min(1.0, value))
            except (TypeError, ValueError):
                normalized[field] = 0.0
                
        return normalized
        
    def _get_default_evaluation(self) -> Dict[str, Any]:
        """
        Return default evaluation with proper score formatting
        """
        return {
            "scores": {
                "relevance": {"score": 0, "explanation": "Evaluation failed"},
                "accuracy": {"score": 0, "explanation": "Evaluation failed"},
                "completeness": {"score": 0, "explanation": "Evaluation failed"},
                "clarity": {"score": 0, "explanation": "Evaluation failed"}
            },
            "overall_score": 0,
            "feedback": "Evaluation failed"
        }
    
    def comprehensive_evaluation(self, 
                               test_cases: List[Dict[str, Any]]) -> Dict[str, Any]:
        """
        Run comprehensive evaluation using all three methods
        """
        results = {
            'traditional_metrics': [],
            'llm_judge_results': [],
            'aqa_results': [],
            'per_query_results': []
        }
        
        for test_case in tqdm(test_cases, desc="Evaluating test cases"):
            query = test_case['query']
            ground_truth = test_case.get('ground_truth')
            contexts = test_case.get('contexts', [])
            
            # Get RAG response if not provided
            generated_response = test_case.get('generated_response') or \
                               self.rag_system.generate_response(query)
            
            # 1. Traditional Metrics
            if ground_truth:
                trad_metrics = self.evaluate_traditional_metrics(
                    query, generated_response, ground_truth, contexts
                )
            else:
                trad_metrics = None
            
            # 2. LLM Judge
            llm_evaluation = self.llm_judge_evaluation(
                query, generated_response, ground_truth
            )
            
            # 3. AQA Evaluation
            if ground_truth:
                aqa_results = self.aqa_evaluation(ground_truth)
            else:
                aqa_results = None
            
            # Store all results
            results['traditional_metrics'].append(trad_metrics)
            results['llm_judge_results'].append(llm_evaluation)
            results['aqa_results'].append(aqa_results)
            
            # Store per-query results
            results['per_query_results'].append({
                'query': query,
                'generated_response': generated_response,
                'ground_truth': ground_truth,
                'traditional_metrics': trad_metrics,
                'llm_evaluation': llm_evaluation,
                'aqa_results': aqa_results
            })
        
        return results

    def generate_report(self, results: Dict[str, Any], output_file: str = None) -> str:
        """
        Generate evaluation report with better error handling and type conversion
        """
        if not results:
            logger.warning("No evaluation results available")
            return "No results to report"
            
        report = ["# RAG System Comprehensive Evaluation Report"]
        report.append(f"Generated on: {datetime.now().strftime('%Y-%m-%d %H:%M')}")
        
        # 1. Traditional Metrics Summary
        report.append("\n## Traditional Metrics Summary")
        if results.get('traditional_metrics'):
            trad_metrics_df = pd.DataFrame([r for r in results['traditional_metrics'] if r is not None])
            if not trad_metrics_df.empty:
                report.append("\nAverage Scores:")
                # Convert to numeric, replacing non-numeric values with 0
                numeric_means = trad_metrics_df.apply(pd.to_numeric, errors='coerce').mean()
                report.append(numeric_means.to_string())
        
        # 2. LLM Judge Summary
        report.append("\n## LLM Judge Evaluation Summary")
        llm_scores = []
        critiques = []
        for r in results.get('llm_judge_results', []):
            if r and isinstance(r, dict):
                if 'overall_score' in r:
                    try:
                        score = float(r['overall_score'])
                        llm_scores.append(score)
                    except (ValueError, TypeError):
                        continue
                if 'feedback' in r:
                    critiques.append(str(r['feedback']))
        
        if llm_scores:
            report.append(f"\nAverage Overall Score: {np.mean(llm_scores):.2f}")
        if critiques:
            report.append("\nSample Feedback:")
            report.extend([f"- {c}" for c in critiques[:3]])  # Show only first 3 critiques
        
        # 3. AQA Results Summary
        report.append("\n## A→Q→A' Evaluation Summary")
        if results.get('aqa_results'):
            aqa_metrics = []
            for r in results['aqa_results']:
                if isinstance(r, dict):
                    # Convert all values to float or use 0.0
                    metrics = {}
                    for k, v in r.items():
                        try:
                            metrics[k] = float(v)
                        except (ValueError, TypeError):
                            metrics[k] = 0.0
                    aqa_metrics.append(metrics)
            
            if aqa_metrics:
                aqa_df = pd.DataFrame(aqa_metrics)
                report.append("\nAverage Scores:")
                report.append(aqa_df.mean().to_string())
        
        # 4. Detailed Analysis
        report.append("\n## Detailed Query Analysis")
        if results.get('per_query_results'):
            for idx, query_result in enumerate(results['per_query_results'], 1):
                if isinstance(query_result, dict):
                    report.append(f"\n### Query {idx}")
                    report.append(f"Query: {query_result.get('query', 'N/A')}")
                    if 'generated_response' in query_result:
                        report.append(f"Generated Response: {query_result['generated_response']}")
                    if 'ground_truth' in query_result:
                        report.append(f"Ground Truth: {query_result['ground_truth']}")
        
        # Save report
        report_text = "\n".join(report)
        if output_file:
            try:
                with open(output_file, 'w', encoding='utf-8') as f:
                    f.write(report_text)
            except Exception as e:
                self.logger.error(f"Error saving report: {str(e)}")
        
        return report_text

    def _create_visualizations(self, results: Dict[str, Any], output_prefix: str = None):
        """
        Create visualization plots for the results
        """
        # 1. Traditional Metrics Distribution
        trad_metrics_df = pd.DataFrame([r for r in results['traditional_metrics'] if r is not None])
        if not trad_metrics_df.empty:
            plt.figure(figsize=(10, 6))
            sns.boxplot(data=trad_metrics_df)
            plt.xticks(rotation=45)
            plt.title("Distribution of Traditional Metrics")
            if output_prefix:
                plt.savefig(f"{output_prefix}_traditional_metrics.png")
            plt.close()
        
        # 2. LLM Judge Scores
        llm_scores = [r['overall_score'] for r in results['llm_judge_results'] if r and 'overall_score' in r]
        if llm_scores:
            plt.figure(figsize=(8, 6))
            sns.histplot(llm_scores, bins=10)
            plt.title("Distribution of LLM Judge Scores")
            if output_prefix:
                plt.savefig(f"{output_prefix}_llm_scores.png")
            plt.close()
        
        # 3. AQA Metrics
        aqa_metrics_df = pd.DataFrame([r for r in results['aqa_results'] if r is not None])
        if not aqa_metrics_df.empty:
            plt.figure(figsize=(8, 6))
            sns.boxplot(data=aqa_metrics_df[['semantic_similarity', 'factual_consistency', 'information_coverage']])
            plt.title("A→Q→A' Evaluation Metrics")
            if output_prefix:
                plt.savefig(f"{output_prefix}_aqa_metrics.png")
            plt.close()

In [10]:
class LocalLLMClient:
    def __init__(self, model_name="TinyLlama/TinyLlama-1.1B-Chat-v1.0"):
        """Initialize local LLM pipeline"""
        self.pipeline = pipeline(
            "text-generation",
            model=model_name,
            torch_dtype=torch.float16,
            device_map="auto"
        )
    
    def generate(self, prompt: str) -> str:
        """Generate response using local LLM"""
        try:
            response = self.pipeline(
                prompt,
                max_new_tokens=200,
                do_sample=True,
                temperature=0.7,
                top_p=0.9,
                num_return_sequences=1
            )
            return response[0]['generated_text']
        except Exception as e:
            print(f"Error in generation: {str(e)}")
            return ""

class RAGSystem:
    def __init__(self, es_client, llm_client):
        """Initialize RAG system"""
        self.es = es_client
        self.llm = llm_client
    
    def generate_response(self, query: str) -> str:
        """Generate RAG response"""
        try:
            # 1. Retrieve contexts
            contexts = self._retrieve_contexts(query)
            
            # 2. Create prompt
            prompt = self._create_prompt(query, contexts)
            
            # 3. Generate response
            response = self.llm.generate(prompt)
            
            return response
        except Exception as e:
            print(f"Error in RAG response: {str(e)}")
            return ""
    
    def _retrieve_contexts(self, query: str, k: int = 3) -> list:
        """Retrieve relevant contexts from Elasticsearch"""
        try:
            search_query = {
                "size": k,
                "query": {
                    "multi_match": {
                        "query": query,
                        "fields": ["text", "original_text"],
                        "type": "best_fields"
                    }
                }
            }
            
            response = self.es.search(index="qa_index_2", body=search_query)
            return [hit['_source'].get('text', '') for hit in response['hits']['hits']]
        except Exception as e:
            print(f"Error retrieving contexts: {str(e)}")
            return []
    
    def _create_prompt(self, query: str, contexts: list) -> str:
        """Create prompt with contexts"""
        context_text = " ".join(contexts)
        return f"""<|system|>
Use this context to answer: {context_text}

<|user|>
{query}

<|assistant|>
"""

In [11]:
# Initialize Elasticsearch client
es = Elasticsearch("http://elasticsearch:9200")

# Quick connection test
if es.ping():
    print("Connected to Elasticsearch successfully!")
else:
    print("Failed to connect to Elasticsearch.")

# Retrieve mappings
index_mappings = es.indices.get_mapping(index="qa_index")
print(index_mappings)

print(index_mappings['qa_index']['mappings']['properties'])

Connected to Elasticsearch successfully!
{'qa_index': {'mappings': {'properties': {'question_text_vector_knn': {'type': 'dense_vector', 'dims': 768, 'index': True, 'similarity': 'cosine', 'index_options': {'type': 'int8_hnsw', 'm': 16, 'ef_construction': 100}}, 'question_vector_knn': {'type': 'dense_vector', 'dims': 768, 'index': True, 'similarity': 'cosine', 'index_options': {'type': 'int8_hnsw', 'm': 16, 'ef_construction': 100}}, 'response': {'type': 'text'}, 'text': {'type': 'text', 'analyzer': 'rag_analyzer'}, 'text_vector_knn': {'type': 'dense_vector', 'dims': 768, 'index': True, 'similarity': 'cosine', 'index_options': {'type': 'int8_hnsw', 'm': 16, 'ef_construction': 100}}, 'vector_combined_knn': {'type': 'dense_vector', 'dims': 768, 'index': True, 'similarity': 'cosine', 'index_options': {'type': 'int8_hnsw', 'm': 16, 'ef_construction': 100}}}}}}
{'question_text_vector_knn': {'type': 'dense_vector', 'dims': 768, 'index': True, 'similarity': 'cosine', 'index_options': {'type': '

In [12]:
from openai import OpenAI

class OpenAIClient:
    def __init__(self, api_key):
        self.client = OpenAI(api_key=api_key)
    
    def generate(self, prompt: str) -> str:
        response = self.client.chat.completions.create(
            model="gpt-3.5-turbo",  # or "gpt-4" for better evaluation
            messages=[
                {"role": "system", "content": "You are an expert evaluator of question-answering systems."},
                {"role": "user", "content": prompt}
            ]
        )
        return response.choices[0].message.content

In [13]:
import torch
from transformers import pipeline


# Initialize all components
llm_client = OpenAIClient(api_key="YOUR-OPENAI-KEY") # Using TinyLlama
rag_system = RAGSystem(es, llm_client)
evaluator = ComprehensiveRAGEvaluator(llm_client, rag_system)

# Get contexts
for test_case in test_cases:
    contexts = preparator.get_contexts_from_elastic(
        es,
        "qa_index_2",
        test_case['query']
    )
    test_case['contexts'] = contexts

# Run evaluation
results = evaluator.comprehensive_evaluation(test_cases)


# Generate report
report = evaluator.generate_report(results, "rag_evaluation_report_gpt35_new")

Evaluating test cases:  57%|█████▋    | 4/7 [00:27<00:20,  6.91s/it]ERROR:__main__:JSON extraction failed: Expecting ',' delimiter: line 1 column 119 (char 118)
Original text: {
    "scores": {
        "relevance": {"score": 0, "explanation": "The generated response does not directly address the user's request for advice."},
        "accuracy": {"score": 1, "explanation": "The information provided is accurate and relevant to mental health concerns."},
        "clarity": {"score": 1, "explanation": "The response is clear and well-structured, providing information in a supportive manner."}
    },
    "overall_score": 0.75,
    "feedback": "While the generated response provides valuable information on mental health and the importance of seeking help, it does not directly address the user's request for advice. Consider incorporating specific advice or suggestions in the response to better align with the user's initial query."
}
Evaluating test cases: 100%|██████████| 7/7 [00:46<00:00,  6.6

In [14]:
from openai import OpenAI

class OpenAIClient:
    def __init__(self, api_key):
        self.client = OpenAI(api_key=api_key)
    
    def generate(self, prompt: str) -> str:
        response = self.client.chat.completions.create(
            model="gpt-4o",  # or "gpt-4" for better evaluation
            messages=[
                {"role": "system", "content": "You are an expert evaluator of question-answering systems."},
                {"role": "user", "content": prompt}
            ]
        )
        return response.choices[0].message.content

In [15]:
import torch
from transformers import pipeline


# Initialize all components
llm_client = OpenAIClient(api_key="YOUR-OPENAI-KEY") # Using TinyLlama
rag_system = RAGSystem(es, llm_client)
evaluator = ComprehensiveRAGEvaluator(llm_client, rag_system)

# Get contexts
for test_case in test_cases:
    contexts = preparator.get_contexts_from_elastic(
        es,
        "qa_index_2",
        test_case['query']
    )
    test_case['contexts'] = contexts

# Run evaluation
results = evaluator.comprehensive_evaluation(test_cases)


# Generate report
report = evaluator.generate_report(results, "rag_evaluation_report_gpt4o_new")

Evaluating test cases:  57%|█████▋    | 4/7 [00:46<00:32, 10.96s/it]ERROR:__main__:JSON extraction failed: Expecting ',' delimiter: line 1 column 923 (char 922)
Original text: ```json
{
    "scores": {
        "relevance": {
            "score": 1,
            "explanation": "The response focuses on providing advice related to mental health, which is a specific interpretation of the general request for advice, indicating a narrow but relevant response."
        },
        "accuracy": {
            "score": 2,
        },
        "completeness": {
            "score": 1,
            "explanation": "The response provides a detailed list of signs of mental illness but does not address other types of advice that the user might be seeking."
        },
        "clarity": {
            "score": 2,
            "explanation": "The response clearly lists the signs and symptoms, making it easy for the user to understand the information provided."
        }
    },
    "overall_score": 1,
    "feedb

In [None]:
import torch
from transformers import pipeline


# Initialize all components
llm_client = LocalLLMClient()  # Using TinyLlama
rag_system = RAGSystem(es, llm_client)
evaluator = ComprehensiveRAGEvaluator(llm_client, rag_system)

# Get contexts
for test_case in test_cases:
    contexts = preparator.get_contexts_from_elastic(
        es,
        "qa_index_2",
        test_case['query']
    )
    test_case['contexts'] = contexts

# Run evaluation
results = evaluator.comprehensive_evaluation(test_cases)

# Generate report
report = evaluator.generate_report(results, "rag_evaluation_report")