In [8]:
"""
LLM Response Evaluation Pipeline

Evaluates AI responses against:
1. Response Relevance & Completeness
2. Hallucination / Factual Accuracy
3. Latency

Compatible with:
- sample-chat-conversation-01.json
- sample_context_vectors-01.json
"""

import json
import time
from typing import Dict, List

import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity


class LLMEvaluationPipeline:
    """End-to-end evaluation pipeline for LLM responses."""

    def __init__(
        self,
        embedding_model: str = "all-MiniLM-L6-v2",
        relevance_threshold: float = 0.75,
        hallucination_threshold: float = 0.30
    ) -> None:
        self.model = SentenceTransformer(embedding_model)
        self.relevance_threshold = relevance_threshold
        self.hallucination_threshold = hallucination_threshold

    # ------------------------------------------------------------------
    # Utility methods
    # ------------------------------------------------------------------
    def _embed(self, texts: List[str]) -> np.ndarray:
        """Generate normalized embeddings."""
        return self.model.encode(
            texts,
            normalize_embeddings=True,
            show_progress_bar=False
        )

    @staticmethod
    def _extract_last_user_and_ai_messages(
        conversation_json: Dict
    ) -> Dict[str, str]:
        """
        Extracts the latest user message and its corresponding AI reply.
        """
        user_message = ""
        ai_message = ""

        for turn in reversed(conversation_json.get("conversation_turns", [])):
            if not ai_message and turn.get("role") == "AI/Chatbot":
                ai_message = turn.get("message", "")
            elif ai_message and turn.get("role") == "User":
                user_message = turn.get("message", "")
                break

        return {
            "user_message": user_message,
            "ai_message": ai_message
        }

    @staticmethod
    def _extract_context_texts(context_json: Dict) -> List[str]:
        """
        Extracts all retrieved context chunks from vector DB response.
        """
        return [
            vector.get("text", "")
            for vector in context_json
            .get("data", {})
            .get("vector_data", [])
            if vector.get("text")
        ]

    # ------------------------------------------------------------------
    # Evaluation metrics
    # ------------------------------------------------------------------
    def compute_relevance(
        self,
        user_message: str,
        ai_message: str
    ) -> float:
        """Computes semantic relevance between user query and AI response."""
        embeddings = self._embed([user_message, ai_message])
        score = cosine_similarity(
            [embeddings[0]],
            [embeddings[1]]
        )[0][0]
        return round(float(score), 3)

    def compute_hallucination(
        self,
        ai_message: str,
        context_texts: List[str]
    ) -> float:
        """
        Computes hallucination score.
        Higher score => higher hallucination risk.
        """
        if not context_texts:
            return 1.0

        ai_embedding = self._embed([ai_message])[0]
        context_embeddings = self._embed(context_texts)

        similarities = cosine_similarity(
            [ai_embedding],
            context_embeddings
        )[0]

        max_similarity = float(np.max(similarities))
        hallucination_score = 1 - max_similarity

        return round(hallucination_score, 3)

    @staticmethod
    def compute_latency(start_time: float) -> int:
        """Returns latency in milliseconds."""
        return int((time.time() - start_time) * 1000)

    # ------------------------------------------------------------------
    # Main evaluation
    # ------------------------------------------------------------------
    def evaluate(
        self,
        conversation_json: Dict,
        context_json: Dict
    ) -> Dict:
        """Runs the full evaluation pipeline."""
        start_time = time.time()

        messages = self._extract_last_user_and_ai_messages(
            conversation_json
        )
        context_texts = self._extract_context_texts(context_json)

        relevance_score = self.compute_relevance(
            messages["user_message"],
            messages["ai_message"]
        )

        hallucination_score = self.compute_hallucination(
            messages["ai_message"],
            context_texts
        )

        latency_ms = self.compute_latency(start_time)

        final_decision = (
            "PASS"
            if relevance_score >= self.relevance_threshold
            and hallucination_score <= self.hallucination_threshold
            else "FAIL"
        )

        return {
            "relevance_score": relevance_score,
            "hallucination_score": hallucination_score,
            "latency_ms": latency_ms,
            "final_decision": final_decision
        }


# ----------------------------------------------------------------------
# Script entry point
# ----------------------------------------------------------------------
def load_json(file_path: str) -> Dict:
    """Loads a JSON file safely."""
    with open(file_path, "r", encoding="utf-8") as file:
        return json.load(file)


def main() -> None:
    conversation_json = load_json("sample-chat-conversation-01.json")
    context_json = load_json("sample_context_vectors-01.json")

    evaluator = LLMEvaluationPipeline()
    result = evaluator.evaluate(conversation_json, context_json)

    print(json.dumps(result, indent=2))


if __name__ == "__main__":
    main()


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

{
  "relevance_score": 0.718,
  "hallucination_score": 0.24,
  "latency_ms": 3309,
  "final_decision": "FAIL"
}


In [7]:
!pip install sentence_transformers --user

