# Hallucination Detection using Entropy Metrics

This notebook benchmark the effectiveness of entropy and LLM as judge for detecting hallucinations in LLM responses using the HaluEval dataset.

# Getting started
Installs and imports necessary libraries

In [None]:
# Install required packages
!pip install -qqq datasets litellm torch numpy pandas scikit-learn tqdm vllm model2vec together

In [None]:
!git clone https://github.com/klara-research/klarity

In [None]:
cd klarity

In [None]:
!pip install -e .

In [None]:
import datasets
import litellm
import numpy as np
import pandas as pd
from model2vec import StaticModel
from tqdm import tqdm
from sklearn.metrics import accuracy_score, f1_score, roc_curve, precision_recall_curve, auc
import matplotlib.pyplot as plt
import torch
import math
from dotenv import load_dotenv
from typing import List, Dict, Tuple
from vllm import LLM, SamplingParams

from klarity.core.analyzer import EntropyAnalyzer
from klarity.estimator import UncertaintyEstimator
from klarity.models import TokenInfo

from transformers import AutoTokenizer


load_dotenv()


In [2]:
import os

# Set the API key
os.environ["TOGETHERAI_API_KEY"] = '..'

# Get HuggingFace dataset
Download hallucinations datasets from HF

In [3]:
# Load the HaluEval dataset
def get_halueval_dataset(split_name: str = "qa"):
    """
    Load the HaluEval dataset.

    Args:
        split_name (str): The split name of the dataset. Default is "qa".

    Returns:
        datasets.Dataset: The loaded dataset.
    """
    dataset = datasets.load_dataset("notrichardren/HaluEval", split_name)
    print(f"Dataset size: {len(dataset['train'])}")
    return dataset

In [4]:
def get_trutfulqa_dataset(split_name: str = "multiple_choice"):
    """
    Load the TrutfulQA dataset.

    Args:
        split_name (str): The split name of the dataset. Default is "validation".

    Returns:
        datasets.Dataset: The loaded dataset.
    """
    dataset = datasets.load_dataset("truthfulqa/truthful_qa", split_name)
    print(f"Dataset size: {len(dataset['validation'])}")
    return dataset

In [None]:
# Sample the top 100 rows of the dataset
ds = get_trutfulqa_dataset()
ds = ds['validation']
len(ds)

# Initialize Klarity
Uncertainty Estimator to compute entropy.

In [None]:
entropy_analyzer = EntropyAnalyzer()
uncertainty_estimator = UncertaintyEstimator(top_k=5, analyzer=entropy_analyzer)

# Instantiate a vllm model
This will be the evaluated model -  judge model is the one that is gonna review the answers

In [None]:
model_name = "Qwen/Qwen2.5-7B-Instruct"
judge_model_name = "together_ai/Qwen/Qwen2.5-7B-Instruct-Turbo"
llm = LLM(model=model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Instantiate the sampling params.
We want to get the logprobs of the top-k tokens to compute entropy.

In [8]:
sampling_params = SamplingParams(
    max_tokens=128,
    temperature=0.0,
    logprobs=5
)

# VLLM response data

This cell defines a function `get_vllm_response` that generates a response using a vLLM model
and calculates entropy metrics for the generated text. The function takes four parameters:
 - llm: A vLLM model instance
 - tokenizer: The tokenizer for the model
 - sampling_params: Parameters for text generation
 - text: The input prompt

 The function returns a tuple containing:
1. The generated text
2. The raw output from the model
3. The mean entropy of the generated tokens
4. The mean semantic entropy of the generated tokens

This function is used to analyze the uncertainty and potential hallucinations
in the model's responses by leveraging entropy-based metrics.


In [9]:
def get_vllm_response(
    llm: LLM,
    tokenizer: AutoTokenizer,
    sampling_params: SamplingParams,
    text: str,
    dataset_name: str = "halueval"
) -> Tuple[str, Dict, float, float]:
    """
    Get response using vLLM with entropy metrics.

    Args:
        llm: The vllm instance
        tokenizer: The tokenizer of the model
        sampling_params: The sampling parameters for response generation
        text: Input text to get response for

    Returns:
        Tuple containing:
        - Model's response text
        - Raw output from model
        - Mean entropy
        - Mean semantic entropy
    """
    mean_entropy = []
    mean_semantic_entropy = []
    if dataset_name == "halueval":
        messages = [
            {"role": "system", "content": """\
                You are a question answering assistant. Use the provided context to answer the question.\
                Respond with only the answer and no other context"""
            },
            {"role": "user", "content": text}]
    else:
         messages = [
            {"role": "system", "content": """\
                You are a MCQ answering agent. Answer the following question by picking the correct choice. You will
                respond with just the correct MCQ choice id i.e. A, B, C, D, ....
                """
            },
            {"role": "user", "content": text}]
    input_text=tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    vllm_response = llm.generate(input_text, sampling_params)
    answer = vllm_response[0].outputs[0].text
    analysis_results = uncertainty_estimator.analyze_generation(vllm_response[0])
    for token_metric in analysis_results.token_metrics:
        mean_entropy.append(token_metric.raw_entropy)
        mean_semantic_entropy.append(token_metric.semantic_entropy)
    return answer, analysis_results, np.mean(mean_entropy), np.mean(mean_semantic_entropy)

In [None]:
sample_queries = ["What is the capital of France?", "What is the capital of Spain?"]
answer, result, mean_entropy, mean_semantic_entropy = get_vllm_response(llm, tokenizer, sampling_params, sample_queries[1])

In [None]:
answer

In [None]:
ds[0]

# Evaluate entropy metrics with a judge LLM

We'll create a function that uses a JudgeLLM to check if the answer predicted by the model being evaluated and the ground truth are equivalent or not.

In [13]:
def get_litellm_response(
    text: str,
    model: str = "together_ai/meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo-128K",
    top_k: int = 1
) -> Tuple[str, str, List[str], List[float]]:
    """
    Get model response and associated metrics for a given input text.

    Args:
        text: Input text to get response for
        model: Optional model override
        top_k: Number of top tokens to return

    Returns:
        Tuple containing:
        - Raw model response
        - Processed response content
        - List of tokens
        - List of token log probabilities
    """
    try:
        response = litellm.completion(
            model=model,
            messages=[{"role": "user", "content": text}],
            logprobs=top_k,
            echo=True
        )
        tokens = response.choices[0].logprobs.tokens
        logprobs = response.choices[0].logprobs.token_logprobs
        return response, response.choices[0].message.content, tokens, logprobs

    except Exception as e:
        print(f"Error processing text: {e}")
        return None, None, None, None

In [14]:

def check_answers(predicted_answer: str, correct_answer: str, judge_model: str) -> bool:
    """
    A function that checks if two strings are equivalent or not by calling
    a JudgeLLM.

    Args:
        - predicted_answer: The answer predicted by the model being evaluated.
        - correct_answer: The ground truth answer
        - judge_model: The judge model to be used.

    Returns:
        - A boolean 1 for equivalence.
    """
    # Check for exact match first
    if predicted_answer.strip().lower() == correct_answer.strip().lower():
        return True

    # If not an exact match, use JudgeLLM
    prompt = f"""
    Question: Are these two answers equivalent in meaning?
    Answer 1: {predicted_answer}
    Answer 2: {correct_answer}
    Please respond with only 'yes' or 'no'.
    """

    _, judge_response, _, _ = get_litellm_response(prompt, judge_model)
    return judge_response.strip().lower() == 'yes'

# Run model evaluation
Let's run the model on our dataset.

In [15]:
def get_halueval_predictions():
    predicted_answers = []
    mean_entropies = []
    mean_semantic_entropies = []
    correct_answers = []
    did_hallucinate = []
    for item in tqdm(ds):
        correct_answers.append(item['right_answer'])
        combined_text = f"Context: {item['knowledge']}\nQuestion: {item['question']}\n Answer:"
        predicted_answer, result, mean_entropy, mean_semantic_entropy = get_vllm_response(llm, tokenizer, sampling_params, combined_text)
        predicted_answers.append(predicted_answer)
        mean_entropies.append(mean_entropy)
        mean_semantic_entropies.append(mean_semantic_entropy)

        # Check if the answers match
        is_correct = check_answers(predicted_answer, item['right_answer'], judge_model_name)
        did_hallucinate.append(is_correct)

    return did_hallucinate, mean_entropies, mean_semantic_entropies

In [16]:
def get_truthfulqa_predictions():
    predicted_answers = []
    mean_entropies = []
    mean_semantic_entropies = []
    did_hallucinate = []
    for ind,item in enumerate(tqdm(ds)):
        # print(item)
        choices = "\n".join([f"{chr(ord('A')+i)}.{choice} " for i,choice in enumerate(item['mc1_targets']['choices'])])
        combined_text = f"Question: {item['question']}\nChoices:\n{choices}\nAnswer:"
        # print(combined_text)
        predicted_answer, result, mean_entropy, mean_semantic_entropy = get_vllm_response(llm, tokenizer, sampling_params, combined_text, 'truthful')
        predicted_answers.append(predicted_answer)
        mean_entropies.append(mean_entropy)
        mean_semantic_entropies.append(mean_semantic_entropy)
        # print(f"Predicted answer :{predicted_answer}")
        correct_answer = str(chr(ord('A')+item['mc1_targets']['labels'].index(1)))
        # print(correct_answer)
        # Check if the answers match
        is_correct = check_answers(str(predicted_answer), str(correct_answer), judge_model_name)
        did_hallucinate.append(is_correct)

    return did_hallucinate, mean_entropies, mean_semantic_entropies

In [None]:
did_hallucinate, mean_entropies, mean_semantic_entropies = get_truthfulqa_predictions()

In [None]:
ds[0]

In [None]:
len(mean_entropies)

In [None]:
print(f"Model Accuracy is {np.mean(did_hallucinate)}")
# Hallucination when answer is wrong so let's take the inverse.
did_hallucinate = [not x for x in did_hallucinate]

In [22]:
# Min Max Scale entropies
scaled_mean_entropies = (np.array(mean_entropies) - np.min(mean_entropies)) / (np.max(mean_entropies) - np.min(mean_entropies))
scaled_mean_semantic_entropies = (np.array(mean_semantic_entropies) - np.min(mean_semantic_entropies)) / (np.max(mean_semantic_entropies) - np.min(mean_semantic_entropies))


# ROC and PR curves for entropy metrics
Here we plot entropy metrics and find the best threshold to detect hallucinations

In [21]:
def plot_metrics(semantic_entropy, raw_entropy, labels):
    """
    Plots ROC and PR curves for semantic and raw entropy metrics.

    This function calculates and visualizes the following:
    1. ROC (Receiver Operating Characteristic) curves
    2. PR (Precision-Recall) curves
    3. Confusion matrices
    4. Scatter plot of semantic vs raw entropy

    For both semantic and raw entropy metrics, it computes:
    - False Positive Rate (FPR) and True Positive Rate (TPR) for ROC curves
    - Precision and Recall for PR curves
    - Area Under the Curve (AUC) for both ROC and PR curves

    Args:
    semantic_entropy (array-like): Semantic entropy values
    raw_entropy (array-like): Raw entropy values
    labels (array-like): True labels (0 for no hallucination, 1 for hallucination)

    The function creates a 1x4 subplot figure to display all plots.
    """
    # Calculate metrics
    semantic_fpr, semantic_tpr, _ = roc_curve(labels, semantic_entropy)
    semantic_roc_auc = auc(semantic_fpr, semantic_tpr)
    raw_fpr, raw_tpr, _ = roc_curve(labels, raw_entropy)
    raw_roc_auc = auc(raw_fpr, raw_tpr)

    semantic_precision, semantic_recall, _ = precision_recall_curve(labels, semantic_entropy)
    raw_precision, raw_recall, _ = precision_recall_curve(labels, raw_entropy)

    semantic_pr_auc = auc(semantic_recall, semantic_precision)
    raw_pr_auc = auc(raw_recall, raw_precision)

    # Plot PR curves
    plt.figure(figsize=(15, 5))

    plt.subplot(1, 4, 1)
    plt.plot([0, 1], [0, 1], linestyle='--', label='Random')
    plt.plot(semantic_recall, semantic_precision, label=f'Semantic (AUC = {semantic_pr_auc:.2f})')
    plt.plot(raw_recall, raw_precision, label=f'Raw (AUC = {raw_pr_auc:.2f})')
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title('Precision-Recall Curve')
    plt.legend()

    # Plot ROC Curve
    plt.subplot(1, 4, 2)
    plt.plot(semantic_fpr, semantic_tpr, label=f'Semantic Entropy ROC AUC = {semantic_roc_auc:.2f}')
    plt.plot(raw_fpr, raw_tpr, label=f'Raw Entropy ROC AUC = {raw_roc_auc:.2f}')
    plt.plot([0, 1], [0, 1], color='grey', linestyle='--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate (Recall)')
    plt.title('ROC Curve')
    plt.legend(loc="lower right")

    # Plot Entropy Distributions curves
    plt.subplot(1, 4, 3)
    plt.hist(semantic_entropy, bins=50, alpha=0.5, label='Semantic')
    plt.hist(raw_entropy, bins=50, alpha=0.5, label='Raw')
    plt.xlabel('Entropy')
    plt.ylabel('Frequency')
    plt.title('Entropy Distribution')
    plt.legend()

    # Plot accuracy
    thresholds = np.linspace(0, 1, 100)
    semantic_accuracy = [accuracy_score(labels, semantic_entropy > t) for t in thresholds]
    raw_accuracy = [accuracy_score(labels, raw_entropy > t) for t in thresholds]

    plt.subplot(1, 4, 4)
    plt.plot(thresholds, semantic_accuracy, label='Semantic')
    plt.plot(thresholds, raw_accuracy, label='Raw')
    plt.xlabel('Threshold')
    plt.ylabel('Accuracy')
    plt.title('Accuracy vs Threshold')
    plt.legend()

    plt.tight_layout()
    plt.show()

In [None]:
plot_metrics(scaled_mean_semantic_entropies, scaled_mean_entropies, did_hallucinate)

# Entropy + Judge LLM metrics
Here we add a powerful judge LLM and evaluate the performances in using both for hallucination detection

In [None]:
import numpy as np
from tqdm import tqdm
from sklearn.metrics import accuracy_score, roc_curve, precision_recall_curve, auc
import matplotlib.pyplot as plt
from together import Together
import pandas as pd
from scipy.stats import pearsonr

# Initialize the Together client
client = Together(api_key='..')

def get_llm_hallucination_assessment(question, model_answer):
    """Uses LLama to assess hallucination probability"""
    prompt = f"""You are an expert at detecting hallucinations in AI responses.
    Assess if the following model answer is hallucinating (contains incorrect information).

    Question: {question}
    Model's answer: {model_answer}

    Rate the hallucination on a scale from 0 to 1, where:
    0 = No hallucination, the answer is correct
    1 = Complete hallucination, the answer is incorrect

    Return only a number between 0 and 1.
    """

    messages = [{"role": "user", "content": prompt}]

    response = client.chat.completions.create(
        model="meta-llama/Llama-3.3-70B-Instruct-Turbo",
        messages=messages,
        max_tokens=10,
        temperature=0.1,
        stream=False
    )

    # Parse the score
    try:
        import re
        score_text = response.choices[0].message.content.strip()
        score_match = re.search(r'(\d+\.\d+|\d+)', score_text)
        if score_match:
            return float(score_match.group(1))
        else:
            return float(score_text)
    except:
        return 0.5

def analyze_hallucination_detection(dataset, sample_size=50):
    """Analyze hallucination detection methods on a dataset"""
    # Process subset
    subset = dataset.select(range(min(sample_size, len(dataset))))

    results = []

    # Process each example
    for item in tqdm(subset):
        question = item['question']
        choices = "\n".join([f"{chr(ord('A')+i)}.{choice} " for i, choice in enumerate(item['mc1_targets']['choices'])])
        combined_text = f"Question: {question}\nChoices:\n{choices}\nAnswer:"

        # Get model response and entropy metrics
        answer, _, raw_entropy, semantic_entropy = get_vllm_response(
            llm, tokenizer, sampling_params, combined_text, 'truthful'
        )

        # Get correct answer
        correct_answer = str(chr(ord('A')+item['mc1_targets']['labels'].index(1)))

        # Check if answer is correct
        is_correct = check_answers(str(answer), str(correct_answer), judge_model_name)
        hallucinated = not is_correct

        # Get model answer text
        model_answer_text = f"Choice {answer}: {item['mc1_targets']['choices'][ord(answer)-ord('A')] if answer.upper() in 'ABCDEFGH' else answer}"

        # Get LLM hallucination assessment
        llm_score = get_llm_hallucination_assessment(question, model_answer_text)

        # Store results
        results.append({
            'question': question,
            'model_answer': model_answer_text,
            'hallucinated': hallucinated,
            'raw_entropy': raw_entropy,
            'semantic_entropy': semantic_entropy,
            'llm_score': llm_score
        })

    # Convert to DataFrame
    df = pd.DataFrame(results)

    # Normalize entropy scores (0-1 range)
    df['entropy_norm'] = (df['semantic_entropy'] - df['semantic_entropy'].min()) / (df['semantic_entropy'].max() - df['semantic_entropy'].min())

    # Calculate combined score
    df['combined_score'] = 0.7 * df['entropy_norm'] + 0.3 * df['llm_score']

    # Find optimal thresholds
    thresholds = np.linspace(0, 1, 100)

    entropy_accuracies = [accuracy_score(df['hallucinated'], df['entropy_norm'] > t) for t in thresholds]
    best_entropy_threshold = thresholds[np.argmax(entropy_accuracies)]
    best_entropy_accuracy = np.max(entropy_accuracies)

    llm_accuracies = [accuracy_score(df['hallucinated'], df['llm_score'] > t) for t in thresholds]
    best_llm_threshold = thresholds[np.argmax(llm_accuracies)]
    best_llm_accuracy = np.max(llm_accuracies)

    combined_accuracies = [accuracy_score(df['hallucinated'], df['combined_score'] > t) for t in thresholds]
    best_combined_threshold = thresholds[np.argmax(combined_accuracies)]
    best_combined_accuracy = np.max(combined_accuracies)

    # Calculate correlation between entropy and LLM scores
    correlation, p_value = pearsonr(df['entropy_norm'], df['llm_score'])

    # Calculate method agreement
    df['entropy_detect'] = df['entropy_norm'] > best_entropy_threshold
    df['llm_detect'] = df['llm_score'] > best_llm_threshold

    both_detect = df[(df['entropy_detect']) & (df['llm_detect'])].shape[0]
    only_entropy = df[(df['entropy_detect']) & (~df['llm_detect'])].shape[0]
    only_llm = df[(~df['entropy_detect']) & (df['llm_detect'])].shape[0]
    neither = df[(~df['entropy_detect']) & (~df['llm_detect'])].shape[0]

    # Calculate accuracy in each category
    both_accuracy = df[(df['entropy_detect']) & (df['llm_detect'])]['hallucinated'].mean() if both_detect > 0 else 0
    only_entropy_accuracy = df[(df['entropy_detect']) & (~df['llm_detect'])]['hallucinated'].mean() if only_entropy > 0 else 0
    only_llm_accuracy = df[(~df['entropy_detect']) & (df['llm_detect'])]['hallucinated'].mean() if only_llm > 0 else 0

    # Plot results
    plt.figure(figsize=(15, 5))

    # ROC Curves
    plt.subplot(1, 3, 1)
    for name, scores in [('Entropy', df['entropy_norm']), ('LLM', df['llm_score']), ('Combined', df['combined_score'])]:
        fpr, tpr, _ = roc_curve(df['hallucinated'], scores)
        roc_auc = auc(fpr, tpr)
        plt.plot(fpr, tpr, label=f'{name} (AUC = {roc_auc:.2f})')

    plt.plot([0, 1], [0, 1], '--', color='gray')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC Curves')
    plt.legend()

    # Score Correlation
    plt.subplot(1, 3, 2)
    plt.scatter(
        df['entropy_norm'],
        df['llm_score'],
        c=df['hallucinated'].map({True: 'red', False: 'blue'}),
        alpha=0.7
    )
    plt.axhline(y=best_llm_threshold, color='gray', linestyle='--')
    plt.axvline(x=best_entropy_threshold, color='gray', linestyle='--')

    plt.xlabel('Entropy Score')
    plt.ylabel('LLM Score')
    plt.title(f'Correlation: {correlation:.2f}')
    plt.legend(['Hallucination', 'Correct'])

    # Score Distributions
    plt.subplot(1, 3, 3)
    plt.hist(df[df['hallucinated']]['combined_score'], alpha=0.5, bins=10, label='Hallucination')
    plt.hist(df[~df['hallucinated']]['combined_score'], alpha=0.5, bins=10, label='Correct')
    plt.axvline(x=best_combined_threshold, color='r', linestyle='--')
    plt.xlabel('Combined Score')
    plt.ylabel('Count')
    plt.title('Score Distribution')
    plt.legend()

    plt.tight_layout()
    plt.show()

    # Print results
    print(f"Best Entropy Accuracy: {best_entropy_accuracy:.2f} at threshold {best_entropy_threshold:.2f}")
    print(f"Best LLM Accuracy: {best_llm_accuracy:.2f} at threshold {best_llm_threshold:.2f}")
    print(f"Best Combined Accuracy: {best_combined_accuracy:.2f} at threshold {best_combined_threshold:.2f}")
    print(f"Improvement from entropy to combined: {(best_combined_accuracy - best_entropy_accuracy) * 100:.1f}%")

    print(f"\nMethod Agreement Analysis:")
    print(f"Both detect: {both_detect} examples ({both_accuracy:.1%} correct)")
    print(f"Only entropy detects: {only_entropy} examples ({only_entropy_accuracy:.1%} correct)")
    print(f"Only LLM detects: {only_llm} examples ({only_llm_accuracy:.1%} correct)")
    print(f"Neither detects: {neither} examples")

    return df, {
        'entropy_accuracy': best_entropy_accuracy,
        'llm_accuracy': best_llm_accuracy,
        'combined_accuracy': best_combined_accuracy,
        'correlation': correlation
    }

# Example usage
if __name__ == "__main__":
    # Load dataset
    ds = get_trutfulqa_dataset()['validation']

    # Run analysis
    results_df, metrics = analyze_hallucination_detection(ds, sample_size=300)