evaluation using prompt into claude

In [None]:
from anthropic import Anthropic
import json

# Initialize Claude client
client = Anthropic()

query = "What is the capital of France?"
retrieved_context = "France is a country in Europe. Its capital city is Paris."
rag_response = "The capital of France is Paris."

# evaluation prompt
prompt = f"""
You are an evaluator. Rate the following RAG response based on four criteria:
1. Faithfulness: Does it stick to the retrieved context without hallucinating?
2. Relevance: Does it answer the user query?
3. Completeness: Does it cover all necessary aspects of the query?
4. Clarity: Is the answer easy to understand?

Query: {query}
Retrieved Context: {retrieved_context}
Generated Answer: {rag_response}

Return your evaluation as a JSON object like this:
{{
  "faithfulness": 1-5,
  "relevance": 1-5,
  "completeness": 1-5,
  "clarity": 1-5,
  "comments": "short explanation"
}}
"""
# claude's evaluation
response = client.messages.create(
    model="claude-3-5-sonnet-20240620",  
    max_tokens=300,
    messages=[{"role": "user", "content": prompt}],
)

eval_text = response.content[0].text
print("Claude's Raw Output:", eval_text)

try:
    eval_json = json.loads(eval_text)
    print("Parsed Evaluation:", json.dumps(eval_json, indent=2))
except Exception as e:
    print("Could not parse JSON:", e)


Please migrate to a newer model. Visit https://docs.anthropic.com/en/docs/resources/model-deprecations for more information.
  response = client.messages.create(


Claude's Raw Output: Here's my evaluation of the RAG response:

{
  "faithfulness": 5,
  "relevance": 5,
  "completeness": 5,
  "clarity": 5,
  "comments": "The answer is perfectly faithful to the context, directly relevant to the query, complete in addressing the question, and clear in its concise statement. It provides the exact information requested without embellishment or omission."
}
Could not parse JSON: Expecting value: line 1 column 1 (char 0)


through bleu and rouge scores (better if higher) (with respect to refernce answer)

In [None]:
from nltk.translate.bleu_score import sentence_bleu
from rouge_score import rouge_scorer

reference = "The capital of France is Paris."
generated = rag_response

# bleu score
bleu = sentence_bleu([reference.split()], generated.split())
print("BLEU Score:", bleu)

# rouge score
scorer = rouge_scorer.RougeScorer(['rouge1','rougeL'], use_stemmer=True)
rouge_scores = scorer.score(reference, generated)
print("ROUGE Scores:", rouge_scores)


BLEU Score: 8.38826642100846e-155
ROUGE Scores: {'rouge1': Score(precision=1.0, recall=1.0, fmeasure=1.0), 'rougeL': Score(precision=0.6666666666666666, recall=0.6666666666666666, fmeasure=0.6666666666666666)}


The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


check for hallucination (if value is low)

In [9]:
from difflib import SequenceMatcher

ratio = SequenceMatcher(None, retrieved_context, rag_response).ratio()
print("Attribution Overlap Ratio:", ratio)


Attribution Overlap Ratio: 0.4090909090909091


other methods which could be used:
- semantic check: with respect to reference answer if it's is given
- fluency check: how natural the response is (through pretrained transformers like gpt)

# updated code

In [None]:
from sentence_transformers import SentenceTransformer, util
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import torch
import json

# Initialize models 
embed_model = SentenceTransformer("all-MiniLM-L6-v2")
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
gpt2_model = GPT2LMHeadModel.from_pretrained("gpt2")

# Example RAG data (can be replaced dynamically)
query = "What is the capital of France?"
retrieved_context = "France is a country in Europe. Its capital city is Paris."
rag_response = "The capital of France is Paris."

# can be replaced by this while evaluating on a given RAG data:

# rag_data = {
#     "query": user_input,
#     "context": retrieved_docs,
#     "response": model_output
# }
# result = evaluate_rag_response(rag_data)


# Semantic Similarity Metrics 

def cosine_sim(a, b):
    emb_a = embed_model.encode(a, convert_to_tensor=True)
    emb_b = embed_model.encode(b, convert_to_tensor=True)
    return util.cos_sim(emb_a, emb_b).item()

faithfulness = cosine_sim(rag_response, retrieved_context)
relevance = cosine_sim(rag_response, query)

# Fluency / Perplexity Metric 
def compute_perplexity(text):
    inputs = tokenizer(text, return_tensors="pt")
    with torch.no_grad():
        loss = gpt2_model(**inputs, labels=inputs["input_ids"]).loss
    return torch.exp(loss).item()

perplexity = compute_perplexity(rag_response)
fluency = 1 / perplexity  # invert since lower perplexity = better

# Normalize & Combine 

# Normalize fluency roughly between 0-1
fluency_score = min(1.0, fluency * 10)

# Weighted composite score 
final_score = round((0.4 * faithfulness + 0.4 * relevance + 0.2 * fluency_score), 3)

# Output 

evaluation = {
    "faithfulness": round(faithfulness, 3),
    "relevance": round(relevance, 3),
    "fluency": round(fluency_score, 3),
    "final_score": final_score,
}

print(json.dumps(evaluation, indent=2))


{
  "faithfulness": 0.904,
  "relevance": 0.879,
  "fluency": 0.224,
  "final_score": 0.758
}


-> these scores are semantic and reference-free 

-> higher = better response quality


for input based evaluation:

In [None]:
from sentence_transformers import SentenceTransformer, util

# Load the sentence embedding model (runs fine on CPU)
model = SentenceTransformer("all-MiniLM-L6-v2")

def evaluate_rag_response(query, context, response):
    # Encode all inputs
    q_emb = model.encode(query, convert_to_tensor=True)
    c_emb = model.encode(context, convert_to_tensor=True)
    r_emb = model.encode(response, convert_to_tensor=True)

    # Compute semantic similarity
    faithfulness = util.cos_sim(r_emb, c_emb).item()  # response aligns with context
    relevance = util.cos_sim(r_emb, q_emb).item()     # response answers query

    # Weighted score
    final_score = round((faithfulness * 0.6 + relevance * 0.4), 3)

    return {
        "faithfulness": round(faithfulness, 3),
        "relevance": round(relevance, 3),
        "final_score": final_score,
        "comments": "Higher score = better response quality. Semantic, not keyword-based."
    }

# 🧠 Interactive Input Section
print("=== RAG Response Evaluator ===")
query = input("\nEnter the user query: ")
context = input("Enter the retrieved context: ")
response = input("Enter the model-generated response: ")

# Evaluate
result = evaluate_rag_response(query, context, response)
print("\n--- Evaluation Result ---")
for k, v in result.items():
    print(f"{k}: {v}")


=== RAG Response Evaluator ===
