# Creating Advanced LLM Evaluators

## Initial Setup

In [1]:
import yaml
import os

from langsmith.evaluation import EvaluationResult, RunEvaluator
from langchain.evaluation.schema import LLMEvalChain, StringEvaluator
from typing import Any, Optional
import re
from typing import Any, Optional

from langchain.callbacks.manager import Callbacks
from langchain.chains import LLMChain
from langchain.evaluation.schema import LLMEvalChain, StringEvaluator
from langchain.llms import OpenAI
from langchain.prompts import PromptTemplate
from langchain.schema import RUN_KEY
from langsmith.evaluation import EvaluationResult, RunEvaluator
from langsmith.schemas import Example, Run
import asyncio

In [2]:
config = yaml.safe_load(open("./config.yml"))

os.environ["OPENAI_API_KEY"] = config["OPENAI_API_KEY"]
os.environ["LANGCHAIN_API_KEY"] = config["LANGCHAIN_API_KEY"]
os.environ["LANGCHAIN_HUB_API_KEY"] = config["LANGCHAIN_API_KEY"]
os.environ["LANGCHAIN_TRACING_V2"] = str(config["LANGCHAIN_TRACING_V2"]).lower()
os.environ["LANGCHAIN_ENDPOINT"] = config["LANGCHAIN_ENDPOINT"]
os.environ["LANGCHAIN_HUB_API_URL"] = config["LANGCHAIN_HUB_API_URL"]
os.environ["LANGCHAIN_WANDB_TRACING"] = str(config["LANGCHAIN_WANDB_TRACING"]).lower()
os.environ["LANGCHAIN_PROJECT"] = config["LANGCHAIN_PROJECT"]

## Example 1: Creating a Simple Evaluator

In [3]:
from langchain_core.messages import HumanMessage, SystemMessage
from langchain_core.prompts.chat import (
    ChatPromptTemplate,
    PromptTemplate,
    HumanMessagePromptTemplate,
    SystemMessagePromptTemplate,
)
from langchain_openai import ChatOpenAI

In [4]:
sys_msg = """[INST]You are a helpful assistant that translates English to French. Your task is to translate as a fluent speaker of both languages.
Translate this sentence from English to French: {sentence}
[/INST]
"""

template = ChatPromptTemplate.from_template(sys_msg)

In [5]:
from langchain.chains import LLMChain
from langchain_openai import OpenAI

In [6]:
translator_chain = LLMChain(llm=OpenAI(), prompt=template)

In [7]:
translator_chain.invoke({"sentence": "Hello, how are you?"})

{'sentence': 'Hello, how are you?', 'text': '\nBonjour, comment vas-tu ?'}

### Creating a simple translation evaluator

Prompt curation is 50% of success

In [8]:
LANGUAGE_CONSISTENCY_PROMPT = PromptTemplate(
    input_variables=["query", "result"],
    template="""You are a highly skilled linguist tasked with evaluating the translation quality between a source text and its corresponding translated text. Your objective is to determine the accuracy, fluency, and appropriateness of the translation.

To assess translation quality, carefully consider the following factors:
- Accuracy: Analyze the translated text to ensure it accurately conveys the meaning and intent of the source text. Check for mistranslations, omissions, or additions that alter the original message.
- Fluency: Evaluate the fluency and naturalness of the translated text in the target language. Consider grammar, syntax, idiomatic expressions, and readability. The translation should sound natural to native speakers of the target language.
- Appropriateness: Assess the appropriateness of the translation in terms of tone, style, and cultural context. The translation should maintain the same register and adapt cultural references as needed for the target audience.

Based on your analysis, provide a translation quality score from 0 to 10, where:
0 - Extremely poor translation with significant errors and lack of fluency.
2 - Poor translation with many inaccuracies and low fluency.
4 - Below average translation with some inaccuracies and moderate fluency issues.
6 - Acceptable translation with minor inaccuracies and good overall fluency.
8 - High-quality translation with very few errors and excellent fluency.
10 - Exceptional translation that accurately captures the source text's meaning, reads naturally in the target language, and maintains appropriate tone and style.

In addition to the score, provide a detailed explanation of your assessment. Highlight specific aspects that support your evaluation, such as accuracy issues, fluency observations, and appropriateness considerations.

Source Text: {query}
Translated Text: {result}

Strictly adhere to the following format in your response:
SCORE: [0-10]
REASONING: <Your brief one line reasoning explanation here. No newlines or line breaks>
"""
)

We need a function to parse the results

In [9]:
def _parse_string_eval_output(text: str) -> dict:
    score_pattern = r"SCORE: (\d+)"
    reasoning_pattern = r"REASONING: (.*)"

    score_match = re.search(score_pattern, text)
    reasoning_match = re.search(reasoning_pattern, text)

    score = int(score_match.group(1)) if score_match else None
    reasoning = reasoning_match.group(1).strip() if reasoning_match else None

    return {"score": score, "reasoning": reasoning}

In [10]:
class BaseEvalChain(LLMChain, StringEvaluator, LLMEvalChain):
    @property
    def requires_reference(self) -> bool:
        return False

    @classmethod
    def from_llm(cls, llm: OpenAI, prompt: PromptTemplate, **kwargs: Any) -> "BaseEvalChain":
        return cls(llm=llm, prompt=prompt, **kwargs)

    def _prepare_output(self, result: dict) -> dict:
        # parsing the output to extract the score and reasoning 
        parsed_result = _parse_string_eval_output(result[self.output_key])
        
        if RUN_KEY in result:
            parsed_result[RUN_KEY] = result[RUN_KEY]
        return parsed_result

In [11]:
class LanguageConsistencyEvalChain(BaseEvalChain):
    @property
    def evaluation_name(self) -> str:
        return "Language Consistency"

    def _evaluate_strings(self, *, prediction: str, reference: Optional[str] = None, input: Optional[str] = None, callbacks: Callbacks = None, include_run_info: bool = False, **kwargs: Any) -> dict:
        result = self({"query": input, "result": prediction}, callbacks=callbacks, include_run_info=include_run_info)
        return self._prepare_output(result)


Wrap it into an evaluator

In [12]:
class BaseEvaluator(RunEvaluator):
    def __init__(self, llm: Optional[OpenAI] = None, prompt: Optional[PromptTemplate] = None, eval_chain_class: Optional[BaseEvalChain] = None, k: int = 5):
        self.llm = llm or OpenAI()
        self.prompt = prompt
        self.eval_chain_class = eval_chain_class
        self.evaluator = self.eval_chain_class.from_llm(self.llm, prompt=self.prompt)
        self.k = k

    async def _evaluate_run_async(self, run: Run, example: Optional[Example] = None) -> dict:
        if run.outputs is None:
            raise ValueError("Run outputs cannot be None")

        response = run.outputs.get("text")

        if response is None:
            raise ValueError("Run outputs must contain 'response' key")

        evaluation = await asyncio.to_thread(self.evaluator._evaluate_strings,
                                             prediction=response,
                                             input=example.inputs["question"] if example else None,
                                             context=run.outputs.get("context"))

        return evaluation

    async def evaluate_run_async(self, run: Run, example: Optional[Example] = None) -> EvaluationResult:
        tasks = []
        for _ in range(self.k):
            task = asyncio.create_task(self._evaluate_run_async(run, example))
            tasks.append(task)

        evaluations = await asyncio.gather(*tasks)

        scores = [eval["score"] for eval in evaluations]
        reasonings = [eval["reasoning"] for eval in evaluations]

        avg_score = sum(scores) / len(scores)
        closest_reasoning_index = min(range(len(scores)), key=lambda i: abs(scores[i] - avg_score))
        closest_reasoning = reasonings[closest_reasoning_index]

        return EvaluationResult(
            key=self.evaluator.evaluation_name.lower().replace(" ", "_"),
            score=avg_score,
            comment=closest_reasoning,
        )
        
    def evaluate_run(self, run: Run, example: Optional[Example] = None) -> EvaluationResult:
            return asyncio.run(self.evaluate_run_async(run, example))

Now we can use default evaluator inside the Langchain


In [13]:
class LanguageConsistencyEvaluator(BaseEvaluator):
    def __init__(self, llm: Optional[OpenAI] = None, prompt: Optional[PromptTemplate] = None):
        super().__init__(llm, prompt or LANGUAGE_CONSISTENCY_PROMPT, LanguageConsistencyEvalChain)

In [14]:
eval_llm = ChatOpenAI(model="gpt-4-turbo-preview")

In [15]:
# Run the test evaluation
langconst_evaluator = LanguageConsistencyEvaluator(llm=eval_llm)

In [32]:
result = langconst_evaluator.evaluator.invoke({"query": "How are you? You good?", "result": "Comment ça va? Tu vas good?"})

In [33]:
result

{'query': 'How are you? You good?',
 'result': 'Comment ça va? Tu vas good?',
 'text': 'SCORE: 6\nREASONING: Translation maintains casual tone and overall meaning but mixes languages ("Tu vas good?"), affecting fluency and appropriateness in a purely French context.'}

In [34]:
print(result["text"])

SCORE: 6
REASONING: Translation maintains casual tone and overall meaning but mixes languages ("Tu vas good?"), affecting fluency and appropriateness in a purely French context.


In [18]:
result = langconst_evaluator.evaluator.invoke({"query": "How are you? You good?", "result": "Comment ça va? Tu vas bien?"})
print(result["text"])

SCORE: 10
REASONING: The translated text accurately conveys the meaning, maintains the informal tone, and reads naturally in French, perfectly matching the source text's intent and style.


## Example 2: Creating a More Advanced Evaluator

In [19]:
CONTEXT_RELEVANCE_PROMPT = PromptTemplate(
    input_variables=["query", "context"],
    template="""You are an expert evaluator assessing the relevance of retrieved context to a given query. Your task is to carefully analyze the provided context and determine if it contains information pertinent to answering the query. 

Consider the following:
- Does the context directly address the main topics or entities mentioned in the query? 
- Does the context provide background information or details that could help form a comprehensive answer?
- Is the context free of irrelevant or tangential information that could lead to hallucinations?

Provide a relevance score from 0 to 10, where:
0 - Completely irrelevant 
2 - Mostly irrelevant with minor pertinent details
4 - Somewhat relevant, but missing key information 
8 - Mostly relevant, covering main points but lacking some specifics
10 - Highly relevant, containing all necessary information to comprehensively address the query

Along with the score, provide a concise explanation justifying your assessment. Highlight specific aspects of the context that informed your decision.

Query: {query}
Context: {context}

Strictly answer in the following format:
SCORE: [0-10]
REASONING: <Your brief one line reasoning explanation here. No newlines or line breaks!>
"""
)


In [20]:
class ContextRelevanceEvalChain(BaseEvalChain):
    @property
    def evaluation_name(self) -> str:
        return "Context Relevance"

    def _evaluate_strings(self, *, prediction: Optional[str] = None, reference: Optional[str] = None, context: Optional[str] = None, input: Optional[str] = None, callbacks: Callbacks = None, include_run_info: bool = False, **kwargs: Any) -> dict:
        result = self({"query": input, "context": context}, callbacks=callbacks, include_run_info=include_run_info)
        return self._prepare_output(result)

In [21]:
class ContextRelevanceEvaluator(BaseEvaluator):
    def __init__(self, llm: Optional[OpenAI] = None, prompt: Optional[PromptTemplate] = None):
        super().__init__(llm, prompt or CONTEXT_RELEVANCE_PROMPT, ContextRelevanceEvalChain)

    def evaluate_run(self, run: Run, example: Optional[Example] = None) -> EvaluationResult:
        if run.outputs is None:
            raise ValueError("Run outputs cannot be None")

        context = run.outputs.get("context")

        if context is None:
            raise ValueError("Run outputs must contain 'context' key")

        return super().evaluate_run(run, example)

In [22]:
# Run the test evaluation
context_evaluator = ContextRelevanceEvaluator(llm=eval_llm)

In [36]:
context = [
    "Bla",
    "Bla bla",
    "Bla bla bla",
]

In [37]:
result = context_evaluator.evaluator.invoke({"query": "When was a moon landing?", "context": context})
print(result["text"])

SCORE: 0
REASONING: The provided context consists only of placeholder text with no information relevant to any moon landing.


In [25]:
context = [
    "The first successful manned Moon landing was accomplished by the Apollo 11 mission of NASA, with astronauts Neil Armstrong and Buzz Aldrin landing on the Moon on July 20, 1969."
    "There have been a total of six manned U.S. landings",
    "To make an apple juice you need to peel the apple and ...",
]

In [26]:
result = context_evaluator.evaluator.invoke({"query": "When was a moon landing?", "context": context})
print(result["text"])

SCORE: 8
REASONING: The context directly addresses the query with the date of the first manned Moon landing but includes unrelated information about making apple juice.


## Running Evaluators on the dataset

Creating a dataset

In [43]:
from langsmith import Client
from langchain.smith import RunEvalConfig, run_on_dataset

# Create a dataset
dataset_inputs = [
    "How are you?",
    "Where is the nearest restaurant?",
    "All in all, actions speak louder than words when you're burning the candle at both ends. ",
]

client = Client()
dataset_name = "Test_FR"

In [44]:
dataset = client.create_dataset(
    dataset_name=dataset_name,
    description="Translation test dataset",
)
client.create_examples(
    inputs=[{"question": q} for q in dataset_inputs],
    dataset_id=dataset.id,
)

In [45]:
# Configure evaluation using off-the-shelf metrics
evaluation_config = RunEvalConfig(
    custom_evaluators=[
        LanguageConsistencyEvaluator(llm=eval_llm),
    ],
)



In [47]:
run_on_dataset(
    dataset_name=dataset_name,
    llm_or_chain_factory=translator_chain,
    client=client,
    evaluation=evaluation_config,
    project_name="test-1",
)

View the evaluation results for project 'test-1' at:
https://smith.langchain.com/o/477ed791-8ad2-52b5-a09b-1799811c6988/datasets/770968e1-8519-4ea8-9183-4af571cff33e/compare?selectedSessions=a15e919e-399a-4117-a525-de3c23335894

View all tests for Dataset Test_FR at:
https://smith.langchain.com/o/477ed791-8ad2-52b5-a09b-1799811c6988/datasets/770968e1-8519-4ea8-9183-4af571cff33e
[------------------------------------------------->] 3/3

{'project_name': 'test-1',
 'results': {'2aef3337-04c4-4292-a4aa-21e0796a1c8f': {'input': {'question': 'How are you?'},
   'feedback': [EvaluationResult(key='language_consistency', score=10.0, value=None, comment='The translation "Comment vas-tu?" accurately conveys the meaning, is fluent and natural in French, and appropriately maintains the informal tone of the source text.', correction=None, evaluator_info={}, feedback_config=None, source_run_id=None, target_run_id=None)],
   'execution_time': 0.752339,
   'run_id': '679e1fc6-d071-4788-ab12-4d0a99554991',
   'output': {'sentence': 'How are you?', 'text': '\nComment vas-tu?'}},
  '7c512c45-20c6-41ad-9e6d-31ffbedcc44b': {'input': {'question': 'Where is the nearest restaurant?'},
   'feedback': [EvaluationResult(key='language_consistency', score=10.0, value=None, comment='The translation accurately conveys the meaning of the source text, uses natural and fluent French, and maintains the appropriate tone and style.', correction=None, ev

In [None]:
{'project_name': 'test-1',
 'results': {'2aef3337-04c4-4292-a4aa-21e0796a1c8f': {'input': {'question': 'How are you?'},
   'feedback': [EvaluationResult(key='language_consistency', score=10.0, value=None, comment='The translation "Comment vas-tu?" accurately conveys the meaning, is fluent and natural in French, and appropriately maintains the informal tone of the source text.', correction=None, evaluator_info={}, feedback_config=None, source_run_id=None, target_run_id=None)],
   'execution_time': 0.752339,
   'run_id': '679e1fc6-d071-4788-ab12-4d0a99554991',
   'output': {'sentence': 'How are you?', 'text': '\nComment vas-tu?'}},
  '7c512c45-20c6-41ad-9e6d-31ffbedcc44b': {'input': {'question': 'Where is the nearest restaurant?'},
   'feedback': [EvaluationResult(key='language_consistency', score=10.0, value=None, comment='The translation accurately conveys the meaning of the source text, uses natural and fluent French, and maintains the appropriate tone and style.', correction=None, evaluator_info={}, feedback_config=None, source_run_id=None, target_run_id=None)],
   'execution_time': 0.447339,
   'run_id': '14a28087-46ba-4811-881c-6873f81a5ad9',
...
   'execution_time': 0.617001,
   'run_id': '9cfc3784-889c-403d-8daf-cec685b8a46d',
   'output': {'sentence': "All in all, actions speak louder than words when you're burning the candle at both ends. ",
    'text': "\nDans l'ensemble, les actions parlent plus fort que les mots lorsque vous brûlez la chandelle par les deux bouts."}}},
 'aggregate_metrics': None}