In [1]:
from dotenv import load_dotenv
import os
print(load_dotenv('../.env'))
print(os.environ['LANGSMITH_PROJECT'])
os.environ['LANGSMITH_TRACING']="true"
os.environ['USER_AGENT'] = 'myagent'

True
agentic-ops


## Custom evaluator 

In [2]:
def correct_label(inputs: dict, reference_outputs: dict, outputs: dict) -> dict:
  score = outputs.get("output") == reference_outputs.get("label")
  return {"score": int(score), "key": "correct_label"}

## LLM-as-Judge Evaluation

In [5]:
from openai import OpenAI
from pydantic import BaseModel, Field

client = OpenAI()

class Similarity_Score(BaseModel):
    similarity_score: int = Field(description="Semantic similarity score between 1 and 10, where 1 means unrelated and 10 means identical.")

# NOTE: This is our evaluator
def compare_semantic_similarity(inputs: dict, reference_outputs: dict, outputs: dict):
    input_question = inputs["question"]
    reference_response = reference_outputs["output"]
    run_response = outputs["output"]
    
    completion = client.beta.chat.completions.parse(
        model=os.environ["OPENAI_MODEL"],
        messages=[
            {   
                "role": "system",
                "content": (
                    "You are a semantic similarity evaluator. Compare the meanings of two responses to a question, "
                    "Reference Response and New Response, where the reference is the correct answer, and we are trying to judge if the new response is similar. "
                    "Provide a score between 1 and 10, where 1 means completely unrelated, and 10 means identical in meaning."
                ),
            },
            {"role": "user", "content": f"Question: {input_question}\n Reference Response: {reference_response}\n Run Response: {run_response}"}
        ],
        response_format=Similarity_Score,
    )

    similarity_score = completion.choices[0].message.parsed
    return {"score": similarity_score.similarity_score, "key": "similarity"}


In [6]:
inputs = {
  "question": "Is huggingface an ML library ?"
}
reference_outputs = {
  "output": "No!!, its a rover name from NASA. Nothing to do with AI "
}


# From Run
outputs = {
  "output": "Yes, huggingface is one of the best AI/LLM library out there and is opensourced !."
}

similarity_score = compare_semantic_similarity(inputs, reference_outputs, outputs)
print(f"Semantic similarity score: {similarity_score}")

Semantic similarity score: {'score': 1, 'key': 'similarity'}


```
system


 "You are a semantic similarity evaluator. Compare the meanings of two responses to a question, "

  "Reference Response and New Response, where the reference is the correct answer, and we are trying to judge if the new response is similar. "

"Provide a score between 1 and 10, where 1 means completely unrelated, and 10 means identical in meaning."


Human
Please grade the following example according to the above instructions:

<example>
<input>
{{input}}
</input>

<output>
{{output.output}}⁠
</output>

<reference_outputs>
{{reference}}
</reference_outputs>
</example>
```

# DEMO Use all the above to create in UI using LLM as a judge
## Then use it in Playground to test with different prompts