In [1]:
from langchain_openai import ChatOpenAI
from langsmith.evaluation import evaluate, LangChainStringEvaluator
from langsmith.schemas import Run, Example
from openai import OpenAI
import json

from dotenv import load_dotenv
load_dotenv()

from langsmith.wrappers import wrap_openai
from langsmith import traceable

In [14]:
client = wrap_openai(OpenAI())

@traceable
def prompt_compliance_evaluator(run: Run, example: Example) -> dict:
    inputs = example.inputs['messages']
    outputs = example.outputs
    # print(outputs['generations'][0]['text'])

    # Extract system prompt
    system_prompt = next((msg['data']['content'] for msg in inputs if msg['type'] == 'system'), "")

    # Extract message history
    message_history = []
    for msg in inputs:
        if msg['type'] in ['human', 'ai']:
            message_history.append({
                "role": "user" if msg['type'] == 'human' else "assistant",
                "content": msg['data']['content']
            })

    # Extract latest user message and model output
    latest_message = message_history[-1]['content'] if message_history else ""
    model_output = outputs['generations'][0]['text']

    judge_prompt_1 = '''
    Based on the above information, evaluate the model's output for compliance with the system prompt and context of the conversation. 
    Provide a score from 0 to 10, where 0 is completely non-compliant and 10 is perfectly compliant.
    Also provide a brief explanation for your score.

    Respond in the following JSON format:
    {{
        "score": <int>,
        "explanation": "<string>"
    }}
    '''

    judge_prompt_2 = '''

    Based on the above information, your task is to provide a total rating score based on how well the model output complies with the system prompt and context of the conversation..
    Give your answer on a scale of 0 to 10, where 0 is completely non-compliant and 10 is perfectly compliant.

    Here is the scale you should use:
    0: Completely non-compliant: The model output is completely unrelated to the system prompt and context of the conversation.
    1-3: Mostly non-compliant: The model output is mostly unrelated to the system prompt and context of the conversation.
    4-6: Partially compliant: The model output is partially related to the system prompt and context of the conversation.
    7-9: Mostly compliant: The model output is mostly related to the system prompt and context of the conversation.
    10: Perfectly compliant: The model output is perfectly related to the system prompt and context of the conversation.

    Provide a brief explanation for your score. 

    Respond in the following JSON format:
    {{
        "score": <int>,
        "explanation": "<string>"
    }}

    '''

    evaluation_prompt = f"""
    System Prompt: {system_prompt}

    Message History:
    {json.dumps(message_history, indent=2)}

    Latest User Message: {latest_message}

    Model Output: {model_output}

    {judge_prompt_2}


    """

    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": "You are an AI assistant tasked with evaluating the compliance of model outputs to given prompts and conversation context."},
            {"role": "user", "content": evaluation_prompt}
        ],
        temperature=0.2
    )

    try:
        result = json.loads(response.choices[0].message.content)
        return {
            "key": "prompt_compliance",
            "score": result["score"] / 10,  # Normalize to 0-1 range
            "reason": result["explanation"]
        }
    except json.JSONDecodeError:
        return {
            "key": "prompt_compliance",
            "score": 0,
            "reason": "Failed to parse evaluator response"
        }

# The name or UUID of the LangSmith dataset to evaluate on.
data = "tbl-retriever"

# A string to prefix the experiment name with.
experiment_prefix = "Evaluating the retrieved articles"

# List of evaluators to score the outputs of target task
evaluators = [
    prompt_compliance_evaluator
]

# Evaluate the target task
results = evaluate(
    lambda inputs: inputs,
    data=data,
    evaluators=evaluators,
    experiment_prefix=experiment_prefix,
)

print(results)

  from .autonotebook import tqdm as notebook_tqdm


View the evaluation results for experiment: 'Evaluating the retrieved articles-8d730206' at:
https://smith.langchain.com/o/dd4af400-2b48-5bd0-935a-465efe9e148e/datasets/0c4dda56-9418-4c36-8339-cbfb1de6336a/compare?selectedSessions=4f3f4855-0a12-4fe1-9b80-bab7b2ec4c2c




0it [00:00, ?it/s]Error running evaluator <DynamicRunEvaluator prompt_compliance_evaluator> on run cfadeac8-00ff-4325-9f15-a1d3ce49236e: RuntimeError('cannot schedule new futures after shutdown')
Traceback (most recent call last):
  File "/opt/homebrew/Caskroom/miniforge/base/envs/scrape/lib/python3.12/site-packages/langsmith/evaluation/_runner.py", line 1351, in _run_evaluators
    self.client._log_evaluation_feedback(
  File "/opt/homebrew/Caskroom/miniforge/base/envs/scrape/lib/python3.12/site-packages/langsmith/client.py", line 3975, in _log_evaluation_feedback
    _submit_feedback(
  File "/opt/homebrew/Caskroom/miniforge/base/envs/scrape/lib/python3.12/site-packages/langsmith/client.py", line 3961, in _submit_feedback
    _executor.submit(self.create_feedback, **kwargs)
  File "/opt/homebrew/Caskroom/miniforge/base/envs/scrape/lib/python3.12/site-packages/langsmith/utils.py", line 661, in submit
    return super().submit(
           ^^^^^^^^^^^^^^^
  File "/opt/homebrew/Caskroom/

KeyboardInterrupt: 

In [43]:
# view examples
from langsmith import Client
client = Client()

data = "tbl-retriever"
examples = client.list_examples(dataset_name=data, limit=10)

system_prompt = None
rag_data = None

for example in examples:
    system_prompt = example.inputs['messages'][0]['data']['content']
    rag_data = example.inputs['messages'][1]['data']['content']
    user_message = example.inputs['messages'][2]['data']['content']
    print('system prompt:', system_prompt)
    print("-"*100)
    print('RAG data:', rag_data)
    print("-"*100)
    print('user message:', user_message)



system prompt: 
You are an expert medical evidence evaluator with a background in synthesizing research from peer-reviewed medical articles. Your task is to critically evaluate summaries of medical studies, synthesize key findings, and provide accurate, evidence-based answers to user queries based on these summaries. You have access to a dataset of articles focused on temperature management, and you can use relevant information from these summaries to answer questions on the topic.

You may also use the get_citation_count function if the user requests the citation count for a paper.

When engaging with the user, follow these guidelines:

	1.	Focus on Evidence: Base your responses primarily on the summaries of medical articles provided. If the summaries don’t fully address the user’s question, clearly state the limitations and provide the best possible answer based on the available data.
	2.	Structure: Begin each response by directly answering the user’s query. Follow up with a synthesi

In [4]:
import openai
model = "gemini-flash"

def rag_relevance_evaluator(run: Run, example: Example, model=model) -> dict:
    inputs = example.inputs['messages']
    outputs = example.outputs

    system_prompt = example.inputs['messages'][0]['data']['content']
    rag_data = example.inputs['messages'][1]['data']['content']
    user_message = example.inputs['messages'][2]['data']['content']

    if "gpt" not in model.lower():
        client = wrap_openai(
            openai.Client(
                api_key="sk-1234",
                base_url="http://localhost:4000"
            )
        )
    else:
        client = wrap_openai(OpenAI())

    judge_prompt_2 = '''

    Based on the above information, evaluate the retrieved articles for relevance to the user's query.
    The goal is to minimize the number of irrelevant retrieved articles that cannot be direclty used to answer the query.
    For example, if the query is regarding steroids and sepsis, but an article concerns only one of those topics, it is not relevant.
    Provide a count of the number of relevant articles, and the number of total retrieved articles.
    Provide a score based on the overall relevance of the retrieved articles to the user's query.

    Provide a brief explanation for your score. 

    Respond in the following JSON format:
    {{
        "relevant_articles": <int>,
        "total_articles": <int>,
        "score": <int>,
        "explanation": "<string>"
    }}

    '''

    evaluation_prompt = f"""
    {judge_prompt_2}
    

    RAG Data: {rag_data}

    User Message: {user_message}

    


    """

    response = client.chat.completions.create(
        model=model,
        messages=[
            {"role": "system", "content": "You are an AI assistant tasked with evaluating the compliance of model outputs to given prompts and conversation context."},
            {"role": "user", "content": evaluation_prompt}
        ],
        temperature=0.1
    )

    try:
        result = response.choices[0].message.content
        if isinstance(result, str) and result.startswith('```json'):
            result = json.loads(result.strip('```json').strip().strip('```'))
        else:
            result = json.loads(response.choices[0].message.content)
        return {
            "key": "rag_relevance",
            "relevant_articles": result["relevant_articles"],
            "total_articles": result["total_articles"],
            "score": result["score"] / 10 if isinstance(result["score"], int) and result["score"] > 10 else result["score"],  # Normalize to 0-1 range if necessary
            "reason": result["explanation"]
        }
    except json.JSONDecodeError:
        return {
            "key": "rag_relevance",
            "score": 0,
            "reason": "Failed to parse evaluator response",
        }

# The name or UUID of the LangSmith dataset to evaluate on.
data = "tbl-retriever"

# A string to prefix the experiment name with.
experiment_prefix = "Evaluating the retrieved articles"

# List of evaluators to score the outputs of target task
evaluators = [
    rag_relevance_evaluator
]

# Evaluate the target task
results = evaluate(
    lambda inputs: inputs,
    data=data,
    evaluators=evaluators,
    experiment_prefix=experiment_prefix,
)

print(results)

View the evaluation results for experiment: 'Evaluating the retrieved articles-3dfc5f7c' at:
https://smith.langchain.com/o/dd4af400-2b48-5bd0-935a-465efe9e148e/datasets/0c4dda56-9418-4c36-8339-cbfb1de6336a/compare?selectedSessions=5f4de9f0-6dbe-4485-afbd-cd05ca3eba58




11it [00:03,  3.38it/s]

<ExperimentResults Evaluating the retrieved articles-3dfc5f7c>





Retriever evaluation