In [197]:
import sys
import os

sys.path.append(os.path.abspath(".."))

from langsmith import Client
from typing import TypedDict
from typing_extensions import Annotated
from langchain.chat_models import init_chat_model
from src.agent import graph

In [198]:
from langsmith import Client
from langchain_core.messages import HumanMessage

client = Client()
dataset_name = "test_the_correct_code_is_generated"

# Original examples (with HumanMessage objects)
examples = [
    {
        "inputs": {
            "messages": [
                {"role": "user", "content": "I want a function to add two numbers"}
            ]
        },
        "outputs": {
            "code": "def add_two_numbers(a: int, b: int) -> int:\n    return a + b"
        },
    },
    {
        "inputs": {
            "messages": [
                {
                    "role": "user",
                    "content": "I want a function to print 'Hello, World!'",
                }
            ]
        },
        "outputs": {"code": "def print_hello_world():\n    print('Hello, World!')"},
    },
    {
        "inputs": {
            "messages": [
                {
                    "role": "user",
                    "content": "I want a function to concatenate two strings with a space in between",
                }
            ]
        },
        "outputs": {
            "code": "def concatenate_strings(a: str, b: str) -> str:\n    return a + ' ' + b"
        },
    },
]


# Create (or append to) the dataset using converted inputs
if not client.has_dataset(dataset_name=dataset_name):
    dataset = client.create_dataset(dataset_name)
else:
    dataset = client.read_dataset(dataset_name=dataset_name)

client.create_examples(dataset_id=dataset.id, examples=examples)

{'example_ids': ['e176addc-0443-4e95-a74e-34db446be41d',
  'fa4f2525-c14c-4063-b033-63e80fd2ee78',
  'c9dd033e-4486-4f4d-be7c-2be8ef38114f'],
 'count': 3}

In [199]:

# LLM as a Judge Instruction
grader_instructions = """ 

You are a teacher grading a  student's code.

You will be given a QUESTION, the GROUND TRUTH (correct) RESPONSE and the STUDENT'S RESPONSE.

Here is the grading criteria to follow:
(1) Grade the student responses based ONLY on their factual accuracy to the ground truth answer.]
(2) Ensure that the student response is a valid Python code.
(3) Ensure the student response does not include any markdown, comments, or extra text outside the JSON.
(4) It is CORRECT if the student response contains more information than the ground truth response, as long as it is factually accurate relative to the  ground truth response.


Correctness:
True means that the students response meets all the criteria.
False means that the students response does not meet all the criteria.

Explain  your reasoning in a step by step manner to ensure your reasoning and  conclusion is correct.
"""

In [200]:
# LLM as a Judge Output schema
class Grade(TypedDict):
    """Compare the expected and actual answer and grade the actual answer"""

    reasoning: Annotated[
        str, ..., "Explain your reasoning whether the actual response is correct or not"
    ]
    is_correct: Annotated[
        bool, ..., "True if the student response is correct, False if it is not"
    ]


grader_llm = init_chat_model(model="gpt-4o", temperature=0).with_structured_output(
    Grade
)


In [None]:
def correctness(
    inputs: dict, outputs: dict, reference_outputs: dict
) -> bool:
    """Evaluate if the final answer is correct"""

    user_instruction = f"""
    QUESTION: {inputs["messages"][-1]}
    GROUND TRUTH RESPONSE: {reference_outputs["code"]}
    STUDENT'S RESPONSE: {outputs}
    """

    grade = grader_llm.invoke(
        [
            {"role": "system", "content": grader_instructions},
            {"role": "user", "content": user_instruction},
        ]
    )

    return grade["is_correct"]


In [202]:
from langchain_core.messages import HumanMessage

inputs = {
    "messages": [
        HumanMessage(content="I want a function to concatenate two strings with a space in between")
    ]
}


In [None]:
from typing import Any, Dict, List
from langchain_core.messages import BaseMessage, HumanMessage

def target_function(inputs: Dict[str, Any]) -> Dict[str, str]:
    
    #Get the messages from the dataset
    msgs: List[Any] = inputs.get("messages", [])

    #Convert to langchain messages object
    msgs = [
        HumanMessage(content=(m.get("content", "") if isinstance(m, dict) else str(m)))
        for m in msgs
    ]

    #Invoke the generate_code node
    result = graph.nodes["generate_code"].invoke({"messages": msgs})
    
    
    return {"code": result.update["code"]}


In [204]:
target_function(inputs)

{'code': 'def concatenate_strings_with_space(str1: str, str2: str) -> str:\n    return str1 + " " + str2'}

In [None]:

experiment_results = client.evaluate(
    target_function,
    data=dataset_name,
    evaluators=[correctness],
    experiment_prefix="test_the_correct_code_is_generated",
)


View the evaluation results for experiment: 'test_the_correct_code_is_generated-1b6d4252' at:
https://smith.langchain.com/o/5e26199c-44b7-5d71-a174-0781dc496380/datasets/f7cffa21-1bc1-4d1b-8932-915778416070/compare?selectedSessions=b83669aa-14f7-45cc-86fa-f38ad1f2f77f




6it [00:18,  3.14s/it]
