In [13]:
import sys
import os

sys.path.append(os.path.abspath(".."))

from langsmith import Client
from typing import TypedDict
from typing_extensions import Annotated
from langchain.chat_models import init_chat_model
from src.agent import graph

In [14]:
from langsmith import Client

client = Client()
dataset_name = "test_the_correct_tests_are_generated"

examples = [
    # 1) add_two_numbers
    {
        "inputs": {"code": "def add_two_numbers(a: int, b: int) -> int:\n    return a + b"},
        "outputs": {
            "tests": {
                "function_name": "add_two_numbers",
                "tests": [
                    {"args": [1, 2], "kwargs": {}, "expect": 3},
                    {"args": [0, 0], "kwargs": {}, "expect": 0},
                    {"args": [-5, 2], "kwargs": {}, "expect": -3},
                    {"args": [10**9, 10**9], "kwargs": {}, "expect": 2000000000},
                ]
            }
        },
    },

    # 2) print_hello_world
    {
        "inputs": {"code": "def print_hello_world() -> None:\n    print('Hello, World!')"},
        "outputs": {
            "tests": {
                "function_name": "print_hello_world",
                "tests": [
                    {"args": [], "kwargs": {}, "expect": None}
                ]
            }
        },
    },

    # 3) concatenate_strings
    {
        "inputs": {"code": "def concatenate_strings(a: str, b: str) -> str:\n    return a + ' ' + b"},
        "outputs": {
            "tests": {
                "function_name": "concatenate_strings",
                "tests": [
                    {"args": ["hello", "world"], "kwargs": {}, "expect": "hello world"},
                    {"args": ["", ""], "kwargs": {}, "expect": " "},
                    {"args": ["hello", ""], "kwargs": {}, "expect": "hello "},
                    {"args": ["", "world"], "kwargs": {}, "expect": " world"},
                ]
            }
        },
    },

    # 4) factorial
    {
        "inputs": {"code": "def factorial(n: int) -> int:\n    return 1 if n == 0 else n * factorial(n - 1)"},
        "outputs": {
            "tests": {
                "function_name": "factorial",
                "tests": [
                    {"args": [0], "kwargs": {}, "expect": 1},
                    {"args": [1], "kwargs": {}, "expect": 1},
                    {"args": [5], "kwargs": {}, "expect": 120},
                ]
            }
        },
    },

    # 5) is_even
    {
        "inputs": {"code": "def is_even(n: int) -> bool:\n    return n % 2 == 0"},
        "outputs": {
            "tests": {
                "function_name": "is_even",
                "tests": [
                    {"args": [2], "kwargs": {}, "expect": True},
                    {"args": [3], "kwargs": {}, "expect": False},
                    {"args": [0], "kwargs": {}, "expect": True},
                    {"args": [-4], "kwargs": {}, "expect": True},
                ]
            }
        },
    },

    # 6) reverse_list
    {
        "inputs": {"code": "def reverse_list(lst: list) -> list:\n    return lst[::-1]"},
        "outputs": {
            "tests": {
                "function_name": "reverse_list",
                "tests": [
                    {"args": [[1, 2, 3]], "kwargs": {}, "expect": [3, 2, 1]},
                    {"args": [[]], "kwargs": {}, "expect": []},
                    {"args": [["a", "b"]], "kwargs": {}, "expect": ["b", "a"]},
                ]
            }
        },
    },

    # 7) get_max
    {
        "inputs": {"code": "def get_max(a: int, b: int) -> int:\n    return a if a > b else b"},
        "outputs": {
            "tests": {
                "function_name": "get_max",
                "tests": [
                    {"args": [1, 2], "kwargs": {}, "expect": 2},
                    {"args": [5, 5], "kwargs": {}, "expect": 5},
                    {"args": [-1, -5], "kwargs": {}, "expect": -1},
                ]
            }
        },
    },

    # 8) join_words
    {
        "inputs": {"code": "def join_words(words: list[str], sep: str = ',') -> str:\n    return sep.join(words)"},
        "outputs": {
            "tests": {
                "function_name": "join_words",
                "tests": [
                    {"args": [["a", "b", "c"]], "kwargs": {}, "expect": "a,b,c"},
                    {"args": [["a", "b"]], "kwargs": {"sep": "-"}, "expect": "a-b"},
                    {"args": [[]], "kwargs": {}, "expect": ""},
                ]
            }
        },
    },
]

# Create (or append to) the dataset
dataset = client.create_dataset(dataset_name) if not client.has_dataset(dataset_name=dataset_name) \
          else client.read_dataset(dataset_name=dataset_name)

client.create_examples(dataset_id=dataset.id, examples=examples)


{'example_ids': ['4ea66859-98cd-4102-9b42-962b52c78b56',
  'bf08d7fc-8473-42a9-aa94-32ec51f7899c',
  '817c9cf1-87ae-4de0-8a87-690c9415a5de',
  '4918ef7a-4409-4373-92ee-99eec58b7ac4',
  'b9d7557b-c8eb-4880-aa48-d7b656ecdaf8',
  '558301d9-8d07-4f1e-a9f6-9a2d16a81141',
  '57ca4968-4c99-49bb-a77e-12d85f6e3abf',
  '8da93b5c-500b-490d-aa47-015f44d3b2b5'],
 'count': 8}

In [15]:
grader_instructions_for_tests = """
You are grading a student's JSON test suite for a given Python function.

You are given:
- QUESTION: the task the tests target (the input code’s behavior)
- GROUND_TRUTH: a correct JSON test suite for that function
- STUDENT_RESPONSE: the student's JSON (should be a JSON object)

Grade ONLY for structural validity and factual alignment to ground-truth behavior.

RUBRIC
1) Must be valid JSON (no Markdown/prose). Top-level keys:
   - "function_name": string
   - "tests": array of objects with keys {"args": array, "kwargs": object, "expect": any}
2) All args/kwargs/expect must be JSON-serializable.
3) Tests must cover the happy path and at least two edge cases relevant to the function.
4) Extra tests are OK if consistent with the function’s true behavior.
5) Function name must match the target function in the input code.

Correctness:
True means that the students response meets all the criteria.
False means that the students response does not meet all the criteria.

"""


In [16]:
# LLM as a Judge Output schema
class Grade(TypedDict):
    """Compare the expected and actual answer and grade the actual answer"""

    reasoning: Annotated[
        str, ..., "Explain your reasoning whether the actual response is correct or not"
    ]
    is_correct: Annotated[
        bool, ..., "True if the student response is correct, False if it is not"
    ]


grader_llm = init_chat_model(model="gpt-4o", temperature=0).with_structured_output(
    Grade
)


In [17]:
def correctness(
    inputs: dict, outputs: dict, reference_outputs: dict
) -> bool:
    """Evaluate if the final answer is correct"""

    user_instruction = f"""
    QUESTION: {inputs["code"]}
    GROUND TRUTH RESPONSE: {reference_outputs["tests"]}
    STUDENT'S RESPONSE: {outputs["tests"]}
    """

    grade = grader_llm.invoke(
        [
            {"role": "system", "content": grader_instructions_for_tests},
            {"role": "user", "content": user_instruction},
        ]
    )

    return grade["is_correct"]


In [18]:
def target_function(inputs: dict) -> dict:
   
    #Invoke the generate_code node
    result = graph.nodes["generate_test"].invoke(inputs)
    
    
    return {"tests": result.update["tests"]}


In [19]:

experiment_results = client.evaluate(
    target_function,
    data=dataset_name,
    evaluators=[correctness],
    experiment_prefix="test_the_correct_tests_are_generated",
)


View the evaluation results for experiment: 'test_the_correct_tests_are_generated-563cf512' at:
https://smith.langchain.com/o/5e26199c-44b7-5d71-a174-0781dc496380/datasets/1cc9ffbc-7579-4b4b-b21f-fde01facb340/compare?selectedSessions=7cf99e8e-d108-4fee-ac5f-385325bb7cd1




1it [00:08,  8.12s/it]Error running target function: Expecting ',' delimiter: line 5 column 28 (char 153)
Traceback (most recent call last):
  File "/Users/jameskanyiri/LANGGRAPH/langgraph_coding_agent_workflow/.venv/lib/python3.13/site-packages/langsmith/evaluation/_runner.py", line 1924, in _forward
    fn(*args, langsmith_extra=langsmith_extra)
    ~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/var/folders/lx/47jl9ym97js4vymgwnc01kww0000gn/T/ipykernel_46279/2221717457.py", line 4, in target_function
    result = graph.nodes["generate_test"].invoke(inputs)
  File "/Users/jameskanyiri/LANGGRAPH/langgraph_coding_agent_workflow/.venv/lib/python3.13/site-packages/langgraph/pregel/_read.py", line 234, in invoke
    return self.bound.invoke(
           ~~~~~~~~~~~~~~~~~^
        input,
        ^^^^^^
        merge_configs({"metadata": self.metadata, "tags": self.tags}, config),
        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
        **kwargs,
        ^^^