In [7]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [108]:
from dotenv import load_dotenv

load_dotenv()

import os

api_key = os.getenv("ANTHROPIC_API_KEY")

if not api_key:
    raise ValueError("Anthropic API key not found")


In [31]:
import os
from jinja2 import Environment, FileSystemLoader

system_prompt_env = Environment(loader=FileSystemLoader("/Users/khannguyen/projects/agent-coding-template/src/agent_coding_template/prompts"))
usecase_prompt_env = Environment(loader=FileSystemLoader("/Users/khannguyen/projects/agent-coding-template/use-case-tests/nvidia-eval-adaptors"))


In [62]:
# Import relevant functionality
from langchain_anthropic import ChatAnthropic
from langchain_core.messages import HumanMessage
from langgraph.checkpoint.memory import MemorySaver
from langgraph.prebuilt import create_react_agent

from agent_coding_template.tools import *

from typing import Dict, TypedDict, Annotated, Sequence
from langchain_anthropic import ChatAnthropic
from langchain_core.messages import HumanMessage
from langgraph.prebuilt import create_react_agent
from langgraph.graph import StateGraph, END, START
from agent_coding_template.tools import *
import uuid
from agent_coding_template.utils.pretty_print import pretty_print_chunk

memory = MemorySaver()
config = {"configurable": {"thread_id": uuid.uuid4()}}


In [67]:
# Define state schema
class AgentState(TypedDict):
    messages: Sequence[HumanMessage]
    thread_id: str

# Create tools and model
model = ChatAnthropic(model_name="claude-3-5-sonnet-20241022", max_tokens=8000)
tools = [DirectoryMapperTool(), FileReaderTool(), ReadmeReaderTool(), FileEditorTool(), CreateDirectoryTool(), FileCopyTool(), FileRenameTool()]


system_prompt = template = system_prompt_env.get_template("coding_agent_system_prompt.j2").render()
print(system_prompt)


agent = create_react_agent(model, tools, checkpointer=memory, prompt=system_prompt)

# Load the template file
# template = usecase_prompt_env.get_template("1_add_dependencies.j2")
# template = usecase_prompt_env.get_template("2_create_submodule_and_copy_entrypoint.j2")
template = usecase_prompt_env.get_template("3_convert_entrypoint_to_standard_interface.j2")

content = template.render(repo_dir="/Users/khannguyen/projects/evaluations/evalplus", adaptor_instruction_filepath="/Users/khannguyen/projects/agent-coding-template/use-case-tests/nvidia-eval-adaptors/nvidia_eval_adaptor_instruction.md")
print(content)


# Stream results
for chunk in agent.stream(
    {"messages": [HumanMessage(content=content)]}, config
):
    pretty_print_chunk(chunk)

**Code Assistant Protocol**

You are an advanced code editing assistant.

**File Editing Protocol** (MUST FOLLOW when making changes):
1. **Pre-Edit Verification** - ALWAYS use FileReaderTool to:
   - Cross-validate paths using DirectoryMapperTool
   - Get current file state WITH LINE NUMBERS
   - Verify exact location for modifications
   
2. **Modification Rules**:
   - Use FileEditorTool to make changes to the file.
   - NEVER assume line numbers - ALWAYS verify via FileReaderTool
   - NEVER include unchanged content - only send NEW/CHANGED lines
   - Example: To add 1 line at position 42:
     ✅ Correct: insert at line 42 with content
     ❌ Wrong: Rewriting entire file with new line added

3. **Sequential Edit Protocol**:
   - After ANY modification to a file, line numbers MUST be re-verified
   - ALWAYS use FileReaderTool again before making additional edits to the same file
   - Treat each edit as independent, requiring fresh line number validation

**Token Efficiency Guidelines

In [126]:
from typing import Literal

from langchain_anthropic import ChatAnthropic
from langchain_core.tools import tool
from langchain_openai import ChatOpenAI

from langgraph.checkpoint.memory import MemorySaver
from langgraph.graph import MessagesState, StateGraph, START
from langgraph.prebuilt import ToolNode

from langchain_core.messages import (
    AIMessage,
    HumanMessage,
    SystemMessage,
    ToolMessage,
    trim_messages,
)



memory = MemorySaver()
config = {"configurable": {"thread_id": uuid.uuid4()}}

tools = [DirectoryMapperTool(), FileReaderTool(), ReadmeReaderTool(), FileEditorTool(), CreateDirectoryTool(), FileCopyTool(), FileRenameTool()]
tool_node = ToolNode(tools)

model = ChatAnthropic(model="claude-3-5-sonnet-20241022", max_tokens=8000)

bound_model = model.bind_tools(tools)

def should_continue(state: MessagesState):
    """Return the next node to execute."""
    last_message = state["messages"][-1]
    # If there is no function call, then we finish
    if not last_message.tool_calls:
        return END
    # Otherwise if there is, we continue
    return "tools"


def filter_messages(messages: list):
    print(f"Length of messages before trimming: {len(messages)}")
    messages = trim_messages(
                messages,
                max_tokens=20000,
                strategy="last",
                token_counter=ChatOpenAI(model="gpt-4o"),
                # Most chat models expect that chat history starts with either:
                # (1) a HumanMessage or
                # (2) a SystemMessage followed by a HumanMessage
                start_on="human",
                # Usually, we want to keep the SystemMessage
                # if it's present in the original history.
                # The SystemMessage has special instructions for the model.
                include_system=True,
                allow_partial=False,
            )

    print(f"Length of messages after trimming: {len(messages)}")

    return messages


# Define the function that calls the model
def call_model(state: MessagesState):
    messages = filter_messages(state["messages"])
    response = bound_model.invoke(messages)
    # We return a list, because this will get added to the existing list
    return {"messages": response}

# Define a new graph
workflow = StateGraph(MessagesState)

# Define the two nodes we will cycle between
workflow.add_node("agent", call_model)
workflow.add_node("tools", tool_node)

# Set the entrypoint as `agent`
# This means that this node is the first one called
workflow.add_edge(START, "agent")

# We now add a conditional edge
workflow.add_conditional_edges(
    # First, we define the start node. We use `agent`.
    # This means these are the edges taken after the `agent` node is called.
    "agent",
    # Next, we pass in the function that will determine which node is called next.
    should_continue,
    # Next, we pass in the pathmap - all the possible nodes this edge could go to
    ["tools", END],
)

# We now add a normal edge from `tools` to `agent`.
# This means that after `tools` is called, `agent` node is called next.
workflow.add_edge("tools", "agent")

# Finally, we compile it!
# This compiles it into a LangChain Runnable,
# meaning you can use it as you would any other runnable
app = workflow.compile(checkpointer=memory)


In [125]:
# Load the template file
# template = usecase_prompt_env.get_template("1_add_dependencies.j2")
# template = usecase_prompt_env.get_template("2_create_submodule_and_copy_entrypoint.j2")
template = usecase_prompt_env.get_template("3_convert_entrypoint_to_standard_interface.j2")

content = template.render(dependency="../../eval-core-utils", repo_dir="/Users/khannguyen/projects/evaluations/ToolTalk", adaptor_instruction_filepath="/Users/khannguyen/projects/agent-coding-template/use-case-tests/nvidia-eval-adaptors/nvidia_eval_adaptor_instruction.md")
print(content)


# Stream results
for chunk in app.stream(
    {"messages": [HumanMessage(content=content)]}, config
):
    pretty_print_chunk(chunk)


# Task: Implement NVIDIA Evaluation Adaptor

## Objective
Modify the entry point to use the interface defined in `eval_core_utils`.

## Context
- Repository Directory: /Users/khannguyen/projects/evaluations/evalplus
- Interface conversion instructions: /Users/khannguyen/projects/agent-coding-template/use-case-tests/nvidia-eval-adaptors/nvidia_eval_adaptor_instruction.md
- Entry point file: nvidia_eval_adaptor.py

## Required Actions

1. Read Interface Conversion Instructions
   - Study the interface conversion instructions

2. Edit Entry Point
   - Convert CLI logic to programmatic interface
   - Implement data model transformations between exisiting benchmark and framework formats

## Key Guidelines
1. Clean Integration
   - NO changes to existing benchmark code. Keep ALL changes within the `nvidia` submodule.
Length of messages before trimming: 11
Length of messages after trimming: 11

=== AGENT MESSAGE ===
{'agent': {'messages': AIMessage(content=[{'text': "I'll help you modify the 