In [1]:
import json
import os
import time
from dataclasses import dataclass
from typing import List, Dict, Any, Optional
from copy import deepcopy

from llm_interact import LLMInteractor, LLMConfig
from llm_interact_env import Environment, EnvironmentConfig, Task, run


### Description of the saved checkpoint


The saved checkpoint is a json file tracking all the messages. The body part is "conversation_history", which follows the structure:

 - "role": user_agent or assistant_agent
 - "prompt_received": the prompt received by this agent
   - When this conversation is asking the assistant_agent to generate summary, this attribute is called "prompt_received_for_summary"
 - "all_messages": the response of this agent. If this is the assistant agent, it also includes the interaction between agent and computer console. This is the direct output of open interpreter, which is a list of dicts. **Notice that in the dict in "all_messages", the "role" is always "assistant" even if in user_agent. This is the openai format of calling LLM apis**
  


## example 1: 

Here we have several predefined tasks, where each task is already a "subtask". 
We just let the user_agent to redescribe the task, and then the assistant_agent will complete the task.
If the task is completed, we will continue to the next task; else, we will stay in the current task and continue to the next turn.

In [None]:
# Define configuration for both agents
user_config = LLMConfig.from_toml("./llm_config.toml")

assistant_config = LLMConfig.from_toml("./llm_config.toml")
assistant_config.run_code = True

# Define environment configuration
env_config = EnvironmentConfig(
    user_llm_config=user_config,
    assistant_llm_config=assistant_config,
    assistant_agent_type="base-agent",
    interpreter_config_path="interpreter_config.toml",
    user_prompt_template="""
    You are a user who needs help completing tasks. You should communicate clearly
    what you need help with based on your current task.

    Your task list:
    {task_list}

    current task:
    {current_task}

    Based on the above, write a message to the assistant with your request.
    Be specific about what you need and reference previous interactions if relevant.
    DO NOT write code yourself, but DO ask for explanations of the code the assistant writes if you think it is necessary.

    Your output should starts with "Task: [description]" (requires your action)
    """,
    max_turns=30,
    user_retry_prompt_template="""
    You are a user who needs help completing tasks. You should communicate clearly
    what you need help with based on your current task.

    Your task list:
    {task_list}

    current task:
    {current_task}

    The assistant has already completed the task, but the result is not satisfactory. Here are the assistant's operations:

    Assistant Operation:
    {assistant_summary}

    Based on the above, write a message to the assistant with your request.
    Be specific about what you need and reference previous interactions if relevant.

    Your output should starts with "Task: [description]" (requires your action)
    """,
    checkpoint_path="environment_state_version1.json",
)

# Define tasks
tasks = [
    Task(
        id="1",
        description="I have a csv file at test_data/Titanic-Dataset-dirty.csv, The dataset description is at test_data/dataset_description.txt. The dataset is dirty, and I want you to help me clean it by dropping missing values and correcting any typos.",
        success_criteria="",
    ),
    Task(
        id="2",
        description="Based on the cleaned dataset, I want you to calculate the average survival rate for male and female passengers.",
        success_criteria="",
    ),
]


2025-04-28 13:22:06 - DataSciBench - INFO - Loaded configuration from ./llm_config.toml
2025-04-28 13:22:06 - DataSciBench - INFO - Loaded configuration from ./llm_config.toml


In [5]:

# Create and run the environment
environment = Environment(env_config, tasks)
completed_tasks = run(environment, "version1")

2025-04-28 13:22:08 - DataSciBench - INFO - Initializing Environment with 2 tasks
2025-04-28 13:22:08 - DataSciBench - INFO - Initialized LLMInteractor with model: gemini/gemini-2.0-flash, temperature: 0.4
2025-04-28 13:22:08 - DataSciBench - INFO - Initialized LLMInteractor with model: gemini/gemini-2.0-flash, temperature: 0.4
2025-04-28 13:22:08 - DataSciBench - INFO - Starting environment run with version: version1
2025-04-28 13:22:08 - DataSciBench - INFO - Starting interaction between agents
2025-04-28 13:22:08 - DataSciBench - INFO - Starting interaction using version1 strategy
2025-04-28 13:22:08 - DataSciBench - INFO - Starting Task 1: I have a csv file at test_data/Titanic-Dataset-dirty.csv, The dataset description is at test_data/dataset_description.txt. The dataset is dirty, and I want you to help me clean it by dropping missing values and correcting any typos.

--- TASK 1: I have a csv file at test_data/Titanic-Dataset-dirty.csv, The dataset description is at test_data/data

In [4]:
environment.assistant_agent.messages

[{'role': 'user',
  'type': 'message',
  'content': "Task: I need help cleaning the Titanic dataset. I have a CSV file located at `test_data/Titanic-Dataset-dirty.csv` and a dataset description at `test_data/dataset_description.txt`. The dataset is dirty, and I want you to help me clean it. Specifically, I need you to:\n\n1.  **Drop rows with missing values.**\n2.  **Correct any typos** you find in the data. Please pay close attention to the categorical columns, such as 'Sex', 'Embarked', and 'Cabin'.\n3.  **Explain your cleaning steps.** I want to understand the rationale behind each cleaning operation you perform.\n"},
 {'role': 'assistant',
  'type': 'message',
  'content': '1.  Explanation: Load the dataset using pandas and print the info and the first few rows to understand the data structure and identify missing values and potential typos.\n2.  Code:\n'},
 {'role': 'assistant',
  'type': 'code',
  'format': 'python',
  'content': '\nimport pandas as pd\n\ntry:\n    df = pd.read_c

In [5]:
environment.conversation_history

[{'role': 'user agent',
  'prompt_received': '\n    You are a user who needs help completing tasks. You should communicate clearly\n    what you need help with based on your current task.\n\n    Your task list:\n    - [ ] Task 1: I have a csv file at test_data/Titanic-Dataset-dirty.csv, The dataset description is at test_data/dataset_description.txt. The dataset is dirty, and I want you to help me clean it by dropping missing values and correcting any typos.\n- [ ] Task 2: Based on the cleaned dataset, I want you to calculate the average survival rate for male and female passengers.\n\n    current task:\n    CURRENT TASK: I have a csv file at test_data/Titanic-Dataset-dirty.csv, The dataset description is at test_data/dataset_description.txt. The dataset is dirty, and I want you to help me clean it by dropping missing values and correcting any typos.\n\nSuccess criteria: \n\n    Based on the above, write a message to the assistant with your request.\n    Be specific about what you need

## example 2: 

Here we have several predefined subtasks, we will show all of them together to the user llm
Then the user llm tells the assistant llm with one subtask at each time, it determines which subtask to allocate based on a summary of the assistant's response

In [2]:


# Define configuration for both agents
user_config = LLMConfig.from_toml("./llm_config.toml")

assistant_config = LLMConfig.from_toml("./llm_config.toml")
assistant_config.run_code = True

# Define environment configuration
env_config = EnvironmentConfig(
    user_llm_config=user_config,
    assistant_llm_config=assistant_config,
    interpreter_config_path="interpreter_config.toml",
    user_prompt_template="""
    You are a user who needs help completing tasks. You should communicate clearly
    what you need help with based on your current task.
    
    Your task list:
    {task_list}
    
    Based on the above, write a message to the assistant with your request.
    You should specify which subtask to allocate to the assistant based on the past conversation. Please communicate with the assistant in a way that the subtask description is clear and concise.
    You can also ask the assistant to help you with other subtasks that are not in the task list if you think it is necessary.
    When all the tasks are completed, please say "##ALL_TASKS_COMPLETED##", and your conversation with the assistant should be terminated.
    DO NOT write code yourself, but DO ask for explanations of the code the assistant writes if you think it is necessary.
    """,
    user_continue_prompt_template="""
    The assistant has responded to your last message. The assistant's response is as follows:
    {assistant_summary}

    Now, please continue to the next subtask. When all the tasks are completed, please say "##ALL_TASKS_COMPLETED##", and your conversation with the assistant should be terminated.
    """,
    max_turns=30,
    checkpoint_path="environment_state_version2.json"
)

# Define tasks
tasks = [
    Task(
        id="1", 
        description="I have a csv file at test_data/Titanic-Dataset-dirty.csv, The dataset description is at test_data/dataset_description.txt. The dataset is dirty, and I want you to help me clean it.",
        success_criteria=""
    ),
    Task(
        id="2",
        description="Based on the cleaned dataset, I want you to calculate the average survival rate for male and female passengers.",
        success_criteria=""
    )
]


2025-04-28 19:41:50 - DataSciBench - INFO - Loaded configuration from ./llm_config.toml
2025-04-28 19:41:50 - DataSciBench - INFO - Loaded configuration from ./llm_config.toml


In [3]:

# Create and run the environment
environment = Environment(env_config, tasks)
completed_tasks = run(environment, "version2")

2025-04-28 19:41:51 - DataSciBench - INFO - Initializing Environment with 2 tasks
2025-04-28 19:41:51 - DataSciBench - INFO - Initialized LLMInteractor with model: gemini/gemini-2.0-flash-lite, temperature: 0.4
2025-04-28 19:41:51 - DataSciBench - INFO - Initialized LLMInteractor with model: gemini/gemini-2.0-flash-lite, temperature: 0.4
2025-04-28 19:41:51 - DataSciBench - INFO - Starting environment run with version: version2
2025-04-28 19:41:51 - DataSciBench - INFO - Starting interaction between agents
2025-04-28 19:41:51 - DataSciBench - INFO - Starting interaction using version2 strategy
2025-04-28 19:41:51 - DataSciBench - INFO - Starting task loop with max turns: 30
2025-04-28 19:41:51 - DataSciBench - INFO - Calling LLM API with a message (total length: ~1152 chars)
2025-04-28 19:41:51 - DataSciBench - INFO - LLM API call successful on attempt 1
2025-04-28 19:41:51 - DataSciBench - INFO - Checkpoint saved successfully
2025-04-28 19:41:51 - DataSciBench - INFO - Calling LLM API

In [1]:
import json
import os
import time
import logging
from dataclasses import dataclass
from typing import List, Dict, Any, Optional
from copy import deepcopy
from llms.llm_interact import LLMConfig
from llm_interact_env import Environment, EnvironmentConfig, Task, run
from logger import logger, configure_global_logger  # Import the logger
import subprocess  # Added for running the script

# Set logger to DEBUG level
logger.setLevel(logging.DEBUG)
# Optional: Add a stream handler if you want debug messages in console
console_handler = logging.StreamHandler()
console_handler.setLevel(logging.DEBUG)
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
console_handler.setFormatter(formatter)
logger.addHandler(console_handler)
configure_global_logger(log_file="llm_interact_tau_bench_debug.log", level=logging.DEBUG)

# Define configuration for both agents
user_config = LLMConfig.from_toml("llm_configs/raw/llm_config_user.toml")
assistant_config = LLMConfig.from_toml("llm_configs/raw/llm_config_agent.toml")
assistant_config.run_code = True

# Load gatekeeper configuration if available
gatekeeper_config = None
try:
    gatekeeper_config = LLMConfig.from_toml("llm_configs/raw/llm_config_gatekeeper.toml")
    
    # Load reference code from walmart_data/reference_code.md
    try:
        with open("mobile-price-prediction-model/reference_code.md", "r") as f:
            reference_code = f.read()
            # Replace the {{reference_code}} placeholder with the actual reference code
            gatekeeper_config.system_prompt = gatekeeper_config.system_prompt.format(reference_code=reference_code)
        logger.info("Loaded gatekeeper configuration with reference code")
    except Exception as e:
        logger.warning(f"Failed to load reference code: {e}")
except Exception as e:
    logger.warning(f"Failed to load gatekeeper configuration: {e}")


2025-05-11 20:56:12 - DataSciBench - INFO - Loaded configuration from llm_configs/raw/llm_config_user.toml
2025-05-11 20:56:12 - DataSciBench - INFO - Loaded configuration from llm_configs/raw/llm_config_agent.toml
2025-05-11 20:56:12 - DataSciBench - INFO - Loaded configuration from llm_configs/raw/llm_config_gatekeeper.toml
2025-05-11 20:56:12 - DataSciBench - INFO - Loaded gatekeeper configuration with reference code


In [2]:
i = 0
checkpoint_path=f"walmart_data/trajactory_walmart_{i}.json"
while os.path.exists(checkpoint_path):
    i += 1
    checkpoint_path=f"walmart_data/trajactory_walmart_{i}.json"

# Define environment configuration
env_config = EnvironmentConfig(
    user_llm_config=user_config,
    assistant_llm_config=assistant_config,
    gatekeeper_llm_config=gatekeeper_config,  # Add gatekeeper config
    assistant_agent_type="base-agent",
    interpreter_config_path="llm_configs/raw/interpreter_config_agent.toml",
    user_prompt_template="""
    You are a data analyst for Walmart.
    
    {project_context}
    """,
    max_turns=20,
    checkpoint_path=checkpoint_path
)

In [3]:
environment = Environment(env_config, [])

2025-05-11 20:56:12 - DataSciBench - INFO - Initializing Environment with 0 tasks
2025-05-11 20:56:12 - DataSciBench - INFO - Environment initialization started at 2025-05-11T20:56:12.228970
2025-05-11 20:56:12 - DataSciBench - DEBUG - Environment initialized with current_task_idx=0
2025-05-11 20:56:12 - DataSciBench - DEBUG - Creating user agent
2025-05-11 20:56:12 - DataSciBench - INFO - Initialized LLM agent with model: litellm_proxy/gemini-flash, temperature: 0.4
2025-05-11 20:56:12 - DataSciBench - DEBUG - Creating assistant agent
2025-05-11 20:56:12 - DataSciBench - INFO - Initialized LLM agent with model: litellm_proxy/claude-3-5-sonnet, temperature: 0.4
2025-05-11 20:56:12 - DataSciBench - DEBUG - Creating gatekeeper agent
2025-05-11 20:56:12 - DataSciBench - INFO - Initialized Gatekeeper with model: litellm_proxy/gemini-flash, temperature: 0.7


In [4]:
response = environment.gatekeeper.call_llm("Yes, that plan sounds perfect. Converting all prices to USD using the 'Launched Price (USA)' column as a reference is exactly what we need to do to prepare the data for modeling. Let's proceed with that step!", output_raw=True)

In [5]:
response

ModelResponse(id='chatcmpl-ac902268-1c5b-4cef-9f79-ee2cc310fffd', created=1747022172, model='litellm_proxy/gemini-2.5-flash-preview-04-17', object='chat.completion', system_fingerprint=None, choices=[Choices(finish_reason='tool_calls', index=0, message=Message(content=None, role='assistant', tool_calls=[ChatCompletionMessageToolCall(index=0, function=Function(arguments='{"thought": "The user instruction aligns with the reference code\'s goal of converting prices to USD for modeling, specifically mentioning the \'Launched Price (USA)\' column which is used in the code as one of the sources for the average USD price and as the target variable. The instruction does not propose a conflicting method, but rather confirms the approach of working with USD prices derived from the available columns, including the USA price. Thus, it is not contradictory to the reference code\'s logic for preparing price data in USD from multiple sources and using the USA price in the modeling step. ", "follow_up

In [6]:
response.choices[0].message

Message(content=None, role='assistant', tool_calls=[ChatCompletionMessageToolCall(index=0, function=Function(arguments='{"thought": "The user instruction aligns with the reference code\'s goal of converting prices to USD for modeling, specifically mentioning the \'Launched Price (USA)\' column which is used in the code as one of the sources for the average USD price and as the target variable. The instruction does not propose a conflicting method, but rather confirms the approach of working with USD prices derived from the available columns, including the USA price. Thus, it is not contradictory to the reference code\'s logic for preparing price data in USD from multiple sources and using the USA price in the modeling step. ", "follow_up_instruction": null, "contradictory": false}', name='gatekeeper'), id='call_4bb8e9c6-4449-4b8e-9420-879f608b7721', type='function')], function_call=None, provider_specific_fields={'refusal': None})