In [1]:
import json
import os
import time
import logging
from dataclasses import dataclass
from typing import List, Dict, Any, Optional
from copy import deepcopy
from llms.llm_interact import LLMConfig
from llms.retriever import Retriever
from llm_interact_env import Environment, EnvironmentConfig, Task, run
from logger import logger, configure_global_logger  # Import the logger
import subprocess  # Added for running the script

In [2]:
retriever_config = LLMConfig.from_toml("llm_configs/raw_tianyu/llm_config_shards.toml")

retriever = Retriever(retriever_config)

2025-05-14 01:12:27 - DataSciBench - INFO - Loaded configuration from llm_configs/raw_tianyu/llm_config_shards.toml
2025-05-14 01:12:27 - DataSciBench - INFO - Initialized Gatekeeper with model: litellm_proxy/claude-3-7-sonnet, temperature: 1


In [3]:
import glob
from pathlib import Path
import os
# Find all instruction.md files in the storage directory
storage_dir = "benchmark_final_test/storage"
instruction_files = []

# Walk through all directories under storage
for root, dirs, files in os.walk(storage_dir):
    # Check if this is an 'instructions' directory
    if os.path.basename(root) == "instructions":
        # Look for instructions.md file
        instruction_file = os.path.join(root, "gatekeeper_reference.md")
        if os.path.exists(instruction_file):
            instruction_files.append(instruction_file)

print(f"Found {len(instruction_files)} instruction files")

# Process each instruction file
for instruction_file in instruction_files:
    print(f"Processing: {instruction_file}")
    
    # Load the instruction content
    with open(instruction_file, "r") as f:
        instruction_content = f.read()
    
    # Process the instruction content
    modified_content = instruction_content
    
    # If the first line contains "submission.csv", remove it and the following newline
    if "submission.csv" in modified_content.split('\n')[0]:
        modified_content = '\n'.join(modified_content.split('\n')[1:])
    
    # Replace double newlines with single newlines
    modified_content = modified_content.replace('\n\n', '\n')
    
    # Call the retriever with the modified instruction content
    response = retriever.call_llm(modified_content, thinking={"type": "enabled", "budget_tokens": 8196})
    # Print the project name and response
    project_name = Path(instruction_file).parts[-3]  # Get the project name from the path
    print(f"Project: {project_name}")
    print(f"Response: {response}")
    print("-" * 50)
    
    # Save the response to reference_insights file in the same directory as instructions.md
    shards_path = os.path.join(os.path.dirname(instruction_file), "shards.md")
    with open(shards_path, "w") as f:
        # If the response starts with #, remove the first line
        if response.startswith("#"):
            response = "\n".join(response.split("\n")[1:])
        f.write(response)
    print(f"Saved reference insights to: {shards_path}")


Found 2 instruction files
Processing: benchmark_final_test/storage/aarthi93-end-to-end-ml-pipeline/instructions/gatekeeper_reference.md
Project: aarthi93-end-to-end-ml-pipeline
Response: # Detailed Data Science Research Pipeline

1. Review the Ames Housing dataset documentation to understand the feature definitions, expected ranges, and domain context before beginning any analysis.

2. Import standard data science libraries including pandas, numpy, matplotlib, seaborn, and scikit-learn components (preprocessing tools, model selection utilities, ensemble methods, and metrics).

3. Load the Ames Housing dataset CSV file into a pandas DataFrame and perform initial inspection using df.info(), df.head(), and df.describe() to understand basic data characteristics.

4. Create a missing values heatmap or bar chart to visualize the percentage of missing values across all columns, identifying features with particularly high missingness.

5. Analyze columns with high missing percentages (like All