In [1]:
import json
import os
import time
import logging
from dataclasses import dataclass
from typing import List, Dict, Any, Optional
from copy import deepcopy
from llms.llm_interact import LLMConfig
from llms.retriever import Retriever
from llm_interact_env import Environment, EnvironmentConfig, Task, run
from logger import logger, configure_global_logger  # Import the logger
import subprocess  # Added for running the script

In [2]:
retriever_config = LLMConfig.from_toml("llm_configs/raw/llm_config_retriever.toml")

retriever = Retriever(retriever_config)

2025-05-13 10:50:41 - DataSciBench - INFO - Loaded configuration from llm_configs/raw/llm_config_retriever.toml
2025-05-13 10:50:41 - DataSciBench - INFO - Initialized Gatekeeper with model: litellm_proxy/claude-3-7-sonnet, temperature: 1


In [7]:
import glob
from pathlib import Path
import os
# Find all instruction.md files in the storage directory
storage_dir = "benchmark_final/storage"
instruction_files = []

# Walk through all directories under storage
for root, dirs, files in os.walk(storage_dir):
    # Check if this is an 'instructions' directory
    if os.path.basename(root) == "instructions":
        # Look for instructions.md file
        instruction_file = os.path.join(root, "instructions.md")
        if os.path.exists(instruction_file):
            instruction_files.append(instruction_file)

print(f"Found {len(instruction_files)} instruction files")

# Process each instruction file
for instruction_file in instruction_files:
    print(f"Processing: {instruction_file}")
    
    # Load the instruction content
    with open(instruction_file, "r") as f:
        instruction_content = f.read()
    
    # Call the retriever with the instruction content
    response = retriever.call_llm(instruction_content, thinking={"type": "enabled", "budget_tokens": 4096})
    # Print the project name and response
    project_name = Path(instruction_file).parts[-3]  # Get the project name from the path
    print(f"Project: {project_name}")
    print(f"Response: {response}")
    print("-" * 50)
    
    # Extract the objective from the instruction content
    import re
    objective_match = re.search(r'\*\*Objective\*\*(.*?)(?=\*\*)', instruction_content, re.DOTALL)
    objective_text = ""
    if objective_match:
        objective_text = objective_match.group(1).strip() + "\n\n"
    
    # Save the response to reference_insights file in the same directory as instructions.md
    reference_insights_path = os.path.join(os.path.dirname(instruction_file), "reference_insights.md")
    with open(reference_insights_path, "w") as f:
        f.write(objective_text + response)
    print(f"Saved reference insights to: {reference_insights_path}")


Found 2 instruction files
Processing: benchmark_final/storage/aarthi93-end-to-end-ml-pipeline/instructions/instructions.md
Project: aarthi93-end-to-end-ml-pipeline
Response: # House Price Prediction Reference Insights

Background:
The dataset contains housing information split into train.csv (with target prices) and test.csv (without targets) for evaluating predictions. The data is housed in '/app/datasets' with a required submission format matching sample_submission.csv.

Goal:
Predict house sale prices (SalePrice).

Metric:
Multiple evaluation metrics are used: RMSE, MAE, and R² to assess model performance from different perspectives.

Reference insights:
- Drop columns with high missing value percentages (Alley: 93%, Pool QC: 99%, Fence: 80%, Misc Feature: 96%) rather than attempting imputation.
- Create meaningful age-related features (House_Age, Years_Since_Remodel) using a reference year, as time-based features are important for price prediction.
- Use different preprocessing str