In [1]:
import json
import os
import time
import logging
from dataclasses import dataclass
from typing import List, Dict, Any, Optional
from copy import deepcopy
from llms.llm_interact import LLMConfig
from llms.retriever import Retriever
from llm_interact_env import Environment, EnvironmentConfig, Task, run
from logger import logger, configure_global_logger  # Import the logger
import subprocess  # Added for running the script

In [2]:
retriever_config = LLMConfig.from_toml("llm_configs/raw_tianyu/llm_config_shards.toml")

retriever = Retriever(retriever_config)

2025-05-14 01:12:27 - DataSciBench - INFO - Loaded configuration from llm_configs/raw_tianyu/llm_config_shards.toml
2025-05-14 01:12:27 - DataSciBench - INFO - Initialized Gatekeeper with model: litellm_proxy/claude-3-7-sonnet, temperature: 1


In [3]:
import glob
from pathlib import Path
import os
# Find all instruction.md files in the storage directory
storage_dir = "benchmark_final_test/storage"
instruction_files = []

# Walk through all directories under storage
for root, dirs, files in os.walk(storage_dir):
    # Check if this is an 'instructions' directory
    if os.path.basename(root) == "instructions":
        # Look for instructions.md file
        instruction_file = os.path.join(root, "gatekeeper_reference.md")
        if os.path.exists(instruction_file):
            instruction_files.append(instruction_file)

print(f"Found {len(instruction_files)} instruction files")

# Process each instruction file
for instruction_file in instruction_files:
    print(f"Processing: {instruction_file}")
    
    # Load the instruction content
    with open(instruction_file, "r") as f:
        instruction_content = f.read()
    
    # Process the instruction content
    modified_content = instruction_content
    
    # If the first line contains "submission.csv", remove it and the following newline
    if "submission.csv" in modified_content.split('\n')[0]:
        modified_content = '\n'.join(modified_content.split('\n')[1:])
    
    # Replace double newlines with single newlines
    modified_content = modified_content.replace('\n\n', '\n')
    
    # Call the retriever with the modified instruction content
    response = retriever.call_llm(modified_content, thinking={"type": "enabled", "budget_tokens": 8196})
    # Print the project name and response
    project_name = Path(instruction_file).parts[-3]  # Get the project name from the path
    print(f"Project: {project_name}")
    print(f"Response: {response}")
    print("-" * 50)
    
    # Save the response to reference_insights file in the same directory as instructions.md
    shards_path = os.path.join(os.path.dirname(instruction_file), "shards.md")
    with open(shards_path, "w") as f:
        # If the response starts with #, remove the first line
        if response.startswith("#"):
            response = "\n".join(response.split("\n")[1:])
        f.write(response)
    print(f"Saved reference insights to: {shards_path}")


Found 2 instruction files
Processing: benchmark_final_test/storage/aarthi93-end-to-end-ml-pipeline/instructions/gatekeeper_reference.md
Project: aarthi93-end-to-end-ml-pipeline
Response: # Detailed Data Science Research Pipeline

1. Review the Ames Housing dataset documentation to understand the feature definitions, expected ranges, and domain context before beginning any analysis.

2. Import standard data science libraries including pandas, numpy, matplotlib, seaborn, and scikit-learn components (preprocessing tools, model selection utilities, ensemble methods, and metrics).

3. Load the Ames Housing dataset CSV file into a pandas DataFrame and perform initial inspection using df.info(), df.head(), and df.describe() to understand basic data characteristics.

4. Create a missing values heatmap or bar chart to visualize the percentage of missing values across all columns, identifying features with particularly high missingness.

5. Analyze columns with high missing percentages (like All

In [8]:
import json
import os
import time
import logging
from dataclasses import dataclass
from typing import List, Dict, Any, Optional
from copy import deepcopy
from llms.llm_interact import LLMConfig
from llms.retriever import Retriever
from llm_interact_env import Environment, EnvironmentConfig, Task, run
from logger import logger, configure_global_logger
import subprocess

retriever_config = LLMConfig.from_toml("llm_configs/raw_tianyu/llm_config_shards.toml")
retriever = Retriever(retriever_config)

import glob
from pathlib import Path
import os

# Find all instruction.md files in the storage directory
storage_dir = "benchmark_final/storage"
instruction_files = []
knowledge_files = []

# Walk through all directories under storage
for root, dirs, files in os.walk(storage_dir):
    # Check if this is an 'instructions' directory
    if os.path.basename(root) == "instructions":
        # Look for instructions.md and cleaned_instructions.md files
        try:
            cleaned_instruction_file = os.path.join(root, "cleaned_instructions.md")
            if os.path.exists(cleaned_instruction_file):
                instruction_files.append(cleaned_instruction_file)
            else:
                raise FileNotFoundError(f"cleaned_instructions.md not found in {root}")
            if os.path.exists(os.path.join(root, "instructions.md")):
                knowledge_files.append(os.path.join(root, "instructions.md"))
            else:
                raise FileNotFoundError(f"instructions.md not found in {root}")
        except Exception as e:
            print(f"Error processing {root}: {e}")
            continue

print(f"Found {len(instruction_files)} instruction files")
print(f"Extracted knowledge from {len(knowledge_files)} projects")

# Process each instruction file
for instruction_file, knowledge_file in zip(instruction_files[:2], knowledge_files[:2]):
    print(f"Processing: {instruction_file} and {knowledge_file}")
    
    # Load the cleaned instruction content
    with open(instruction_file, "r") as f:
        instruction_content = f.read()

    with open(knowledge_file, "r") as f:
        original_content = f.read()
    # Extract content after "**Your Knowledge**"
    if "**Your Knowledge**" in original_content:
        knowledge_start_idx = original_content.find("**Your Knowledge**")
        if knowledge_start_idx != -1:
            # Get content after "**Your Knowledge**" header
            extracted_knowledge = original_content[knowledge_start_idx:]
    
    # Get the project name
    project_name = Path(instruction_file).parts[-3]  # Get the project name from the path
    
    print(extracted_knowledge + "\n" + "Here are the original instructions:\n" + instruction_content)
    # Call the retriever with the instruction content
    response = retriever.call_llm(extracted_knowledge + "\n" + "Here are the original instructions" + instruction_content, thinking={"type": "enabled", "budget_tokens": 20000})
    
    # Print the project name and response
    print(f"Project: {project_name}")
    print(f"Response: {response}")
    print("-" * 50)
    
    # Save the response to shards file in the same directory as instructions.md
    shards_path = os.path.join(os.path.dirname(instruction_file), "shards.md")
    with open(shards_path, "w") as f:
        # If the response starts with #, remove the first line
        if response.startswith("#"):
            response = "\n".join(response.split("\n")[1:])
        f.write(response)
    print(f"Saved shards to: {shards_path}")
    

2025-05-14 12:27:11 - DataSciBench - INFO - Loaded configuration from llm_configs/raw_tianyu/llm_config_shards.toml
2025-05-14 12:27:11 - DataSciBench - INFO - Initialized Gatekeeper with model: litellm_proxy/claude-3-7-sonnet, temperature: 1
Found 21 instruction files
Extracted knowledge from 21 projects
Processing: benchmark_final/storage/ugurcan95-brazilian-tweet-sentiment-analysis/instructions/cleaned_instructions.md and benchmark_final/storage/ugurcan95-brazilian-tweet-sentiment-analysis/instructions/instructions.md
**Your Knowledge**
- The code uses Portuguese stopwords rather than English ones, indicating the analysis is specifically tailored for Brazilian Portuguese tweets, which requires language-specific NLP resources.

- The text cleaning function removes URLs, hashtags, and mentions, suggesting these elements don't contribute meaningful sentiment information and might introduce noise in the analysis.

- The implementation uses lemmatization rather than stemming, which prese

In [4]:
knowledge_content

{'ugurcan95-brazilian-tweet-sentiment-analysis': "**Your Knowledge**\n- The code uses Portuguese stopwords rather than English ones, indicating the analysis is specifically tailored for Brazilian Portuguese tweets, which requires language-specific NLP resources.\n\n- The text cleaning function removes URLs, hashtags, and mentions, suggesting these elements don't contribute meaningful sentiment information and might introduce noise in the analysis.\n\n- The implementation uses lemmatization rather than stemming, which preserves the semantic meaning of words by reducing them to their dictionary form based on part-of-speech, providing more accurate representations for sentiment analysis.\n\n- The TF-IDF vectorizer is configured with ngram_range=(1, 2), capturing both individual words and two-word phrases, which helps preserve contextual information that may be important for sentiment detection.\n\n- Logistic Regression is chosen as the classification model, likely because it works well wi

In [3]:
instruction_content

'- Create a comprehensive feature engineering function that:\n1. Extracts titles from passenger names and groups rare titles\n2. Creates family-related features (family size, is alone flag, family group categories)\n3. Processes cabin information (has cabin flag, deck extraction)\n4. Handles ticket information (prefix extraction, numeric conversion)\n5. Bins age and fare into categorical groups\n6. Creates interaction features (Age*Class, Fare per person)\nThen apply this function to both training and test datasets.\n\n- Handle missing values in the datasets by:\n1. Filling missing Age values based on Title, Sex, and Pclass group medians\n2. Imputing missing Embarked values with the mode\n3. Filling missing Fare values in the test set with median fares by Pclass\n4. Recalculating age and fare bins after imputation to ensure consistency\n\n- Prepare the data for modeling by:\n1. Selecting relevant features for prediction\n2. Separating features into numeric and categorical groups\n3. Cr