In [1]:
import json
import os
import re
from datetime import datetime
import numpy as np

In [2]:
def parse_timestamp_from_filename(filename):
    """Extract timestamp from filename.
    Expected format: *_YYYYMMDD_HHMMSS.json"""
    match = re.search(r'_(\d{8}_\d{6})\.json$', filename)
    if match:
        timestamp_str = match.group(1)
        return timestamp_str
    return None

In [3]:
def is_failed(result_dict):
    """Check if a result represents a failed run.
    Failed if skill_score is -10000, nan, or missing."""
    try:
        skill_score = result_dict.get('metrics', {}).get('skill_score')
        return (skill_score == -10000 or 
                (isinstance(skill_score, float) and np.isnan(skill_score)) or
                skill_score is None)
    except:
        return True

In [4]:
# Initialize the results list
all_results = []

# Process all JSON files in the results directory
results_dir = 'experiments/results'
for filename in os.listdir(results_dir):
    if filename.endswith('.json') and filename != 'summary.json' and filename != 'evaluation_only_summary.json':
        file_path = os.path.join(results_dir, filename)
        
        try:
            with open(file_path, 'r') as f:
                result_dict = json.load(f)
                
            # Add timestamp and failed status
            timestamp = parse_timestamp_from_filename(filename)
            if timestamp:
                result_dict['timestamp'] = timestamp
                result_dict['failed'] = is_failed(result_dict)
                all_results.append(result_dict)
        except Exception as e:
            print(f"Error processing {filename}: {str(e)}")

Error processing abdallaellaithy-titanic-in-space-ml-survival-predictions_sample_agent_deepseek_r1_20250513_121519.json: Expecting value: line 1 column 1 (char 0)
Error processing abdallaellaithy-titanic-in-space-ml-survival-predictions_sample_agent_deepseek_r1_20250513_111645.json: Expecting value: line 1 column 1 (char 0)
Error processing jakubkrasuski-store-sales-forecasting-modeling-with-lightgbm_sample_agent_deepseek_r1_20250513_111645.json: Expecting value: line 1 column 1 (char 0)
Error processing vijaythurimella-bank-subscriptions-predictions-f1-score_sample_agent_deepseek_r1_20250513_111645.json: Expecting value: line 1 column 1 (char 0)
Error processing jakubkrasuski-llm-chatbot-arena-predicting-user-preferences_sample_agent_deepseek_r1_20250513_111645.json: Expecting value: line 1 column 1 (char 0)
Error processing amitsinghbhadoria0-final-qt-project-analysis_deepseek-r1_20250514_215435.json: Expecting value: line 1 column 1 (char 0)
Error processing amitsinghbhadoria0-final

In [5]:
# Display the number of results processed
print(f"Total results processed: {len(all_results)}")

# Display a sample result
if all_results:
    print("\nSample result:")
    print(json.dumps(all_results[0], indent=2, default=str))

Total results processed: 83

Sample result:
{
  "benchmark_id": "ugurcan95-brazilian-tweet-sentiment-analysis",
  "agent_id": "litellm_proxy/deepseek-r1",
  "completion_status": "completed",
  "metrics": {
    "interaction_time_seconds": 505.785724,
    "conversation_turns": 8,
    "code_snippets_count": 0,
    "code_operations": {
      "pandas_operations": 0,
      "plotting": 0,
      "dataframe_creation": 0,
      "file_io": 0,
      "error_handling": 0,
      "loops": 0,
      "functions": 0,
      "imports": 0
    },
    "absolute_metric_score": 0.7493333333333333,
    "skill_score": 0.020067761271826905
  },
  "timestamp": "20250513_144422",
  "failed": false
}


### find out competition/non-competition notebooks

In [6]:
# Get all benchmark IDs from the notebook files
competition_notebooks_dir = "benchmark_data_toSubmit/notebooks/storage"
competition_benchmark = []

# Pattern to convert filenames to benchmark IDs
# Replace ##### with - in filenames
notebook_files = os.listdir(competition_notebooks_dir)
for notebook_file in notebook_files:
    if notebook_file.endswith('.ipynb'):
        # Convert filename to benchmark_id (replace ##### with -)
        benchmark_id = notebook_file[:-6].replace('#####', '-')
        competition_benchmark.append(benchmark_id)

# Print the competition benchmarks
print(f"Found {len(competition_benchmark)} competition benchmarks:")
for benchmark in competition_benchmark:
    print(f"- {benchmark}")

# Create the noncompetition_benchmark list
# This includes benchmark_ids in all_results that aren't in competition_benchmark
noncompetition_benchmark = []

# Get unique benchmark_ids from all_results
all_benchmark_ids = set()
for result in all_results:
    benchmark_id = result.get('benchmark_id')
    if benchmark_id:
        all_benchmark_ids.add(benchmark_id)

# Find benchmarks that aren't in the competition list
for benchmark_id in all_benchmark_ids:
    if benchmark_id not in competition_benchmark:
        noncompetition_benchmark.append(benchmark_id)

# Print the non-competition benchmarks
print(f"\nFound {len(noncompetition_benchmark)} non-competition benchmarks:")
for benchmark in noncompetition_benchmark:
    print(f"- {benchmark}")

Found 11 competition benchmarks:
- vijaythurimella-bank-subscriptions-predictions-f1-score
- patilaakash619-backpack-price-prediction-ml-guide
- esotericdata1-titanickaggle-ds
- dmytrobuhai-eda-rf
- jakubkrasuski-llm-chatbot-arena-predicting-user-preferences
- ugurcan95-brazilian-tweet-sentiment-analysis
- abdallaellaithy-titanic-in-space-ml-survival-predictions
- shaswatatripathy-store-sales-prediction
- jakubkrasuski-store-sales-forecasting-modeling-with-lightgbm
- mightyjiraiya-titanic-survival-prediction
- iseedeep-mission-podcast-listening-prediction

Found 10 non-competition benchmarks:
- ak5047-australia-weather
- patilaakash619-electric-vehicle-population-data-in-the-us
- sasakitetsuya-predicting-startup-valuation-with-machine-learning
- amitsinghbhadoria0-final-qt-project-analysis
- hasangulec-feature-engineering-diabetes
- drpashamd4r-indian-floods-data-exploratory
- aarthi93-end-to-end-ml-pipeline
- umerhayat123-how-i-achieved-83-accuracy
- ayodejiibrahimlateef-integrative-a

### find out the latest result for one agent

In [11]:
def find_latest_result_for_agent(agent_id):
    """
    Find the latest result for each benchmark_id for a given agent_id.
    
    Args:
        agent_id (str): The ID of the agent to search for
        
    Returns:
        list: A list of dictionaries containing the latest results for each benchmark
    """
    # Dictionary to store the latest result for each benchmark_id
    latest_results = {}
    
    # Iterate through all results
    for result in all_results:
        # Skip if this result is not from the specified agent
        if result.get('agent_id') != agent_id:
            continue
            
        benchmark_id = result.get('benchmark_id')
        if not benchmark_id:
            continue
            
        # If we haven't seen this benchmark_id before, or if this result is newer
        if (benchmark_id not in latest_results or 
            result['timestamp'] > latest_results[benchmark_id]['timestamp']):
            if result['timestamp'] < '20250514_000000': # HACK: only for new prompt
                continue
            latest_results[benchmark_id] = result
    
    # Convert the dictionary values to a list
    return list(latest_results.values())

# Example usage:
latest_results = find_latest_result_for_agent("litellm_proxy/deepseek-v3")
print(f"Found {len(latest_results)} latest results")
if latest_results:
    print("\nSample latest result:")
    print(json.dumps(latest_results[0], indent=2, default=str))

Found 15 latest results

Sample latest result:
{
  "benchmark_id": "hanymato-mobile-price-prediction-model",
  "agent_id": "litellm_proxy/deepseek-v3",
  "completion_status": "completed",
  "metrics": {
    "interaction_time_seconds": 590.870069,
    "conversation_turns": 12,
    "code_snippets_count": 22,
    "total_code_executions": 22,
    "code_operations": {
      "pandas_operations": 47,
      "plotting": 0,
      "dataframe_creation": 2,
      "file_io": 19,
      "error_handling": 10,
      "loops": 29,
      "functions": 26,
      "imports": 42,
      "error_count": 1
    },
    "absolute_metric_score": 178.17228369271595,
    "skill_score": 0.14507348460747282
  },
  "timestamp": "20250514_215357",
  "failed": false
}


In [15]:
deepseek_v3_latest_results = find_latest_result_for_agent("litellm_proxy/deepseek-v3")
deepseek_r1_latest_results = find_latest_result_for_agent("litellm_proxy/deepseek-r1")

In [23]:
len(deepseek_v3_latest_results)

15

In [22]:
deepseek_v3_is_failed = np.array([i["failed"] for i in deepseek_v3_latest_results])
deepseek_r1_is_failed = np.array([i["failed"] for i in deepseek_r1_latest_results])
deepseek_v3_skill_scores = np.array([i["metrics"]["skill_score"] for i in deepseek_v3_latest_results])
deepseek_r1_skill_scores = np.array([i["metrics"]["skill_score"] for i in deepseek_r1_latest_results])
deepseek_v3_skill_scores.shape
deepseek_r1_skill_scores.shape

# proportion of positive skill scores
print("proportion of positive skill scores")
print("deepseek-v3: ",np.sum(deepseek_v3_skill_scores >= -0.05) / len(deepseek_v3_skill_scores))
print("deepseek-r1: ",np.sum(deepseek_r1_skill_scores >= -0.05) / len(deepseek_r1_skill_scores))
print("\n")

# proportion of failed runs
print("proportion of failed runs")
print("deepseek-v3: ",np.sum(deepseek_v3_is_failed) / len(deepseek_v3_is_failed))
print("deepseek-r1: ",np.sum(deepseek_r1_is_failed) / len(deepseek_r1_is_failed))
print("\n")

print("average skill score of non-failed runs")
print("deepseek-v3: ",np.mean(deepseek_v3_skill_scores[~deepseek_v3_is_failed]))
print("deepseek-r1: ",np.mean(deepseek_r1_skill_scores[~deepseek_r1_is_failed]))
print("\n")

# proportion of positive skill scores among non-failed runs
print("proportion of positive skill scores among non-failed runs")
print("deepseek-v3: ", np.sum(deepseek_v3_skill_scores[~deepseek_v3_is_failed] >= -0.05) / len(deepseek_v3_skill_scores[~deepseek_v3_is_failed]))
print("deepseek-r1: ", np.sum(deepseek_r1_skill_scores[~deepseek_r1_is_failed] >= -0.05) / len(deepseek_r1_skill_scores[~deepseek_r1_is_failed]))
print("\n")

# proportion of positive skill scores among non-competition benchmarks
print("proportion of positive skill scores among non-competition benchmarks")
deepseek_v3_skill_scores_non_competition = deepseek_v3_skill_scores[np.array([i["benchmark_id"] not in competition_benchmark for i in deepseek_v3_latest_results])]
deepseek_r1_skill_scores_non_competition = deepseek_r1_skill_scores[np.array([i["benchmark_id"] not in competition_benchmark for i in deepseek_r1_latest_results])]
print("deepseek-v3: ", np.sum(deepseek_v3_skill_scores_non_competition >= -0.05) / len(deepseek_v3_skill_scores_non_competition))
print("deepseek-r1: ", np.sum(deepseek_r1_skill_scores_non_competition >= -0.05) / len(deepseek_r1_skill_scores_non_competition))
print("\n")

# proportion of positive skill scores among non-competition benchmarks
print("proportion of positive skill scores among competition benchmarks")
deepseek_v3_skill_scores_competition = deepseek_v3_skill_scores[np.array([i["benchmark_id"] in competition_benchmark for i in deepseek_v3_latest_results])]
deepseek_r1_skill_scores_competition = deepseek_r1_skill_scores[np.array([i["benchmark_id"] in competition_benchmark for i in deepseek_r1_latest_results])]
print("deepseek-v3: ", np.sum(deepseek_v3_skill_scores_competition >= -0.05) / len(deepseek_v3_skill_scores_competition))
print("deepseek-r1: ", np.sum(deepseek_r1_skill_scores_competition >= -0.05) / len(deepseek_r1_skill_scores_competition))
print("\n")



proportion of positive skill scores
deepseek-v3:  0.6
deepseek-r1:  0.5


proportion of failed runs
deepseek-v3:  0.4
deepseek-r1:  0.35714285714285715


average skill score of non-failed runs
deepseek-v3:  0.08559950280510456
deepseek-r1:  -0.3849368674823873


proportion of positive skill scores among non-failed runs
deepseek-v3:  1.0
deepseek-r1:  0.7777777777777778


proportion of positive skill scores among non-competition benchmarks
deepseek-v3:  0.375
deepseek-r1:  0.2857142857142857


proportion of positive skill scores among competition benchmarks
deepseek-v3:  0.8571428571428571
deepseek-r1:  0.7142857142857143




In [19]:
deepseek_r1_skill_scores

array([-1.00000000e+04,  5.19492346e-04,  3.72050817e-02,  4.90654530e-04,
        4.48448333e-02,  7.10923041e-01,  3.92156863e-02, -1.00000000e+04,
       -8.62068966e-02, -1.00000000e+04, -4.25490196e+00,  4.34782609e-02,
       -1.00000000e+04, -1.00000000e+04])

In [21]:
deepseek_r1_skill_scores_competition

array([-1.00000000e+04,  5.19492346e-04,  3.72050817e-02,  4.90654530e-04,
        7.10923041e-01, -4.25490196e+00,  4.34782609e-02])

In [50]:
deepseek_v3_is_failed

array([False, False, False, False, False,  True, False,  True, False,
        True, False, False, False, False, False, False,  True,  True,
       False])

In [20]:
lis = [i for i in all_results if i['agent_id'] == 'litellm_proxy/deepseek-v3']

In [21]:
len(lis)

19