In [94]:
import json
import os
import re
from datetime import datetime
import numpy as np

In [None]:
def parse_timestamp_from_filename(filename):
    """Extract timestamp from filename.
    Expected format: *_YYYYMMDD_HHMMSS.json"""
    match = re.search(r"_(\d{8}_\d{6})\.json$", filename)
    if match:
        timestamp_str = match.group(1)
        return timestamp_str
    return None

In [None]:
def is_failed(result_dict):
    """Check if a result represents a failed run.
    Failed if skill_score is -10000, nan, or missing."""
    try:
        skill_score = result_dict.get("metrics", {}).get("skill_score")
        return (
            skill_score == -10000 or (isinstance(skill_score, float) and np.isnan(skill_score)) or skill_score is None
        )
    except Exception:
        return True

In [None]:
# Initialize the results list
all_results = []

# Process all JSON files in the results directory
results_dir = "experiments/results"
for filename in os.listdir(results_dir):
    if filename.endswith(".json") and filename != "summary.json" and filename != "evaluation_only_summary.json":
        file_path = os.path.join(results_dir, filename)

        try:
            with open(file_path) as f:
                result_dict = json.load(f)

            # Add timestamp and failed status
            timestamp = parse_timestamp_from_filename(filename)
            if timestamp:
                result_dict["timestamp"] = timestamp
                result_dict["failed"] = is_failed(result_dict)
                all_results.append(result_dict)
        except Exception as e:
            print(f"Error processing {filename}: {str(e)}")

Error processing abdallaellaithy-titanic-in-space-ml-survival-predictions_claude-3-7-sonnet_20250514_182315.json: Expecting value: line 1 column 1 (char 0)
Error processing abdallaellaithy-titanic-in-space-ml-survival-predictions_deepseek-r1_20250514_165611.json: Expecting value: line 1 column 1 (char 0)
Error processing sudalairajkumar-simple-feature-engg-notebook-spooky-author_o3_20250514_185710.json: Expecting value: line 1 column 1 (char 0)
Error processing abdallaellaithy-titanic-in-space-ml-survival-predictions_claude-3-7-sonnet_20250514_182908.json: Expecting value: line 1 column 1 (char 0)
Error processing sudalairajkumar-simple-feature-engg-notebook-spooky-author_o3_20250514_185555.json: Expecting value: line 1 column 1 (char 0)
Error processing abdallaellaithy-titanic-in-space-ml-survival-predictions_deepseek-v3_20250514_165611.json: Expecting value: line 1 column 1 (char 0)
Error processing abdallaellaithy-titanic-in-space-ml-survival-predictions_claude-3-7-sonnet_20250514_1

In [98]:
# Display the number of results processed
print(f"Total results processed: {len(all_results)}")

# Display a sample result
if all_results:
    print("\nSample result:")
    print(json.dumps(all_results[0], indent=2, default=str))

Total results processed: 128

Sample result:
{
  "benchmark_id": "ugurcan95-brazilian-tweet-sentiment-analysis",
  "agent_id": "litellm_proxy/deepseek-r1",
  "completion_status": "completed",
  "metrics": {
    "interaction_time_seconds": 505.785724,
    "conversation_turns": 8,
    "code_snippets_count": 5,
    "total_code_executions": 5,
    "code_operations": {
      "pandas_operations": 10,
      "plotting": 0,
      "dataframe_creation": 0,
      "file_io": 4,
      "error_handling": 0,
      "loops": 3,
      "functions": 3,
      "imports": 22,
      "error_count": 2
    },
    "absolute_metric_score": 0.7493333333333333,
    "skill_score": 0.020067761271826905
  },
  "timestamp": "20250513_144422",
  "failed": false
}


### find out competition/non-competition notebooks

In [None]:
# Get all benchmark IDs from the notebook files
competition_notebooks_dir = "benchmark_data_toSubmit/notebooks/storage"
competition_benchmark = []

# Pattern to convert filenames to benchmark IDs
# Replace ##### with - in filenames
notebook_files = os.listdir(competition_notebooks_dir)
for notebook_file in notebook_files:
    if notebook_file.endswith(".ipynb"):
        # Convert filename to benchmark_id (replace ##### with -)
        benchmark_id = notebook_file[:-6].replace("#####", "-")
        competition_benchmark.append(benchmark_id)

competition_notebooks_dir = "test_data/notebooks/storage"
notebook_files = os.listdir(competition_notebooks_dir)
for notebook_file in notebook_files:
    if notebook_file.endswith(".ipynb"):
        # Convert filename to benchmark_id (replace ##### with -)
        benchmark_id = notebook_file[:-6].replace("#####", "-")
        competition_benchmark.append(benchmark_id)

# Print the competition benchmarks
print(f"Found {len(competition_benchmark)} competition benchmarks:")
for benchmark in competition_benchmark:
    print(f"- {benchmark}")

# Create the noncompetition_benchmark list
# This includes benchmark_ids in all_results that aren't in competition_benchmark
noncompetition_benchmark = []

# Get unique benchmark_ids from all_results
benchmark_dir = "benchmark_final/storage"
all_benchmark_ids = []

# Check if the directory exists
if os.path.exists(benchmark_dir) and os.path.isdir(benchmark_dir):
    for benchmark_folder in os.listdir(benchmark_dir):
        # Assume each benchmark_id has a corresponding subdirectory
        if os.path.isdir(os.path.join(benchmark_dir, benchmark_folder)):
            all_benchmark_ids.append(benchmark_folder)
else:
    print(f"Warning: Directory {benchmark_dir} does not exist or is not a directory!")
all_benchmark_ids = set(all_benchmark_ids)


# Find benchmarks that aren't in the competition list
for benchmark_id in all_benchmark_ids:
    if benchmark_id not in competition_benchmark:
        noncompetition_benchmark.append(benchmark_id)

# Print the non-competition benchmarks
print(f"\nFound {len(noncompetition_benchmark)} non-competition benchmarks:")
for benchmark in noncompetition_benchmark:
    print(f"- {benchmark}")

Found 16 competition benchmarks:
- vijaythurimella-bank-subscriptions-predictions-f1-score
- patilaakash619-backpack-price-prediction-ml-guide
- esotericdata1-titanickaggle-ds
- dmytrobuhai-eda-rf
- jakubkrasuski-llm-chatbot-arena-predicting-user-preferences
- ugurcan95-brazilian-tweet-sentiment-analysis
- abdallaellaithy-titanic-in-space-ml-survival-predictions
- shaswatatripathy-store-sales-prediction
- jakubkrasuski-store-sales-forecasting-modeling-with-lightgbm
- mightyjiraiya-titanic-survival-prediction
- iseedeep-mission-podcast-listening-prediction
- jimmyyeung-spaceship-titanic-xgb-top5
- tetsutani-ps3e9-eda-and-gbdt-catboost-median-duplicatedata
- slimreaper-random-forest-xgb-catboost-ensemble-t40
- sudalairajkumar-simple-feature-engg-notebook-spooky-author
- mohitsital-top-10-bike-sharing-rf-gbm

Found 10 non-competition benchmarks:
- sasakitetsuya-predicting-startup-valuation-with-machine-learning
- drpashamd4r-indian-floods-data-exploratory
- umerhayat123-how-i-achieved-83-

In [29]:
len(all_benchmark_ids)

26

### find out the latest result for one agent

In [None]:
def find_latest_unfailed_result_for_agent(agent_id, start_timestamp=None, end_timestamp=None, file_path="experiments"):
    """
    Find the latest unfailed result for each benchmark_id for a given agent_id within a timestamp range.
    A result is considered unfailed if the corresponding log file contains "Runner script completed".

    Args:
        agent_id (str): The ID of the agent to search for
        start_timestamp (str): Optional start of timestamp range in format YYYYMMDD_HHMMSS
        end_timestamp (str): Optional end of timestamp range in format YYYYMMDD_HHMMSS

    Returns:
        list: A list of dictionaries containing the latest unfailed results for each benchmark
    """
    # Dictionary to store the latest unfailed result for each benchmark_id
    latest_unfailed_results = {}

    # Convert timestamp strings to datetime objects if provided
    start_time = None
    end_time = None
    if start_timestamp:
        start_time = datetime.strptime(start_timestamp, "%Y%m%d_%H%M%S")
    if end_timestamp:
        end_time = datetime.strptime(end_timestamp, "%Y%m%d_%H%M%S")

    all_results = []

    # Process all JSON files in the results directory
    results_dir = f"{file_path}/results"
    for filename in os.listdir(results_dir):
        if filename.endswith(".json") and filename != "summary.json" and filename != "evaluation_only_summary.json":
            f_path = os.path.join(results_dir, filename)

            try:
                with open(f_path) as f:
                    result_dict = json.load(f)

                # Add timestamp and failed status
                timestamp = parse_timestamp_from_filename(filename)
                if timestamp:
                    result_dict["timestamp"] = timestamp
                    result_dict["failed"] = is_failed(result_dict)
                    all_results.append(result_dict)
            except Exception:
                # print(f"Error processing {filename}: {str(e)}")
                pass

    # Iterate through all results
    for result in all_results:
        # Skip if this result is not from the specified agent
        if agent_id not in result.get("agent_id"):
            continue

        benchmark_id = result.get("benchmark_id")
        if not benchmark_id:
            continue
        result_time = None
        # Check if the result is within the timestamp range
        if result.get("timestamp"):
            # Convert timestamp to datetime object for comparison
            if isinstance(result["timestamp"], str):
                try:
                    result_time = datetime.strptime(result["timestamp"], "%Y%m%d_%H%M%S")
                except ValueError:
                    # If timestamp format is different, skip this result
                    continue
            else:
                # If timestamp is already a datetime object
                result_time = result["timestamp"]

            # Skip if the result is outside the specified time range
            if (start_time and result_time < start_time) or (end_time and result_time > end_time):
                continue

        # Check if the run was completed successfully by examining the log file
        log_file_path = f"{file_path}/logs/{benchmark_id}_{agent_id.replace('/', '_')}_{result['timestamp']}.log"
        if os.path.exists(log_file_path):
            try:
                with open(log_file_path, errors="ignore") as f:
                    log_content = f.read()
                    if "Runner script completed" not in log_content:
                        # Skip this result if the run didn't complete successfully
                        continue
            except Exception as e:
                print(f"Warning: Could not read log file {log_file_path}: {e}")
                # Skip if we can't verify completion
                continue
        else:
            # If the log file doesn't exist, we can't verify completion
            print(f"Warning: Log file not found: {log_file_path}")
            continue

        # If we haven't seen this benchmark_id before, or if this result is newer
        if (
            benchmark_id not in latest_unfailed_results
            or result_time > latest_unfailed_results[benchmark_id]["timestamp"]
        ):
            # Store the result with the datetime object for easier comparison later
            result["timestamp"] = result_time
            latest_unfailed_results[benchmark_id] = result

    # Convert the dictionary values to a list and restore string timestamps
    result_list = list(latest_unfailed_results.values())
    for result in result_list:
        # Convert datetime back to string for consistency
        if isinstance(result["timestamp"], datetime):
            result["timestamp"] = result["timestamp"].strftime("%Y%m%d_%H%M%S")

    return result_list


# Example usage:
start_time = "20250514_000000"  # Optional: start of time range
end_time = "20250514_235959"  # Optional: end of time range
file_path = "experiments_old_version"
latest_unfailed_results = find_latest_unfailed_result_for_agent("deepseek-v3", start_time, end_time, file_path)
print(f"Found {len(latest_unfailed_results)} latest unfailed results between {start_time} and {end_time}")
# if latest_unfailed_results:
#     print("\nSample latest unfailed result:")
#     print(json.dumps(latest_unfailed_results[0], indent=2, default=str))

Found 23 latest unfailed results between 20250514_000000 and 20250514_235959


In [None]:
def analyze_agent_results(
    agent_name,
    file_path="experiments_old_version",
    all_benchmark_ids=None,
    competition_benchmark=None,
    skill_score_threshold=-0.05,
):
    results = find_latest_unfailed_result_for_agent(agent_name, file_path=file_path)

    if all_benchmark_ids is not None:
        missing = set(all_benchmark_ids) - set([i["benchmark_id"] for i in results])
        print(f"Missing benchmark_ids for {agent_name}:")
        print(missing)
        print("\n")

    failed_flags = np.array([i["failed"] for i in results])
    skill_scores = np.array([i["metrics"]["skill_score"] for i in results])

    print("proportion of positive skill scores")
    print(f"{agent_name}: ", np.sum(skill_scores >= skill_score_threshold) / len(skill_scores))
    print("\n")

    print("proportion of failed runs")
    print(f"{agent_name}: ", np.sum(failed_flags) / len(failed_flags))
    print("\n")

    print("average skill score of non-failed runs")
    print(f"{agent_name}: ", np.mean(skill_scores[~failed_flags]))
    print("\n")

    print("proportion of positive skill scores among non-failed runs")
    print(
        f"{agent_name}: ",
        np.sum(skill_scores[~failed_flags] >= skill_score_threshold) / len(skill_scores[~failed_flags]),
    )
    print("\n")

    if competition_benchmark is not None:
        is_competition = np.array([i["benchmark_id"] in competition_benchmark for i in results])
        is_non_competition = ~is_competition

        print("proportion of positive skill scores among non-competition benchmarks")
        print(
            f"{agent_name}: ",
            np.sum(skill_scores[is_non_competition] >= skill_score_threshold) / np.sum(is_non_competition),
        )
        print("\n")

        print("proportion of positive skill scores among competition benchmarks")
        print(f"{agent_name}: ", np.sum(skill_scores[is_competition] >= skill_score_threshold) / np.sum(is_competition))
        print("\n")
    return results


results_deepseek_v3 = analyze_agent_results(
    agent_name="deepseek-v3",
    all_benchmark_ids=all_benchmark_ids,
    competition_benchmark=competition_benchmark,
)

Missing benchmark_ids for deepseek-v3:
set()


proportion of positive skill scores
deepseek-v3:  0.46153846153846156


proportion of failed runs
deepseek-v3:  0.23076923076923078


average skill score of non-failed runs
deepseek-v3:  -0.32349105923185995


proportion of positive skill scores among non-failed runs
deepseek-v3:  0.6


proportion of positive skill scores among non-competition benchmarks
deepseek-v3:  0.5


proportion of positive skill scores among competition benchmarks
deepseek-v3:  0.4375




In [109]:
results_deepseek_v3

[{'benchmark_id': 'tetsutani-ps3e9-eda-and-gbdt-catboost-median-duplicatedata',
  'agent_id': 'litellm_proxy/deepseek-v3',
  'completion_status': 'completed',
  'metrics': {'interaction_time_seconds': 590.192362,
   'conversation_turns': 6,
   'code_snippets_count': 7,
   'total_code_executions': 7,
   'code_operations': {'pandas_operations': 3,
    'plotting': 0,
    'dataframe_creation': 2,
    'file_io': 4,
    'error_handling': 0,
    'loops': 3,
    'functions': 2,
    'imports': 8,
    'error_count': 0},
   'absolute_metric_score': 12.318180674246626,
   'skill_score': -0.019854917317326692},
  'timestamp': '20250514_172506',
  'failed': False},
 {'benchmark_id': 'drpashamd4r-indian-floods-data-exploratory',
  'agent_id': 'litellm_proxy/deepseek-v3',
  'completion_status': 'completed',
  'metrics': {'interaction_time_seconds': 395.499356,
   'conversation_turns': 7,
   'code_snippets_count': 12,
   'total_code_executions': 12,
   'code_operations': {'pandas_operations': 3,
    'p

In [105]:
analyze_agent_results(
    agent_name="deepseek-r1",
    all_benchmark_ids=all_benchmark_ids,
    competition_benchmark=competition_benchmark,
)

Missing benchmark_ids for deepseek-r1:
{'jakubkrasuski-llm-chatbot-arena-predicting-user-preferences', 'jimmyyeung-spaceship-titanic-xgb-top5', 'patilaakash619-backpack-price-prediction-ml-guide'}


proportion of positive skill scores
deepseek-r1:  0.2608695652173913


proportion of failed runs
deepseek-r1:  0.43478260869565216


average skill score of non-failed runs
deepseek-r1:  -1.6431253611542571


proportion of positive skill scores among non-failed runs
deepseek-r1:  0.46153846153846156


proportion of positive skill scores among non-competition benchmarks
deepseek-r1:  0.3


proportion of positive skill scores among competition benchmarks
deepseek-r1:  0.23076923076923078




### find out finished benchmarks for a specific model at certain time range


In [None]:
import os
import glob
from collections import defaultdict

# Define target path and time range
logs_dir = "experiments_old_version/logs"
target_model = "o3"
start_timestamp = "20250514_000000"  # Start time, format: YYYYMMDD_HHMMSS
end_timestamp = "20250514_235959"  # End time, format: YYYYMMDD_HHMMSS

# Convert string timestamps to datetime objects
start_time = datetime.strptime(start_timestamp, "%Y%m%d_%H%M%S")
end_time = datetime.strptime(end_timestamp, "%Y%m%d_%H%M%S")

# Convert string timestamps to datetime objects
start_time = datetime.strptime(start_timestamp, "%Y%m%d_%H%M%S")
end_time = datetime.strptime(end_timestamp, "%Y%m%d_%H%M%S")

# Find all log files
log_files = glob.glob(os.path.join(logs_dir, f"*_{target_model}*.log"))

print(f"Found {len(log_files)} {target_model} log files")

# Use a dictionary to record the completion status of each benchmark_id
benchmark_status = defaultdict(lambda: {"completed": False, "runs": [], "ever_run": False})

# Process each found log file
for log_file in log_files:
    filename = os.path.basename(log_file)

    # Extract timestamp from filename
    timestamp_match = re.search(r"_(\d{8}_\d{6})\.log$", filename)
    if not timestamp_match:
        continue

    file_timestamp_str = timestamp_match.group(1)
    try:
        file_timestamp = datetime.strptime(file_timestamp_str, "%Y%m%d_%H%M%S")
    except ValueError:
        continue  # Skip invalid timestamp format

    # Extract benchmark_id from filename (assuming format is benchmark_id_model_timestamp.log)
    benchmark_parts = filename.split("_" + target_model)
    if len(benchmark_parts) < 2:
        continue

    benchmark_id = benchmark_parts[0]

    # Check if timestamp is within the specified range
    if start_time <= file_timestamp <= end_time:
        # Mark this benchmark as having been run
        benchmark_status[benchmark_id]["ever_run"] = True

        # Read the log file content
        with open(log_file, errors="ignore") as f:
            log_content = f.read()

        # Record run status
        is_completed = "Runner script completed" in log_content
        benchmark_status[benchmark_id]["runs"].append(
            {"timestamp": file_timestamp_str, "completed": is_completed, "log_file": log_file}
        )

        # If there is a successful run, mark the entire benchmark as completed
        if is_completed:
            benchmark_status[benchmark_id]["completed"] = True

# Filter out incomplete benchmarks
incomplete_benchmarks = []
never_run_benchmarks = []

# Check each benchmark in all_benchmark_ids
for benchmark_id in all_benchmark_ids:
    # If benchmark has never been run
    if benchmark_id not in benchmark_status or not benchmark_status[benchmark_id]["ever_run"]:
        never_run_benchmarks.append(benchmark_id)
    # If benchmark has been run but not successfully completed
    elif not benchmark_status[benchmark_id]["completed"]:
        # Find all runs for this benchmark_id
        runs = benchmark_status[benchmark_id]["runs"]
        # Sort by timestamp, get the latest
        if runs:
            latest_run = sorted(runs, key=lambda x: x["timestamp"], reverse=True)[0]
            incomplete_benchmarks.append(
                {"benchmark_id": benchmark_id, "latest_timestamp": latest_run["timestamp"], "run_count": len(runs)}
            )

# Output results
print("\nSummary:")
print(f"Found {len(incomplete_benchmarks)} incomplete benchmarks between {start_timestamp} and {end_timestamp}:")
for item in incomplete_benchmarks:
    print(f"- {item['benchmark_id']} (Latest timestamp: {item['latest_timestamp']}, Run count: {item['run_count']})")

print(f"\nFound {len(never_run_benchmarks)} benchmarks that were never run:")
for benchmark_id in never_run_benchmarks:
    print(f"- {benchmark_id}")

找到 65 个 o3 日志文件

总结:
在 20250514_000000 到 20250514_235959 范围内找到 0 个未完成的 benchmark:

发现 0 个从未运行过的 benchmark:


In [75]:
benchmark_status

defaultdict(<function __main__.<lambda>()>,
            {'dmytrobuhai-eda-rf': {'completed': False,
              'runs': [],
              'ever_run': True},
             'jakubkrasuski-store-sales-forecasting-modeling-with-lightgbm': {'completed': False,
              'runs': [],
              'ever_run': True},
             'hasangulec-feature-engineering-diabetes': {'completed': False,
              'runs': [],
              'ever_run': True},
             'tetsutani-ps3e9-eda-and-gbdt-catboost-median-duplicatedata': {'completed': False,
              'runs': [],
              'ever_run': True},
             'ugurcan95-brazilian-tweet-sentiment-analysis': {'completed': False,
              'runs': [],
              'ever_run': True},
             'mohitsital-top-10-bike-sharing-rf-gbm': {'completed': False,
              'runs': [],
              'ever_run': True},
             'jakubkrasuski-llm-chatbot-arena-predicting-user-preferences': {'completed': False,
              'runs':

In [74]:
incomplete_benchmarks

[]

In [None]:
[i["benchmark_id"] for i in incomplete_benchmarks]

['patilaakash619-backpack-price-prediction-ml-guide',
 'drpashamd4r-indian-floods-data-exploratory',
 'iseedeep-mission-podcast-listening-prediction',
 'tetsutani-ps3e9-eda-and-gbdt-catboost-median-duplicatedata',
 'ak5047-australia-weather',
 'ayodejiibrahimlateef-integrative-analysis-early-depression-detection',
 'amitsinghbhadoria0-final-qt-project-analysis',
 'patilaakash619-electric-vehicle-population-data-in-the-us',
 'dmytrobuhai-eda-rf',
 'jakubkrasuski-store-sales-forecasting-modeling-with-lightgbm',
 'jimmyyeung-spaceship-titanic-xgb-top5',
 'shaswatatripathy-store-sales-prediction']

### move results to the target folder

In [None]:
import os
import shutil
import glob
import re
from datetime import datetime


def parse_timestamp(filename):
    """Extract timestamp from filename."""
    # Common timestamp pattern: YYYYMMDD_HHMMSS
    match = re.search(r"_(\d{8}_\d{6})", filename)
    if match:
        timestamp_str = match.group(1)
        try:
            return datetime.strptime(timestamp_str, "%Y%m%d_%H%M%S")
        except ValueError:
            return None
    return None


def is_in_timestamp_range(timestamp, start_timestamp, end_timestamp):
    """Check if timestamp is within the specified range."""
    if not timestamp:
        return False
    return start_timestamp <= timestamp <= end_timestamp


def move_files(source_dir, target_dir, subdir, start_timestamp_str, end_timestamp_str, model_name=None):
    """Move files in a subdirectory that match the timestamp range."""
    start_timestamp = datetime.strptime(start_timestamp_str, "%Y%m%d_%H%M%S")
    end_timestamp = datetime.strptime(end_timestamp_str, "%Y%m%d_%H%M%S")

    source_path = os.path.join(source_dir, subdir)
    target_path = os.path.join(target_dir, subdir)

    # Create target subdirectory if it doesn't exist
    os.makedirs(target_path, exist_ok=True)

    # Get all files in the source directory
    files = glob.glob(os.path.join(source_path, "*"))

    moved_count = 0
    for file_path in files:
        # Skip directories (optional - remove this if you want to process nested directories)
        if os.path.isdir(file_path):
            continue

        filename = os.path.basename(file_path)
        timestamp = parse_timestamp(filename)

        if is_in_timestamp_range(timestamp, start_timestamp, end_timestamp):
            if model_name is not None:
                if model_name not in filename:
                    continue
            target_file_path = os.path.join(target_path, filename)
            try:
                shutil.move(file_path, target_file_path)
                print(f"Moved: {file_path} -> {target_file_path}")
                moved_count += 1
            except Exception as e:
                print(f"Error moving {file_path}: {e}")

    return moved_count

In [None]:
target_folder = "experiments_old_version"
model_name = "deepseek-v3"
start_timestamp = "20250514_232500"
end_timestamp = "20250514_235900"

# Validate timestamp format
try:
    datetime.strptime(start_timestamp, "%Y%m%d_%H%M%S")
    datetime.strptime(end_timestamp, "%Y%m%d_%H%M%S")
except ValueError:
    print("Error: Timestamps must be in format YYYYMMDD_HHMMSS")
    raise ValueError("Error: Timestamps must be in format YYYYMMDD_HHMMSS")

source_dir = "experiments"
target_dir = target_folder

# Create target directory if it doesn't exist
os.makedirs(target_dir, exist_ok=True)

# List of subdirectories to process
subdirs = ["results", "logs", "checkpoints"]

total_moved = 0
for subdir in subdirs:
    if os.path.exists(os.path.join(source_dir, subdir)):
        moved = move_files(source_dir, target_dir, subdir, start_timestamp, end_timestamp, model_name)
        total_moved += moved
        print(f"Moved {moved} files from {subdir}")
    else:
        print(f"Subdirectory {subdir} not found in {source_dir}")

print(f"Total: Moved {total_moved} files to {target_dir}")

Moved: experiments/results/patilaakash619-backpack-price-prediction-ml-guide_deepseek-v3_20250514_235716.json -> experiments_old_version/results/patilaakash619-backpack-price-prediction-ml-guide_deepseek-v3_20250514_235716.json
Moved: experiments/results/shaswatatripathy-store-sales-prediction_deepseek-v3_20250514_235716.json -> experiments_old_version/results/shaswatatripathy-store-sales-prediction_deepseek-v3_20250514_235716.json
Moved 2 files from results
Moved: experiments/logs/patilaakash619-backpack-price-prediction-ml-guide_deepseek-v3_20250514_235716.log -> experiments_old_version/logs/patilaakash619-backpack-price-prediction-ml-guide_deepseek-v3_20250514_235716.log
Moved: experiments/logs/shaswatatripathy-store-sales-prediction_deepseek-v3_20250514_235716.log -> experiments_old_version/logs/shaswatatripathy-store-sales-prediction_deepseek-v3_20250514_235716.log
Moved 2 files from logs
Moved: experiments/checkpoints/patilaakash619-backpack-price-prediction-ml-guide_deepseek-v3_

In [19]:
os.listdir("benchmark_final/storage")

['jimmyyeung-spaceship-titanic-xgb-top5',
 'dmytrobuhai-eda-rf',
 'tetsutani-ps3e9-eda-and-gbdt-catboost-median-duplicatedata',
 'vijaythurimella-bank-subscriptions-predictions-f1-score',
 'amitsinghbhadoria0-final-qt-project-analysis',
 'jakubkrasuski-store-sales-forecasting-modeling-with-lightgbm',
 'patilaakash619-backpack-price-prediction-ml-guide',
 'esotericdata1-titanickaggle-ds',
 'shaswatatripathy-store-sales-prediction',
 'sudalairajkumar-simple-feature-engg-notebook-spooky-author',
 'slimreaper-random-forest-xgb-catboost-ensemble-t40',
 'drpashamd4r-indian-floods-data-exploratory',
 'ayodejiibrahimlateef-integrative-analysis-early-depression-detection',
 'ak5047-australia-weather',
 'mightyjiraiya-titanic-survival-prediction',
 'mohitsital-top-10-bike-sharing-rf-gbm',
 'sasakitetsuya-predicting-startup-valuation-with-machine-learning',
 'umerhayat123-how-i-achieved-83-accuracy',
 'iseedeep-mission-podcast-listening-prediction',
 'ugurcan95-brazilian-tweet-sentiment-analysis'