In [1]:
import pandas as pd
import numpy as np
import json
import os
from io import StringIO

# --- Configuration ---
CSV_FILENAME = "bbo_master_w09.csv"
OUTPUT_FILENAME = "week09_clean_inputs.json" # Filename reverted to maintain naming consistency
OUTPUT_DIR = "add_data" # New directory variable
ROUND_NUMBER = 9
NUM_FUNCTIONS = 8
# --- End Configuration ---

def determine_dimensions(df, func_id):
    """
    Determines the correct dimensionality for a given Function ID.
    The dimension is the highest X-column (X1 to X8) that contains at least one
    non-NaN/non-zero-like value for that function ID.
    """
    func_data = df[df['Function ID'] == func_id]
    
    # Define X columns to check
    x_cols = [f'X{i}' for i in range(1, 9)]
    
    # Check for non-null/non-zero data in each dimension
    dimension = 0
    for i, col in enumerate(x_cols):
        # Check if the column exists in the DataFrame and if it contains
        # any value that is NOT NaN/None, or is non-zero (if it's a number).
        if col in func_data.columns and func_data[col].notna().any():
            # Check for non-zero data specifically, as some datasets pad with 0.0
            # Convert to numeric, errors='coerce' to handle non-numeric 'nulls'
            numeric_data = pd.to_numeric(func_data[col], errors='coerce').fillna(0)
            
            # If any value is significantly non-zero (or just not null/empty),
            # we consider this dimension active.
            if (numeric_data.abs() > 1e-9).any():
                dimension = i + 1
            elif func_data[col].astype(str).str.strip().str.len().max() > 0:
                 # Catch cases where the column is populated by non-numeric strings but is present
                 dimension = i + 1
        else:
             # Stop checking if the column itself is missing or entirely NaN
            break
            
    # --- HARDCODED DIMENSION OVERRIDES BASED ON BBO ROUND 9 REQUIREMENTS ---
    # F1: 2D (Default/Data-driven)
    # F2: 2D (Default/Data-driven)
    # F3: 3D (Default/Data-driven)
    if func_id == 4: # Enforce F4 as 4D
        return 4
    if func_id == 5: # Enforce F5 as 4D
        return 4
    if func_id == 6: # Enforce F6 as 5D (NEW CHANGE)
        return 5
    if func_id == 7: # Enforce F7 as 6D (NEW CHANGE)
        return 6
    if func_id == 8: # Enforce F8 as 8D
        return 8
    # --- END OVERRIDES ---
    
    # Fallback/Default dimension logic
    if dimension < 2 and func_data.shape[0] > 0:
        return 2

    return dimension if dimension >= 2 else 2 # Default to 2D if nothing else found


def generate_random_query(func_id, dimension):
    """
    Generates a random query list of the specified dimension, rounded to 6 decimal places.
    The seed now includes the unique func_id to ensure every query is different.
    """
    # Using func_id in the seed guarantees unique sequences for F1, F2, F3, etc.
    np.random.seed(42 + func_id + dimension + ROUND_NUMBER) 
    return [round(np.random.rand(), 6) for _ in range(dimension)]


def generate_queries(csv_data_source):
    """
    Main function to analyze the data and generate the submission queries.
    It accepts either raw CSV content (for the collaborative platform) or a file path.
    """
    print("--- Starting BBO Query Generation ---")
    
    try:
        if isinstance(csv_data_source, str) and os.path.exists(csv_data_source):
            # Case 1: Running locally - read from file path
            print(f"INFO: Reading data from local file: {csv_data_source}")
            df = pd.read_csv(csv_data_source)
        elif isinstance(csv_data_source, str):
            # Case 2: Running in environment (e.g., in this chat) - read from raw content
            print("INFO: Reading data from environment content string.")
            df = pd.read_csv(StringIO(csv_data_source))
        else:
             raise ValueError("Invalid CSV data source provided.")
        
        # Ensure column names are stripped of whitespace
        df.columns = df.columns.str.strip()
        print(f"INFO: Loaded master data with {len(df)} total rows.")
        
    except Exception as e:
        print(f"ERROR: Could not load or parse CSV data: {e}. Please ensure '{CSV_FILENAME}' is in the correct directory.")
        return {}

    # 1. Determine Correct Dimensions for all functions
    function_dims = {}
    
    print("\n--- Function Dimensionality Analysis ---")
    
    # Iterate through all function IDs
    for func_id in range(1, NUM_FUNCTIONS + 1):
        function_id_col = 'Function ID' if 'Function ID' in df.columns else 'FunctionID'
        
        if function_id_col in df.columns and func_id in df[function_id_col].unique():
            # Get dimension (will use the hardcoded override inside the function)
            dim = determine_dimensions(df.rename(columns={function_id_col: 'Function ID'}), func_id)
            
            function_dims[f'F{func_id}'] = (dim, func_id) # Store both dim and ID
            # Check if there is data for this function
            num_points = df[df[function_id_col] == func_id].shape[0]
            print(f"INFO: Function F{func_id} ({dim}D) has {num_points} data points.")
        else:
            # Fallback logic for IDs not found, ensuring the correct dimension is used
            if func_id == 4: dim = 4
            elif func_id == 5: dim = 4
            elif func_id == 6: dim = 5 # Use 5D for F6 fallback
            elif func_id == 7: dim = 6 # Use 6D for F7 fallback
            elif func_id == 8: dim = 8
            else: dim = 2
            
            function_dims[f'F{func_id}'] = (dim, func_id) # Store both dim and ID
            print(f"INFO: Function F{func_id} defaulting to: {dim}D (ID not found in current dataset, using BBO default dimension).")

    print("------------------------------------------")
    
    # 2. Generate Random Queries with the Correct Dimensions
    final_queries = {}
    print(f"--- Generating Queries for Round {ROUND_NUMBER} Submission ({OUTPUT_DIR}/{OUTPUT_FILENAME}) ---")
    for func_name, (dim, func_id) in function_dims.items():
        # Pass both the ID and the dimension for unique seeding
        query = generate_random_query(func_id, dim) 
        final_queries[func_name] = query
        print(f"{func_name} ({dim}D): {query}")

    # 3. Save the result as JSON list of lists in the specified subfolder
    
    # Construct the final list of lists in order F1 to F8
    ordered_queries = [final_queries[f'F{i}'] for i in range(1, NUM_FUNCTIONS + 1)]
    
    output_path = os.path.join(OUTPUT_DIR, OUTPUT_FILENAME)
    
    # Create the directory if it doesn't exist
    os.makedirs(OUTPUT_DIR, exist_ok=True)
    
    # Save the data
    try:
        with open(output_path, 'w') as f:
            json.dump(ordered_queries, f, indent=4)
        
        print("------------------------------------------")
        print(f"SUCCESS: Generated {len(ordered_queries)} FINAL queries with correct dimensions.")
        print(f"File saved to: {output_path}")
    except Exception as e:
        print(f"ERROR: Could not save the JSON file to {output_path}. Error: {e}")
    
    return final_queries


# --- Execution Block ---
# This block handles the different ways the script can be run.

# 1. Check for the collaborative environment file handler (if running in the chat platform)
try:
    # Use the content provided by the collaborative environment
    if '__files__' in globals() and CSV_FILENAME in __files__:
        csv_content = __files__[CSV_FILENAME]['content']
        generate_queries(csv_content)
    else:
        # 2. If running in the collaborative environment but file handle is missing (or running locally)
        # Attempt to read the file directly using the filename (for local execution)
        generate_queries(CSV_FILENAME)

except NameError:
    # 3. Running outside the collaborative environment (e.g., local IDE/notebook)
    # The '__files__' variable is not defined, so fall back to reading the file directly.
    generate_queries(CSV_FILENAME)

--- Starting BBO Query Generation ---
INFO: Reading data from local file: bbo_master_w09.csv
INFO: Loaded master data with 144 total rows.

--- Function Dimensionality Analysis ---
INFO: Function F1 (2D) has 18 data points.
INFO: Function F2 (2D) has 18 data points.
INFO: Function F3 (3D) has 18 data points.
INFO: Function F4 (4D) has 18 data points.
INFO: Function F5 (4D) has 18 data points.
INFO: Function F6 (5D) has 18 data points.
INFO: Function F7 (6D) has 18 data points.
INFO: Function F8 (8D) has 18 data points.
------------------------------------------
--- Generating Queries for Round 9 Submission (add_data/week09_clean_inputs.json) ---
F1 (2D): [0.420183, 0.363239]
F2 (2D): [0.093108, 0.971656]
F3 (3D): [0.08735, 0.230477, 0.411061]
F4 (4D): [0.924035, 0.157871, 0.866915, 0.084157]
F5 (4D): [0.300873, 0.186946, 0.323183, 0.66575]
F6 (5D): [0.033755, 0.489108, 0.846085, 0.411402, 0.631415]
F7 (6D): [0.379099, 0.567098, 0.595593, 0.449859, 0.45702, 0.311651]
F8 (8D): [0.545851,