In [1]:
import pandas as pd
import numpy as np
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import Matern, ConstantKernel as C
from scipy.stats import norm
from scipy.optimize import minimize
import logging
import warnings
import json
import os
from typing import List, Tuple, Dict, Any

In [2]:
# Set up logging for cleaner output
logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s')
# Suppress GPR numerical warnings for cleaner output
warnings.filterwarnings("ignore", category=UserWarning)

# --- Final Submission Configuration ---
DATA_FILE_PATH = 'bbo_master_w06.csv' 
OUTPUT_DIR = 'add_data'
OUTPUT_FILE_NAME = 'week06_clean_inputs.json'
OUTPUT_PATH = os.path.join(OUTPUT_DIR, OUTPUT_FILE_NAME)

# Final exploitation strategy: Matern 2.5 for high-precision local exploitation.
OPTIMAL_KERNEL = Matern(length_scale=1.0, nu=2.5) 
NUM_MULTI_STARTS = 50 
MAX_COORD_VALUE = 0.999999 # Mandatory cap for submission

# Define the domain (dimensionality) for each function
FUNCTION_DIMS: Dict[int, int] = {
    1: 2, 2: 2, 3: 3, 4: 4, 5: 4, 6: 5, 7: 6, 8: 8
}
ALL_FUNCTION_IDS = list(FUNCTION_DIMS.keys())
ALL_X_COLS = [f'X{i}' for i in range(1, 9)]



In [3]:
# --- Acquisition Function: Expected Improvement (EI) ---

def expected_improvement(X: np.ndarray, gpr: GaussianProcessRegressor, f_best: float) -> np.ndarray:
    """Calculates the Expected Improvement (EI) for a given point X."""
    mu, sigma = gpr.predict(X.reshape(1, -1), return_std=True)
    
    # Handle near-zero sigma to avoid division by zero
    if sigma.any() < 1e-10:
        return np.array([0.0])

    Z = (mu - f_best) / sigma
    ei = (mu - f_best) * norm.cdf(Z) + sigma * norm.pdf(Z)
    return ei

In [4]:
# --- Optimization Function ---

def optimize_acquisition(func_id: int, gpr: GaussianProcessRegressor, f_best: float) -> List[float]:
    """Finds the next optimal query point X by maximizing the Expected Improvement."""
    
    D = FUNCTION_DIMS[func_id]
    acq_func = lambda X: -expected_improvement(X, gpr, f_best) 
    bounds = [(0, 1)] * D # Optimization operates on the open interval [0, 1] 
    
    best_x = None
    best_acq_value = np.inf
    
    # Multi-start optimization loop (50 starts for maximum precision)
    for i in range(NUM_MULTI_STARTS):
        x0 = np.random.uniform(0, 1, D)
        res = minimize(
            acq_func,
            x0,
            bounds=bounds,
            method='L-BFGS-B'
        )
        
        if res.success and res.fun < best_acq_value:
            best_acq_value = res.fun
            best_x = res.x
            
    if best_x is None:
        logging.warning(f"  Warning: Optimization failed for F{func_id}. Using random start.")
        best_x = np.random.uniform(0, 1, D)
        
    return best_x.tolist()

In [5]:
def generate_week06_queries() -> List[List[float]]:
    """Main function to load data, train GPRs, and generate 8 new queries with final formatting."""
    
    # 1. Load Data
    try:
        df_master = pd.read_csv(DATA_FILE_PATH)
    except FileNotFoundError:
        logging.error(f"FATAL: Master data file not found at '{DATA_FILE_PATH}'.")
        return []

    df_clean = df_master.copy()
    for col in ALL_X_COLS:
        # Fill non-numeric (NaN/empty string) with 0.0 for GPR training
        df_clean[col] = pd.to_numeric(df_clean[col], errors='coerce').fillna(0.0)

    all_queries_raw = []
    
    logging.info("\n" + "="*70)
    logging.info(f"== Generating 8 FINAL Week 6 Queries (Matern 2.5 + EI + {NUM_MULTI_STARTS} Multi-Starts) ==")
    logging.info("="*70)

    for func_id in ALL_FUNCTION_IDS:
        # 2. Prepare function-specific data
        df_func = df_clean[df_clean['Function ID'] == func_id]
        D = FUNCTION_DIMS[func_id]
        X_cols_relevant = ALL_X_COLS[:D]
        X = df_func[X_cols_relevant].values
        Y = df_func['Y'].values.reshape(-1, 1)

        f_best = np.max(Y)
        
        # 3. Train the GPR Model
        gpr = GaussianProcessRegressor(
            kernel=OPTIMAL_KERNEL, 
            n_restarts_optimizer=5, 
            alpha=1e-8
        )
        gpr.fit(X, Y)

        # 4. Optimize Acquisition Function
        query_x_list = optimize_acquisition(func_id, gpr, f_best)

        # 5. Format and Finalise Query: Pad, Clip (0.999999), and Round (6-digit precision)
        
        # Clip values to the mandatory max and min
        clipped_query = [np.clip(x, 0.0, MAX_COORD_VALUE) for x in query_x_list]
        
        # Format to 6 decimal places
        formatted_query = [round(x, 6) for x in clipped_query]
        
        # Pad with 0.0 to 8 dimensions
        padded_query = formatted_query + [0.0] * (8 - D)
        all_queries_raw.append(padded_query)
        
        # Log the generated query (showing first few dimensions)
        logging.info(f"F{func_id} ({D}D): f_best={f_best:.4f}. Next Query X1-X{D}: {[f'{x:.6f}' for x in formatted_query[:3]]}...")

    # 6. Save the final queries to JSON
    if not os.path.exists(OUTPUT_DIR):
        os.makedirs(OUTPUT_DIR)
        
    with open(OUTPUT_PATH, 'w') as f:
        # Custom JSON dumping to ensure the output format is a clean array of arrays
        json.dump(all_queries_raw, f, indent=2)
        
    logging.info("\n" + "-"*70)
    logging.info(f"SUCCESS: Generated 8 FINAL queries for Week 6.")
    logging.info(f"File saved to: '{OUTPUT_PATH}'. This is the file to submit.")
    logging.info("-" * 70)
    
    return all_queries_raw

In [6]:
if __name__ == '__main__':
    generate_week06_queries()

INFO: 
INFO: == Generating 8 FINAL Week 6 Queries (Matern 2.5 + EI + 50 Multi-Starts) ==
INFO: F1 (2D): f_best=0.1744. Next Query X1-X2: ['0.000000', '0.999999']...
INFO: F2 (2D): f_best=0.6660. Next Query X1-X2: ['0.535953', '0.318272']...
INFO: F3 (3D): f_best=-0.0200. Next Query X1-X3: ['0.999999', '0.999999', '0.000000']...
INFO: F4 (4D): f_best=-2.9741. Next Query X1-X4: ['0.101075', '0.813896', '0.853852']...
INFO: F5 (4D): f_best=4440.4809. Next Query X1-X4: ['0.362040', '0.663433', '0.665456']...
INFO: F6 (5D): f_best=-0.7143. Next Query X1-X5: ['0.000000', '0.000000', '0.999999']...
INFO: F7 (6D): f_best=1.6487. Next Query X1-X6: ['0.540511', '0.127780', '0.698166']...
INFO: F8 (8D): f_best=9.9951. Next Query X1-X8: ['0.067152', '0.242506', '0.000000']...
INFO: 
----------------------------------------------------------------------
INFO: SUCCESS: Generated 8 FINAL queries for Week 6.
INFO: File saved to: 'add_data/week06_clean_inputs.json'. This is the file to submit.
INFO: --