In [1]:
import pandas as pd
import numpy as np
from scipy.stats import norm
from scipy.optimize import minimize
from scipy.optimize import Bounds
import logging
import warnings
import json
import os
from typing import List, Tuple, Dict, Any

# Import necessary components from scikit-learn for GPR
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, ConstantKernel as C, WhiteKernel
from sklearn.preprocessing import StandardScaler
from sklearn.exceptions import ConvergenceWarning

In [1]:
# --- Configuration ---
MASTER_FILE_PATH = 'bbo_master_w06.csv' # The master file updated after Week 6 results
OUTPUT_DIR = 'add_data'
OUTPUT_FILE_PATH = os.path.join(OUTPUT_DIR, 'week07_clean_inputs.json')
# Function ID -> Dimension (D) mapping
FUNCTION_DIMS: Dict[int, int] = {
    1: 2, 2: 2, 3: 3, 4: 4, 5: 4, 6: 5, 7: 6, 8: 8
}

# Set up logging for cleaner output
logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s')
# Suppress GPR numerical warnings for cleaner output
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=ConvergenceWarning)

In [1]:
def clean_data_for_function(df: pd.DataFrame, func_id: int, D: int) -> Tuple[np.ndarray, np.ndarray, float]:
    """
    Filters the DataFrame for a specific function, selects the correct X columns,
    performs explicit type conversion and NaN removal for maximum robustness.
    """
    df_func = df[df['Function ID'] == func_id].copy()
    
    # 1. Define the relevant columns for this function
    x_cols = [f'X{i}' for i in range(1, D + 1)]
    required_cols = x_cols + ['Y']
    
    # 2. Force type conversion: Convert all required columns to numeric, coercing any non-numeric 
    # value (like a blank or string) to a proper floating-point NaN.
    for col in required_cols:
        df_func[col] = pd.to_numeric(df_func[col], errors='coerce')

    # 3. Drop rows with any NaN in the REQUIRED columns (X1...XD and Y)
    X_Y_clean = df_func.dropna(subset=required_cols)
    
    X = X_Y_clean[x_cols].values
    Y = X_Y_clean['Y'].values.reshape(-1, 1)

    if X.shape[0] < 2:
        logging.error(f"Function F{func_id}: Not enough clean data points ({X.shape[0]}) to train GPR.")
        raise ValueError("Insufficient data points after cleaning.")

    # Find the best current maximum Y value
    f_best = Y.max()
    
    logging.info(f"Function F{func_id}: Loaded {X.shape[0]} clean points. Dimension D={D} enforced. f_best={f_best:.4f}")
    
    return X, Y, f_best

In [1]:
def expected_improvement(X_candidate: np.ndarray, gpr: GaussianProcessRegressor, f_best: float, xi: float = 0.01) -> np.ndarray:
    """
    Calculates the Expected Improvement (EI) for a candidate point X_candidate.
    Returns the negative EI, as 'minimize' is used for optimisation.
    """
    X_candidate = X_candidate.reshape(1, -1)
    
    mu, sigma = gpr.predict(X_candidate, return_std=True)
    mu = mu[0]
    sigma = sigma[0]

    if np.ndim(sigma) == 0:
        sigma = np.array([sigma])

    with np.errstate(divide='warn'):
        imp = mu - f_best - xi
        Z = imp / sigma
        ei = imp * norm.cdf(Z) + sigma * norm.pdf(Z)
        # Set EI to 0 where the variance is extremely small (i.e., already sampled)
        ei[sigma < 1e-10] = 0.0
        
    return -ei.flatten()

In [1]:
def propose_next_query(gpr: GaussianProcessRegressor, f_best: float, D: int, num_restarts: int = 30) -> List[float]:
    """
    Maximises the Expected Improvement acquisition function by minimising the negative EI.
    """
    
    best_ei = -np.inf
    best_x = None
    
    # Use Bounds object for 'L-BFGS-B'
    bounds = Bounds(0.0, 1.0)
    
    for _ in range(num_restarts):
        # Initial guess must be within bounds
        x0 = np.random.uniform(0.0, 1.0, size=D)
        
        result = minimize(
            fun=expected_improvement,
            x0=x0,
            args=(gpr, f_best, 0.01),
            method='L-BFGS-B',
            bounds=bounds
        )
        
        # result.fun is the minimum negative EI, so -result.fun is the maximum positive EI
        if -result.fun > best_ei:
            best_ei = -result.fun
            # Convert the numpy array result to a standard Python list
            best_x = result.x.tolist()
    
    logging.info(f"Optimal EI found: {best_ei:.4f}")

    if best_x is None:
        logging.warning(f"Optimisation failed. Returning random query.")
        best_x = np.random.uniform(0.0, 1.0, size=D).tolist()
    
    # Ensure all values are clamped between 0.0 and 1.0 (though L-BFGS-B with Bounds should handle this)
    return [max(0.0, min(1.0, val)) for val in best_x]

In [1]:
def run_optimisation() -> None:
    """Main function to run the BBO query generation process."""
    
    if not os.path.exists(MASTER_FILE_PATH):
        logging.error(f"Master file not found at: {MASTER_FILE_PATH}")
        return

    df_master = pd.read_csv(MASTER_FILE_PATH)
    
    all_queries = []
    
    for func_id, D in FUNCTION_DIMS.items():
        logging.info(f"\n--- Processing Function F{func_id} ({D}D) ---")
        
        try:
            # 1. Clean and filter data
            X, Y, f_best_raw = clean_data_for_function(df_master, func_id, D)
            
            # 2. Scale Y data (standard practice for GPR)
            scaler_y = StandardScaler()
            Y_scaled = scaler_y.fit_transform(Y)
            f_best_scaled = scaler_y.transform([[f_best_raw]])[0, 0]
            
            # 3. Define the kernel (fixed for version compatibility and robustness)
            kernel = (
                C(1.0, constant_value_bounds=(1e-3, 1e3)) * RBF(length_scale=np.ones(D), length_scale_bounds=(1e-3, 1e3)) + 
                WhiteKernel(noise_level=1e-5, noise_level_bounds=(1e-7, 1e-1))
            )
            
            # 4. Train the GPR model
            gpr = GaussianProcessRegressor(
                kernel=kernel, 
                alpha=0.0, 
                n_restarts_optimizer=15
            )
            gpr.fit(X, Y_scaled.flatten())
            
            logging.info(f"GPR Model Trained. Log-Marginal-Likelihood: {gpr.log_marginal_likelihood_value_:.4f}")
            
            # 5. Propose next query using Expected Improvement
            next_X = propose_next_query(gpr, f_best_scaled, D, num_restarts=30)
            
            # 6. Format and pad the query for the final JSON submission
            padded_query_raw = next_X + [0.0] * (8 - len(next_X))
            
            # Apply required rounding to 6 decimal places for final output
            # This ensures they are numerical floats with the correct precision.
            padded_query = [round(val, 6) for val in padded_query_raw]

            all_queries.append(padded_query)
            
            logging.info(f"F{func_id} Proposed Query (D={D}): {padded_query[:D]} (Rounded to 6 DP)")

        except Exception as e:
            logging.error(f"FATAL ERROR processing Function F{func_id}: {e}. Returning random query.")
            D_error = FUNCTION_DIMS.get(func_id)
            if D_error:
                # Generate and round a random query on error
                next_X_random = np.random.uniform(0.0, 1.0, size=D_error).tolist()
                padded_query_raw = next_X_random + [0.0] * (8 - D_error)
                padded_query = [round(val, 6) for val in padded_query_raw]
            else:
                padded_query = [0.0] * 8 

            all_queries.append(padded_query)


    # --- Final Output Generation (JSON) ---
    if not os.path.exists(OUTPUT_DIR):
        os.makedirs(OUTPUT_DIR)
        
    try:
        # Use simple json.dump, which correctly writes Python floats as JSON numbers
        with open(OUTPUT_FILE_PATH, 'w') as f:
            json.dump(all_queries, f, indent=2)
            
        logging.info("\n--------------------------------------------------")
        logging.info(f"SUCCESS: Generated {len(all_queries)} FINAL queries for Week 7.")
        logging.info(f"File saved to: '{OUTPUT_FILE_PATH}'. This is the file to submit.")
        logging.info("--------------------------------------------------")
        
    except Exception as e:
        logging.error(f"Failed to save JSON file: {e}")

In [1]:
if __name__ == '__main__':
    run_optimisation()

INFO: 
--- Processing Function F1 (2D) ---
INFO: Function F1: Loaded 14 clean points. Dimension D=2 enforced. f_best=0.1744
INFO: GPR Model Trained. Log-Marginal-Likelihood: -16.3722
INFO: Optimal EI found: 0.0002
INFO: F1 Proposed Query (D=2): [0.736739, 1.0] (Rounded to 6 DP)
INFO: 
--- Processing Function F2 (2D) ---
INFO: Function F2: Loaded 13 clean points. Dimension D=2 enforced. f_best=0.6660
INFO: GPR Model Trained. Log-Marginal-Likelihood: -12.1975
INFO: Optimal EI found: 0.0504
INFO: F2 Proposed Query (D=2): [0.702802, 0.967778] (Rounded to 6 DP)
INFO: 
--- Processing Function F3 (3D) ---
INFO: Function F3: Loaded 12 clean points. Dimension D=3 enforced. f_best=-0.0200
INFO: GPR Model Trained. Log-Marginal-Likelihood: -10.6301
INFO: Optimal EI found: 0.2364
INFO: F3 Proposed Query (D=3): [0.051259, 0.0, 0.61341] (Rounded to 6 DP)
INFO: 
--- Processing Function F4 (4D) ---
INFO: Function F4: Loaded 13 clean points. Dimension D=4 enforced. f_best=-2.9741
INFO: GPR Model Trained