In [2]:
import pandas as pd
import numpy as np
from scipy.stats import norm
from scipy.optimize import minimize
from scipy.optimize import Bounds
import logging
import warnings
import json
import os
from typing import List, Tuple, Dict, Any

# Import necessary components from scikit-learn for GPR
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, ConstantKernel as C, Matern, WhiteKernel
from sklearn.preprocessing import StandardScaler
from sklearn.exceptions import ConvergenceWarning

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s')
warnings.filterwarnings("ignore", category=ConvergenceWarning)
warnings.filterwarnings("ignore", message="The optimal value found for the log-marginal-likelihood is close to the specified upper bound")

# --- CONFIGURATION FOR WEEK 08 ---
MASTER_FILE_PATH = 'bbo_master_w08.csv' # Loading the newly aggregated master file
OUTPUTS_DIR = 'add_data'
OUTPUT_FILENAME = 'week08_clean_inputs.json' # Generating queries for Week 08 submission
NUM_FUNCTIONS = 8
N_RESTARTS = 10 # Number of restarts for acquisition function optimization
XI = 0.01 # The exploration trade-off parameter (standard for EI)

# Function dimensionality mapping
FUNCTION_DIMS = {
    1: 2, 2: 2, 3: 3, 4: 4, 5: 4, 6: 5, 7: 6, 8: 8
}
# Search Bounds (all functions are constrained to the [0, 1] hypercube)
LOWER_BOUND = 0.0
UPPER_BOUND = 1.0

def expected_improvement(X: np.ndarray, gp_model: GaussianProcessRegressor, f_best: float, xi: float = 0.01) -> np.ndarray:
    """Expected Improvement acquisition function for MAXIMISATION."""
    # X must be reshaped if it's a single point (1D -> 2D)
    if X.ndim == 1:
        X = X.reshape(1, -1)

    # Get mean and standard deviation from the Gaussian Process model
    mu, sigma = gp_model.predict(X, return_std=True)

    # Suppress warnings for division by zero (sigma=0) if the GP is highly certain
    with np.errstate(divide='ignore'):
        # Calculate Z (the standard deviation-normalized difference from f_best)
        Z = (mu - f_best - xi) / sigma

        # Expected Improvement formula for MAXIMISATION:
        # EI = (mu - f_best - xi) * Phi(Z) + sigma * phi(Z)
        # EI is a measure of the expected gain over the current best (f_best).
        ei = (mu - f_best - xi) * norm.cdf(Z) + sigma * norm.pdf(Z)

        # If sigma is zero, EI must be zero (we are already certain)
        ei[sigma == 0.0] = 0.0

    # We want to minimize the negative EI to maximize EI
    return -ei

def train_and_query_bbo() -> List[List[float]]:
    """Main function to run the BBO strategy for all 8 functions."""

    if not os.path.exists(MASTER_FILE_PATH):
        logging.critical(f"FATAL ERROR: Master file '{MASTER_FILE_PATH}' not found. Please run create_bbo_master_w08.py first.")
        return []

    df_master = pd.read_csv(MASTER_FILE_PATH)
    logging.info(f"Loaded master data with {len(df_master)} total rows.")

    final_queries = []

    # Use the robust Matern kernel with noise modeling (WhiteKernel) - Critical for noisy LLM outputs.
    # Matern(nu=2.5) provides a good balance between RBF (smooth) and Exponential (non-smooth).
    kernel = C(1.0, (1e-3, 1e3)) * Matern(length_scale=1.0, length_scale_bounds=(1e-3, 1e3), nu=2.5) + \
             WhiteKernel(noise_level=0.1, noise_level_bounds=(1e-5, 1e-1))

    # --- Iterate through all 8 functions ---
    for f_id in range(1, NUM_FUNCTIONS + 1):
        f_name = f'F{f_id}'
        dim = FUNCTION_DIMS[f_id]
        # Filter for the current function and ensure Y values are present
        df_f = df_master[df_master['Function ID'] == f_id].dropna(subset=['Y']).copy()

        # 1. Prepare Data
        x_cols = [f'X{d+1}' for d in range(dim)]
        df_f.dropna(subset=x_cols, inplace=True)

        X_raw = df_f[x_cols].values
        Y_raw = df_f['Y'].values.reshape(-1, 1)

        if len(X_raw) < 1:
            logging.warning(f"Skipping {f_name}: No complete data points found.")
            final_queries.append([np.nan] * dim)
            continue

        # Find the best observed Y value (f_best) for EI calculation (Maximisation problem)
        f_best_raw = Y_raw.max()
        logging.info(f"\n--- Processing Function {f_name} ({dim}D) ---")
        logging.info(f"Function {f_name}: Loaded {len(X_raw)} clean points. Dimension D={dim} enforced. f_best={f_best_raw:.4f}")

        # 2. Scale Data
        x_scaler = StandardScaler()
        X_scaled = x_scaler.fit_transform(X_raw)

        # 3. Train Gaussian Process Model
        # normalize_y=True centers the output data, improving stability
        gp = GaussianProcessRegressor(kernel=kernel, n_restarts_optimizer=10, normalize_y=True, random_state=42)
        gp.fit(X_scaled, Y_raw)
        logging.info(f"GPR Model Trained. Log-Marginal-Likelihood: {gp.log_marginal_likelihood_value_:.4f}")

        # 4. Find Next Query Point by maximizing the EI acquisition function
        
        # Acquisition optimization wrapper: converts unscaled X to scaled X for GP
        def acquisition_wrapper(x_unscaled):
            # The optimization operates on the unscaled space [0, 1]
            x_scaled = x_scaler.transform(x_unscaled.reshape(1, -1))
            # Evaluate EI using the scaled point and the raw f_best
            return expected_improvement(x_scaled, gp, f_best_raw, XI)

        # --- FIX: Ensure Bounds object is created with arrays, not a 'shape' argument ---
        lower_bounds = np.full(dim, LOWER_BOUND)
        upper_bounds = np.full(dim, UPPER_BOUND)
        bounds_unscaled = Bounds(lower_bounds, upper_bounds)
        # ---------------------------------------------------------------------------------

        best_ei_value = np.inf
        best_x_unscaled = None

        # Use multiple random restarts in the unscaled space [0, 1]
        for _ in range(N_RESTARTS):
            x0_unscaled = np.random.uniform(LOWER_BOUND, UPPER_BOUND, size=dim)
            # Use L-BFGS-B method for bounded optimization
            res = minimize(acquisition_wrapper, x0=x0_unscaled, bounds=bounds_unscaled, method='L-BFGS-B')

            if res.fun < best_ei_value:
                best_ei_value = res.fun
                best_x_unscaled = res.x

        # Final check and formatting
        if best_x_unscaled is not None:
            query_point = best_x_unscaled.tolist()
            # Round to 6 decimal places for clean submission
            query_point_clean = [round(x, 6) for x in query_point]
            final_queries.append(query_point_clean)
            logging.info(f"F{f_id} Proposed Query (D={dim}): {query_point_clean} (Rounded to 6 DP)")
        else:
            # Fallback to a random point if optimization fails
            random_point = np.random.uniform(LOWER_BOUND, UPPER_BOUND, size=dim).tolist()
            final_queries.append([round(x, 6) for x in random_point])
            logging.warning(f"F{f_id}: EI optimization failed. Using random fallback: {final_queries[-1]}")

    # 5. Output Queries to JSON
    if not os.path.exists(OUTPUTS_DIR):
        os.makedirs(OUTPUTS_DIR)

    output_path = os.path.join(OUTPUTS_DIR, OUTPUT_FILENAME)

    try:
        with open(output_path, 'w') as f:
            json.dump(final_queries, f, indent=4)
        logging.info("\n" + "-"*50)
        logging.info(f"SUCCESS: Generated {len(final_queries)} FINAL queries for Week 8.")
        logging.info(f"File saved to: {output_path}")
        logging.info(f"This is your submission file: {OUTPUT_FILENAME}")
        logging.info("-"*50)
    except Exception as e:
        logging.error(f"Failed to write output JSON file: {e}")

    return final_queries

if __name__ == '__main__':
    generated_queries = train_and_query_bbo()
    if generated_queries:
        print("\n--- Queries for Week 08 Submission (week08_clean_inputs.json) ---")
        for i, q in enumerate(generated_queries):
            # Print query F1 to F8, matching the required submission format
            print(f"F{i+1}: {q}")

INFO: Loaded master data with 136 total rows.
INFO: 
--- Processing Function F1 (2D) ---
INFO: Function F1: Loaded 16 clean points. Dimension D=2 enforced. f_best=0.1744
INFO: GPR Model Trained. Log-Marginal-Likelihood: -21.9488
INFO: F1 Proposed Query (D=2): [0.662067, 0.47948] (Rounded to 6 DP)
INFO: 
--- Processing Function F2 (2D) ---
INFO: Function F2: Loaded 15 clean points. Dimension D=2 enforced. f_best=0.6660
INFO: GPR Model Trained. Log-Marginal-Likelihood: -15.8999
INFO: F2 Proposed Query (D=2): [0.769782, 1.0] (Rounded to 6 DP)
INFO: 
--- Processing Function F3 (3D) ---
INFO: Function F3: Loaded 14 clean points. Dimension D=3 enforced. f_best=-0.0200
INFO: GPR Model Trained. Log-Marginal-Likelihood: -18.7991
INFO: F3 Proposed Query (D=3): [0.664275, 0.600999, 0.535155] (Rounded to 6 DP)
INFO: 
--- Processing Function F4 (4D) ---
INFO: Function F4: Loaded 15 clean points. Dimension D=4 enforced. f_best=-2.9741
INFO: GPR Model Trained. Log-Marginal-Likelihood: -14.8504
INFO: 


--- Queries for Week 08 Submission (week08_clean_inputs.json) ---
F1: [0.662067, 0.47948]
F2: [0.769782, 1.0]
F3: [0.664275, 0.600999, 0.535155]
F4: [0.428378, 0.369407, 0.345557, 0.399058]
F5: [1.0, 1.0, 1.0, 1.0]
F6: [0.510625, 0.33055, 0.518944, 0.77119, 0.154218]
F7: [0.967968, 0.468373, 0.152233, 0.998512, 0.040256, 0.912553]
F8: [0.128465, 0.244099, 0.0, 0.310018, 0.535545, 0.241708, 0.363253, 0.467537]
