In [145]:
import pandas as pd
import numpy as np
from scipy.stats import norm
from scipy.optimize import minimize
import logging
import warnings
import json
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, ConstantKernel as C

In [146]:
# Suppress convergence warnings from GPR kernel optimization for cleaner output
warnings.filterwarnings("ignore", category=UserWarning)

In [147]:
# Set up logging for informative output
logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s')

In [148]:
# --- Configuration ---
DATA_FILE = 'bbo_master_w03.csv'
OUTPUT_QUERY_FILE = 'week03_queries.json' 
RANDOM_STATE = 42
N_FUNCTIONS = 8

# Set random seed for reproducibility
np.random.seed(RANDOM_STATE)

# --- FUNCTION CONFIGURATION (As requested, using D=5, D=6, D=7) ---
FUNCTION_CONFIG = {
    1: {'D': 2, 'cols': ['X1', 'X2'], 'bounds': [(0, 1), (0, 1)]},
    2: {'D': 2, 'cols': ['X1', 'X2'], 'bounds': [(0, 1), (0, 1)]},
    3: {'D': 3, 'cols': ['X1', 'X2', 'X3'], 'bounds': [(0, 1), (0, 1), (0, 1)]},
    4: {'D': 4, 'cols': ['X1', 'X2', 'X3', 'X4'], 'bounds': [(0, 1)] * 4},
    5: {'D': 4, 'cols': ['X1', 'X2', 'X3', 'X4'], 'bounds': [(0, 1)] * 4}, 
    6: {'D': 5, 'cols': ['X1', 'X2', 'X3', 'X4', 'X5'], 'bounds': [(0, 1)] * 5}, 
    7: {'D': 6, 'cols': ['X1', 'X2', 'X3', 'X4', 'X5', 'X6'], 'bounds': [(0, 1)] * 6}, 
    8: {'D': 8, 'cols': ['X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X7', 'X8'], 'bounds': [(0, 1)] * 8},
}

In [149]:
# --- Data Loading Function (Must be defined once, outside the loop) ---
def get_function_data(df: pd.DataFrame, func_id: int):
    """Filters the dataframe to get clean X and Y data for a specific function ID."""
    if func_id not in FUNCTION_CONFIG:
        raise ValueError(f"Function ID {func_id} not found in configuration.")

    config = FUNCTION_CONFIG[func_id]
    feature_cols = config['cols']
    
    func_df = df[df['Function ID'] == func_id].copy()
    func_df.dropna(subset=feature_cols, how='any', inplace=True)
    
    if func_df.empty:
        raise ValueError(f"Function ID {func_id} has no valid data after NaN removal.")

    X = func_df[feature_cols].values
    Y = func_df['Y'].values.reshape(-1, 1) 
    
    return X, Y, config

In [150]:
# --- Core Bayesian Optimisation Functions ---

def expected_improvement(X, gpr, best_y):
    """
    Calculates the Expected Improvement (EI) for a given point X for MAXIMISATION.
    """
    X = np.atleast_2d(X)
    
    # Predict the mean (mu) and standard deviation (sigma)
    mu, sigma = gpr.predict(X, return_std=True)
    
    # Ensure mu and sigma are 2D column vectors (N, 1) for correct broadcasting
    mu = mu.reshape(-1, 1)
    sigma = sigma.reshape(-1, 1)
    
    # We seek improvement *above* the current maximum (best_y)
    with np.errstate(divide='ignore'):
        # The numerator is (mu - best_y). We want mu to be LARGER than best_y.
        Z = (mu - best_y) / sigma
        
        # EI formula for MAXIMISATION: sigma * (Z * Phi(Z) + phi(Z))
        # Note: The formula structure is the same, but Z is inverted.
        ei = sigma * (Z * norm.cdf(Z) + norm.pdf(Z))
    
    # If sigma is zero (e.g., at an observed point), EI must be zero
    ei[sigma == 0.0] = 0.0
    
    return ei.flatten()

In [151]:
def propose_new_query(acquisition_func, gpr, best_y, config, n_restarts=50, n_random=1000):
    """
    Finds the next point that maximizes the acquisition function using a hybrid 
    multi-start L-BFGS-B and random search strategy.
    """
    
    # Minimisation objective (negative of the acquisition function)
    # Since we want to MAXIMISE EI, we MINIMISE -EI.
    def min_obj(X):
        return -acquisition_func(X, gpr, best_y)

    bounds_list = config['bounds']
    d_dim = config['D']
    
    best_X = None
    best_EI = -np.inf
    
    # 1. Multi-start L-BFGS-B optimization
    for _ in range(n_restarts):
        # Start optimization from a random point
        X0 = np.random.uniform(np.array(bounds_list)[:, 0], np.array(bounds_list)[:, 1], size=d_dim)
        
        res = minimize(fun=min_obj,
                       x0=X0,
                       bounds=bounds_list,
                       method='L-BFGS-B')
        
        # Update best result from local optimization
        if res.success and -res.fun > best_EI:
            best_EI = -res.fun
            best_X = res.x
            
    # 2. Global Random Search (for better exploration)
    X_random = np.random.uniform(np.array(bounds_list)[:, 0], np.array(bounds_list)[:, 1], size=(n_random, d_dim))
    EI_random = acquisition_func(X_random, gpr, best_y)
    
    # Compare local optima with the best random point
    if EI_random.max() > best_EI:
        best_EI = EI_random.max()
        best_X = X_random[np.argmax(EI_random)]
            
    # Fallback: If no successful optimization or better random point found, return a random query
    if best_X is None:
        logging.warning(f"Optimization failed for D={d_dim}. Falling back to random query.")
        best_X = np.random.uniform(np.array(bounds_list)[:, 0], np.array(bounds_list)[:, 1], size=d_dim)
    
    return best_X

In [152]:
def generate_queries(df):
    """Generates one new query point for each function using Bayesian Optimisation."""
    
    new_queries_list = []
    
    logging.info("--- Starting Query Generation for Week 03 (MAXIMISATION) ---")
    
    for func_id in range(1, N_FUNCTIONS + 1):
        
        try:
            # Use the dedicated function for robust data retrieval
            X_data, Y_data, config = get_function_data(df, func_id)
        except ValueError as e:
            logging.error(f"F{func_id}: Failed to load data: {e}")
            continue

        d_dim = config['D']
        N_train = len(X_data)
        
        # *** CRITICAL: Get the MAX Y for MAXIMISATION ***
        best_y = Y_data.max() 

        logging.info(f"F{func_id}: Training on {N_train} data points. Dimensionality (D): {d_dim}.")

        # Define the kernel and GPR model
        kernel = C(1.0, (1e-3, 1e3)) * RBF(np.ones(d_dim), (1e-2, 1e2))
        
        gpr = GaussianProcessRegressor(
            kernel=kernel, 
            alpha=1e-6, 
            n_restarts_optimizer=10, 
            random_state=RANDOM_STATE
        )
        gpr.fit(X_data, Y_data)
        
        # Propose a new query point using the hybrid strategy
        new_X = propose_new_query(expected_improvement, gpr, best_y, config)
        
        # Handle the strict upper boundary X < 1
        new_X_clamped = np.where(new_X >= 1.0, 0.999999, new_X)
        
        # --- Update for 6 Decimal Places ---
        # Round the coordinates to 6 decimal places for final output
        new_X_rounded = np.round(new_X_clamped, 6)
        
        # Construct the JSON output format
        new_queries_list.append([float(x) for x in new_X_rounded])
        
        # Logging the result for confirmation (using :.6f for display)
        query_str = ', '.join([f'{val:.6f}' for val in new_X_rounded])
        logging.info(f"F{func_id}: New query calculated: [{query_str}]... | Best Y (Max): {best_y:.4f}")
        
    logging.info("--- Query Generation Complete ---")
    
    # Print the full query output to console (as requested)
    logging.info("\n--- FULL WEEK 03 QUERY OUTPUT (MAXIMISATION) ---")
    # json.dump will now use the precision of the rounded floats in new_queries_list
    logging.info(json.dumps(new_queries_list, indent=2))
    logging.info("----------------------------------\n")
    
    # Save the queries as a JSON file
    with open(OUTPUT_QUERY_FILE, 'w') as f:
        json.dump(new_queries_list, f, indent=2)
        
    logging.info(f"Successfully generated {len(new_queries_list)} new queries. Saved to {OUTPUT_QUERY_FILE}")


In [153]:
def main():
    """Main function to load data and generate new queries."""
    try:
        df = pd.read_csv(DATA_FILE)
        logging.info(f"Successfully loaded data from {DATA_FILE}. Total rows: {len(df)}")
        
        generate_queries(df)
        
    except FileNotFoundError:
        logging.error(f"ERROR: The data file '{DATA_FILE}' was not found. Please ensure it is uploaded.")
    except Exception as e:
        logging.error(f"An unexpected error occurred during execution: {e}")

In [154]:
def main():
    """Main function to load data and generate new queries."""
    try:
        df = pd.read_csv(DATA_FILE)
        logging.info(f"Successfully loaded data from {DATA_FILE}. Total rows: {len(df)}")
        
        generate_queries(df)
        
    except FileNotFoundError:
        logging.error(f"ERROR: The data file '{DATA_FILE}' was not found. Please ensure it is uploaded.")
    except Exception as e:
        logging.error(f"An unexpected error occurred during execution: {e}")


In [155]:
if __name__ == "__main__":
    main()

INFO: Successfully loaded data from bbo_master_w03.csv. Total rows: 96
INFO: --- Starting Query Generation for Week 03 (MAXIMISATION) ---
INFO: F1: Training on 12 data points. Dimensionality (D): 2.
INFO: F1: New query calculated: [0.625357, 0.708073]... | Best Y (Max): 0.0690
INFO: F2: Training on 12 data points. Dimensionality (D): 2.
INFO: F2: New query calculated: [0.698199, 0.000000]... | Best Y (Max): 0.6112
INFO: F3: Training on 12 data points. Dimensionality (D): 3.
INFO: F3: New query calculated: [0.039280, 0.000000, 0.649545]... | Best Y (Max): -0.0200
INFO: F4: Training on 12 data points. Dimensionality (D): 4.
INFO: F4: New query calculated: [0.000000, 0.000000, 0.000000, 0.000000]... | Best Y (Max): -2.9741
INFO: F5: Training on 12 data points. Dimensionality (D): 4.
INFO: F5: New query calculated: [0.216684, 0.827187, 0.670580, 0.894390]... | Best Y (Max): 1091.3153
INFO: F6: Training on 12 data points. Dimensionality (D): 5.
INFO: F6: New query calculated: [0.517761, 0.0