In [112]:
import pandas as pd
import numpy as np
from scipy.stats import norm
from scipy.optimize import minimize
import logging
import warnings
import json
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, ConstantKernel as C

In [113]:
# Suppress convergence warnings from GPR kernel optimization for cleaner output
warnings.filterwarnings("ignore", category=UserWarning)

In [114]:
# Set up logging for informative output
logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s')

In [115]:
# --- Configuration ---
DATA_FILE = 'bbo_master_w03.csv'
OUTPUT_QUERY_FILE = 'week03_queries.json'
N_FUNCTIONS = 8
RANDOM_STATE = 42
# Set random seed for reproducibility
np.random.seed(RANDOM_STATE)

# --- FUNCTION CONFIGURATION (As requested, using D=5, D=6, D=7) ---
FUNCTION_CONFIG = {
    1: {'D': 2, 'cols': ['X1', 'X2'], 'bounds': [(0, 1), (0, 1)]},
    2: {'D': 2, 'cols': ['X1', 'X2'], 'bounds': [(0, 1), (0, 1)]},
    3: {'D': 3, 'cols': ['X1', 'X2', 'X3'], 'bounds': [(0, 1), (0, 1), (0, 1)]},
    4: {'D': 4, 'cols': ['X1', 'X2', 'X3', 'X4'], 'bounds': [(0, 1)] * 4},
    5: {'D': 4, 'cols': ['X1', 'X2', 'X3', 'X4'], 'bounds': [(0, 1)] * 4}, 
    6: {'D': 5, 'cols': ['X1', 'X2', 'X3', 'X4', 'X5'], 'bounds': [(0, 1)] * 5}, 
    7: {'D': 6, 'cols': ['X1', 'X2', 'X3', 'X4', 'X5', 'X6'], 'bounds': [(0, 1)] * 6}, 
    8: {'D': 8, 'cols': ['X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X7', 'X8'], 'bounds': [(0, 1)] * 8},
}

In [116]:
# --- Data Loading Function (Must be defined once, outside the loop) ---
def get_function_data(df: pd.DataFrame, func_id: int):
    """Filters the dataframe to get clean X and Y data for a specific function ID."""
    if func_id not in FUNCTION_CONFIG:
        raise ValueError(f"Function ID {func_id} not found in configuration.")

    config = FUNCTION_CONFIG[func_id]
    feature_cols = config['cols']
    
    # Filter the initial function data
    func_df = df[df['Function ID'] == func_id].copy()
    
    # CRITICAL STEP: Drop rows where any of the necessary features are NaN (using 'any' as we assume Y is always present if X isn't NaN)
    func_df.dropna(subset=feature_cols, how='any', inplace=True)
    
    if func_df.empty:
        raise ValueError(f"Function ID {func_id} has no valid data after NaN removal.")

    X = func_df[feature_cols].values
    Y = func_df['Y'].values.reshape(-1, 1) 
    
    return X, Y, config

In [117]:
# --- Core Bayesian Optimisation Functions ---

def expected_improvement(X, gpr, best_y):
    """
    Calculates the Expected Improvement (EI) for a given point X for MINIMISATION.
    """
    X = np.atleast_2d(X)
    
    # Predict the mean (mu) and standard deviation (sigma)
    mu, sigma = gpr.predict(X, return_std=True)
    
    # CRITICAL FIX: Ensure mu and sigma are 2D column vectors (N, 1) for correct broadcasting
    mu = mu.reshape(-1, 1)
    sigma = sigma.reshape(-1, 1)
    
    # We seek improvement *below* the current minimum (best_y)
    with np.errstate(divide='ignore'):
        # Z = (best_y - mu) / sigma. All terms are now (N, 1) arrays or float (best_y)
        Z = (best_y - mu) / sigma
        
        # EI formula for minimisation: sigma * (Z * Phi(Z) + phi(Z))
        ei = sigma * (Z * norm.cdf(Z) + norm.pdf(Z))
    
    # If sigma is zero (e.g., at an observed point), EI must be zero
    ei[sigma == 0.0] = 0.0
    
    return ei.flatten()

In [118]:
def propose_new_query(acquisition_func, gpr, best_y, config, n_restarts=50, n_random=1000):
    """
    Finds the next point that maximizes the acquisition function using a hybrid 
    multi-start L-BFGS-B and random search strategy.
    """
    
    # Minimisation objective (negative of the acquisition function)
    def min_obj(X):
        return -acquisition_func(X, gpr, best_y)

    bounds_list = config['bounds']
    d_dim = config['D']
    
    best_X = None
    best_EI = -np.inf
    
    # 1. Multi-start L-BFGS-B optimization
    for _ in range(n_restarts):
        # Start optimization from a random point
        X0 = np.random.uniform(np.array(bounds_list)[:, 0], np.array(bounds_list)[:, 1], size=d_dim)
        
        res = minimize(fun=min_obj,
                       x0=X0,
                       bounds=bounds_list,
                       method='L-BFGS-B')
        
        # Update best result from local optimization
        if res.success and -res.fun > best_EI:
            best_EI = -res.fun
            best_X = res.x
            
    # 2. Global Random Search (for better exploration)
    X_random = np.random.uniform(np.array(bounds_list)[:, 0], np.array(bounds_list)[:, 1], size=(n_random, d_dim))
    EI_random = acquisition_func(X_random, gpr, best_y)
    
    # Compare local optima with the best random point
    if EI_random.max() > best_EI:
        best_EI = EI_random.max()
        best_X = X_random[np.argmax(EI_random)]
            
    # Fallback: If no successful optimization or better random point found, return a random query
    if best_X is None:
        logging.warning(f"Optimization failed for D={d_dim}. Falling back to random query.")
        best_X = np.random.uniform(np.array(bounds_list)[:, 0], np.array(bounds_list)[:, 1], size=d_dim)
    
    return best_X

In [119]:
def generate_queries(df):
    """Generates one new query point for each function using Bayesian Optimisation."""
    
    new_queries_list = []
    
    logging.info("--- Starting Query Generation for Week 03 ---")
    
    for func_id in range(1, N_FUNCTIONS + 1):
        
        try:
            # Use the dedicated function for robust data retrieval
            X_data, Y_data, config = get_function_data(df, func_id)
        except ValueError as e:
            logging.error(f"F{func_id}: Failed to load data: {e}")
            continue

        d_dim = config['D']
        N_train = len(X_data)
        
        # NOTE: Assumes Minimisation (lower Y is better)
        best_y = Y_data.min() 

        logging.info(f"F{func_id}: Training on {N_train} data points. Dimensionality (D): {d_dim}.")

        # Define the kernel and GPR model
        kernel = C(1.0, (1e-3, 1e3)) * RBF(np.ones(d_dim), (1e-2, 1e2))
        
        gpr = GaussianProcessRegressor(
            kernel=kernel, 
            alpha=1e-6, 
            n_restarts_optimizer=10, 
            random_state=RANDOM_STATE
        )
        gpr.fit(X_data, Y_data)
        
        # Propose a new query point using the hybrid strategy
        new_X = propose_new_query(expected_improvement, gpr, best_y, config)
        
        # Construct the JSON output format: list of lists [[x1, x2, ...], ...]
        # NOTE: Using list comprehension here ensures all numbers are cast to float for JSON compatibility
        new_queries_list.append([float(x) for x in new_X])
        
        # Logging the result for confirmation
        query_str = ', '.join([f'{val:.4f}' for val in new_X])
        logging.info(f"F{func_id}: New query calculated: [{query_str}]... | Best Y: {best_y:.4f}")
        
    logging.info("--- Query Generation Complete ---")
    
    # Print the full query output to console (as requested)
    logging.info("\n--- FULL WEEK 03 QUERY OUTPUT ---")
    logging.info(json.dumps(new_queries_list, indent=2))
    logging.info("----------------------------------\n")
    
    # Save the queries as a JSON file
    with open(OUTPUT_QUERY_FILE, 'w') as f:
        json.dump(new_queries_list, f, indent=2)
        
    logging.info(f"Successfully generated {len(new_queries_list)} new queries. Saved to {OUTPUT_QUERY_FILE}")

In [120]:
def main():
    # --- Step 0: Setup and Data Loading ---
    logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s')
    
    try:
        logging.info(f"--- Loading data from {DATA_FILE} ---")
        df = pd.read_csv(DATA_FILE)
    except FileNotFoundError:
        logging.error(f"FATAL ERROR: Data file {DATA_FILE} not found.")
        return

    # Initialise the list that will store all new queries
    new_queries = []
    
    logging.info("--- Starting Week 03 Query Generation via Bayesian Optimisation ---")
    
    # --- CORE BAYESIAN OPTIMISATION LOOP ---
    for func_id in range(1, N_FUNCTIONS + 1):
        info_prefix = f"INFO: F{func_id}: "
        
        try:
            # Step 1: CORRECT DATA INGESTION & CLEANING
            X, Y, feature_cols = get_function_data(df, func_id) 
            Y_best = Y.max()
            
            # Get dimensionality and bounds
            D = X.shape[1]
            bounds = FUNCTION_CONFIG[func_id]['bounds']

            logging.info(f"{info_prefix}Training on {len(X)} data points. Dimensionality (D): {D}.")
            
            # Step 2: GPR INITIALISATION AND TRAINING
            # D is used to define the correct size of the RBF length_scale vector
            kernel = C(1.0, (1e-3, 1e3)) * RBF(length_scale=[1.0] * D, length_scale_bounds=(1e-5, 1e4))
            gpr = GaussianProcessRegressor(kernel=kernel, alpha=1e-6, n_restarts_optimizer=20, random_state=RANDOM_STATE)
            
            gpr.fit(X, Y.ravel()) 
            
            # Step 3: OPTIMISE ACQUISITION FUNCTION
            # X_start must have the correct dimensionality D
            X_start = np.random.uniform(0, 1, size=(D,))
            
            # Define the objective function (negative EI for minimization)
            def objective(X_new):
                # Now the call passes 5 arguments, matching the fixed EI signature
                return -expected_improvement(X_new, X, gpr, Y_best, bounds)

            # Perform the optimization
            res = minimize(objective, X_start, bounds=bounds, method='L-BFGS-B')
            
            # Extract the new query point
            new_query = res.x.round(4).tolist()
            
            logging.info(f"{info_prefix}New query calculated: {new_query}... | Best Y: {Y_best:.4f}")
            
            # Store the new query (Pad with NaNs for D < 8)
            padding = [np.nan] * (8 - D)
            new_queries.append([func_id] + new_query + padding)
            
        except Exception as e:
            # Log any remaining errors
            logging.error(f"ERROR: F{func_id} General Error: {e}")
    
    # --- Step 4: Save the New Queries ---
    
    # Define the columns for the output file
    column_names = ['Function ID', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X7', 'X8']
    
    # Create the DataFrame
    df_queries = pd.DataFrame(new_queries, columns=column_names)
    
    # Save the file
    df_queries.to_csv(QUERIES_FILE, index=False)
    
    logging.info("--- Query Generation Complete ---")
    logging.info(f"Successfully generated {len(df_queries)} new queries. Saved to {QUERIES_FILE}")


In [121]:
def main():
    """Main function to load data and generate new queries."""
    try:
        df = pd.read_csv(DATA_FILE)
        logging.info(f"Successfully loaded data from {DATA_FILE}. Total rows: {len(df)}")
        
        generate_queries(df)
        
    except FileNotFoundError:
        logging.error(f"ERROR: The data file '{DATA_FILE}' was not found. Please ensure it is uploaded.")
    except Exception as e:
        logging.error(f"An unexpected error occurred during execution: {e}")

In [122]:
if __name__ == "__main__":
    main()

INFO: Successfully loaded data from bbo_master_w03.csv. Total rows: 96
INFO: --- Starting Query Generation for Week 03 ---
INFO: F1: Training on 12 data points. Dimensionality (D): 2.
INFO: F1: New query calculated: [0.6614, 0.1852]... | Best Y: -0.0036
INFO: F2: Training on 12 data points. Dimensionality (D): 2.
INFO: F2: New query calculated: [0.1013, 0.3683]... | Best Y: -0.0656
INFO: F3: Training on 12 data points. Dimensionality (D): 3.
INFO: F3: New query calculated: [0.1546, 1.0000, 1.0000]... | Best Y: -0.3989
INFO: F4: Training on 12 data points. Dimensionality (D): 4.
INFO: F4: New query calculated: [1.0000, 0.7682, 0.7896, 0.3504]... | Best Y: -22.1083
INFO: F5: Training on 1 data points. Dimensionality (D): 5.
INFO: F5: New query calculated: [0.1803, 0.7241, 0.8122, 0.6791, 0.6069]... | Best Y: 30.7598
INFO: F6: Training on 1 data points. Dimensionality (D): 6.
INFO: F6: New query calculated: [0.7307, 0.6480, 0.8231, 0.6730, 0.5128, 0.1986]... | Best Y: -0.8670
INFO: F7: Tr