In [93]:
# Capstone_week02.ipynb
# Generates 8 new queries (one for each function) for Week 2 of the BBO challenge.
# Strategy is adapted based on Week 1 results:
# F4 & F6 (poor results) swapped to UCB (Explore).
# F5 (excellent result) swapped to EI (Exploit).

import pandas as pd
import numpy as np
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import Matern
from scipy.stats import norm
import warnings
import os

# Suppress sklearn warnings during optimisation
warnings.filterwarnings("ignore")

In [94]:
# --- 1. Configuration ---

# NOTE: This file name MUST match the output of data merger script.
MASTER_DATA_FILE = 'bbo_master_w02.csv'

# Output file for the new queries
QUERIES_OUTPUT_FILE = 'week02_queries.csv'

# Define the acquisition function strategy:
# Alternate between EI and UCB to balance exploitation and exploration.
# with adjustments based on Week 1 performance.
ACQUISITION_STRATEGY = {
    # F1 (2D, simpler, low-D) -> Exploit (EI)
    1: 'EI',
    # F2 (2D, simpler, low-D) -> Exploit (EI)
    2: 'EI',
    # F3 (3D, moderate) -> Explore (UCB)
    3: 'UCB',
    # --- based on Week 1 results: F4 is poor, needs EXPLORATION.
    # F5 is great, needs EXPLOITATION. ---
    # F4 (4D, moderate) -> Explore (UCB)
    4: 'UCB',
    # F5 (4D, moderate) -> Exploit (EI)
    5: 'EI',
    # F6 (5D, poor result: -1.02, needs aggressinve EXPLORATION)
    6: 'UCB',
    # F7 (6D, higher, low result: 0.54, continue EXPLORATION)
    7: 'UCB',
    # F8 (8D, high-D, good result: 9.89, continue EXPLORATION due to high D)
    8: 'UCB',
}

# UCB exploration parameter (kappa):
# Higher kappa encourages more exploration (searching uncertain areas).
# Use a higher kappa for high-D functions where uncertainty is vast.

UCB_KAPPA = {
    'low_d': 1.5,  # For F1, F2
    'high_d': 2.5  # For F3-F8
}

In [95]:
# --- 2. ACQUISITION FUNCTIONS ---

def expected_improvement(X, model, current_best_y):
    """
    Expected Improvement (EI) for exploitation.
    It measures the expected gain from sampling X.
    """
    mu, sigma = model.predict(X, return_std=True)
   
    # Handle zero standard deviation to prevent errors
    with np.errstate(divide='ignore'):
        Z = (current_best_y - mu) / sigma
        # Calculate EI
        ei = (current_best_y - mu) * norm.cdf(Z) + sigma * norm.pdf(Z)
        ei[sigma == 0.0] = 0.0  # if uncertainty is zero, EI is zero
        
    # try to *maximise* EI, so return the positive value
    return ei

In [96]:
def upper_confidence_bound(X, model, kappa):
    """
    Upper Confidence Bound (UCB) for exploration
    It balances mean prediction (mu) and uncertainty (sigma).
    """
    mu, sigma = model.predict(X, return_std=True)
    
    # UCB = Mean + Kappa * Standard Deviation
    # Higher kappa encourages more exploration of uncertain areas.
    return mu + kappa * sigma


In [97]:
def next_query_point(acq_func, model, bounds, current_best_y, kappa=None, n_random_samples=10000):
    """
    Uses random search to find the point (X) that maximises the acquisition function.
    """
    
    # 1. Generate many random candidate points across the search space [0, 1]
    # n_dims is the no. of input dimensions for the function (e.g., 2 for F1, 8 for F8)
    n_dims = bounds.shape[0]
    candidate_x = np.random.uniform(bounds[:, 0], bounds[:, 1], size=(n_random_samples, n_dims))
    
    # 2. Evaluate the acquisition function for all candidate points
    if acq_func == 'EI':
        scores = expected_improvement(candidate_x, model, current_best_y)
    elif acq_func == 'UCB':
        scores = upper_confidence_bound(candidate_x, model, kappa)
    else:
        raise ValueError(f"Unknown acquisition function: {acq_func}")
        
    # 3. Find the candidate point that yields the maximum score
    best_x = candidate_x[np.argmax(scores)]
    
    return best_x.reshape(1, -1)  # Return as a 2D array (1 row, N columns)

In [98]:
# --- 3. MAIN FUNCTION ---

def generate_week02_queries():
    """
    Loads data, trains GP models, and generates the 8 new queries.
    """
    print(f"Loading master data from {MASTER_DATA_FILE}...")
    if not os.path.exists(MASTER_DATA_FILE):
        print(f"Error: {MASTER_DATA_FILE} not found. Ensure the data merger script was run.")
        return
    
    # Set up the base DataFrame
    df_master = pd.read_csv(MASTER_DATA_FILE)
    x_cols = [f'X{i}' for i in range(1, 9)]
    
    # Store all generated rows in a list of dictionaries
    all_new_rows = []
    
    # Loop through all 8 functions
    for func_id in range(1, 9):
        # 3.1 Data Filtering and Preparation
        df_func = df_master[df_master['Function ID'] == func_id].copy()
        
        # Determine the no. of dimensions (e.g., 2 for F1, 8 for F8)
        # Count non-NaN X columns
        n_dims = 0
        for col in x_cols:
            # The dimension is the index of the last X column that has non-NaN values
            if df_func[col].notna().any():
                n_dims += 1
            else:
                # Since X columns are sequential (X1, X2, ...), we can stop
                # as soon as hitting a column that is entirely NaN for this function.
                break
        
        if n_dims == 0:
            print(f"Skipping F{func_id}: No valid data points found.")
            continue
        
        # Isolate the relevant X (inputs) and Y (outputs)
        X = df_func[x_cols[:n_dims]].values
        Y = df_func['Y'].values.reshape(-1, 1)  # Reshape Y to be 2D
        
        # Current best observation (for EI) - to MAXIMISE Y
        current_best_y = Y.max()
        
        # The search space bounds are always [0, 1] for all dimensions
        bounds = np.array([[0.0, 1.0]] * n_dims)
        
        # 3.2 Model Training
        # Use a Matern kernel, which is highly flexible for BBO
        kernel = Matern(length_scale=1.0, length_scale_bounds=(1e-1, 10.0), nu=2.5)
        model = GaussianProcessRegressor(
            kernel = kernel,
            alpha = 1e-6,  # Small noise term for numerical stability
            n_restarts_optimizer=20,
            normalize_y = True  # Essential for robust GP performance
        )
        
        try:
            model.fit(X, Y)
        except ValueError as e:
            print(f"Error fitting GP for F{func_id}: {e}")
            continue
            
        # 3.3 Acquisition Function Selection and Parameter Setup
        acq_func = ACQUISITION_STRATEGY[func_id]
        
        if acq_func == 'UCB':
            # Use .get() for safe dictionary access, providing defaults to prevent KeyError
            high_d_kappa = UCB_KAPPA.get('high_d', 2.5)
            low_d_kappa = UCB_KAPPA.get('low_d', 1.5)
            kappa_val = high_d_kappa if n_dims > 2 else low_d_kappa
            print(f"F{func_id} ({n_dims}D): Strategy = UCB (Kappa = {kappa_val})")
        else:
            kappa_val = None
            print(f"F{func_id} ({n_dims}D): Strategy = EI (Current Best Y={current_best_y:.4f})")
            
        # 3.4 Query Generation
        new_x_array = next_query_point(
            acq_func = acq_func,
            model = model,
            bounds = bounds,
            current_best_y = current_best_y,
            kappa = kappa_val
        )
        
        # 3.5 Format and Append Result
        new_row = {'Function ID': func_id}  # Y is NaN until submitted/returned
        
        # Initialise all X columns to NaN
        for col in x_cols:
            new_row[col] = np.nan
        
        # Convert the generated coordinates array to a flat list for clean assignment
        generated_coords = new_x_array.flatten()      
        
        # Overwrite the relevant X values with the generated query
        # Iterates over the generated_coords list
        for i in range(n_dims):
            col_name = f'X{i+1}'
            # Ensure the coordinate is a standard Python float
            new_row[col_name] = float(generated_coords[i])
            
        # Append the dictionary to the list
        all_new_rows.append(new_row)
            
        
    # 3.6 Create Final DataFrame, Save, and Print the Queries
    
    # Define the exact columns required for the submission file
    submission_cols = ['Function ID'] + [f'X{i}' for i in range(1, 9)]
    
    # Create the final DataFrame using the list of rows and select/reorder columns
    df_queries = pd.DataFrame(all_new_rows)
    df_queries = df_queries[submission_cols]
    
    # Round the X columns to 6 dec. places for the submission format.
    x_cols_to_round = [col for col in submission_cols if col.startswith('X')]
    df_queries[x_cols_to_round] = df_queries[x_cols_to_round].round(6)
    
    # Save the final queries
    df_queries.to_csv(QUERIES_OUTPUT_FILE, index=False)
    
    # Print the queries to the console 
    print("\n" + "=" * 60)
    print(f"| GENERATED QUERIES FOR WEEK 2 ({QUERIES_OUTPUT_FILE}) |")
    print("=" * 60)
    print(df_queries.to_string(index=False))
    print("=" * 60 + "\n")
    
    print("-" * 50)
    print(f"SUCCESS: Generated {len(df_queries)} queries for Week 2.")
    print(f"File saved as '{QUERIES_OUTPUT_FILE}'.")
    print("This file contains the X inputs to submit to the Capstone portal.")
    print("-" * 50)
    

In [99]:
if __name__ == '__main__':
    generate_week02_queries()

Loading master data from bbo_master_w02.csv...
F1 (2D): Strategy = EI (Current Best Y=0.0000)
F2 (2D): Strategy = EI (Current Best Y=0.6112)
F3 (3D): Strategy = UCB (Kappa = 2.5)
F4 (4D): Strategy = UCB (Kappa = 2.5)
F5 (4D): Strategy = EI (Current Best Y=1091.3153)
F6 (5D): Strategy = UCB (Kappa = 2.5)
F7 (6D): Strategy = UCB (Kappa = 2.5)
F8 (8D): Strategy = UCB (Kappa = 2.5)

| GENERATED QUERIES FOR WEEK 2 (week02_queries.csv) |
 Function ID       X1       X2       X3       X4       X5       X6      X7       X8
           1 0.630822 0.662337      NaN      NaN      NaN      NaN     NaN      NaN
           2 0.154270 0.343620      NaN      NaN      NaN      NaN     NaN      NaN
           3 0.802457 0.736185 0.510802      NaN      NaN      NaN     NaN      NaN
           4 0.371967 0.308509 0.271705 0.299497      NaN      NaN     NaN      NaN
           5 0.317247 0.690020 0.262847 0.360195      NaN      NaN     NaN      NaN
           6 0.691590 0.163316 0.826321 0.658325 0.024349   