In [1]:
import pandas as pd
import numpy as np
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import Matern, ConstantKernel as C
from scipy.stats import norm
from scipy.optimize import minimize
import logging
import warnings
import json
import os
from typing import List, Tuple, Union, Callable

# Set up logging for cleaner output
logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s')
# Suppress GPR numerical warnings and convergence warnings for cleaner output
warnings.filterwarnings('ignore', category=UserWarning)
warnings.filterwarnings('ignore', category=RuntimeWarning)

# --- Configuration for Final Exploitation Phase (Week 7) ---
DATA_FILE_PATH = 'bbo_master_w07.csv' # Input data file
ADD_DATA_DIR = 'add_data'
OUTPUT_FILE_PATH = os.path.join(ADD_DATA_DIR, 'week07_clean_inputs.json') # Output query file

# Define the dimensions (D) for each function (F1 to F8)
FUNCTION_DIMS = {
    1: 2, 2: 2, 3: 3, 4: 4,
    5: 4, 6: 5, 7: 6, 8: 8
}

# Strategy: Pure Exploitation for final round
ACQUISITION_STRATEGIES = {
    f_id: 'EI' for f_id in FUNCTION_DIMS.keys()
}

# --- Acquisition Function Definitions ---

def expected_improvement(X_query: np.ndarray, gpr: GaussianProcessRegressor, Y_best: Union[float, None]) -> float:
    """
    Expected Improvement (EI) acquisition function, formulated for MAXIMIZATION.
    
    Y_best must be the best known *maximum* value.
    Returns the NEGATIVE EI, as we use scipy.optimize.minimize to find the maximum.
    """
    if Y_best is None:
        return 0.0 

    # Predict mean (mu) and standard deviation (sigma)
    X_query = np.atleast_2d(X_query)
    # The GPR predicts based on the D-dimensional input, not the padded 8D array
    mu, sigma = gpr.predict(X_query, return_std=True)
    mu, sigma = mu[0], sigma[0]
    
    # Handle sigma close to zero (already sampled points)
    if sigma < 1e-6:
        return 0.0
    
    # Standard calculation of EI
    # We use the correct formulation: EI = (mu - Y_best) * Phi(z) + sigma * phi(z)
    z = (mu - Y_best) / sigma
    
    # Normalised CDF (c) = Phi(z) and PDF (p) = phi(z)
    c = norm.cdf(z)
    p = norm.pdf(z)
    
    # Expected Improvement formula
    ei = (mu - Y_best) * c + sigma * p
    
    # Return the negative value to MAXIMIZE EI using a MINIMIZER
    return -ei 

# --- Core BBO Functions ---

def load_and_preprocess_data(file_path: str) -> pd.DataFrame:
    """Loads the master CSV file, performs basic cleaning, and scales Y values."""
    if not os.path.exists(file_path):
        logging.error(f"Data file not found at: {file_path}. Please ensure '{DATA_FILE_PATH}' is available.")
        return pd.DataFrame()
    
    df = pd.read_csv(file_path)
    # Replace NaN with 0.0 for X columns (to handle padded dimensions from the master file)
    x_cols = [f'X{i}' for i in range(1, 9)]
    df[x_cols] = df[x_cols].fillna(0.0)
    
    # Scale Y to ensure positive values for GPR stability if Y can be heavily negative.
    # We maintain the relative order (maximization target remains the same)
    df['Y_Scaled'] = df['Y'] - df['Y'].min() + 1e-6
    
    return df

def optimise_acquisition(
    gpr: GaussianProcessRegressor, 
    D: int, 
    acquisition_func: Callable, 
    Y_best: Union[float, None] = None
) -> Tuple[np.ndarray, float]:
    """
    Minimizes the NEGATIVE acquisition function to find the next query point 
    that MAXIMIZES the acquisition. This optimization search is ONLY over D dimensions.
    """
    
    if acquisition_func == expected_improvement and Y_best is None:
        raise ValueError("Y_best must be provided for Expected Improvement.")
        
    # Define the objective function wrapper for the minimizer
    target_func = lambda x: acquisition_func(x, gpr, Y_best)
    
    # Set bounds: [0, 1] for active dimensions
    bounds_active = [(0, 1)] * D
    
    n_restarts = 10 
    best_x_active = None
    min_acq_value = np.inf 
    
    for _ in range(n_restarts):
        # Initial point x0 for the D active dimensions
        x0_active = np.random.uniform(0, 1, D)
        
        # Use L-BFGS-B for bounded optimization
        res = minimize(
            fun=target_func, 
            x0=x0_active, 
            bounds=bounds_active, 
            method='L-BFGS-B'
        )
        
        if res.success and res.fun < min_acq_value:
            min_acq_value = res.fun
            best_x_active = res.x
            
    if best_x_active is None:
         # Fallback to a random point if optimisation failed
         best_x_active = np.random.uniform(0, 1, D)
    
    # Combine the D-dimensional result with padded zeros to create the 8D query array
    next_query_8d = np.concatenate([best_x_active, np.zeros(8 - D)])
        
    # Return the 8D query array and the maximum acquisition value (positive)
    return next_query_8d, -min_acq_value

def constrain_queries(queries: List[List[float]], epsilon: float = 1e-6) -> List[List[float]]:
    """
    APPLIES THE CRITICAL CONSTRAINT: 0 <= x_i < 1.
    Replaces any value of exactly 1.0 with 1.0 - epsilon.
    """
    constrained_queries = []
    for query in queries:
        # Ensure x < 1.0 for all dimensions (0.0 padding is unaffected)
        constrained_row = [x if x < 1.0 else (1.0 - epsilon) for x in query]
        constrained_queries.append(constrained_row)
    return constrained_queries

def run_optimisation():
    """Main function to orchestrate the data loading, model training, and query generation."""
    df_master = load_and_preprocess_data(DATA_FILE_PATH)
    if df_master.empty:
        return
        
    all_queries = []
    
    for f_id, D in FUNCTION_DIMS.items():
        logging.info(f"\n--- Processing Function F{f_id} (D={D}) ---")
        
        # 1. Prepare Data for current function
        df_func = df_master[df_master['Function ID'] == f_id].copy()
        
        # Only use the active dimensions X1 to X_D for training
        x_cols = [f'X{i}' for i in range(1, D + 1)]
        X_train = df_func[x_cols].values # X_train is correctly D-dimensional
        Y_train = df_func['Y_Scaled'].values
        
        # 2. Train Gaussian Process Regressor (GPR)
        # Using Matern kernel (nu=2.5) for general smoothness
        kernel = C(1.0, (1e-3, 1e3)) * Matern(length_scale=np.ones(D), length_scale_bounds=(1e-3, 1e3), nu=2.5)
        gpr = GaussianProcessRegressor(kernel=kernel, alpha=1e-6, n_restarts_optimizer=10, normalize_y=True)
        
        try:
            gpr.fit(X_train, Y_train)
            logging.info(f"GPR Model Trained. Log-Marginal-Likelihood: {gpr.log_marginal_likelihood():.4f}")
        except Exception as e:
            logging.error(f"Error training GPR for F{f_id}: {e}. Skipping.")
            continue
            
        # 3. Optimise Acquisition Function (Expected Improvement for Week 7)
        strategy_name = ACQUISITION_STRATEGIES[f_id]
        acq_func = expected_improvement
        
        Y_best_scaled = np.max(Y_train) # Target is the maximum scaled Y
        
        # Optimise acquisition to find the next best query point
        next_query_8d, optimal_value = optimise_acquisition(gpr, D, acq_func, Y_best_scaled)
        
        # 4. Format Output and store the 8D float array
        all_queries.append([float(x) for x in next_query_8d])
        
        logging.info(f"Optimal {strategy_name} found: {optimal_value:.4f}")
        logging.info(f"F{f_id} ({D}D): f_best={df_func['Y'].max():.4f}. Next Query X1-X{D}: {[f'{x:.6f}' for x in next_query_8d[:D]]}...")
        
    # 5. Apply the Constraint: 0 <= x_i < 1
    all_queries_constrained = constrain_queries(all_queries, epsilon=1e-6)
    
    # 6. Save to JSON file
    logging.info("\n" + "*" * 70)
    logging.info("SUCCESS: Applying constraint (1.0 -> 0.999999) and saving FINAL queries for Week 7...")
    
    if not os.path.exists(ADD_DATA_DIR):
        os.makedirs(ADD_DATA_DIR)
        
    # Dump the constrained list of lists (8 functions, 8 inputs each)
    output_data = [[f'{x:.6f}' for x in query] for query in all_queries_constrained]

    with open(OUTPUT_FILE_PATH, 'w') as f:
        json.dump(output_data, f, indent=4)
        
    logging.info(f"File saved to: '{OUTPUT_FILE_PATH}'. This is the file to submit.")
    logging.info("*" * 70)
    
if __name__ == '__main__':
    run_optimisation()

INFO: 
--- Processing Function F1 (D=2) ---
INFO: GPR Model Trained. Log-Marginal-Likelihood: -19.8863
INFO: Optimal EI found: 0.0000
INFO: F1 (2D): f_best=0.1744. Next Query X1-X2: ['0.239825', '0.486074']...
INFO: 
--- Processing Function F2 (D=2) ---
INFO: GPR Model Trained. Log-Marginal-Likelihood: -15.3497
INFO: Optimal EI found: 0.0019
INFO: F2 (2D): f_best=0.6660. Next Query X1-X2: ['0.917077', '0.459697']...
INFO: 
--- Processing Function F3 (D=3) ---
INFO: GPR Model Trained. Log-Marginal-Likelihood: -19.6805
INFO: Optimal EI found: 0.0229
INFO: F3 (3D): f_best=-0.0200. Next Query X1-X3: ['0.528145', '0.578868', '0.510550']...
INFO: 
--- Processing Function F4 (D=4) ---
INFO: GPR Model Trained. Log-Marginal-Likelihood: -13.6857
INFO: Optimal EI found: 1.8692
INFO: F4 (4D): f_best=-2.9741. Next Query X1-X4: ['0.375246', '0.298707', '0.306354', '0.661866']...
INFO: 
--- Processing Function F5 (D=4) ---
INFO: GPR Model Trained. Log-Marginal-Likelihood: -12.9647
INFO: Optimal EI fo

In [1]:
import pandas as pd
import numpy as np
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import Matern, ConstantKernel as C
from scipy.stats import norm
from scipy.optimize import minimize
import logging
import warnings
import json
import os
from typing import List, Tuple, Union, Callable

# Set up logging for cleaner output
logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s')
# Suppress GPR numerical warnings and convergence warnings for cleaner output
warnings.filterwarnings('ignore', category=UserWarning)
warnings.filterwarnings('ignore', category=RuntimeWarning)

# --- Configuration for Final Exploitation Phase (Week 7) ---
DATA_FILE_PATH = 'bbo_master_w07.csv' # Input data file
ADD_DATA_DIR = 'add_data'
OUTPUT_FILE_PATH = os.path.join(ADD_DATA_DIR, 'week07_clean_inputs.json') # Output query file

# Define the dimensions (D) for each function (F1 to F8)
FUNCTION_DIMS = {
    1: 2, 2: 2, 3: 3, 4: 4,
    5: 4, 6: 5, 7: 6, 8: 8
}

# Strategy: Pure Exploitation for final round
ACQUISITION_STRATEGIES = {
    f_id: 'EI' for f_id in FUNCTION_DIMS.keys()
}

# --- Acquisition Function Definitions ---

def expected_improvement(X_query: np.ndarray, gpr: GaussianProcessRegressor, Y_best: Union[float, None]) -> float:
    """
    Expected Improvement (EI) acquisition function, formulated for MAXIMIZATION.
    
    Y_best must be the best known *maximum* value.
    Returns the NEGATIVE EI, as we use scipy.optimize.minimize to find the maximum.
    """
    if Y_best is None:
        return 0.0 

    # Predict mean (mu) and standard deviation (sigma)
    X_query = np.atleast_2d(X_query)
    # The GPR predicts based on the D-dimensional input, not the padded 8D array
    mu, sigma = gpr.predict(X_query, return_std=True)
    mu, sigma = mu[0], sigma[0]
    
    # Handle sigma close to zero (already sampled points)
    if sigma < 1e-6:
        return 0.0
    
    # Standard calculation of EI
    # We use the correct formulation: EI = (mu - Y_best) * Phi(z) + sigma * phi(z)
    z = (mu - Y_best) / sigma
    
    # Normalised CDF (c) = Phi(z) and PDF (p) = phi(z)
    c = norm.cdf(z)
    p = norm.pdf(z)
    
    # Expected Improvement formula
    ei = (mu - Y_best) * c + sigma * p
    
    # Return the negative value to MAXIMIZE EI using a MINIMIZER
    return -ei 

# --- Core BBO Functions ---

def load_and_preprocess_data(file_path: str) -> pd.DataFrame:
    """Loads the master CSV file, performs basic cleaning, and scales Y values."""
    if not os.path.exists(file_path):
        logging.error(f"Data file not found at: {file_path}. Please ensure '{DATA_FILE_PATH}' is available.")
        return pd.DataFrame()
    
    df = pd.read_csv(file_path)
    # Replace NaN with 0.0 for X columns (to handle padded dimensions from the master file)
    x_cols = [f'X{i}' for i in range(1, 9)]
    df[x_cols] = df[x_cols].fillna(0.0)
    
    # Scale Y to ensure positive values for GPR stability if Y can be heavily negative.
    # We maintain the relative order (maximization target remains the same)
    df['Y_Scaled'] = df['Y'] - df['Y'].min() + 1e-6
    
    return df

def optimise_acquisition(
    gpr: GaussianProcessRegressor, 
    D: int, 
    acquisition_func: Callable, 
    Y_best: Union[float, None] = None
) -> Tuple[np.ndarray, float]:
    """
    Minimizes the NEGATIVE acquisition function to find the next query point 
    that MAXIMIZES the acquisition. This optimization search is ONLY over D dimensions.
    """
    
    if acquisition_func == expected_improvement and Y_best is None:
        raise ValueError("Y_best must be provided for Expected Improvement.")
        
    # Define the objective function wrapper for the minimizer
    target_func = lambda x: acquisition_func(x, gpr, Y_best)
    
    # Set bounds: [0, 1] for active dimensions
    bounds_active = [(0, 1)] * D
    
    n_restarts = 10 
    best_x_active = None
    min_acq_value = np.inf 
    
    for _ in range(n_restarts):
        # Initial point x0 for the D active dimensions
        x0_active = np.random.uniform(0, 1, D)
        
        # Use L-BFGS-B for bounded optimization
        res = minimize(
            fun=target_func, 
            x0=x0_active, 
            bounds=bounds_active, 
            method='L-BFGS-B'
        )
        
        if res.success and res.fun < min_acq_value:
            min_acq_value = res.fun
            best_x_active = res.x
            
    if best_x_active is None:
         # Fallback to a random point if optimisation failed
         best_x_active = np.random.uniform(0, 1, D)
    
    # Combine the D-dimensional result with padded zeros to create the 8D query array
    next_query_8d = np.concatenate([best_x_active, np.zeros(8 - D)])
        
    # Return the 8D query array and the maximum acquisition value (positive)
    return next_query_8d, -min_acq_value

def constrain_queries(queries: List[List[float]], epsilon: float = 1e-6) -> List[List[float]]:
    """
    APPLIES THE CRITICAL CONSTRAINT: 0 <= x_i < 1.
    Replaces any value of exactly 1.0 with 1.0 - epsilon.
    """
    constrained_queries = []
    for query in queries:
        # Ensure x < 1.0 for all dimensions (0.0 padding is unaffected)
        constrained_row = [x if x < 1.0 else (1.0 - epsilon) for x in query]
        constrained_queries.append(constrained_row)
    return constrained_queries

def run_optimisation():
    """Main function to orchestrate the data loading, model training, and query generation."""
    df_master = load_and_preprocess_data(DATA_FILE_PATH)
    if df_master.empty:
        return
        
    all_queries = []
    
    for f_id, D in FUNCTION_DIMS.items():
        logging.info(f"\n--- Processing Function F{f_id} (D={D}) ---")
        
        # 1. Prepare Data for current function
        df_func = df_master[df_master['Function ID'] == f_id].copy()
        
        # Only use the active dimensions X1 to X_D for training
        x_cols = [f'X{i}' for i in range(1, D + 1)]
        X_train = df_func[x_cols].values # X_train is correctly D-dimensional
        Y_train = df_func['Y_Scaled'].values
        
        # 2. Train Gaussian Process Regressor (GPR)
        # Using Matern kernel (nu=2.5) for general smoothness
        kernel = C(1.0, (1e-3, 1e3)) * Matern(length_scale=np.ones(D), length_scale_bounds=(1e-3, 1e3), nu=2.5)
        gpr = GaussianProcessRegressor(kernel=kernel, alpha=1e-6, n_restarts_optimizer=10, normalize_y=True)
        
        try:
            gpr.fit(X_train, Y_train)
            logging.info(f"GPR Model Trained. Log-Marginal-Likelihood: {gpr.log_marginal_likelihood():.4f}")
        except Exception as e:
            logging.error(f"Error training GPR for F{f_id}: {e}. Skipping.")
            continue
            
        # 3. Optimise Acquisition Function (Expected Improvement for Week 7)
        strategy_name = ACQUISITION_STRATEGIES[f_id]
        acq_func = expected_improvement
        
        Y_best_scaled = np.max(Y_train) # Target is the maximum scaled Y
        
        # Optimise acquisition to find the next best query point
        next_query_8d, optimal_value = optimise_acquisition(gpr, D, acq_func, Y_best_scaled)
        
        # 4. Format Output and store the 8D float array
        all_queries.append([float(x) for x in next_query_8d])
        
        logging.info(f"Optimal {strategy_name} found: {optimal_value:.4f}")
        logging.info(f"F{f_id} ({D}D): f_best={df_func['Y'].max():.4f}. Next Query X1-X{D}: {[f'{x:.6f}' for x in next_query_8d[:D]]}...")
        
    # 5. Apply the Constraint: 0 <= x_i < 1
    all_queries_constrained = constrain_queries(all_queries, epsilon=1e-6)
    
    # 6. Save to JSON file
    logging.info("\n" + "*" * 70)
    logging.info("SUCCESS: Applying constraint (1.0 -> 0.999999) and saving FINAL queries for Week 7...")
    
    if not os.path.exists(ADD_DATA_DIR):
        os.makedirs(ADD_DATA_DIR)
        
    # Dump the constrained list of lists (8 functions, 8 inputs each)
    output_data = [[f'{x:.6f}' for x in query] for query in all_queries_constrained]

    with open(OUTPUT_FILE_PATH, 'w') as f:
        json.dump(output_data, f, indent=4)
        
    logging.info(f"File saved to: '{OUTPUT_FILE_PATH}'. This is the file to submit.")
    logging.info("*" * 70)
    
if __name__ == '__main__':
    run_optimisation()

INFO: 
--- Processing Function F1 (D=2) ---
INFO: GPR Model Trained. Log-Marginal-Likelihood: -19.8863
INFO: Optimal EI found: 0.0000
INFO: F1 (2D): f_best=0.1744. Next Query X1-X2: ['0.239825', '0.486074']...
INFO: 
--- Processing Function F2 (D=2) ---
INFO: GPR Model Trained. Log-Marginal-Likelihood: -15.3497
INFO: Optimal EI found: 0.0019
INFO: F2 (2D): f_best=0.6660. Next Query X1-X2: ['0.917077', '0.459697']...
INFO: 
--- Processing Function F3 (D=3) ---
INFO: GPR Model Trained. Log-Marginal-Likelihood: -19.6805
INFO: Optimal EI found: 0.0229
INFO: F3 (3D): f_best=-0.0200. Next Query X1-X3: ['0.528145', '0.578868', '0.510550']...
INFO: 
--- Processing Function F4 (D=4) ---
INFO: GPR Model Trained. Log-Marginal-Likelihood: -13.6857
INFO: Optimal EI found: 1.8692
INFO: F4 (4D): f_best=-2.9741. Next Query X1-X4: ['0.375246', '0.298707', '0.306354', '0.661866']...
INFO: 
--- Processing Function F5 (D=4) ---
INFO: GPR Model Trained. Log-Marginal-Likelihood: -12.9647
INFO: Optimal EI fo