In [1]:
import pandas as pd
import numpy as np
from scipy.stats import norm
from scipy.optimize import minimize, Bounds
import logging
import warnings
import json
import os
import sys

# Import necessary components from scikit-learn for GPR
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, ConstantKernel as C, Matern, WhiteKernel
from sklearn.preprocessing import StandardScaler
from sklearn.exceptions import ConvergenceWarning

# --- CONFIGURATION ---
# We use the Week 11 master file (which contains data up to Round 10)
INPUT_MASTER_FILE = 'bbo_master_w11.csv'
OUTPUT_DIR = 'add_data'
OUTPUT_FILE = os.path.join(OUTPUT_DIR, 'week11_clean_inputs.json')

# Trust Region Radii (The "Cluster Size")
# Tight (0.05) for boundary/corner solutions (F5, F8)
# Moderate (0.15) for internal peaks (F4, F6)
# Loose (0.30) for scattered/uncertain functions (F1, F2, F3, F7)
TRUST_RADII = {
    1: 0.30, 
    2: 0.30, 
    3: 0.30, 
    4: 0.15, 
    5: 0.05, 
    6: 0.15, 
    7: 0.30, 
    8: 0.05 
}

# Dimensions
FUNCTION_DIMS = {1: 2, 2: 2, 3: 3, 4: 4, 5: 4, 6: 5, 7: 6, 8: 8}

# Optimization Settings
N_RESTARTS = 25       # Number of optimizer restarts within the trust region
XI = 0.001            # Very low exploration (Exploitation focus within the cluster)

# Logging Setup
logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s', stream=sys.stdout)
warnings.filterwarnings("ignore", category=ConvergenceWarning)

def get_function_data(df, func_id):
    """Extracts X and Y data for a specific function."""
    df_f = df[df['Function ID'] == func_id].dropna(subset=['Y']).copy()
    dim = FUNCTION_DIMS[func_id]
    x_cols = [f'X{i+1}' for i in range(dim)]
    
    # Ensure cols exist
    for col in x_cols:
        if col not in df_f.columns:
            df_f[col] = np.nan
            
    df_f.dropna(subset=x_cols, inplace=True)
    
    X = df_f[x_cols].values
    Y = df_f['Y'].values
    return X, Y, dim

def expected_improvement(X, gpr, f_best, xi=0.01):
    """Calculates EI for Maximization."""
    X = X.reshape(1, -1)
    mu, sigma = gpr.predict(X, return_std=True)
    
    with np.errstate(divide='ignore'):
        Z = (mu - f_best - xi) / sigma
        ei = (mu - f_best - xi) * norm.cdf(Z) + sigma * norm.pdf(Z)
        ei[sigma == 0.0] = 0.0
    return -ei.flatten() # Return negative for minimization

def get_trust_region_bounds(x_best, radius, dim):
    """
    Creates a hypercube bound around the best point.
    Clips to [5] to ensure we stay in the valid domain.
    """
    lower = np.maximum(0.0, x_best - radius)
    upper = np.minimum(1.0, x_best + radius)
    
    # Create a Bounds object for Scipy
    return Bounds(lower, upper), lower, upper

def generate_week11_queries():
    if not os.path.exists(INPUT_MASTER_FILE):
        logging.error(f"Input file {INPUT_MASTER_FILE} not found.")
        return

    df_master = pd.read_csv(INPUT_MASTER_FILE)
    logging.info(f"Loaded {len(df_master)} rows from {INPUT_MASTER_FILE}")

    final_queries = []

    for f_id in range(1, 9):
        X_raw, Y_raw, dim = get_function_data(df_master, f_id)
        
        if len(X_raw) == 0:
            logging.warning(f"F{f_id}: No data found. Using random fallback.")
            final_queries.append([0.5]*dim)
            continue

        # 1. Identify the Cluster Centroid (x_best)
        best_idx = np.argmax(Y_raw)
        x_best = X_raw[best_idx]
        f_best = Y_raw[best_idx]
        
        # 2. Define Trust Region
        radius = TRUST_RADII[f_id]
        trust_bounds, lb, ub = get_trust_region_bounds(x_best, radius, dim)
        
        logging.info(f"--- F{f_id} (Radius {radius}) ---")
        logging.info(f"   Best Y so far: {f_best:.4f}")
        logging.info(f"   Centroid (x_best): {np.round(x_best, 3)}")
        logging.info(f"   Trust Region: [{np.round(lb,2)}, {np.round(ub,2)}] (showing dim 1)")

        # 3. Fit GP Model
        # Standardize X for better GP performance
        scaler = StandardScaler()
        X_scaled = scaler.fit_transform(X_raw)
        
        kernel = C(1.0) * Matern(length_scale=1.0, nu=2.5) + WhiteKernel(noise_level=0.1)
        gpr = GaussianProcessRegressor(kernel=kernel, n_restarts_optimizer=10, normalize_y=True)
        gpr.fit(X_scaled, Y_raw)

        # 4. Optimize within Trust Region
        # Wrapper handles scaling transparently
        def acq_wrapper(x):
            # Scale input x before passing to GP
            x_s = scaler.transform(x.reshape(1, -1))
            return expected_improvement(x_s, gpr, f_best, xi=XI)

        best_acq = np.inf
        next_x = None

        for _ in range(N_RESTARTS):
            # Sample random start point WITHIN the trust region
            x0 = np.random.uniform(lb, ub)
            
            res = minimize(
                acq_wrapper, 
                x0=x0, 
                bounds=trust_bounds, 
                method='L-BFGS-B'
            )
            
            if res.fun < best_acq:
                best_acq = res.fun
                next_x = res.x

        if next_x is None:
            logging.warning("Optimization failed. Returning centroid.")
            next_x = x_best

        # 5. Format Output
        query_list = [round(x, 6) for x in next_x]
        final_queries.append(query_list)
        logging.info(f"   Next Query: {query_list}")

    # Save to JSON
    if not os.path.exists(OUTPUT_DIR):
        os.makedirs(OUTPUT_DIR)
        
    with open(OUTPUT_FILE, 'w') as f:
        json.dump(final_queries, f, indent=4)
        
    logging.info(f"\nSUCCESS: Saved 8 queries to {OUTPUT_FILE}")

if __name__ == "__main__":
    generate_week11_queries()

INFO: Loaded 160 rows from bbo_master_w11.csv
INFO: --- F1 (Radius 0.3) ---
INFO:    Best Y so far: 0.1744
INFO:    Centroid (x_best): [0.599 0.631]
INFO:    Trust Region: [[0.3  0.33], [0.9  0.93]] (showing dim 1)
INFO:    Next Query: [np.float64(0.488381), np.float64(0.661088)]
INFO: --- F2 (Radius 0.3) ---
INFO:    Best Y so far: 0.6660
INFO:    Centroid (x_best): [0.685 1.   ]
INFO:    Trust Region: [[0.38 0.7 ], [0.98 1.  ]] (showing dim 1)
INFO:    Next Query: [np.float64(0.684632), np.float64(0.968359)]
INFO: --- F3 (Radius 0.3) ---
INFO:    Best Y so far: -0.0122
INFO:    Centroid (x_best): [0.664 0.601 0.535]
INFO:    Trust Region: [[0.36 0.3  0.24], [0.96 0.9  0.84]] (showing dim 1)
INFO:    Next Query: [np.float64(0.616525), np.float64(0.739696), np.float64(0.508262)]
INFO: --- F4 (Radius 0.15) ---
INFO:    Best Y so far: 0.4940
INFO:    Centroid (x_best): [0.428 0.369 0.346 0.399]
INFO:    Trust Region: [[0.28 0.22 0.2  0.25], [0.58 0.52 0.5  0.55]] (showing dim 1)
INFO:   