In [2]:
import pandas as pd
import numpy as np
import json
import os
import logging
import warnings
from scipy.stats import norm
from scipy.optimize import minimize, Bounds
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import Matern, ConstantKernel as C, WhiteKernel
from sklearn.preprocessing import StandardScaler

# --- Configuration aligned with Module 21 (Transparency) ---
logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s')
warnings.filterwarnings("ignore")

MASTER_FILE = 'bbo_master_w10.csv'
OUTPUT_FILE = 'add_data/week10_clean_inputs.json'
FUNCTION_DIMS = {1: 2, 2: 2, 3: 3, 4: 4, 5: 4, 6: 5, 7: 6, 8: 8}

# Strategy Parameters [Source 26]
# XI = 0.01: Tighter exploitation focus after the Wk9 Random Search
XI = 0.01 
N_RESTARTS = 20 
RANDOM_SEED = 42 # For Reproducibility [Source 14]

def expected_improvement(X, gp, f_best, xi=XI):
    """Calculates EI. Returns negative EI for minimization."""
    mu, sigma = gp.predict(X.reshape(1, -1), return_std=True)
    with np.errstate(divide='ignore'):
        imp = mu - f_best - xi
        Z = imp / sigma
        ei = imp * norm.cdf(Z) + sigma * norm.pdf(Z)
        ei[sigma == 0.0] = 0.0
    return -ei

def generate_queries():
    np.random.seed(RANDOM_SEED)
    
    if not os.path.exists(MASTER_FILE):
        logging.error(f"Master file {MASTER_FILE} not found.")
        return

    df = pd.read_csv(MASTER_FILE)
    queries = []

    logging.info("--- Generating Week 10 Queries (Bayesian Optimization) ---")
    logging.info(f"Strategy: Matern 2.5 + WhiteKernel | XI={XI} | Seed={RANDOM_SEED}")

    for f_id in range(1, 9):
        dim = FUNCTION_DIMS[f_id]
        
        # Data Preparation
        df_f = df[df['Function ID'] == f_id].copy()
        x_cols = [f'X{i+1}' for i in range(dim)]
        
        # Strict cleaning for reproducibility
        for col in x_cols + ['Y']:
            df_f[col] = pd.to_numeric(df_f[col], errors='coerce')
        df_f = df_f.dropna(subset=x_cols + ['Y'])
        
        X = df_f[x_cols].values
        Y = df_f['Y'].values.reshape(-1, 1)
        
        # Scaling (Critical for Kernel Performance)
        scaler_x = StandardScaler()
        scaler_y = StandardScaler()
        X_s = scaler_x.fit_transform(X)
        Y_s = scaler_y.fit_transform(Y).flatten()
        f_best_s = np.max(Y_s)
        
        # Kernel Selection [Source 26]
        # Matern: Handles roughness/complexity of LLM functions
        # WhiteKernel: Handles the 'noise' (variance) seen between Round 8 and 9
        kernel = C(1.0) * Matern(length_scale=1.0, nu=2.5) + WhiteKernel(noise_level=0.1)
        
        gp = GaussianProcessRegressor(
            kernel=kernel, 
            n_restarts_optimizer=10, 
            normalize_y=False,
            random_state=RANDOM_SEED # Ensures reproducible model state
        )
        gp.fit(X_s, Y_s)
        
        # Optimization Wrapper
        def acq_wrapper(x):
            # Transform candidate x to scaled space for prediction
            x_scaled = scaler_x.transform(x.reshape(1, -1))
            return expected_improvement(x_scaled, gp, f_best_s)
            
        # Optimization Loop
        best_val = np.inf
        best_x = None
        bounds = Bounds([0.0]*dim, [1.0]*dim)
        
        for _ in range(N_RESTARTS):
            x0 = np.random.uniform(0, 1, dim)
            res = minimize(acq_wrapper, x0, method='L-BFGS-B', bounds=bounds)
            if res.fun < best_val:
                best_val = res.fun
                best_x = res.x

        # Fallback
        if best_x is None: 
            logging.warning(f"F{f_id}: Optimization failed, using random.")
            best_x = np.random.uniform(0, 1, dim)
        
        # Formatting
        q_clean = [float(round(x, 6)) for x in best_x]
        queries.append(q_clean[:dim])
        
        logging.info(f"F{f_id}: f_best (raw)={np.max(Y):.4f} | Proposed: {q_clean}")

    with open(OUTPUT_FILE, 'w') as f:
        json.dump(queries, f, indent=4)
    logging.info(f"Saved to {OUTPUT_FILE}")

if __name__ == '__main__':
    generate_queries()

INFO: --- Generating Week 10 Queries (Bayesian Optimization) ---
INFO: Strategy: Matern 2.5 + WhiteKernel | XI=0.01 | Seed=42
INFO: F1: f_best (raw)=0.1744 | Proposed: [0.37454, 0.950714]
INFO: F2: f_best (raw)=0.6660 | Proposed: [0.657063, 0.99151]
INFO: F3: f_best (raw)=-0.0122 | Proposed: [1.0, 0.51776, 0.694931]
INFO: F4: f_best (raw)=0.4940 | Proposed: [0.449744, 0.39173, 0.446812, 0.416164]
INFO: F5: f_best (raw)=8662.4050 | Proposed: [0.940459, 0.953929, 0.914864, 0.370159]
INFO: F6: f_best (raw)=-0.2555 | Proposed: [0.355536, 0.313193, 0.465424, 0.893483, 0.081739]
INFO: F7: f_best (raw)=1.6487 | Proposed: [0.103124, 0.902553, 0.505252, 0.826457, 0.32005, 0.895523]
INFO: F8: f_best (raw)=9.9951 | Proposed: [1.0, 1.0, 0.636102, 1.0, 1.0, 0.887155, 1.0, 1.0]
INFO: Saved to add_data/week10_clean_inputs.json
