In [None]:
import os
import sys

import pandas as pd
pd.set_option('display.max_columns', None)
import numpy as np

import random

sys.path.append(os.path.expanduser("~/CO2-to-C3/src"))
from xgboost import XGBRegressor
from xgboost.callback import EarlyStopping
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from scipy.stats import uniform, randint
from bo_tools import expected_improvement, random_selection_from_top_EIvalues

import warnings
warnings.filterwarnings("ignore")

bayesopt_grid = pd.read_csv("~/data/BayesOpt_grid.csv") # BayesOpt_grid.csv available at https://doi.org/10.5281/zenodo.15107045

In [None]:
print("Number of compositions: ", len(bayesopt_grid))
bayesopt_grid.head()

In [None]:
number_of_models = 10
TARGET = 'log10_jC3H6_grid'

experiments = 10

initial_BATCH = [0]     # Batch 0 means seed dataset
BATCH_SIZE = 6
iteration = 150

random_sampling_ratio = 0.75
alpha = 0.5 # for Acquisition function (EI)

species_columns = ['*OCHO','*COOH','CO*COH-2*CO','*CHO-*CO','*C-*CHO']
train_cols = ['f1', 'f2', 'f3'] +\
            ['ele1_' + x for x in species_columns] +\
            ['ele2_' + x for x in species_columns] +\
            ['ele3_' + x for x in species_columns] 

In [None]:
for exp in range(experiments):

    seeds = [random.randint(0, 10001) for _ in range(number_of_models + 1)]

    print(f'Experiment {exp+1} (Random Seeds:{seeds})')

    if not initial_BATCH:
        if BATCH_SIZE == 1:
            initial_df = bayesopt_grid.sample(n=2, random_state=seeds[0]).copy()
        else:
            initial_df = bayesopt_grid.sample(n=BATCH_SIZE, random_state=seeds[0]).copy()
    else:
        initial_df = bayesopt_grid.loc[bayesopt_grid['batch'].isin(initial_BATCH)].copy()

    max_value_list = []

    searched_new_compositions_in_each_iteration = []

    for i in range(iteration):

        if i == 0:
            screened_df = initial_df.copy()

        # Prepare Ensemble XGBoost models
        ensemble_models = []
        for j in range(number_of_models):

            param_dist = {
                'n_estimators': randint(50, 500),
                'max_depth': randint(2, 10),
                'learning_rate': uniform(0.01, 0.3),
                'subsample': uniform(0.6, 0.4),
                'colsample_bytree': uniform(0.6, 0.4),
                'min_child_weight': randint(1, 7),
                'gamma': uniform(0, 0.5),
                'reg_alpha': uniform(0, 1),
                'reg_lambda': uniform(0, 1)
            }

            base_model = XGBRegressor(
                missing=np.nan,
                random_state=seeds[0],
                eval_metric='rmse',
                callbacks=[EarlyStopping(rounds=5, save_best=True, maximize=False)] # Early stopping - Suhas
            )

            if len(screened_df) < int(np.floor(5/random_sampling_ratio)): # CV=2 if the number of data is less than 5
                cv = 2
            else:
                cv = 5

            random_search = RandomizedSearchCV(
                estimator=base_model,
                param_distributions=param_dist,
                n_iter=50,      # Number of parameter settings sampled 
                cv=cv,          # 5-fold cross validation
                scoring='neg_root_mean_squared_error',
                n_jobs=-1,      # Use all available cores
                verbose=0,      # Low verbosity or use '2'
                random_state=seeds[0]
            )

            # Random sampling from training dataset (i.e., screened_df)


            X_train = screened_df.sample(n=int(len(screened_df) * random_sampling_ratio + 0.5), 
                                         random_state=seeds[j+1], replace=True)[train_cols].copy()
            y_train = screened_df.loc[X_train.index][TARGET].copy()

            # Splitting into training and validation sets for early stopping
            X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(
                X_train, y_train, test_size=0.2, random_state=seeds[j + 1]
            )

            # random_search.fit(X_train, y_train)
            random_search.fit(X_train_split, y_train_split, eval_set=[(X_val_split, y_val_split)], early_stopping_rounds=5, verbose=False)

            best_model = random_search.best_estimator_

            ensemble_models.append(best_model)

        # Prediction for rest of the compositions

        remaining_df = bayesopt_grid[~bayesopt_grid.isin(screened_df.to_dict(orient='list')).all(axis=1)]

        X_search = remaining_df[train_cols].copy()
        mu_list = []

        for k in range(number_of_models):

            best_model = ensemble_models[k]

            mu = best_model.predict(X_search)
            mu_list.append(mu)

        mu_array = np.array(mu_list)

        mean_values = np.mean(mu_array, axis=0)
        std_values = np.std(mu_array, axis=0)

        # Calculate EI
        max_value = screened_df[TARGET].max()

        ei = expected_improvement(mean_values, std_values, max_value, alpha=alpha)

        # Randomly select compositions based on HIGH EI values

        rand_idx = random_selection_from_top_EIvalues(ei, n=BATCH_SIZE, seed=seeds[0])

        selected_next_compositions = []
        for idx in rand_idx:
            composition = remaining_df.iloc[idx]['composition_nominal']
            selected_next_compositions.append(composition)
            add_df = remaining_df[remaining_df['composition_nominal'] == composition]
            screened_df = pd.concat([screened_df, add_df], axis=0)

        new_max_value = screened_df[TARGET].max()

        max_value_list.append(10**new_max_value)
        searched_new_compositions_in_each_iteration.append(selected_next_compositions)

        print(f"Experiment {exp+1} - Iteration {i+1} (#ofExp. = {len(screened_df)}) - New MAX value: {10**new_max_value} ({selected_next_compositions})")

    result_df = pd.DataFrame({'iteration': range(1,iteration+1), 'max_value': max_value_list, 'screened_compositions': searched_new_compositions_in_each_iteration})
    result_df.to_csv(f"BO_BatchSize{BATCH_SIZE}_{TARGET}_Iter_{iteration}_Initial_{initial_BATCH}_exp{exp+10}_Seed_{seeds}.csv", index=False)