In [1]:
import math
import gc
import pickle
import random
from copy import deepcopy
from tqdm import tqdm
import numpy as np
import pandas as pd
import polars as pl
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from xgboost import XGBRegressor, XGBClassifier
from lightgbm import LGBMRegressor, LGBMClassifier
from catboost import CatBoostRegressor, CatBoostClassifier
import optuna
from optuna.samplers import RandomSampler, TPESampler, GPSampler
import warnings
warnings.filterwarnings("ignore")
# import multiprocessing
# max_n_jobs = multiprocessing.cpu_count()
import shap
import torch
from torch.utils.data import Dataset, TensorDataset, DataLoader, Sampler
import torch.nn as nn
import torch.optim as optim
import mlx.core as mx
import mlx.nn as nnmx
import mlx.optimizers as optimmx

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Set the device
device = "mps" if torch.backends.mps.is_available() else "cpu"

In [3]:
feature_version = 2
# 1 for pc feature, 
# 2 for label correlation feature # seems to work most consistently
# 3 for best features based on combination rank
# 4 for including time features (in case we want to reverse engineer the masked timestamp)
# 5 for increasing number of correlation features + only use those that are in the same cluster
# 6 is for 2 but more features, use when I want to use more features for larger models or AE approaches

In [4]:
default_random_state = 101
random.seed(default_random_state)
np.random.seed(default_random_state)
torch.manual_seed(default_random_state)
torch.mps.manual_seed(default_random_state)
mx.random.seed(default_random_state)

#### Import train data and popular features

In [5]:
train_df = pd.read_parquet(f"data/cleaned/cleaned_train_{feature_version}.parquet")
train_df.head()

Unnamed: 0,X751,X473,X472,X451,X226,X219,X205,X445,X444,X27,...,X758_X219_X89_interaction,X508_X465_X226_interaction,X508_X466_X226_interaction,X445_X466_X226_interaction,X465_X466_X226_interaction,X466_X226_X444_interaction,X95_X169_X272_interaction,X89_X169_X272_interaction,__index_level_0__,label
0,0.000617,0.362816,0.255354,0.625153,-0.893206,-0.654146,-1.250753,0.755891,0.625328,1.714323,...,0.356342,-0.060688,-0.088965,-0.232081,-0.071993,-0.191994,0.432966,0.495033,2023-03-01 00:00:00,0.562539
1,0.013388,0.378391,0.274621,0.63725,-0.738291,-0.634723,-1.100357,0.760472,0.633046,1.396133,...,0.342402,-0.051643,-0.07449,-0.195497,-0.062058,-0.162739,0.460443,0.525698,2023-03-01 00:01:00,0.533686
2,-0.016807,0.382337,0.279272,0.640437,-0.71342,-0.631882,-1.073226,0.761631,0.635009,1.205921,...,0.339868,-0.048644,-0.069951,-0.189648,-0.060436,-0.158119,0.468474,0.534105,2023-03-01 00:02:00,0.546505
3,-0.036622,0.387473,0.28475,0.642831,-0.644172,-0.612901,-0.982398,0.761936,0.635508,1.419536,...,0.32869,-0.042121,-0.060503,-0.171468,-0.054733,-0.143016,0.461758,0.525697,2023-03-01 00:03:00,0.357703
4,-0.053322,0.39082,0.289431,0.648175,-0.62884,-0.607648,-0.952145,0.76477,0.640311,1.408936,...,0.324915,-0.039559,-0.056454,-0.168882,-0.054339,-0.141398,0.450168,0.51177,2023-03-01 00:04:00,0.362452


In [6]:
popular_features_train = pd.read_parquet("data/cleaned/popular_features_train.parquet")
popular_features_train.head()

Unnamed: 0,volume,bid_qty,ask_qty,buy_qty,sell_qty
0,221.389,15.283,8.425,176.405,44.984
1,847.796,38.59,2.336,525.846,321.95
2,295.596,0.442,60.25,159.227,136.369
3,460.705,4.865,21.016,335.742,124.963
4,142.818,27.158,3.451,98.411,44.407


#### Implement some helper function

In [7]:
# First need to split into some fold
train_df["__index_level_0__"] = pd.to_datetime(train_df["__index_level_0__"])

default_cv = 4
default_cv_type = "full"
# NOTE: default_cv must set to 1 instead of 3 based on consistency with LB score contains 49% of test data
# NOTE: 3 cv with gap is slightly better or almost equal

def create_cv(train_df, features=None, cv=default_cv):
    if features is not None:
        train_df = train_df[features + ["__index_level_0__", "label"]]
    X_train_arr = []
    X_test_arr = []
    Y_train_arr = []
    Y_test_arr = []
    for i in range(cv):
        # if i == 0:
        #     train_month = [3, 4, 5, 6, 7, 8]
        #     test_month = [9, 10, 11, 12, 1, 2]
        # else:
        train_month = list(range(3 + i, 7 + i))
        test_month = list(map(lambda x: x % 12 if x > 12 else x, list(range(8 + i, 12 + i))))
        print(train_month, test_month)
        # print(train_month, test_month)
        train = train_df[train_df["__index_level_0__"].dt.month.isin(train_month)].reset_index().drop("index", axis = 1)
        test = train_df[train_df["__index_level_0__"].dt.month.isin(test_month)].reset_index().drop("index", axis = 1)
        X_train_arr.append(train.drop(["__index_level_0__", "label"], axis = 1))
        X_test_arr.append(test.drop(["__index_level_0__", "label"], axis = 1))
        Y_train_arr.append(train["label"])
        Y_test_arr.append(test["label"])  
    return X_train_arr, X_test_arr, Y_train_arr, Y_test_arr

# def create_cv_random_test(train_df, features=None, test_cv=10):
#     # randomize so that we have 1 train, but try it on 10 different test 
#     if features is not None:
#         train_df = train_df[features + ["timestamp", "label"]]
#     X_train_arr = []
#     X_test_arr = []
#     Y_train_arr = []
#     Y_test_arr = []

#     # Create train data
#     train_month = [3, 4, 5, 6, 7, 8]
#     train = train_df[train_df["timestamp"].dt.month.isin(train_month)] 
#     X_train_arr.append(train.drop(["timestamp", "label"], axis = 1))
#     Y_train_arr.append(train["label"])

#     test_month = [9, 10, 11, 12, 1, 2]
#     test = train_df[train_df["timestamp"].dt.month.isin(test_month)]
#     # Create test data
#     for _ in range(test_cv):
#         random_test = test.sample(frac = 0.5, random_state = default_random_state)
#         X_test_arr.append(random_test.drop(["timestamp", "label"], axis = 1))
#         Y_test_arr.append(random_test["label"])

#     return X_train_arr, X_test_arr, Y_train_arr, Y_test_arr 

# class [-1, 0, 1] -> [0, 1, 2] => < -0.2 => neg, > 0.2 => pos, else => neutral
def create_classification_class(label):
    if label < -0.4: return 0
    elif label < 0: return 1
    elif label < 0.4: return 2
    return 3

def create_cv_classification(train_df, features=None, cv=default_cv):
    if features is not None:
        train_df = train_df[features + ["__index_level_0__", "label"]]
    X_train_arr = []
    X_test_arr = []
    Y_train_arr = []
    Y_test_arr = []
    for i in range(cv):
        train_month = list(range(3 + i, 7 + i))
        # train_month = [3, 4, 5, 6, 7, 8]
        test_month = list(map(lambda x: x % 12 if x > 12 else x, list(range(8 + i, 12 + i))))
        print(train_month, test_month)
        # test_month = [9, 10, 11, 12, 1, 2] # try to make a gap to see if there is any differences in cv-lb correlation
        # print(train_month, test_month)
        train = train_df[train_df["__index_level_0__"].dt.month.isin(train_month)].reset_index().drop("index", axis = 1)
        test = train_df[train_df["__index_level_0__"].dt.month.isin(test_month)].reset_index().drop("index", axis = 1)
        X_train_arr.append(train.drop(["__index_level_0__", "label"], axis = 1))
        X_test_arr.append(test.drop(["__index_level_0__", "label"], axis = 1))
        Y_train_arr.append(train["label"].apply(lambda x: create_classification_class(x)))
        Y_test_arr.append(test["label"].apply(lambda x: create_classification_class(x)))  
    return X_train_arr, X_test_arr, Y_train_arr, Y_test_arr

In [8]:
def pearson_score(Y_test, Y_pred):
    if isinstance(Y_test, pd.Series) or isinstance(Y_test, pd.DataFrame):
        Y_test = Y_test.values
    if isinstance(Y_pred, pd.Series) or isinstance(Y_pred, pd.DataFrame):
        Y_pred = Y_pred.values
    Y_test = np.ravel(Y_test)
    Y_pred = np.ravel(Y_pred)
    pearson = np.corrcoef(Y_test, Y_pred)[0, 1]
    if np.isnan(pearson):
        if np.std(Y_pred) == 0:
            print(Y_pred)
            print("Error: zero variance prediction")
        elif np.isnan(Y_pred).any():
            print("Error: nan prediction")
        return -1
    else:
        return pearson

In [9]:
# Make function specifically for cross validation
def train_eval_cv(model, cv, X_train_arr, X_test_arr, Y_train_arr, Y_test_arr, scoring_function=pearson_score):
    cv_score = 0

    for i in range(cv):
        X_train, X_test = X_train_arr[i], X_test_arr[i]
        Y_train, Y_test = Y_train_arr[i], Y_test_arr[i]
        model.fit(X_train, Y_train)
        Y_pred = model.predict(X_test)
        cv_score += scoring_function(Y_test, Y_pred)
    
    return cv_score / cv

def train_eval_cv_random_test(model, cv, X_train_arr, X_test_arr, Y_train_arr, Y_test_arr, scoring_function=pearson_score, test_cv = 10):
    cv_score = 0

    for i in range(cv):
        curr_cv_score = 0

        # Conduct fitting
        X_train, X_test = X_train_arr[i], X_test_arr[i]
        Y_train, Y_test = Y_train_arr[i], Y_test_arr[i]
        model.fit(X_train, Y_train)
        
        # sampling and testing
        len_test = X_test.shape[0]
        for seed in tqdm(range(test_cv)):
            np.random.seed(seed)
            test_index = np.random.choice(len_test, size = len_test // 2, replace = False) 
            X_test_sample = X_test.loc[test_index, :]
            Y_test_sample = Y_test[test_index]
            Y_pred_sample = model.predict(X_test_sample)
            curr_cv_score += scoring_function(Y_test_sample, Y_pred_sample)
        
        cv_score += curr_cv_score / test_cv
    
    np.random.seed(default_random_state)
    return cv_score

In [10]:
default_n_trees = 1000
# Finetuning XGBoost
def objective_xgboost(trial):
    params = {
        "n_estimators": default_n_trees,
        "max_depth": trial.suggest_int("max_depth", 2, 10),
        "learning_rate": trial.suggest_float("learning_rate", 0.001, 0.1, log = True), # 0.001 - 0.1 -> 0.01 - 0.05 
        "verbosity": 0,
        "subsample": trial.suggest_float("subsample", 0.05, 1.0), # 1.0 -> 0.2
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.05, 1),
        "colsample_bynode": trial.suggest_float("colsample_bynode", 0.05, 1), 
        "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.05, 1), 
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 10),
        "reg_alpha": trial.suggest_float("reg_alpha", 0, 100),
        "reg_lambda": trial.suggest_float("reg_lambda", 0, 100),
        "gamma": trial.suggest_float("gamma", 0, 5),
        "enable_categorical": True,
        "random_state": default_random_state
    }

    xgbr = XGBRegressor(**params)
    cv_pearson = train_eval_cv(xgbr, default_cv, X_train_arr, X_test_arr, Y_train_arr, Y_test_arr, pearson_score)
    return cv_pearson

def objective_lightgbm(trial):
    params = {
        "n_estimators": default_n_trees,
        "verbosity": -1,
        "max_depth": trial.suggest_int("max_depth", 2, 10), # 1 - 10 => 1 - 5
        "learning_rate": trial.suggest_float("learning_rate", 0.001, 0.1, log=True), # 0.001 - 0.1 -> 0.005 - 0.02
        "num_leaves": trial.suggest_int("num_leaves", 2, 2**10),
        "subsample": trial.suggest_float("subsample", 0.05, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.05, 1.0),
        "min_child_weight": trial.suggest_float("min_child_weight", 0, 1),
        "reg_alpha": trial.suggest_float("reg_alpha", 0, 100),
        "reg_lambda": trial.suggest_float("reg_lambda", 0, 100),
        "random_state": default_random_state
    }

    lgbr = LGBMRegressor(**params)
    cv_pearson = train_eval_cv(lgbr, default_cv, X_train_arr, X_test_arr, Y_train_arr, Y_test_arr, pearson_score)
    return cv_pearson

def objective_catboost(trial):
    params = {
        "iterations": default_n_trees,
        "verbose": False,
        "learning_rate": trial.suggest_float("learning_rate", 0.001, 0.1, log=True), # 0.001 - 0.1 => 0.01 - 0.1
        "depth": trial.suggest_int("depth", 1, 10), #  1 - 10 => 5 - 15
        "subsample": trial.suggest_float("subsample", 0.05, 1.0),
        "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.05, 1.0),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 2, 600),
        "reg_lambda": trial.suggest_float("reg_lambda", 0, 100),
        "random_seed": default_random_state
    }

    cbr = CatBoostRegressor(**params)
    cv_pearson = train_eval_cv(cbr, default_cv, X_train_arr, X_test_arr, Y_train_arr, Y_test_arr, pearson_score)
    return cv_pearson

In [11]:
# Finetuning XGBoost
def objective_xgboost_classification(trial):
    params = {
        "n_estimators": default_n_trees,
        "max_depth": trial.suggest_int("max_depth", 2, 10),
        "learning_rate": trial.suggest_float("learning_rate", 0.001, 0.1, log = True), # 0.001 - 0.1 -> 0.01 - 0.05 
        "verbosity": 0,
        "subsample": trial.suggest_float("subsample", 0.05, 1.0), # 1.0 -> 0.2
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.05, 1),
        "colsample_bynode": trial.suggest_float("colsample_bynode", 0.05, 1), 
        "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.05, 1), 
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 10),
        "reg_alpha": trial.suggest_float("reg_alpha", 0, 100),
        "reg_lambda": trial.suggest_float("reg_lambda", 0, 100),
        "gamma": trial.suggest_float("gamma", 0, 5),
        "enable_categorical": True,
        "random_state": default_random_state
    }

    xgbr = XGBClassifier(**params)
    cv_acc = train_eval_cv(xgbr, default_cv, X_train_arr, X_test_arr, Y_train_arr, Y_test_arr, accuracy_score)
    return cv_acc

def objective_lightgbm_classification(trial):
    params = {
        "n_estimators": default_n_trees,
        "verbosity": -1,
        "max_depth": trial.suggest_int("max_depth", 2, 10), # 1 - 10 => 1 - 5
        "learning_rate": trial.suggest_float("learning_rate", 0.001, 0.1, log=True), # 0.001 - 0.1 -> 0.005 - 0.02
        "num_leaves": trial.suggest_int("num_leaves", 2, 2**10),
        "subsample": trial.suggest_float("subsample", 0.05, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.05, 1.0),
        "min_child_weight": trial.suggest_float("min_child_weight", 0, 1),
        "reg_alpha": trial.suggest_float("reg_alpha", 0, 100),
        "reg_lambda": trial.suggest_float("reg_lambda", 0, 100),
        "random_state": default_random_state
    }

    lgbr = LGBMClassifier(**params)
    cv_acc = train_eval_cv(lgbr, default_cv, X_train_arr, X_test_arr, Y_train_arr, Y_test_arr, accuracy_score)
    return cv_acc

def objective_catboost_classification(trial):
    params = {
        "iterations": default_n_trees,
        "verbose": False,
        "learning_rate": trial.suggest_float("learning_rate", 0.001, 0.1, log=True), # 0.001 - 0.1 => 0.01 - 0.1
        "depth": trial.suggest_int("depth", 1, 10), #  1 - 10 => 5 - 15
        "subsample": trial.suggest_float("subsample", 0.05, 1.0),
        "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.05, 1.0),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 2, 600),
        "reg_lambda": trial.suggest_float("reg_lambda", 0, 100),
        "random_seed": default_random_state
    }

    cbr = CatBoostRegressor(**params)
    cv_acc = train_eval_cv(cbr, default_cv, X_train_arr, X_test_arr, Y_train_arr, Y_test_arr, accuracy_score)
    return cv_acc

In [12]:
default_n_trials = 100
default_n_jobs = 1

def optimize_xgboost(study_name, storage_name, objective_function=objective_xgboost, n_trials = default_n_trials, n_jobs = default_n_jobs):
    print("Conduct hyperparam opt for XGBoost")
    study = optuna.create_study(
        study_name = study_name,
        direction ='maximize',
        storage = f"sqlite:///{storage_name}.db",
        sampler = TPESampler(seed = 101, n_startup_trials=10),
        load_if_exists=True
    )
    study.optimize(objective_function, n_trials=n_trials, n_jobs=n_jobs)
    print('Best hyperparameters:', study.best_params)
    print('Best Pearson score:', study.best_value)
    return study.best_params

def optimize_lightgbm(study_name, storage_name, objective_function=objective_lightgbm, n_trials = default_n_trials, n_jobs = default_n_jobs):
    print("Conduct hyperparam opt for LightGBM")
    study = optuna.create_study(
        study_name = study_name,
        direction='maximize',
        storage = f"sqlite:///{storage_name}.db",
        sampler = TPESampler(seed = 101, n_startup_trials=10),
        load_if_exists=True
    )
    study.optimize(objective_function, n_trials=n_trials, n_jobs=n_jobs)
    print('Best hyperparameters:', study.best_params)
    print('Best Pearson score:', study.best_value)
    return study.best_params

def optimize_catboost(study_name, storage_name, objective_function=objective_catboost, n_trials = default_n_trials, n_jobs = default_n_jobs):
    print("Conduct hyperparam opt for CatBoost")
    study = optuna.create_study(
        study_name = study_name,
        direction='maximize',
        storage = f"sqlite:///{storage_name}.db",
        sampler = TPESampler(seed = 101, n_startup_trials=10),
        load_if_exists=True
    )
    study.optimize(objective_function, n_trials=n_trials, n_jobs=n_jobs)
    print('Best hyperparameters:', study.best_params)
    print('Best Pearson score:', study.best_value)
    return study.best_params

#### First iteration: training with all features from the collection, no popular features

In [None]:
original_features = [f for f in train_df.columns if "X" in f]

X_train_arr, X_test_arr, Y_train_arr, Y_test_arr = create_cv(train_df, original_features)

In [None]:
best_params_xgboost = optimize_xgboost(
    f"xgboost_{feature_version}_{default_cv}_{default_random_state}_{default_n_trees}_study",
    f"xgboost_{feature_version}_{default_cv}_{default_random_state}_{default_n_trees}_study"
)

In [None]:
best_params_lightgbm = optimize_lightgbm(
    f"lightgbm_{feature_version}_{default_cv}_{default_random_state}_{default_n_trees}_study",
    f"lightgbm_{feature_version}_{default_cv}_{default_random_state}_{default_n_trees}_study"
)

In [None]:
# best_params_catboost = optimize_catboost(
#     f"catboost_{feature_version}_{default_cv}_{default_random_state}_{default_n_trees}_study",
#     f"catboost_{feature_version}_{default_cv}_{default_random_state}_{default_n_trees}_study"
# )
# # Need to take down as catboost might not work well in this situation

Analyze params - cv relationship

In [13]:
def get_study_df(filename):
    study = optuna.load_study(
        study_name = filename,
        storage = f"sqlite:///{filename}.db"
    )
    study_df = []
    for trial in study.trials:
        trial_dict = trial.params
        trial_dict["value"] = trial.value
        study_df.append(trial_dict)

    return pd.DataFrame(study_df)

In [14]:
def params_value_viz(study_df):
    nrows = (study_df.shape[1] - 1) // 3 + ((study_df.shape[1] - 1) % 3 > 0)
    fig, ax = plt.subplots(nrows = nrows, ncols = 3, figsize = (14, 5 * nrows))
    for inx, var in enumerate(study_df.columns):
        x, y = inx // 3, inx % 3
        if var != "value":
            sns.regplot(study_df, x = var, y = "value", ax = ax[x][y], lowess=True, line_kws={'color': 'green'}, ci = 95)
    plt.show()

In [None]:
study_df_xgboost = get_study_df(f"xgboost_{feature_version}_{default_cv}_{default_random_state}_{default_n_trees}_study")   
params_value_viz(study_df_xgboost)

In [None]:
study_df_lightgbm = get_study_df(f"lightgbm_{feature_version}_{default_cv}_{default_random_state}_{default_n_trees}_study")
params_value_viz(study_df_lightgbm)

In [None]:
# study_df_catboost = get_study_df(f"catboost_{feature_version}_{default_cv}_{default_random_state}_{default_n_trees}_study")
# params_value_viz(study_df_catboost)

Analyze feature importance + CV performance

In [15]:
def get_best_params_from_file(filename):
    study = optuna.load_study(
        study_name = filename,
        storage = f"sqlite:///{filename}.db"
    )
    return study.best_params

In [16]:
def get_shap_values(model, X_train_arr, X_test_arr, Y_train_arr, Y_test_arr, sample_size=10000):
    mean_abs_shap_all = np.zeros(X_train_arr[0].shape[1])
    for i in range(default_cv):
        X_train, X_test = X_train_arr[i], X_test_arr[i]
        Y_train, Y_test = Y_train_arr[i], Y_test_arr[i]
        model.fit(X_train, Y_train)
        X_test_sample = X_test.sample(sample_size, random_state = default_random_state)
        explainer = shap.TreeExplainer(model)
        shap_values = explainer.shap_values(X_test)
        mean_abs_shap = np.mean(np.abs(shap_values), axis = 0)
        mean_abs_shap_all += mean_abs_shap
    mean_abs_shap_all /= default_cv
    return mean_abs_shap_all

In [None]:
params = {
    "n_estimators": default_n_trees,
    "verbosity": 0,
    "enable_categorical": True,
    "random_state": default_random_state
}
best_params_xgboost = get_best_params_from_file(f"xgboost_{feature_version}_{default_cv}_{default_random_state}_{default_n_trees}_study")
for p in best_params_xgboost:
    params[p] = best_params_xgboost[p]

xgboost_feature_importances = {}

xgbr = XGBRegressor(**params)
for i in range(default_cv):
    X_train, X_test = X_train_arr[i], X_test_arr[i]
    Y_train, Y_test = Y_train_arr[i], Y_test_arr[i]
    xgbr.fit(X_train, Y_train)
    print(pearson_score(Y_test, xgbr.predict(X_test)))
    features = xgbr.feature_names_in_.tolist()
    # features_i = xgbr.feature_importances_.tolist()
    features_i = get_shap_values(xgbr, X_train_arr, X_test_arr, Y_train_arr, Y_test_arr)
    for inx, feat in enumerate(features):
        xgboost_feature_importances[feat] = xgboost_feature_importances.get(feat, 0) + features_i[inx]

# print(feature_importances)
plt.hist(xgboost_feature_importances.values())
# Seems like only COD features are important (can try to only use 4-8 hours if 4-13 hours does not work well)

In [None]:
print([f for f in xgboost_feature_importances if xgboost_feature_importances[f] > 0.01])

In [None]:
params = {
    "n_estimators": default_n_trees,
    "verbosity": -1,
    "random_state": default_random_state
}
best_params_lightgbm = get_best_params_from_file(f"lightgbm_{feature_version}_{default_cv}_{default_random_state}_{default_n_trees}_study")
for p in best_params_lightgbm:
    params[p] = best_params_lightgbm[p]

lightgbm_feature_importances = {}

lgbr = LGBMRegressor(**params)
for i in range(default_cv):
    X_train, X_test = X_train_arr[i], X_test_arr[i]
    Y_train, Y_test = Y_train_arr[i], Y_test_arr[i]
    lgbr.fit(X_train, Y_train)
    print(pearson_score(Y_test, lgbr.predict(X_test)))
    features = lgbr.feature_names_in_.tolist()
    # features_i = lgbr.feature_importances_.tolist()
    features_i = get_shap_values(lgbr, X_train_arr, X_test_arr, Y_train_arr, Y_test_arr)
    for inx, feat in enumerate(features):
        lightgbm_feature_importances[feat] = lightgbm_feature_importances.get(feat, 0) + features_i[inx]

plt.hist(lightgbm_feature_importances.values())
# seems to pick up time features not as good as past 4 hours features

In [None]:
print([f for f in lightgbm_feature_importances if lightgbm_feature_importances[f] >= 0.01])

In [None]:
# params = {
#     "iterations": default_n_trees,
#     "verbose": False,
#     "random_seed": default_random_state
# }
# best_params_catboost = get_best_params_from_file(f"catboost_{feature_version}_{default_cv}_{default_random_state}_{default_n_trees}_study")
# for p in best_params_catboost:
#     params[p] = best_params_catboost[p]

# catboost_feature_importances = {}

# cbr = CatBoostRegressor(**params)
# cv_rmse = 0

# for i in range(default_cv):
#     X_train, X_test = X_train_arr[i], X_test_arr[i]
#     Y_train, Y_test = Y_train_arr[i], Y_test_arr[i]
#     cbr.fit(X_train, Y_train)
#     print(pearson_score(Y_test, cbr.predict(X_test)))
#     features = cbr.feature_names_
#     # features_i = cbr.feature_importances_.tolist()
#     features_i = get_shap_values(cbr, X_train_arr, X_test_arr, Y_train_arr, Y_test_arr)
#     for inx, feat in enumerate(features):
#         catboost_feature_importances[feat] = catboost_feature_importances.get(feat, 0) + features_i[inx]

# plt.hist(catboost_feature_importances.values())
# # can pick up a combination of both past cod and tss, not good at picking up ph, temp

In [None]:
# print([f for f in catboost_feature_importances if catboost_feature_importances[f] >= 0.02])

Get top 20 important features in all of them

In [None]:
xgboost_feature_importances_df = pd.DataFrame(
    {"var": xgboost_feature_importances.keys(), "importance": xgboost_feature_importances.values()}
)
xgboost_feature_importances_df["importance"] /= default_cv
# xgboost_feature_importances_df["rank_importance"] = xgboost_feature_importances_df["importance"].rank(ascending=False)
lightgbm_feature_importances_df = pd.DataFrame(
    {"var": lightgbm_feature_importances.keys(), "importance": lightgbm_feature_importances.values()}
)
lightgbm_feature_importances_df["importance"] /= default_cv
# lightgbm_feature_importances_df["rank_importance"] = lightgbm_feature_importances_df["importance"].rank(ascending=False)
# catboost_feature_importances_df = pd.DataFrame(
#     {"var": catboost_feature_importances.keys(), "importance_catboost": catboost_feature_importances.values()}
# )
# catboost_feature_importances_df["rank_importance"] = catboost_feature_importances_df["importance_catboost"].rank(ascending=False)
feature_importances_df = xgboost_feature_importances_df.merge(
    lightgbm_feature_importances_df,
    on="var",
    how="inner",
    suffixes=("_xgboost", "_lightgbm")
)
# feature_importances_df = feature_importances_df.merge(
#     catboost_feature_importances_df,
#     on="var",
#     how="inner",
#     suffixes=("", "_catboost")
# )
# feature_importances_df = feature_importances_df[["var", "rank_importance_xgboost", "rank_importance_lightgbm", "rank_importance_catboost"]]
# feature_importances_df["rank"] = 1/3 * (feature_importances_df["rank_importance_xgboost"] + feature_importances_df["rank_importance_lightgbm"] + feature_importances_df["rank_importance_catboost"])

feature_importances_df["importance"] = 1/2 * (feature_importances_df["importance_xgboost"] + feature_importances_df["importance_lightgbm"])
feature_importances_df = feature_importances_df.sort_values(by="importance", ascending=False).reset_index().drop("index", axis = 1)
feature_importances_df[:50]

In [None]:
feature_importances_df.to_csv("feature_importances_df.csv", index = False)

In [None]:
feature_importances_df = pd.read_csv("feature_importances_df.csv")
feature_importances_df

In [None]:
print(feature_importances_df.loc[:49, "var"].tolist())

#### Second Iteration: adding popular feature in addition to original features correlated to label

In [None]:
train_added_df = pd.concat([train_df, popular_features_train], axis = 1)

X_train_arr, X_test_arr, Y_train_arr, Y_test_arr = create_cv(train_added_df)

In [None]:
best_params_xgboost_popular_feature = optimize_xgboost(
    f"xgboost_{feature_version}_{default_cv}_{default_random_state}_{default_n_trees}_popular_feature_study",
    f"xgboost_{feature_version}_{default_cv}_{default_random_state}_{default_n_trees}_popular_feature_study"
)

In [None]:
best_params_lightgbm_popular_feature = optimize_lightgbm(
    f"lightgbm_{feature_version}_{default_cv}_{default_random_state}_{default_n_trees}_popular_feature_study",
    f"lightgbm_{feature_version}_{default_cv}_{default_random_state}_{default_n_trees}_popular_feature_study"
)

Check for feature importance

In [None]:
params = {
    "n_estimators": default_n_trees,
    "verbosity": 0,
    "enable_categorical": True,
    "random_state": default_random_state
}
best_params_xgboost_popular_feature = get_best_params_from_file(f"xgboost_{feature_version}_{default_cv}_{default_random_state}_{default_n_trees}_popular_feature_study")
for p in best_params_xgboost_popular_feature:
    params[p] = best_params_xgboost_popular_feature[p]

xgboost_feature_importances = {}

xgbr = XGBRegressor(**params)
for i in range(default_cv):
    X_train, X_test = X_train_arr[i], X_test_arr[i]
    Y_train, Y_test = Y_train_arr[i], Y_test_arr[i]
    xgbr.fit(X_train, Y_train)
    print(pearson_score(Y_test, xgbr.predict(X_test)))
    features = xgbr.feature_names_in_.tolist()
    # features_i = xgbr.feature_importances_.tolist()
    features_i = get_shap_values(xgbr, X_train_arr, X_test_arr, Y_train_arr, Y_test_arr)
    for inx, feat in enumerate(features):
        xgboost_feature_importances[feat] = xgboost_feature_importances.get(feat, 0) + features_i[inx]

# print(feature_importances)
plt.hist(xgboost_feature_importances.values())
# Seems like only COD features are important (can try to only use 4-8 hours if 4-13 hours does not work well)

In [None]:
params = {
    "n_estimators": default_n_trees,
    "verbosity": -1,
    "random_state": default_random_state
}
best_params_lightgbm_popular_feature = get_best_params_from_file(f"lightgbm_{feature_version}_{default_cv}_{default_random_state}_{default_n_trees}_popular_feature_study")
for p in best_params_lightgbm_popular_feature:
    params[p] = best_params_lightgbm_popular_feature[p]

lightgbm_feature_importances = {}

lgbr = LGBMRegressor(**params)
for i in range(default_cv):
    X_train, X_test = X_train_arr[i], X_test_arr[i]
    Y_train, Y_test = Y_train_arr[i], Y_test_arr[i]
    lgbr.fit(X_train, Y_train)
    print(pearson_score(Y_test, lgbr.predict(X_test)))
    features = lgbr.feature_names_in_.tolist()
    # features_i = lgbr.feature_importances_.tolist()
    features_i = get_shap_values(lgbr, X_train_arr, X_test_arr, Y_train_arr, Y_test_arr)
    for inx, feat in enumerate(features):
        lightgbm_feature_importances[feat] = lightgbm_feature_importances.get(feat, 0) + features_i[inx]

plt.hist(lightgbm_feature_importances.values())
# seems to pick up time features not as good as past 4 hours features

In [None]:
xgboost_feature_importances_df = pd.DataFrame(
    {"var": xgboost_feature_importances.keys(), "importance": xgboost_feature_importances.values()}
)
xgboost_feature_importances_df["importance"] /= default_cv
# xgboost_feature_importances_df["rank_importance"] = xgboost_feature_importances_df["importance"].rank(ascending=False)
lightgbm_feature_importances_df = pd.DataFrame(
    {"var": lightgbm_feature_importances.keys(), "importance": lightgbm_feature_importances.values()}
)
lightgbm_feature_importances_df["importance"] /= default_cv
# lightgbm_feature_importances_df["rank_importance"] = lightgbm_feature_importances_df["importance"].rank(ascending=False)
# catboost_feature_importances_df = pd.DataFrame(
#     {"var": catboost_feature_importances.keys(), "importance_catboost": catboost_feature_importances.values()}
# )
# catboost_feature_importances_df["rank_importance"] = catboost_feature_importances_df["importance_catboost"].rank(ascending=False)
feature_importances_df = xgboost_feature_importances_df.merge(
    lightgbm_feature_importances_df,
    on="var",
    how="inner",
    suffixes=("_xgboost", "_lightgbm")
)
# feature_importances_df = feature_importances_df.merge(
#     catboost_feature_importances_df,
#     on="var",
#     how="inner",
#     suffixes=("", "_catboost")
# )
# feature_importances_df = feature_importances_df[["var", "rank_importance_xgboost", "rank_importance_lightgbm", "rank_importance_catboost"]]
# feature_importances_df["rank"] = 1/3 * (feature_importances_df["rank_importance_xgboost"] + feature_importances_df["rank_importance_lightgbm"] + feature_importances_df["rank_importance_catboost"])
feature_importances_df["importance"] = 1/2 * (feature_importances_df["importance_xgboost"] + feature_importances_df["importance_lightgbm"])
feature_importances_df = feature_importances_df.sort_values(by="importance", ascending=False).reset_index().drop("index", axis = 1)
feature_importances_df

In [None]:
best_xgboost_score = optuna.load_study(
    study_name = "xgboost_2_4_101_1000_popular_feature_study",
    storage = f"sqlite:///xgboost_2_4_101_1000_popular_feature_study.db"
).best_value
best_lightgbm_score = optuna.load_study(
    study_name = "lightgbm_2_4_101_1000_popular_feature_study",
    storage = f"sqlite:///lightgbm_2_4_101_1000_popular_feature_study.db"
).best_value
feature_importances_df["weighted_importance"] = (best_xgboost_score * feature_importances_df["importance_xgboost"] + best_lightgbm_score * feature_importances_df["importance_lightgbm"]) / (best_xgboost_score + best_lightgbm_score)
feature_importances_df = feature_importances_df.sort_values("weighted_importance", ascending=False, ignore_index=True)
feature_importances_df

In [None]:
feature_importances_df.to_csv("feature_importances_df.csv", index = False)

In [None]:
feature_importances_df = pd.read_csv("feature_importances_df.csv")
feature_importances_df

In [None]:
feature_importances_df[~feature_importances_df["var"].str.contains("X")]

In [None]:
print(feature_importances_df.sort_values("importance", ignore_index=True, ascending=False).head(30)["var"].tolist())

In [None]:
print(feature_importances_df.sort_values("weighted_importance", ignore_index=True, ascending=False).head(30)["var"].tolist())

In [None]:
s1 = set(feature_importances_df.sort_values("importance", ignore_index=True, ascending=False).head(20)["var"].tolist())
s2 = set(feature_importances_df.sort_values("weighted_importance", ignore_index=True, ascending=False).head(20)["var"].tolist())
print(s1 - s2)
print(s2 - s1)

In [None]:
feature_importances_df.sort_values("importance", ignore_index=True, ascending=False).head(50)

In [None]:
feature_importances_df.sort_values("weighted_importance", ignore_index=True, ascending=False).head(50)

#### Third Iteration: a common truncated version using good features across all models + popular features

In [None]:
best_features = ['X757', 'X758', 'X759', 'X508', 'X614', 'X752', 'X331', 'X445', 'X465', 'X385', 
                 'X466', 'X95', 'X23', 'X219', 'X31', 'X373', 'X379', 'X284', 'X750', 'X652', 
                 'X279', 'X89', 'X169', 'X753', 'X226', 'X28', 'X444', 'X272', 'X271', 'X218']
# Best is at 30 features with no popular features
best_features = list(set(best_features))
train_added_df = pd.concat([train_df, popular_features_train], axis=1)
X_train_arr, X_test_arr, Y_train_arr, Y_test_arr = create_cv(train_added_df, best_features)

XGBoost

In [None]:
best_xgboost_params_common_truncated = optimize_xgboost(
    f"xgboost_{feature_version}_{default_cv}_{default_random_state}_{default_n_trees}_common_truncated_{len(best_features)}_study",
    f"xgboost_{feature_version}_{default_cv}_{default_random_state}_{default_n_trees}_common_truncated_{len(best_features)}_study"
) 

LightGBM

In [None]:
best_lightgbm_params_common_truncated = optimize_lightgbm(
    f"lightgbm_{feature_version}_{default_cv}_{default_random_state}_{default_n_trees}_common_truncated_{len(best_features)}_study",
    f"lightgbm_{feature_version}_{default_cv}_{default_random_state}_{default_n_trees}_common_truncated_{len(best_features)}_study"
)

Catboost

In [None]:
# best_catboost_params_common_truncated = optimize_catboost(
#     f"catboost_{feature_version}_{default_cv}_{default_random_state}_{default_n_trees}_common_truncated_20_study",
#     f"catboost_{feature_version}_{default_cv}_{default_random_state}_{default_n_trees}_common_truncated_20_study"
# )

Analyze model performance and feature importance across train and test

In [None]:
params = {
    "n_estimators": default_n_trees,
    "verbosity": 0,
    "enable_categorical": True,
    "random_state": default_random_state
}
best_params_xgboost = get_best_params_from_file(f"xgboost_{feature_version}_{default_cv}_{default_random_state}_{default_n_trees}_common_truncated_20_study")
for p in best_params_xgboost:
    params[p] = best_params_xgboost[p]

xgbr_arr = []

for i in tqdm(range(default_cv)):
    xgbr = XGBRegressor(**params)
    xgbr.fit(X_train_arr[i], Y_train_arr[i])
    xgbr_arr.append(xgbr)

In [None]:
params = {
    "n_estimators": default_n_trees,
    "verbosity": -1,
    "random_state": default_random_state,
}
best_params_lightgbm = get_best_params_from_file(f"lightgbm_{feature_version}_{default_cv}_{default_random_state}_{default_n_trees}_common_truncated_20_study")
for p in best_params_lightgbm:
    params[p] = best_params_lightgbm[p]

lgbr_arr = []

for i in tqdm(range(default_cv)):
    lgbr = LGBMRegressor(**params)
    lgbr.fit(X_train_arr[i], Y_train_arr[i])
    lgbr_arr.append(lgbr)

In [None]:
xgboost_feature_importances = {}
lightgbm_feature_importances = {}

for i in tqdm(range(default_cv)):
    features = xgbr_arr[i].feature_names_in_.tolist()
    features_i = get_shap_values(xgbr_arr[i], X_train_arr, X_test_arr, Y_train_arr, Y_test_arr)
    for inx, feat in enumerate(features):
        xgboost_feature_importances[feat] = xgboost_feature_importances.get(feat, 0) + features_i[inx]
    features = lgbr_arr[i].feature_names_in_.tolist()
    features_i = get_shap_values(xgbr_arr[i], X_train_arr, X_test_arr, Y_train_arr, Y_test_arr)
    for inx, feat in enumerate(features):
        lightgbm_feature_importances[feat] = lightgbm_feature_importances.get(feat, 0) + features_i[inx]

xgboost_feature_importances_df = pd.DataFrame(
    {"var": xgboost_feature_importances.keys(), "importance": xgboost_feature_importances.values()}
)
xgboost_feature_importances_df["importance"] /= default_cv
# xgboost_feature_importances_df["rank_importance"] = xgboost_feature_importances_df["importance"].rank(ascending=False)
lightgbm_feature_importances_df = pd.DataFrame(
    {"var": lightgbm_feature_importances.keys(), "importance": lightgbm_feature_importances.values()}
)
lightgbm_feature_importances_df["importance"] /= default_cv
# lightgbm_feature_importances_df["rank_importance"] = lightgbm_feature_importances_df["importance"].rank(ascending=False)
# catboost_feature_importances_df = pd.DataFrame(
#     {"var": catboost_feature_importances.keys(), "importance_catboost": catboost_feature_importances.values()}
# )
# catboost_feature_importances_df["rank_importance"] = catboost_feature_importances_df["importance_catboost"].rank(ascending=False)
feature_importances_df_common_truncated = xgboost_feature_importances_df.merge(
    lightgbm_feature_importances_df,
    on="var",
    how="inner",
    suffixes=("_xgboost", "_lightgbm")
)
# feature_importances_df = feature_importances_df.merge(
#     catboost_feature_importances_df,
#     on="var",
#     how="inner",
#     suffixes=("", "_catboost")
# )
# feature_importances_df = feature_importances_df[["var", "rank_importance_xgboost", "rank_importance_lightgbm", "rank_importance_catboost"]]
# feature_importances_df["rank"] = 1/3 * (feature_importances_df["rank_importance_xgboost"] + feature_importances_df["rank_importance_lightgbm"] + feature_importances_df["rank_importance_catboost"])
feature_importances_df_common_truncated["importance"] = 1/2 * (feature_importances_df_common_truncated["importance_xgboost"] + feature_importances_df_common_truncated["importance_lightgbm"])
feature_importances_df_common_truncated = feature_importances_df_common_truncated.sort_values(by="importance", ascending=False).reset_index().drop("index", axis = 1)
feature_importances_df_common_truncated

In [None]:
best_xgboost_score = optuna.load_study(
    study_name = "xgboost_2_4_101_1000_common_truncated_20_study",
    storage = f"sqlite:///xgboost_2_4_101_1000_common_truncated_20_study.db"
).best_value
best_lightgbm_score = optuna.load_study(
    study_name = "lightgbm_2_4_101_1000_common_truncated_20_study",
    storage = f"sqlite:///lightgbm_2_4_101_1000_common_truncated_20_study.db"
).best_value
feature_importances_df_common_truncated["weighted_importance"] = (best_xgboost_score * feature_importances_df_common_truncated["importance_xgboost"] + best_lightgbm_score * feature_importances_df_common_truncated["importance_lightgbm"]) / (best_xgboost_score + best_lightgbm_score)
feature_importances_df_common_truncated = feature_importances_df_common_truncated.sort_values("weighted_importance", ascending=False, ignore_index=True)
feature_importances_df_common_truncated

#### Fourth iteration: Adding popular feature on top of truncated X

In [None]:
best_features = ['X757', 'X758', 'X759', 'X508', 'X614', 'X752', 'X331', 'X445', 'X465', 'X385', 
                 'X466', 'X95', 'X23', 'X219', 'X31', 'X373', 'X379', 'X284', 'X750', 'X652', 
                 'X279', 'X89', 'X169', 'X753', 'X226', 'X28', 'X444', 'X272', 'X271', 'X218'] + \
                ["volume", "bid_qty", "ask_qty", "buy_qty", "sell_qty"] 
# Since the features are already normalized, we cannot use the newly created features like order_flow_imbalance,
# since they lose their meanings already, but we can still use the old popular features
best_features = list(set(best_features))
train_added_df = pd.concat([train_df, popular_features_train], axis=1)
X_train_arr, X_test_arr, Y_train_arr, Y_test_arr = create_cv(train_added_df, best_features)

XGBoost

In [None]:
best_xgboost_params_common_truncated_popular_feature = optimize_xgboost(
    f"xgboost_{feature_version}_{default_cv}_{default_random_state}_{default_n_trees}_common_truncated_{len(best_features) - 5}_popular_feature_study",
    f"xgboost_{feature_version}_{default_cv}_{default_random_state}_{default_n_trees}_common_truncated_{len(best_features) - 5}_popular_feature_study"
) 

LightGBM

In [None]:
best_lightgbm_params_common_truncated_popular_feature = optimize_lightgbm(
    f"lightgbm_{feature_version}_{default_cv}_{default_random_state}_{default_n_trees}_common_truncated_20_popular_feature_study",
    f"lightgbm_{feature_version}_{default_cv}_{default_random_state}_{default_n_trees}_common_truncated_20_popular_feature_study"
) 

#### Fifth Iteration Instead of using GBDT, can we use MLP on these features

Convert from normal CV to torch type CV

In [None]:
# Create the CV data, seems to be better with only anonymized features
best_features = ['X862', 'X598', 'X863', 'X856', 'X612', 'X466', 'X533', 'X861', 'X445', 'X531',
                 'X385', 'X23', 'X465', 'X284', 'X331', 'X95', 'X169', 'X285', 'X137', 'X31']
                # ["volume", "bid_qty", "ask_qty", "buy_qty", "sell_qty"] + \
                # [col for col in train_df.columns.tolist() if "X" not in col and col not in ["timestamp", "label"]]
best_features = list(set(best_features))
train_added_df = pd.concat([train_df, popular_features_train], axis=1)
X_train_arr, X_test_arr, Y_train_arr, Y_test_arr = create_cv(train_added_df, best_features)

In [None]:
# Extra code to "reduce" from float64 to float32
def float64_to_float32(data):
    if isinstance(data, pd.DataFrame):
        for col in data.columns:
            data[col] = data[col].astype("float32")
    elif isinstance(data, pd.Series):
        data = data.astype("float32")
    return data

for i in range(default_cv):
    X_train_arr[i] = float64_to_float32(X_train_arr[i])
    X_test_arr[i] = float64_to_float32(X_test_arr[i])
    Y_train_arr[i] = float64_to_float32(Y_train_arr[i])
    Y_test_arr[i] = float64_to_float32(Y_test_arr[i])

In [None]:
def normal_cv_to_torch_cv(X_train_arr, X_test_arr, Y_train_arr, Y_test_arr, cv=default_cv):
    train_arr = []
    test_arr = []
    for i in range(cv):
        # First shuffle the data
        X_train, Y_train = X_train_arr[i], Y_train_arr[i]
        X_train["label"] = Y_train
        # Instead of shuffle the training data when create the dataloader, try to shuffle beforehand
        # X_train = X_train.sample(frac = 1, random_state = default_random_state)
        # not shuffle, keep it by date
        Y_train = X_train["label"]
        X_train = X_train.drop("label", axis = 1)

        # Then normalize
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train.values)

        # Create train dataset
        X_train, Y_train = torch.from_numpy(X_train), torch.from_numpy(Y_train.values)
        train_dataset = TensorDataset(X_train, Y_train)
        train_arr.append(train_dataset)

        # Normalize X_test
        X_test = scaler.transform(X_test_arr[i].values)

        # Create test dataset
        X_test, Y_test = torch.from_numpy(X_test), torch.from_numpy(Y_test_arr[i].values)
        test_dataset = TensorDataset(X_test, Y_test)
        test_arr.append(test_dataset)
        
    return train_arr, test_arr

In [None]:
train_arr, test_arr = normal_cv_to_torch_cv(X_train_arr, X_test_arr, Y_train_arr, Y_test_arr)

Define the model

In [None]:
# Define the model
class MLP(nn.Module):
    def __init__(self, num_features, hidden_layers_size, dropout):
        super(MLP, self).__init__()

        # Initialize layers & batchnorm
        last_layer = num_features
        self.layers = nn.ModuleList()
        for current_layer in hidden_layers_size:
            self.layers.append(nn.Linear(last_layer, current_layer))
            last_layer = current_layer
        self.layers.append(nn.Linear(last_layer, 1))

        # Initialize activation
        self.activation = nn.ReLU()

        # Initialze dropout
        self.dropout = nn.Dropout(p = dropout)

    def forward(self, x):
        for inx, layer in enumerate(self.layers):
            if inx == len(self.layers) - 1:
                x = layer(x)
            else:
                x = layer(x)
                x = self.activation(x)
                x = self.dropout(x)
        return x

    def reset(self):
        for layer in self.layers:
            layer.reset_parameters()

Train model with CV and evaluate

In [None]:
# Separate function for train & eval step
def train_mlp(model, criterion, optimizer, train_dataloader, num_epochs):
    model.train()
    for _ in tqdm(range(num_epochs)):
        for (inputs, targets) in train_dataloader:
            # Load to device
            inputs, targets= inputs.to(device), targets.to(device)
            # Forward step
            outputs = model(inputs)
            # get error
            error = criterion(outputs, targets)
            # Zero out the past gradient
            optimizer.zero_grad()
            # Backprop
            error.backward()
            # Gradient Descent
            optimizer.step()

def eval_mlp(model, test_dataloader):
    outputs_all = np.zeros(0)
    targets_all = np.zeros(0)
    model.eval()
    with torch.no_grad():
        for _, (inputs, targets) in enumerate(test_dataloader):
            # Load to device
            inputs = inputs.to(device)
            # Forward step
            outputs = model(inputs).detach().cpu().numpy().flatten()
            # Load to overall Y_test, Y_pred to calculate pearson score later
            outputs_all = np.concatenate([outputs_all, outputs])
            targets_all = np.concatenate([targets_all, targets])
    return pearson_score(targets_all, outputs_all)

In [None]:
def train_eval_cv_torch(model, lr, cv, train_arr, test_arr, batch_size, num_epochs):
    cv_pearson = 0
    for i in range(cv):
        # Get the dataloader
        train_dataset = train_arr[i]
        train_dataloader = DataLoader(train_dataset, batch_size = batch_size, num_workers=0)
        test_dataset = test_arr[i]
        test_dataloader = DataLoader(test_dataset, batch_size = batch_size, num_workers=0)

        # Reinitialize the model
        model.reset()
        model.to(device)

        # Initialize the loss function
        criterion = nn.MSELoss()

        # Reinitialize the optimizer
        optimizer = optim.Adam(model.parameters(), lr = lr)

        # Train the model
        train_mlp(model, criterion, optimizer, train_dataloader, num_epochs)

        # Test the model
        pearson = eval_mlp(model, test_dataloader)
        print(pearson)
        cv_pearson += pearson
    return cv_pearson / cv

In [None]:
# Training process of the default config
hidden_layers_size = [16, 8, 4]
lr = 0.001
batch_size = 60
num_epochs = 10

mlpr = MLP(len(best_features), hidden_layers_size=hidden_layers_size, dropout = 0.3)

train_eval_cv_torch(mlpr, lr, default_cv, train_arr, test_arr, batch_size, num_epochs)

#### Sixth Iteration: Change this into a classification problem

In [None]:
original_features = [f for f in train_df.columns if "X" in f]

X_train_arr, X_test_arr, Y_train_arr, Y_test_arr = create_cv_classification(train_df, original_features)

In [None]:
best_params_xgboost_classification = optimize_xgboost(
    f"xgboost_{feature_version}_{default_cv}_{default_random_state}_{default_n_trees}_classification_study",
    f"xgboost_{feature_version}_{default_cv}_{default_random_state}_{default_n_trees}_classification_study",
    objective_xgboost_classification
)

In [None]:
best_params_lightgbm_classification = optimize_lightgbm(
    f"lightgbm_{feature_version}_{default_cv}_{default_random_state}_{default_n_trees}_classification_study",
    f"lightgbm_{feature_version}_{default_cv}_{default_random_state}_{default_n_trees}_classification_study",
    objective_lightgbm_classification
)

#### Seventh Iteration: Search for the best way to train

In [None]:
def search_training_scheme(model, train_df, cv = default_cv, features = None):
    folds_trial = [
        # level 1
        [[0, 1, 2, 3]], 
        [[0, 1]], [[1, 2]], [[2, 3]],
        [[0]], [[1]], [[2]], [[3]],
        [[0, 1], [1, 2], [2, 3]],
        [[0, 1], [2, 3]],
        [[0], [1], [2], [3]],
        # level 2
        [[0, 1, 2, 3], [0, 1]],
        [[0, 1, 2, 3], [1, 2]],
        [[0, 1, 2, 3], [2, 3]],
        [[0, 1, 2, 3], [0, 1], [2, 3]],
        [[0, 1, 2, 3], [0, 1], [1, 2], [2, 3]],
        [[0, 1, 2, 3], [0], [1], [2], [3]],
        [[0, 1], [2, 3], [0], [1], [2], [3]],
        [[0, 1], [1, 2], [2, 3], [0], [1], [2], [3]],
        # level 3
        [[0, 1, 2, 3], [0, 1], [0]],
        [[0, 1, 2, 3], [2, 3], [3]],
        [[0, 1, 2, 3], [0, 1], [2, 3], [0], [1], [2], [3]],
        [[0, 1, 2, 3], [0, 1], [1, 2], [2, 3], [0], [1], [2], [3]],
    ]

    if features is not None:
        train_df = train_df[features + ["timestamp", "label"]]

    for folds in folds_trial:
        print(f"Current folds list is {folds}")
        model_lst = [deepcopy(model)] * len(folds)
        cv_pearson = []
        for i in range(cv):
            train_month = list(range(3 + i, 7 + i))
            test_month = list(map(lambda x: x % 12 if x > 12 else x, list(range(8 + i, 12 + i))))
            test = train_df[train_df["timestamp"].dt.month.isin(test_month)].reset_index().drop("index", axis = 1)
            X_test, Y_test = test.drop(["timestamp", "label"], axis = 1), test["label"]
            Y_pred = np.zeros(Y_test.shape[0])
            for j in range(len(folds)):
                fold = folds[j]
                model = model_lst[j]
                train_month_curr = [train_month[f] for f in fold]
                train_curr = train_df[train_df["timestamp"].dt.month.isin(train_month_curr)].reset_index().drop("index", axis = 1)
                X_train, Y_train = train_curr.drop(["timestamp", "label"], axis = 1), train_curr["label"]
                model.fit(X_train, Y_train)
                Y_pred += model.predict(X_test)
            Y_pred /= len(folds)
            cv_pearson.append(pearson_score(Y_test, Y_pred))
            print(f"Finish fold {i} with score: {pearson_score(Y_test, Y_pred)}")
        print(f"Finish trial with mean score: {np.mean(np.array(cv_pearson))}")
        print(f"Finish trial with std score: {np.std(np.array(cv_pearson))}")
        print()

In [None]:
train_added_df = pd.concat([train_df, popular_features_train], axis = 1)

params = {
    "n_estimators": default_n_trees,
    "verbosity": 0,
    "enable_categorical": True,
    "random_state": default_random_state
}
best_params_xgboost_popular_feature = get_best_params_from_file(f"xgboost_{feature_version}_{default_cv}_{default_random_state}_{default_n_trees}_popular_feature_study")
for p in best_params_xgboost_popular_feature:
    params[p] = best_params_xgboost_popular_feature[p]

xgbr = XGBRegressor(**params)
search_training_scheme(xgbr, train_added_df)
# Notable
# [[0, 1, 2, 3]]
# [[0, 1, 2, 3], [1, 2]]
# [[0, 1, 2, 3], [0, 1], [2, 3]]
# [[0, 1, 2, 3], [0, 1], [1, 2], [2, 3]]
# [[0, 1, 2, 3], [0], [1], [2], [3]] 
# [[0, 1, 2, 3], [0, 1], [2, 3], [0], [1], [2], [3]]
# [[0, 1, 2, 3], [0, 1], [1, 2], [2, 3], [0], [1], [2], [3]]

In [None]:
train_added_df = pd.concat([train_df, popular_features_train], axis = 1)

params = {
    "n_estimators": default_n_trees,
    "verbosity": -1,
    "random_state": default_random_state
}
best_params_lightgbm_popular_feature = get_best_params_from_file(f"lightgbm_{feature_version}_{default_cv}_{default_random_state}_{default_n_trees}_popular_feature_study")
for p in best_params_lightgbm_popular_feature:
    params[p] = best_params_lightgbm_popular_feature[p]

lgbr = LGBMRegressor(**params)
search_training_scheme(lgbr, train_added_df)
# [[0, 1, 2, 3]]
# [[0, 1, 2, 3], [0, 1]]
# [[0, 1, 2, 3], [0, 1], [2, 3]]
# [[0, 1, 2, 3], [0], [1], [2], [3]]
# [[0, 1, 2, 3], [0, 1], [0]]
# [[0, 1, 2, 3], [0, 1], [2, 3], [0], [1], [2], [3]]

#### Eighth Iteration: rewrite the code for MLP training using MLX

Create the data for training + custom batch iteration

In [17]:
# Create the CV data, seems to be better with only anonymized features
# best_features = ['X862', 'X598', 'X863', 'X856', 'X612', 'X466', 'X533', 'X861', 'X445', 'X531', 
#                  'X385', 'X23', 'X284', 'X465', 'X331', 'X95', 'X285', 'X31', 'X169', 'X137', 
#                 'X379', 'X186', 'X852', 'X302', 'X868', 'X89', 'X219', 'X855', 'X540', 'X301'] 
                #  'X198', 'X373', 'X524', 'X291', 'X444', 'X279', 'X300', 'X181', 'X367', 'X538', 
                #  'X288', 'X226', 'X857', 'X860', 'X205', 'X298', 'X272', 'X472', 'X28', 'X754']
                # ["volume", "bid_qty", "ask_qty", "buy_qty", "sell_qty"] + \
                # [col for col in train_df.columns.tolist() if "X" not in col and col not in ["timestamp", "label"]]
# best_features = list(set(best_features))
# best_features = [col for col in train_df.columns if "X" in col]
# train_added_df = pd.concat([train_df, popular_features_train], axis=1)
# X_train_arr, X_test_arr, Y_train_arr, Y_test_arr = create_cv(train_added_df, best_features)

In [18]:
# Extra code to "reduce" from float64 to float32
def float64_to_float32(data):
    if isinstance(data, pd.DataFrame):
        for col in data.columns:
            data[col] = data[col].astype("float32")
    elif isinstance(data, pd.Series):
        data = data.astype("float32")
    return data

# for i in range(default_cv):
#     X_train_arr[i] = float64_to_float32(X_train_arr[i])
#     X_test_arr[i] = float64_to_float32(X_test_arr[i])
#     Y_train_arr[i] = float64_to_float32(Y_train_arr[i])
#     Y_test_arr[i] = float64_to_float32(Y_test_arr[i])

In [19]:
def normal_cv_to_mlx_cv(X_train_arr = None, X_test_arr = None, Y_train_arr = None, Y_test_arr = None, cv=default_cv):
    for i in range(cv):
        # Normalize first
        if X_train_arr is not None and X_test_arr is not None:
            scaler = StandardScaler()
            X_train_arr[i] = scaler.fit_transform(X_train_arr[i].values)
            X_test_arr[i] = scaler.transform(X_test_arr[i].values)

        # Convert to mlx format
        if X_train_arr is not None: X_train_arr[i] = mx.array(X_train_arr[i])
        if X_test_arr is not None: X_test_arr[i] = mx.array(X_test_arr[i])
        if Y_train_arr is not None: Y_train_arr[i] = mx.array(Y_train_arr[i].values)
        if Y_test_arr is not None: Y_test_arr[i] = mx.array(Y_test_arr[i].values)
        
    return X_train_arr, X_test_arr, Y_train_arr, Y_test_arr

# X_train_arr, X_test_arr, Y_train_arr, Y_test_arr = normal_cv_to_mlx_cv(X_train_arr, X_test_arr, Y_train_arr, Y_test_arr)

Define the model

In [20]:
# Define the model
# We do not use the reset method this time so you have to create the model at each fold
class MLPMLX(nnmx.Module):
    def __init__(self, num_features, hidden_layers_size, dropout):
        super().__init__()

        # Initialize layers & batchnorm
        last_layer = num_features
        self.layers = []
        for current_layer in hidden_layers_size:
            self.layers.append(nnmx.Linear(last_layer, current_layer))
            last_layer = current_layer
        self.layers.append(nnmx.Linear(last_layer, 1))

        # Initialize activation
        self.activation = nnmx.ReLU()

        # Initialize dropout
        self.dropout = nnmx.Dropout(p = dropout)

    def __call__(self, x):
        for inx, layer in enumerate(self.layers):
            x = layer(x)
            if inx != len(self.layers) - 1:
                x = self.activation(x)
                x = self.dropout(x)
        return x

Train model with CV and evaluate

In [21]:
# Custom function for batch iteration
def batch_iterate(batch_size, X, Y, shuffle = True):
    for i in range(0, Y.size, batch_size):
        X_curr = X[i: min(i + batch_size, Y.size), :]
        Y_curr = Y[i: min(i + batch_size, Y.size)]
        if shuffle:
            inx_lst = mx.random.permutation(batch_size)
            X_curr = X_curr[inx_lst, :]
            Y_curr = Y_curr[inx_lst]
        yield X_curr, Y_curr

In [22]:
# Separate function for train & eval step
def train_mlp_mlx(model, loss_and_grad_fn, optimizer, X_train, Y_train, batch_size, num_epochs):
    model.train()
    for _ in tqdm(range(num_epochs)):
        for (inputs, targets) in batch_iterate(batch_size, X_train, Y_train):
            _, grads = loss_and_grad_fn(model, inputs, targets)
            # Update the optimizer state and model parameters in a single call
            optimizer.update(model, grads)
            # Force a graph evaluation
            mx.eval(model.parameters(), optimizer.state)

def eval_mlp_mlx(model, X_test, Y_test, batch_size):
    outputs_all = np.zeros(0)
    targets_all = np.zeros(0)
    model.eval()
    for (inputs, targets) in batch_iterate(batch_size, X_test, Y_test, shuffle=False):
        outputs = model(inputs).reshape(-1)
        # convert back to numpy
        outputs, targets = np.array(outputs), np.array(targets)
        # Load to overall Y_test, Y_pred to calculate pearson score later
        outputs_all = np.concatenate([outputs_all, outputs])
        targets_all = np.concatenate([targets_all, targets])
    return pearson_score(targets_all, outputs_all)

In [23]:
def train_eval_cv_mlx(num_features, hidden_layers_size, dropout, lr, cv, X_train_arr, X_test_arr, Y_train_arr, Y_test_arr, batch_size, num_epochs):
    cv_pearson = 0
    for _, (X_train, X_test, Y_train, Y_test) in enumerate(zip(X_train_arr, X_test_arr, Y_train_arr, Y_test_arr)):
        # initialize the model
        mx.random.seed(default_random_state)
        model = MLPMLX(num_features, hidden_layers_size, dropout)

        # Initialize the loss function
        def loss_fn(model, X, Y):
            Y_pred = model(X).reshape(-1)
            return mx.mean(nnmx.losses.mse_loss(Y_pred, Y))
            # Y_centered = Y - mx.mean(Y)
            # Y_pred_centered = Y_pred - mx.mean(Y_pred)
            # return mx.sum(Y_centered * Y_pred_centered) / mx.sqrt(mx.sum(Y_centered ** 2) * mx.sum(Y_pred_centered ** 2))
        loss_and_grad_fn = nnmx.value_and_grad(model, loss_fn)

        # Reinitialize the optimizer
        optimizer = optimmx.Adam(learning_rate = lr)

        # Train the model
        train_mlp_mlx(model, loss_and_grad_fn, optimizer, X_train, Y_train, batch_size, num_epochs)

        # Test the model
        pearson = eval_mlp_mlx(model, X_test, Y_test, batch_size)
        print(pearson)
        if pearson == -1:
            return pearson
        cv_pearson += pearson
    return cv_pearson / cv

Conduct training and evaluating process of the model

In [24]:
# # Training process of the default config
# num_features = len(best_features)
# hidden_layers_size = [8, 8, 8]
# dropout = 0.2
# lr = 0.001
# batch_size = 180
# num_epochs = 10

# train_eval_cv_mlx(num_features, hidden_layers_size, dropout, lr, default_cv, X_train_arr, X_test_arr, Y_train_arr, Y_test_arr, batch_size, num_epochs)

Conduct Bayesian Optimization on this

In [25]:
default_num_layers = 2

In [26]:
def objective_mlp_mlx(trial):
    # First initialize the parameters
    num_features = len(best_features)
    num_layers = default_num_layers
    log_2_hidden_layers_size = []
    for i in range(num_layers):
        if len(log_2_hidden_layers_size) == 0:
            log_2_hidden_layers_size.append(trial.suggest_int(f"log2_hidden_layer_{i}", 2, 7))
        else:
            log_2_hidden_layers_size.append(trial.suggest_int(f"log2_hidden_layer_{i}", 2, log_2_hidden_layers_size[-1]))
    hidden_layers_size = [2**l for l in log_2_hidden_layers_size]
    dropout = trial.suggest_categorical("dropout", [0.2, 0.3, 0.4, 0.5, 0.6, 0.7])
    lr = trial.suggest_float("lr", 0.0001, 0.01, log=True)
    batch_size = trial.suggest_categorical("batch_size", [30, 60, 120, 180, 360, 720])
    num_epochs = trial.suggest_categorical("num_epochs", [5 * i for i in range(1, 7)])
    
    # Conduct training based on those parameters
    return train_eval_cv_mlx(num_features, hidden_layers_size, dropout, lr, default_cv, X_train_arr, X_test_arr, Y_train_arr, Y_test_arr, batch_size, num_epochs)

In [30]:
def optimize_mlp_mlx(study_name, storage_name, objective_function=objective_mlp_mlx, n_trials = 100, n_jobs = 1):
    print("Conduct hyperparam opt for MLP")
    study = optuna.create_study(
        study_name = study_name,
        direction ='maximize',
        storage = f"sqlite:///{storage_name}.db",
        sampler = TPESampler(seed = 101, n_startup_trials=10),
        load_if_exists=True
    )
    study.optimize(objective_function, n_trials=n_trials, n_jobs=n_jobs)
    print('Best hyperparameters:', study.best_params)
    print('Best Pearson score:', study.best_value)
    return study.best_params

In [31]:
# Create the CV data, seems to be better with only anonymized features
best_features = ['X757', 'X758', 'X759', 'X508', 'X614', 'X752', 'X331', 'X445', 'X465', 'X385', 
                 'X466', 'X95', 'X23', 'X219', 'X31', 'X373', 'X379', 'X284', 'X750', 'X652', 
                 'X279', 'X89', 'X169', 'X753', 'X226', 'X28', 'X444', 'X272', 'X271', 'X218']
                # ["volume", "bid_qty", "ask_qty", "buy_qty", "sell_qty"] + \
                # [col for col in train_df.columns.tolist() if "X" not in col and col not in ["timestamp", "label"]]
best_features = list(set(best_features))
# best_features = [col for col in train_df.columns if "X" in col]
# train_added_df = pd.concat([train_df, popular_features_train], axis=1)
X_train_arr, X_test_arr, Y_train_arr, Y_test_arr = create_cv(train_df, best_features)

# Convert to float32
for i in range(default_cv):
    X_train_arr[i] = float64_to_float32(X_train_arr[i])
    X_test_arr[i] = float64_to_float32(X_test_arr[i])
    Y_train_arr[i] = float64_to_float32(Y_train_arr[i])
    Y_test_arr[i] = float64_to_float32(Y_test_arr[i])

# Convert to MLX
X_train_arr, X_test_arr, Y_train_arr, Y_test_arr = normal_cv_to_mlx_cv(X_train_arr, X_test_arr, Y_train_arr, Y_test_arr)

[3, 4, 5, 6] [8, 9, 10, 11]
[4, 5, 6, 7] [9, 10, 11, 12]
[5, 6, 7, 8] [10, 11, 12, 1]
[6, 7, 8, 9] [11, 12, 1, 2]


In [33]:
optimize_mlp_mlx(
    f"mlp_mlx_{feature_version}_{default_cv}_{default_random_state}_{default_num_layers}_common_truncated_{len(best_features)}_study",
    f"mlp_mlx_{feature_version}_{default_cv}_{default_random_state}_{default_num_layers}_common_truncated_{len(best_features)}_study"
)

[I 2025-07-16 15:41:17,316] Using an existing study with name 'mlp_mlx_2_4_101_2_common_truncated_30_study' instead of creating a new one.


Conduct hyperparam opt for MLP


100%|██████████| 15/15 [00:19<00:00,  1.29s/it]


0.10823771394955192


100%|██████████| 15/15 [00:18<00:00,  1.22s/it]


0.1362515892015448


100%|██████████| 15/15 [00:17<00:00,  1.18s/it]


0.12264849532264227


100%|██████████| 15/15 [00:17<00:00,  1.14s/it]
[I 2025-07-16 15:42:31,309] Trial 201 finished with value: 0.12404366176247959 and parameters: {'log2_hidden_layer_0': 5, 'log2_hidden_layer_1': 3, 'dropout': 0.6, 'lr': 0.00011633586993360199, 'batch_size': 120, 'num_epochs': 15}. Best is trial 63 with value: 0.12508327601680538.


0.12903684857617942


100%|██████████| 15/15 [00:11<00:00,  1.31it/s]


0.10976230454463237


100%|██████████| 15/15 [00:11<00:00,  1.32it/s]


0.13705518842178296


100%|██████████| 15/15 [00:11<00:00,  1.30it/s]


0.12143838324947191


100%|██████████| 15/15 [00:11<00:00,  1.33it/s]
[I 2025-07-16 15:43:18,080] Trial 202 finished with value: 0.12445559286570673 and parameters: {'log2_hidden_layer_0': 5, 'log2_hidden_layer_1': 3, 'dropout': 0.6, 'lr': 0.00011909476811058479, 'batch_size': 180, 'num_epochs': 15}. Best is trial 63 with value: 0.12508327601680538.


0.12956649524693967


100%|██████████| 15/15 [00:11<00:00,  1.31it/s]


0.10954759802947815


100%|██████████| 15/15 [00:11<00:00,  1.32it/s]


0.1365205051553584


100%|██████████| 15/15 [00:11<00:00,  1.30it/s]


0.12119348641469403


100%|██████████| 15/15 [00:11<00:00,  1.32it/s]
[I 2025-07-16 15:44:04,840] Trial 203 finished with value: 0.12406122370356212 and parameters: {'log2_hidden_layer_0': 5, 'log2_hidden_layer_1': 3, 'dropout': 0.6, 'lr': 0.00012937379172166014, 'batch_size': 180, 'num_epochs': 15}. Best is trial 63 with value: 0.12508327601680538.


0.1289833052147179


100%|██████████| 15/15 [00:11<00:00,  1.31it/s]


0.10972556197594202


100%|██████████| 15/15 [00:12<00:00,  1.21it/s]


0.1367610135211504


100%|██████████| 15/15 [00:12<00:00,  1.18it/s]


0.12143616229774357


100%|██████████| 15/15 [00:11<00:00,  1.26it/s]
[I 2025-07-16 15:44:54,334] Trial 204 finished with value: 0.12427074990297465 and parameters: {'log2_hidden_layer_0': 5, 'log2_hidden_layer_1': 3, 'dropout': 0.6, 'lr': 0.00012025554123008208, 'batch_size': 180, 'num_epochs': 15}. Best is trial 63 with value: 0.12508327601680538.


0.12916026181706267


100%|██████████| 15/15 [00:11<00:00,  1.26it/s]


0.10942904031736583


100%|██████████| 15/15 [00:11<00:00,  1.32it/s]


0.1367364769028468


100%|██████████| 15/15 [00:11<00:00,  1.33it/s]


0.12147781930151501


100%|██████████| 15/15 [00:11<00:00,  1.35it/s]
[I 2025-07-16 15:45:41,026] Trial 205 finished with value: 0.12427567792198166 and parameters: {'log2_hidden_layer_0': 5, 'log2_hidden_layer_1': 3, 'dropout': 0.6, 'lr': 0.00012092669646350956, 'batch_size': 180, 'num_epochs': 15}. Best is trial 63 with value: 0.12508327601680538.


0.129459375166199


100%|██████████| 15/15 [00:02<00:00,  5.11it/s]


0.10325604016697588


100%|██████████| 15/15 [00:02<00:00,  5.15it/s]


0.13125696733583264


100%|██████████| 15/15 [00:02<00:00,  5.03it/s]


0.12024679100879555


100%|██████████| 15/15 [00:02<00:00,  5.06it/s]
[I 2025-07-16 15:45:53,113] Trial 206 finished with value: 0.12067584487866309 and parameters: {'log2_hidden_layer_0': 5, 'log2_hidden_layer_1': 3, 'dropout': 0.2, 'lr': 0.0001554759126488311, 'batch_size': 720, 'num_epochs': 15}. Best is trial 63 with value: 0.12508327601680538.


0.12794358100304834


100%|██████████| 15/15 [00:11<00:00,  1.35it/s]


0.10997919179142633


100%|██████████| 15/15 [00:11<00:00,  1.35it/s]


0.1356241816788654


100%|██████████| 15/15 [00:11<00:00,  1.34it/s]


0.12114135540759116


100%|██████████| 15/15 [00:11<00:00,  1.35it/s]
[I 2025-07-16 15:46:38,610] Trial 207 finished with value: 0.12309407105134207 and parameters: {'log2_hidden_layer_0': 5, 'log2_hidden_layer_1': 3, 'dropout': 0.6, 'lr': 0.0001436149426644261, 'batch_size': 180, 'num_epochs': 15}. Best is trial 63 with value: 0.12508327601680538.


0.12563155532748535


100%|██████████| 15/15 [00:11<00:00,  1.34it/s]


0.10928368425016069


100%|██████████| 15/15 [00:11<00:00,  1.34it/s]


0.13669421087486225


100%|██████████| 15/15 [00:11<00:00,  1.33it/s]


0.12124859009562396


100%|██████████| 15/15 [00:11<00:00,  1.34it/s]
[I 2025-07-16 15:47:24,389] Trial 208 finished with value: 0.12412609736729163 and parameters: {'log2_hidden_layer_0': 5, 'log2_hidden_layer_1': 3, 'dropout': 0.6, 'lr': 0.00012179760306804725, 'batch_size': 180, 'num_epochs': 15}. Best is trial 63 with value: 0.12508327601680538.


0.12927790424851968


100%|██████████| 15/15 [00:11<00:00,  1.34it/s]


0.10996116697827349


100%|██████████| 15/15 [00:11<00:00,  1.34it/s]


0.13637511259836338


100%|██████████| 15/15 [00:11<00:00,  1.33it/s]


0.1215068942350027


100%|██████████| 15/15 [00:11<00:00,  1.35it/s]
[I 2025-07-16 15:48:10,080] Trial 209 finished with value: 0.12420311520879876 and parameters: {'log2_hidden_layer_0': 5, 'log2_hidden_layer_1': 3, 'dropout': 0.6, 'lr': 0.0001331031204541797, 'batch_size': 180, 'num_epochs': 15}. Best is trial 63 with value: 0.12508327601680538.


0.12896928702355545


100%|██████████| 15/15 [00:11<00:00,  1.33it/s]


0.10918452484239595


100%|██████████| 15/15 [00:11<00:00,  1.31it/s]


0.1296095619322234


100%|██████████| 15/15 [00:11<00:00,  1.30it/s]


0.11677544418910087


100%|██████████| 15/15 [00:11<00:00,  1.31it/s]
[I 2025-07-16 15:48:56,768] Trial 210 finished with value: 0.12039715879771576 and parameters: {'log2_hidden_layer_0': 5, 'log2_hidden_layer_1': 3, 'dropout': 0.3, 'lr': 0.00013363850076826022, 'batch_size': 180, 'num_epochs': 15}. Best is trial 63 with value: 0.12508327601680538.


0.12601910422714283


100%|██████████| 15/15 [00:11<00:00,  1.31it/s]


0.10964065132305145


100%|██████████| 15/15 [00:11<00:00,  1.32it/s]


0.1373832139794256


100%|██████████| 15/15 [00:11<00:00,  1.30it/s]


0.12165287662386207


100%|██████████| 15/15 [00:11<00:00,  1.32it/s]
[I 2025-07-16 15:49:43,573] Trial 211 finished with value: 0.12456095299047631 and parameters: {'log2_hidden_layer_0': 5, 'log2_hidden_layer_1': 3, 'dropout': 0.6, 'lr': 0.00011663532309823442, 'batch_size': 180, 'num_epochs': 15}. Best is trial 63 with value: 0.12508327601680538.


0.1295670700355661


100%|██████████| 15/15 [00:11<00:00,  1.31it/s]


0.10965712949650323


100%|██████████| 15/15 [00:11<00:00,  1.32it/s]


0.13723702356637132


100%|██████████| 15/15 [00:11<00:00,  1.30it/s]


0.1213002168898127


100%|██████████| 15/15 [00:11<00:00,  1.32it/s]
[I 2025-07-16 15:50:30,367] Trial 212 finished with value: 0.12428600161145958 and parameters: {'log2_hidden_layer_0': 5, 'log2_hidden_layer_1': 3, 'dropout': 0.6, 'lr': 0.00011605050021170596, 'batch_size': 180, 'num_epochs': 15}. Best is trial 63 with value: 0.12508327601680538.


0.12894963649315114


100%|██████████| 15/15 [00:11<00:00,  1.31it/s]


0.10951683016478327


100%|██████████| 15/15 [00:11<00:00,  1.31it/s]


0.13736310032899124


100%|██████████| 15/15 [00:11<00:00,  1.30it/s]


0.12200759828765802


100%|██████████| 15/15 [00:11<00:00,  1.32it/s]
[I 2025-07-16 15:51:17,149] Trial 213 finished with value: 0.12463635509742446 and parameters: {'log2_hidden_layer_0': 5, 'log2_hidden_layer_1': 3, 'dropout': 0.6, 'lr': 0.00011861869137546292, 'batch_size': 180, 'num_epochs': 15}. Best is trial 63 with value: 0.12508327601680538.


0.12965789160826532


100%|██████████| 15/15 [00:11<00:00,  1.30it/s]


0.10147069758568479


100%|██████████| 15/15 [00:11<00:00,  1.30it/s]


0.10851926407787937


100%|██████████| 15/15 [00:11<00:00,  1.28it/s]


0.09698348161161664


100%|██████████| 15/15 [00:11<00:00,  1.30it/s]
[I 2025-07-16 15:52:04,463] Trial 214 finished with value: 0.10482551983554178 and parameters: {'log2_hidden_layer_0': 5, 'log2_hidden_layer_1': 5, 'dropout': 0.6, 'lr': 0.00014015082465932517, 'batch_size': 180, 'num_epochs': 15}. Best is trial 63 with value: 0.12508327601680538.


0.11232863606698636


100%|██████████| 15/15 [00:11<00:00,  1.32it/s]


0.10937027371251697


100%|██████████| 15/15 [00:11<00:00,  1.32it/s]


0.13622824106625536


100%|██████████| 15/15 [00:11<00:00,  1.30it/s]


0.1219173985828268


100%|██████████| 15/15 [00:11<00:00,  1.32it/s]
[I 2025-07-16 15:52:51,135] Trial 215 finished with value: 0.12437626718642186 and parameters: {'log2_hidden_layer_0': 5, 'log2_hidden_layer_1': 3, 'dropout': 0.6, 'lr': 0.00012593149935000218, 'batch_size': 180, 'num_epochs': 15}. Best is trial 63 with value: 0.12508327601680538.


0.12998915538408834


100%|██████████| 15/15 [00:11<00:00,  1.32it/s]


0.10994335275559285


100%|██████████| 15/15 [00:11<00:00,  1.32it/s]


0.13625574226317871


100%|██████████| 15/15 [00:11<00:00,  1.31it/s]


0.12082958512670078


100%|██████████| 15/15 [00:11<00:00,  1.33it/s]
[I 2025-07-16 15:53:37,794] Trial 216 finished with value: 0.12309022887565106 and parameters: {'log2_hidden_layer_0': 5, 'log2_hidden_layer_1': 3, 'dropout': 0.6, 'lr': 0.0001551475116474636, 'batch_size': 180, 'num_epochs': 15}. Best is trial 63 with value: 0.12508327601680538.


0.1253322353571319


100%|██████████| 15/15 [00:11<00:00,  1.32it/s]


0.10962870222456865


100%|██████████| 15/15 [00:11<00:00,  1.31it/s]


0.13644140091222196


100%|██████████| 15/15 [00:11<00:00,  1.30it/s]


0.12160814447119332


100%|██████████| 15/15 [00:11<00:00,  1.32it/s]
[I 2025-07-16 15:54:24,570] Trial 217 finished with value: 0.12429018256798116 and parameters: {'log2_hidden_layer_0': 5, 'log2_hidden_layer_1': 3, 'dropout': 0.6, 'lr': 0.0001288072573597085, 'batch_size': 180, 'num_epochs': 15}. Best is trial 63 with value: 0.12508327601680538.


0.1294824826639407


100%|██████████| 15/15 [00:11<00:00,  1.32it/s]


0.10821141499206273


100%|██████████| 15/15 [00:11<00:00,  1.31it/s]


0.12316757320025859


100%|██████████| 15/15 [00:11<00:00,  1.30it/s]


0.09916582615594297


100%|██████████| 15/15 [00:11<00:00,  1.32it/s]
[I 2025-07-16 15:55:11,342] Trial 218 finished with value: 0.11256005041141998 and parameters: {'log2_hidden_layer_0': 5, 'log2_hidden_layer_1': 3, 'dropout': 0.6, 'lr': 0.0003947288125609249, 'batch_size': 180, 'num_epochs': 15}. Best is trial 63 with value: 0.12508327601680538.


0.11969538729741559


100%|██████████| 15/15 [00:11<00:00,  1.32it/s]


0.10956873651491666


100%|██████████| 15/15 [00:11<00:00,  1.32it/s]


0.1373989707974023


100%|██████████| 15/15 [00:11<00:00,  1.30it/s]


0.12147711042491204


100%|██████████| 15/15 [00:11<00:00,  1.31it/s]
[I 2025-07-16 15:55:58,114] Trial 219 finished with value: 0.12445368294867522 and parameters: {'log2_hidden_layer_0': 5, 'log2_hidden_layer_1': 3, 'dropout': 0.6, 'lr': 0.0001178527661279943, 'batch_size': 180, 'num_epochs': 15}. Best is trial 63 with value: 0.12508327601680538.


0.1293699140574699


100%|██████████| 15/15 [00:11<00:00,  1.31it/s]


0.10966788046905564


100%|██████████| 15/15 [00:11<00:00,  1.31it/s]


0.13662096558656822


100%|██████████| 15/15 [00:11<00:00,  1.30it/s]


0.12144865441643118


100%|██████████| 15/15 [00:11<00:00,  1.32it/s]
[I 2025-07-16 15:56:44,886] Trial 220 finished with value: 0.1241556550637239 and parameters: {'log2_hidden_layer_0': 5, 'log2_hidden_layer_1': 3, 'dropout': 0.6, 'lr': 0.0001206375792887652, 'batch_size': 180, 'num_epochs': 15}. Best is trial 63 with value: 0.12508327601680538.


0.12888511978284062


100%|██████████| 15/15 [00:11<00:00,  1.31it/s]


0.11008068055551341


100%|██████████| 15/15 [00:11<00:00,  1.31it/s]


0.1353437646129422


100%|██████████| 15/15 [00:11<00:00,  1.30it/s]


0.1218643943006492


100%|██████████| 15/15 [00:11<00:00,  1.30it/s]
[I 2025-07-16 15:57:31,837] Trial 221 finished with value: 0.12334076212088022 and parameters: {'log2_hidden_layer_0': 5, 'log2_hidden_layer_1': 3, 'dropout': 0.6, 'lr': 0.00014317464449559658, 'batch_size': 180, 'num_epochs': 15}. Best is trial 63 with value: 0.12508327601680538.


0.12607420901441602


100%|██████████| 15/15 [00:11<00:00,  1.31it/s]


0.10944062872789442


100%|██████████| 15/15 [00:11<00:00,  1.27it/s]


0.13730272716447517


100%|██████████| 15/15 [00:11<00:00,  1.30it/s]


0.121700366142994


100%|██████████| 15/15 [00:11<00:00,  1.32it/s]
[I 2025-07-16 15:58:19,092] Trial 222 finished with value: 0.12442875341286182 and parameters: {'log2_hidden_layer_0': 5, 'log2_hidden_layer_1': 3, 'dropout': 0.6, 'lr': 0.000117846855942609, 'batch_size': 180, 'num_epochs': 15}. Best is trial 63 with value: 0.12508327601680538.


0.1292712916160837


100%|██████████| 15/15 [00:11<00:00,  1.32it/s]


0.10947243673721245


100%|██████████| 15/15 [00:11<00:00,  1.31it/s]


0.13717617770697274


100%|██████████| 15/15 [00:11<00:00,  1.31it/s]


0.12171411422141112


100%|██████████| 15/15 [00:11<00:00,  1.32it/s]
[I 2025-07-16 15:59:05,793] Trial 223 finished with value: 0.12441889481174045 and parameters: {'log2_hidden_layer_0': 5, 'log2_hidden_layer_1': 3, 'dropout': 0.6, 'lr': 0.00011804985337537073, 'batch_size': 180, 'num_epochs': 15}. Best is trial 63 with value: 0.12508327601680538.


0.12931285058136557


100%|██████████| 15/15 [00:11<00:00,  1.32it/s]


0.10974214920507762


100%|██████████| 15/15 [00:11<00:00,  1.32it/s]


0.13671377895798392


100%|██████████| 15/15 [00:11<00:00,  1.27it/s]


0.12158699467741958


100%|██████████| 15/15 [00:11<00:00,  1.31it/s]
[I 2025-07-16 15:59:52,944] Trial 224 finished with value: 0.12425014781048782 and parameters: {'log2_hidden_layer_0': 5, 'log2_hidden_layer_1': 3, 'dropout': 0.6, 'lr': 0.00012081286544695676, 'batch_size': 180, 'num_epochs': 15}. Best is trial 63 with value: 0.12508327601680538.


0.12895766840147013


100%|██████████| 15/15 [00:11<00:00,  1.27it/s]


0.10947546413118638


100%|██████████| 15/15 [00:11<00:00,  1.30it/s]


0.13732175895173568


100%|██████████| 15/15 [00:11<00:00,  1.29it/s]


0.12179147620786619


100%|██████████| 15/15 [00:12<00:00,  1.19it/s]
[I 2025-07-16 16:00:41,560] Trial 225 finished with value: 0.12451761037586075 and parameters: {'log2_hidden_layer_0': 5, 'log2_hidden_layer_1': 3, 'dropout': 0.6, 'lr': 0.00011800015587103919, 'batch_size': 180, 'num_epochs': 15}. Best is trial 63 with value: 0.12508327601680538.


0.12948174221265474


100%|██████████| 15/15 [00:12<00:00,  1.25it/s]


0.10960114342634862


100%|██████████| 15/15 [00:11<00:00,  1.27it/s]


0.13704521531071723


100%|██████████| 15/15 [00:11<00:00,  1.27it/s]


0.12146552379956624


100%|██████████| 15/15 [00:11<00:00,  1.28it/s]
[I 2025-07-16 16:01:29,955] Trial 226 finished with value: 0.12436005515055704 and parameters: {'log2_hidden_layer_0': 5, 'log2_hidden_layer_1': 3, 'dropout': 0.6, 'lr': 0.00012739735446649065, 'batch_size': 180, 'num_epochs': 15}. Best is trial 63 with value: 0.12508327601680538.


0.1293283380655961


100%|██████████| 15/15 [00:11<00:00,  1.28it/s]


0.10949572025730561


100%|██████████| 15/15 [00:11<00:00,  1.26it/s]


0.13650887025470032


100%|██████████| 15/15 [00:12<00:00,  1.20it/s]


0.12131234288456211


100%|██████████| 15/15 [00:11<00:00,  1.28it/s]
[I 2025-07-16 16:02:18,874] Trial 227 finished with value: 0.12408964488477936 and parameters: {'log2_hidden_layer_0': 5, 'log2_hidden_layer_1': 3, 'dropout': 0.6, 'lr': 0.00013000864447987315, 'batch_size': 180, 'num_epochs': 15}. Best is trial 63 with value: 0.12508327601680538.


0.12904164614254945


100%|██████████| 15/15 [00:12<00:00,  1.21it/s]


0.10758945943931487


100%|██████████| 15/15 [00:12<00:00,  1.24it/s]


0.13247366739717392


100%|██████████| 15/15 [00:11<00:00,  1.27it/s]


0.11676081295068119


100%|██████████| 15/15 [00:11<00:00,  1.32it/s]
[I 2025-07-16 16:03:07,576] Trial 228 finished with value: 0.12098731020489166 and parameters: {'log2_hidden_layer_0': 5, 'log2_hidden_layer_1': 3, 'dropout': 0.2, 'lr': 0.00011568814141110338, 'batch_size': 180, 'num_epochs': 15}. Best is trial 63 with value: 0.12508327601680538.


0.12712530103239666


100%|██████████| 15/15 [00:11<00:00,  1.30it/s]


0.11014007861230522


100%|██████████| 15/15 [00:11<00:00,  1.32it/s]


0.13776133737345772


100%|██████████| 15/15 [00:11<00:00,  1.28it/s]


0.12018470132650473


100%|██████████| 15/15 [00:11<00:00,  1.30it/s]
[I 2025-07-16 16:03:54,789] Trial 229 finished with value: 0.12331430045250313 and parameters: {'log2_hidden_layer_0': 5, 'log2_hidden_layer_1': 3, 'dropout': 0.6, 'lr': 0.00015870706587877657, 'batch_size': 180, 'num_epochs': 15}. Best is trial 63 with value: 0.12508327601680538.


0.1251710844977449


100%|██████████| 15/15 [01:06<00:00,  4.46s/it]


0.10152945300025627


100%|██████████| 15/15 [01:08<00:00,  4.54s/it]


0.11697246774435562


100%|██████████| 15/15 [01:08<00:00,  4.59s/it]


0.10416573943716705


100%|██████████| 15/15 [01:07<00:00,  4.49s/it]
[I 2025-07-16 16:08:31,475] Trial 230 finished with value: 0.10976218293185644 and parameters: {'log2_hidden_layer_0': 5, 'log2_hidden_layer_1': 3, 'dropout': 0.6, 'lr': 0.00014306288001108758, 'batch_size': 30, 'num_epochs': 15}. Best is trial 63 with value: 0.12508327601680538.


0.11638107154564682


100%|██████████| 15/15 [00:11<00:00,  1.29it/s]


0.10951578596572911


100%|██████████| 15/15 [00:11<00:00,  1.30it/s]


0.13641648153516256


100%|██████████| 15/15 [00:11<00:00,  1.28it/s]


0.12187123142763366


100%|██████████| 15/15 [00:11<00:00,  1.32it/s]
[I 2025-07-16 16:09:18,861] Trial 231 finished with value: 0.1242750842275861 and parameters: {'log2_hidden_layer_0': 5, 'log2_hidden_layer_1': 3, 'dropout': 0.6, 'lr': 0.00012632653693525348, 'batch_size': 180, 'num_epochs': 15}. Best is trial 63 with value: 0.12508327601680538.


0.12929683798181904


100%|██████████| 15/15 [00:11<00:00,  1.31it/s]


0.10955143585013138


100%|██████████| 15/15 [00:11<00:00,  1.32it/s]


0.13660242641184794


100%|██████████| 15/15 [00:11<00:00,  1.28it/s]


0.12141670645599492


100%|██████████| 15/15 [00:11<00:00,  1.32it/s]
[I 2025-07-16 16:10:05,780] Trial 232 finished with value: 0.12417255020029813 and parameters: {'log2_hidden_layer_0': 5, 'log2_hidden_layer_1': 3, 'dropout': 0.6, 'lr': 0.00012716320470678322, 'batch_size': 180, 'num_epochs': 15}. Best is trial 63 with value: 0.12508327601680538.


0.12911963208321828


100%|██████████| 15/15 [00:11<00:00,  1.29it/s]


0.10963829029289178


100%|██████████| 15/15 [00:12<00:00,  1.21it/s]


0.137328291562471


100%|██████████| 15/15 [00:11<00:00,  1.29it/s]


0.12185256649294919


100%|██████████| 15/15 [00:11<00:00,  1.31it/s]
[I 2025-07-16 16:10:53,951] Trial 233 finished with value: 0.1245593268828104 and parameters: {'log2_hidden_layer_0': 5, 'log2_hidden_layer_1': 3, 'dropout': 0.6, 'lr': 0.00011663600389552263, 'batch_size': 180, 'num_epochs': 15}. Best is trial 63 with value: 0.12508327601680538.


0.12941815918292965


100%|██████████| 15/15 [00:11<00:00,  1.30it/s]


0.10956480407800553


100%|██████████| 15/15 [00:11<00:00,  1.31it/s]


0.13728430434313119


100%|██████████| 15/15 [00:11<00:00,  1.30it/s]


0.12154429594611497


100%|██████████| 15/15 [00:11<00:00,  1.33it/s]
[I 2025-07-16 16:11:40,782] Trial 234 finished with value: 0.1244522754756053 and parameters: {'log2_hidden_layer_0': 5, 'log2_hidden_layer_1': 3, 'dropout': 0.6, 'lr': 0.00011674905411139321, 'batch_size': 180, 'num_epochs': 15}. Best is trial 63 with value: 0.12508327601680538.


0.1294156975351695


100%|██████████| 15/15 [00:11<00:00,  1.32it/s]


0.10957609861655873


100%|██████████| 15/15 [00:11<00:00,  1.31it/s]


0.1360822461687969


100%|██████████| 15/15 [00:11<00:00,  1.30it/s]


0.12128817939305671


100%|██████████| 15/15 [00:11<00:00,  1.32it/s]
[I 2025-07-16 16:12:27,588] Trial 235 finished with value: 0.12395891724389974 and parameters: {'log2_hidden_layer_0': 5, 'log2_hidden_layer_1': 3, 'dropout': 0.6, 'lr': 0.0001340725475607006, 'batch_size': 180, 'num_epochs': 15}. Best is trial 63 with value: 0.12508327601680538.


0.12888914479718658


100%|██████████| 15/15 [00:33<00:00,  2.25s/it]


0.1055248320621387


100%|██████████| 15/15 [00:35<00:00,  2.34s/it]


0.13250962983594536


100%|██████████| 15/15 [00:36<00:00,  2.41s/it]


0.12059808282903418


100%|██████████| 15/15 [00:33<00:00,  2.26s/it]
[I 2025-07-16 16:14:49,412] Trial 236 finished with value: 0.12185444729404807 and parameters: {'log2_hidden_layer_0': 5, 'log2_hidden_layer_1': 3, 'dropout': 0.6, 'lr': 0.00011510432035117655, 'batch_size': 60, 'num_epochs': 15}. Best is trial 63 with value: 0.12508327601680538.


0.12878524444907405


100%|██████████| 15/15 [00:11<00:00,  1.27it/s]


0.10942447926800818


100%|██████████| 15/15 [00:12<00:00,  1.22it/s]


0.13659805371010272


100%|██████████| 15/15 [00:11<00:00,  1.30it/s]


0.12184646949599873


100%|██████████| 15/15 [00:11<00:00,  1.34it/s]
[I 2025-07-16 16:15:37,271] Trial 237 finished with value: 0.12436456302154851 and parameters: {'log2_hidden_layer_0': 5, 'log2_hidden_layer_1': 3, 'dropout': 0.6, 'lr': 0.00012596269255270252, 'batch_size': 180, 'num_epochs': 15}. Best is trial 63 with value: 0.12508327601680538.


0.12958924961208446


100%|██████████| 15/15 [00:11<00:00,  1.31it/s]


0.10985962227916699


100%|██████████| 15/15 [00:11<00:00,  1.32it/s]


0.13582106889246434


100%|██████████| 15/15 [00:11<00:00,  1.31it/s]


0.12130887977791356


100%|██████████| 15/15 [00:11<00:00,  1.34it/s]
[I 2025-07-16 16:16:23,690] Trial 238 finished with value: 0.12320297415069881 and parameters: {'log2_hidden_layer_0': 5, 'log2_hidden_layer_1': 3, 'dropout': 0.6, 'lr': 0.00014426158992407407, 'batch_size': 180, 'num_epochs': 15}. Best is trial 63 with value: 0.12508327601680538.


0.12582232565325036


100%|██████████| 15/15 [00:11<00:00,  1.31it/s]


0.1097096309301643


100%|██████████| 15/15 [00:11<00:00,  1.32it/s]


0.13655119233919016


100%|██████████| 15/15 [00:11<00:00,  1.32it/s]


0.12142695707094808


100%|██████████| 15/15 [00:11<00:00,  1.35it/s]
[I 2025-07-16 16:17:09,971] Trial 239 finished with value: 0.12412590790799466 and parameters: {'log2_hidden_layer_0': 5, 'log2_hidden_layer_1': 3, 'dropout': 0.6, 'lr': 0.00013031364902371475, 'batch_size': 180, 'num_epochs': 15}. Best is trial 63 with value: 0.12508327601680538.


0.12881585129167614


100%|██████████| 15/15 [00:16<00:00,  1.11s/it]


0.10797850684711906


100%|██████████| 15/15 [00:16<00:00,  1.10s/it]


0.1349888604363863


100%|██████████| 15/15 [00:16<00:00,  1.12s/it]


0.12143887958198771


100%|██████████| 15/15 [00:16<00:00,  1.10s/it]
[I 2025-07-16 16:18:17,998] Trial 240 finished with value: 0.12170689963593742 and parameters: {'log2_hidden_layer_0': 5, 'log2_hidden_layer_1': 3, 'dropout': 0.7, 'lr': 0.00011682930390894848, 'batch_size': 120, 'num_epochs': 15}. Best is trial 63 with value: 0.12508327601680538.


0.12242135167825664


100%|██████████| 15/15 [00:02<00:00,  5.20it/s]


0.11052131197506244


100%|██████████| 15/15 [00:02<00:00,  5.21it/s]


0.13523129060167197


100%|██████████| 15/15 [00:02<00:00,  5.00it/s]


0.1179332614183531


100%|██████████| 15/15 [00:02<00:00,  5.03it/s]
[I 2025-07-16 16:18:30,066] Trial 241 finished with value: 0.1223114450161545 and parameters: {'log2_hidden_layer_0': 5, 'log2_hidden_layer_1': 3, 'dropout': 0.5, 'lr': 0.00012979768746667562, 'batch_size': 720, 'num_epochs': 15}. Best is trial 63 with value: 0.12508327601680538.


0.12555991606953049


100%|██████████| 15/15 [00:11<00:00,  1.33it/s]


0.10973004167470611


100%|██████████| 15/15 [00:11<00:00,  1.33it/s]


0.13718415786550908


100%|██████████| 15/15 [00:11<00:00,  1.32it/s]


0.1211226873019905


100%|██████████| 15/15 [00:11<00:00,  1.34it/s]
[I 2025-07-16 16:19:16,236] Trial 242 finished with value: 0.12431304910225946 and parameters: {'log2_hidden_layer_0': 5, 'log2_hidden_layer_1': 3, 'dropout': 0.6, 'lr': 0.00011596080680679834, 'batch_size': 180, 'num_epochs': 15}. Best is trial 63 with value: 0.12508327601680538.


0.12921530956683214


100%|██████████| 15/15 [00:11<00:00,  1.33it/s]


0.10986753291120067


100%|██████████| 15/15 [00:11<00:00,  1.34it/s]


0.13736260475800435


100%|██████████| 15/15 [00:11<00:00,  1.33it/s]


0.12119807811300369


100%|██████████| 15/15 [00:11<00:00,  1.34it/s]
[I 2025-07-16 16:20:02,219] Trial 243 finished with value: 0.12440332294257542 and parameters: {'log2_hidden_layer_0': 5, 'log2_hidden_layer_1': 3, 'dropout': 0.6, 'lr': 0.00011464845738115424, 'batch_size': 180, 'num_epochs': 15}. Best is trial 63 with value: 0.12508327601680538.


0.12918507598809295


100%|██████████| 15/15 [00:11<00:00,  1.33it/s]


0.11006721922842574


100%|██████████| 15/15 [00:11<00:00,  1.27it/s]


0.13726406915899414


100%|██████████| 15/15 [00:11<00:00,  1.35it/s]


0.1211015310626933


100%|██████████| 15/15 [00:11<00:00,  1.35it/s]
[I 2025-07-16 16:20:48,681] Trial 244 finished with value: 0.12442460048485302 and parameters: {'log2_hidden_layer_0': 5, 'log2_hidden_layer_1': 3, 'dropout': 0.6, 'lr': 0.00011344920379171952, 'batch_size': 180, 'num_epochs': 15}. Best is trial 63 with value: 0.12508327601680538.


0.12926558248929884


100%|██████████| 15/15 [00:11<00:00,  1.36it/s]


0.10980643124776658


100%|██████████| 15/15 [00:11<00:00,  1.35it/s]


0.13751442153352963


100%|██████████| 15/15 [00:11<00:00,  1.33it/s]


0.12150503904472165


100%|██████████| 15/15 [00:11<00:00,  1.36it/s]
[I 2025-07-16 16:21:34,155] Trial 245 finished with value: 0.12445950968610689 and parameters: {'log2_hidden_layer_0': 5, 'log2_hidden_layer_1': 3, 'dropout': 0.6, 'lr': 0.00011451351912610334, 'batch_size': 180, 'num_epochs': 15}. Best is trial 63 with value: 0.12508327601680538.


0.12901214691840968


100%|██████████| 15/15 [00:11<00:00,  1.34it/s]


0.10997233319158861


100%|██████████| 15/15 [00:11<00:00,  1.35it/s]


0.1371614335599749


100%|██████████| 15/15 [00:11<00:00,  1.33it/s]


0.12140546819661631


100%|██████████| 15/15 [00:11<00:00,  1.35it/s]
[I 2025-07-16 16:22:19,930] Trial 246 finished with value: 0.12439456953841474 and parameters: {'log2_hidden_layer_0': 5, 'log2_hidden_layer_1': 3, 'dropout': 0.6, 'lr': 0.00011437471552477292, 'batch_size': 180, 'num_epochs': 15}. Best is trial 63 with value: 0.12508327601680538.


0.12903904320547915


100%|██████████| 15/15 [00:11<00:00,  1.34it/s]


0.11007311881038843


100%|██████████| 15/15 [00:11<00:00,  1.28it/s]


0.13727120496571313


100%|██████████| 15/15 [00:11<00:00,  1.25it/s]


0.1210648915566051


100%|██████████| 15/15 [00:11<00:00,  1.34it/s]
[I 2025-07-16 16:23:07,044] Trial 247 finished with value: 0.1243774021659945 and parameters: {'log2_hidden_layer_0': 5, 'log2_hidden_layer_1': 3, 'dropout': 0.6, 'lr': 0.00011278933086783025, 'batch_size': 180, 'num_epochs': 15}. Best is trial 63 with value: 0.12508327601680538.


0.12910039333127135


100%|██████████| 15/15 [00:11<00:00,  1.34it/s]


0.11020183281258104


100%|██████████| 15/15 [00:11<00:00,  1.34it/s]


0.13707355826321857


100%|██████████| 15/15 [00:10<00:00,  1.37it/s]


0.12082700479595355


100%|██████████| 15/15 [00:10<00:00,  1.36it/s]
[I 2025-07-16 16:23:52,400] Trial 248 finished with value: 0.12435931177323421 and parameters: {'log2_hidden_layer_0': 5, 'log2_hidden_layer_1': 3, 'dropout': 0.6, 'lr': 0.00011273292344295445, 'batch_size': 180, 'num_epochs': 15}. Best is trial 63 with value: 0.12508327601680538.


0.1293348512211837


100%|██████████| 15/15 [00:11<00:00,  1.33it/s]


0.10990936881642946


100%|██████████| 15/15 [00:11<00:00,  1.33it/s]


0.13722149664221187


100%|██████████| 15/15 [00:11<00:00,  1.32it/s]


0.12096002256620703


100%|██████████| 15/15 [00:11<00:00,  1.35it/s]
[I 2025-07-16 16:24:38,451] Trial 249 finished with value: 0.12432302767226376 and parameters: {'log2_hidden_layer_0': 5, 'log2_hidden_layer_1': 3, 'dropout': 0.6, 'lr': 0.00010954632637849735, 'batch_size': 180, 'num_epochs': 15}. Best is trial 63 with value: 0.12508327601680538.


0.1292012226642067


100%|██████████| 15/15 [00:11<00:00,  1.33it/s]


0.10957369792661444


100%|██████████| 15/15 [00:11<00:00,  1.35it/s]


0.13733343192947395


100%|██████████| 15/15 [00:11<00:00,  1.32it/s]


0.12120484611103463


100%|██████████| 15/15 [00:11<00:00,  1.35it/s]
[I 2025-07-16 16:25:24,326] Trial 250 finished with value: 0.12434538907741663 and parameters: {'log2_hidden_layer_0': 5, 'log2_hidden_layer_1': 3, 'dropout': 0.6, 'lr': 0.00011514167554086136, 'batch_size': 180, 'num_epochs': 15}. Best is trial 63 with value: 0.12508327601680538.


0.12926958034254354


100%|██████████| 15/15 [00:11<00:00,  1.34it/s]


0.10757729505030215


100%|██████████| 15/15 [00:11<00:00,  1.32it/s]


0.13225058020788794


100%|██████████| 15/15 [00:11<00:00,  1.31it/s]


0.11715857970036796


100%|██████████| 15/15 [00:11<00:00,  1.34it/s]
[I 2025-07-16 16:26:10,462] Trial 251 finished with value: 0.12114297677911462 and parameters: {'log2_hidden_layer_0': 5, 'log2_hidden_layer_1': 3, 'dropout': 0.2, 'lr': 0.00011234777545534446, 'batch_size': 180, 'num_epochs': 15}. Best is trial 63 with value: 0.12508327601680538.


0.12758545215790038


100%|██████████| 15/15 [00:11<00:00,  1.34it/s]


0.10986842025806366


100%|██████████| 15/15 [00:11<00:00,  1.33it/s]


0.13716076361972057


100%|██████████| 15/15 [00:11<00:00,  1.32it/s]


0.12108608523765448


100%|██████████| 15/15 [00:12<00:00,  1.23it/s]
[I 2025-07-16 16:26:57,576] Trial 252 finished with value: 0.12435533314869572 and parameters: {'log2_hidden_layer_0': 5, 'log2_hidden_layer_1': 3, 'dropout': 0.6, 'lr': 0.00011214011293804533, 'batch_size': 180, 'num_epochs': 15}. Best is trial 63 with value: 0.12508327601680538.


0.1293060634793442


100%|██████████| 15/15 [00:12<00:00,  1.20it/s]


0.11010841242084153


100%|██████████| 15/15 [00:11<00:00,  1.32it/s]


0.13343947793002237


100%|██████████| 15/15 [00:10<00:00,  1.36it/s]


0.11997332134488449


100%|██████████| 15/15 [00:11<00:00,  1.36it/s]
[I 2025-07-16 16:27:44,425] Trial 253 finished with value: 0.12307076320267543 and parameters: {'log2_hidden_layer_0': 5, 'log2_hidden_layer_1': 3, 'dropout': 0.3, 'lr': 0.00011236677711288617, 'batch_size': 180, 'num_epochs': 15}. Best is trial 63 with value: 0.12508327601680538.


0.12876184111495334


100%|██████████| 15/15 [00:11<00:00,  1.33it/s]


0.10969469396129496


100%|██████████| 15/15 [00:11<00:00,  1.32it/s]


0.1355326147783822


100%|██████████| 15/15 [00:11<00:00,  1.34it/s]


0.12144286720605846


100%|██████████| 15/15 [00:11<00:00,  1.34it/s]
[I 2025-07-16 16:28:30,553] Trial 254 finished with value: 0.123726866432031 and parameters: {'log2_hidden_layer_0': 5, 'log2_hidden_layer_1': 3, 'dropout': 0.6, 'lr': 0.00013962411361134576, 'batch_size': 180, 'num_epochs': 15}. Best is trial 63 with value: 0.12508327601680538.


0.1282372897823884


100%|██████████| 15/15 [00:11<00:00,  1.34it/s]


0.10995267182101641


100%|██████████| 15/15 [00:11<00:00,  1.35it/s]


0.13728197012814897


100%|██████████| 15/15 [00:11<00:00,  1.34it/s]


0.12110329721964946


100%|██████████| 15/15 [00:11<00:00,  1.33it/s]
[I 2025-07-16 16:29:16,377] Trial 255 finished with value: 0.12436971232317351 and parameters: {'log2_hidden_layer_0': 5, 'log2_hidden_layer_1': 3, 'dropout': 0.6, 'lr': 0.00011240168494422822, 'batch_size': 180, 'num_epochs': 15}. Best is trial 63 with value: 0.12508327601680538.


0.12914091012387915


100%|██████████| 15/15 [00:11<00:00,  1.32it/s]


0.10996632444465752


100%|██████████| 15/15 [00:11<00:00,  1.34it/s]


0.13709097841879472


100%|██████████| 15/15 [00:11<00:00,  1.32it/s]


0.12103571963703791


100%|██████████| 15/15 [00:11<00:00,  1.35it/s]
[I 2025-07-16 16:30:02,468] Trial 256 finished with value: 0.12435089846440595 and parameters: {'log2_hidden_layer_0': 5, 'log2_hidden_layer_1': 3, 'dropout': 0.6, 'lr': 0.00011226311034614339, 'batch_size': 180, 'num_epochs': 15}. Best is trial 63 with value: 0.12508327601680538.


0.12931057135713367


100%|██████████| 15/15 [00:11<00:00,  1.34it/s]


0.10939956895933496


100%|██████████| 15/15 [00:11<00:00,  1.34it/s]


0.13643665534032987


100%|██████████| 15/15 [00:11<00:00,  1.35it/s]


0.12186881773427953


100%|██████████| 15/15 [00:11<00:00,  1.34it/s]
[I 2025-07-16 16:30:48,200] Trial 257 finished with value: 0.12426667045369433 and parameters: {'log2_hidden_layer_0': 5, 'log2_hidden_layer_1': 3, 'dropout': 0.6, 'lr': 0.00012517402299947605, 'batch_size': 180, 'num_epochs': 15}. Best is trial 63 with value: 0.12508327601680538.


0.12936163978083295


100%|██████████| 15/15 [00:11<00:00,  1.33it/s]


0.11004314582147245


100%|██████████| 15/15 [00:11<00:00,  1.32it/s]


0.1369226062503653


100%|██████████| 15/15 [00:11<00:00,  1.34it/s]


0.12101707407303501


100%|██████████| 15/15 [00:11<00:00,  1.35it/s]
[I 2025-07-16 16:31:34,239] Trial 258 finished with value: 0.1242772232201566 and parameters: {'log2_hidden_layer_0': 5, 'log2_hidden_layer_1': 3, 'dropout': 0.6, 'lr': 0.00011262059019476778, 'batch_size': 180, 'num_epochs': 15}. Best is trial 63 with value: 0.12508327601680538.


0.1291260667357536


100%|██████████| 15/15 [00:11<00:00,  1.36it/s]


0.06996073279973634


100%|██████████| 15/15 [00:11<00:00,  1.33it/s]


0.07394850025153696


100%|██████████| 15/15 [00:11<00:00,  1.30it/s]


0.10312152758941183


100%|██████████| 15/15 [00:11<00:00,  1.35it/s]
[I 2025-07-16 16:32:20,183] Trial 259 finished with value: 0.08265265353755831 and parameters: {'log2_hidden_layer_0': 5, 'log2_hidden_layer_1': 3, 'dropout': 0.6, 'lr': 0.001228508424504244, 'batch_size': 180, 'num_epochs': 15}. Best is trial 63 with value: 0.12508327601680538.


0.08357985350954812


100%|██████████| 15/15 [00:11<00:00,  1.33it/s]


0.10998060569841613


100%|██████████| 15/15 [00:11<00:00,  1.34it/s]


0.1357060845245054


100%|██████████| 15/15 [00:11<00:00,  1.33it/s]


0.12114183571725959


100%|██████████| 15/15 [00:11<00:00,  1.33it/s]
[I 2025-07-16 16:33:06,232] Trial 260 finished with value: 0.12391892078214238 and parameters: {'log2_hidden_layer_0': 5, 'log2_hidden_layer_1': 3, 'dropout': 0.6, 'lr': 0.0001384560465823377, 'batch_size': 180, 'num_epochs': 15}. Best is trial 63 with value: 0.12508327601680538.


0.1288471571883884


100%|██████████| 15/15 [00:11<00:00,  1.35it/s]


0.10963106399606841


100%|██████████| 15/15 [00:11<00:00,  1.35it/s]


0.1365159591208573


100%|██████████| 15/15 [00:11<00:00,  1.33it/s]


0.12134144353239969


100%|██████████| 15/15 [00:11<00:00,  1.35it/s]
[I 2025-07-16 16:33:51,849] Trial 261 finished with value: 0.12423526839226748 and parameters: {'log2_hidden_layer_0': 5, 'log2_hidden_layer_1': 3, 'dropout': 0.6, 'lr': 0.00012312032149408035, 'batch_size': 180, 'num_epochs': 15}. Best is trial 63 with value: 0.12508327601680538.


0.12945260691974456


100%|██████████| 15/15 [00:11<00:00,  1.35it/s]


0.1099151184483202


100%|██████████| 15/15 [00:11<00:00,  1.34it/s]


0.13718289746879667


100%|██████████| 15/15 [00:11<00:00,  1.32it/s]


0.12087098507526961


100%|██████████| 15/15 [00:11<00:00,  1.34it/s]
[I 2025-07-16 16:34:37,709] Trial 262 finished with value: 0.12427310948174386 and parameters: {'log2_hidden_layer_0': 5, 'log2_hidden_layer_1': 3, 'dropout': 0.6, 'lr': 0.0001119735741159236, 'batch_size': 180, 'num_epochs': 15}. Best is trial 63 with value: 0.12508327601680538.


0.12912343693458891


100%|██████████| 15/15 [00:11<00:00,  1.33it/s]


0.10934194074622906


100%|██████████| 15/15 [00:11<00:00,  1.36it/s]


0.136687043826277


100%|██████████| 15/15 [00:11<00:00,  1.28it/s]


0.1216643567628999


100%|██████████| 15/15 [00:11<00:00,  1.27it/s]
[I 2025-07-16 16:35:24,584] Trial 263 finished with value: 0.1242967017417555 and parameters: {'log2_hidden_layer_0': 5, 'log2_hidden_layer_1': 3, 'dropout': 0.6, 'lr': 0.00012533042828586922, 'batch_size': 180, 'num_epochs': 15}. Best is trial 63 with value: 0.12508327601680538.


0.1294934656316161


100%|██████████| 15/15 [00:34<00:00,  2.31s/it]


0.08314847307111636


100%|██████████| 15/15 [00:32<00:00,  2.17s/it]


0.05047342282232793


100%|██████████| 15/15 [00:33<00:00,  2.26s/it]


0.08489403857855254


100%|██████████| 15/15 [00:34<00:00,  2.27s/it]
[I 2025-07-16 16:37:42,675] Trial 264 finished with value: -1.0 and parameters: {'log2_hidden_layer_0': 5, 'log2_hidden_layer_1': 3, 'dropout': 0.6, 'lr': 0.002797288036642771, 'batch_size': 60, 'num_epochs': 15}. Best is trial 63 with value: 0.12508327601680538.


[0.02650189 0.02650189 0.02650189 ... 0.02650189 0.02650189 0.02650189]
Error: zero variance prediction
-1


100%|██████████| 15/15 [00:10<00:00,  1.36it/s]


0.10814560634320736


100%|██████████| 15/15 [00:10<00:00,  1.36it/s]


0.12724699606709902


100%|██████████| 15/15 [00:10<00:00,  1.37it/s]


0.10627655104505612


100%|██████████| 15/15 [00:10<00:00,  1.37it/s]
[I 2025-07-16 16:38:27,572] Trial 265 finished with value: 0.11386025036670443 and parameters: {'log2_hidden_layer_0': 5, 'log2_hidden_layer_1': 3, 'dropout': 0.6, 'lr': 0.00030427667256170194, 'batch_size': 180, 'num_epochs': 15}. Best is trial 63 with value: 0.12508327601680538.


0.11377184801145519


100%|██████████| 15/15 [00:11<00:00,  1.36it/s]


0.10534684222784388


100%|██████████| 15/15 [00:11<00:00,  1.36it/s]


0.12708060562571524


100%|██████████| 15/15 [00:11<00:00,  1.33it/s]


0.112979526839531


100%|██████████| 15/15 [00:11<00:00,  1.31it/s]
[I 2025-07-16 16:39:13,387] Trial 266 finished with value: 0.11691811921400613 and parameters: {'log2_hidden_layer_0': 5, 'log2_hidden_layer_1': 3, 'dropout': 0.2, 'lr': 0.00014650964159515034, 'batch_size': 180, 'num_epochs': 15}. Best is trial 63 with value: 0.12508327601680538.


0.1222655021629344


100%|██████████| 15/15 [00:11<00:00,  1.31it/s]


0.10998704204732601


100%|██████████| 15/15 [00:11<00:00,  1.31it/s]


0.13736264387865582


100%|██████████| 15/15 [00:11<00:00,  1.30it/s]


0.12142592682001722


100%|██████████| 15/15 [00:11<00:00,  1.31it/s]
[I 2025-07-16 16:40:00,266] Trial 267 finished with value: 0.12447919462420842 and parameters: {'log2_hidden_layer_0': 5, 'log2_hidden_layer_1': 3, 'dropout': 0.6, 'lr': 0.0001137879190956964, 'batch_size': 180, 'num_epochs': 15}. Best is trial 63 with value: 0.12508327601680538.


0.1291411657508346


100%|██████████| 30/30 [00:33<00:00,  1.12s/it]


0.10290684013717513


100%|██████████| 30/30 [00:33<00:00,  1.12s/it]


0.11775661826131904


100%|██████████| 30/30 [00:35<00:00,  1.17s/it]


0.10628635257133753


100%|██████████| 30/30 [00:34<00:00,  1.15s/it]
[I 2025-07-16 16:42:18,848] Trial 268 finished with value: 0.11100388679578532 and parameters: {'log2_hidden_layer_0': 5, 'log2_hidden_layer_1': 3, 'dropout': 0.6, 'lr': 0.0001277596946627527, 'batch_size': 120, 'num_epochs': 30}. Best is trial 63 with value: 0.12508327601680538.


0.11706573621330962


100%|██████████| 15/15 [01:08<00:00,  4.56s/it]
[I 2025-07-16 16:43:28,852] Trial 269 finished with value: -1.0 and parameters: {'log2_hidden_layer_0': 5, 'log2_hidden_layer_1': 3, 'dropout': 0.6, 'lr': 0.00588119691411629, 'batch_size': 30, 'num_epochs': 15}. Best is trial 63 with value: 0.12508327601680538.


[0.03728813 0.03728813 0.03728813 ... 0.03728813 0.03728813 0.03728813]
Error: zero variance prediction
-1


100%|██████████| 15/15 [00:11<00:00,  1.27it/s]


0.11148035501981643


100%|██████████| 15/15 [00:12<00:00,  1.24it/s]


0.13475249125653793


100%|██████████| 15/15 [00:11<00:00,  1.29it/s]


0.11512856761902569


100%|██████████| 15/15 [00:11<00:00,  1.32it/s]
[I 2025-07-16 16:44:16,767] Trial 270 finished with value: 0.12156536187938048 and parameters: {'log2_hidden_layer_0': 5, 'log2_hidden_layer_1': 3, 'dropout': 0.7, 'lr': 0.00013754501635531338, 'batch_size': 180, 'num_epochs': 15}. Best is trial 63 with value: 0.12508327601680538.


0.12490003362214185


100%|██████████| 15/15 [00:11<00:00,  1.27it/s]


0.10986097624577552


100%|██████████| 15/15 [00:12<00:00,  1.22it/s]


0.13717115144566655


100%|██████████| 15/15 [00:11<00:00,  1.26it/s]


0.12116689041545159


100%|██████████| 15/15 [00:11<00:00,  1.28it/s]
[I 2025-07-16 16:45:05,693] Trial 271 finished with value: 0.12434771459810524 and parameters: {'log2_hidden_layer_0': 5, 'log2_hidden_layer_1': 3, 'dropout': 0.6, 'lr': 0.00011144098044288219, 'batch_size': 180, 'num_epochs': 15}. Best is trial 63 with value: 0.12508327601680538.


0.12919184028552733


100%|██████████| 15/15 [00:03<00:00,  4.90it/s]


0.10548830539581436


100%|██████████| 15/15 [00:03<00:00,  4.92it/s]


0.12803345430596644


100%|██████████| 15/15 [00:03<00:00,  4.76it/s]


0.10845236699568934


100%|██████████| 15/15 [00:03<00:00,  4.90it/s]
[I 2025-07-16 16:45:18,342] Trial 272 finished with value: 0.11403437673074125 and parameters: {'log2_hidden_layer_0': 5, 'log2_hidden_layer_1': 3, 'dropout': 0.6, 'lr': 0.00012050833503827964, 'batch_size': 720, 'num_epochs': 15}. Best is trial 63 with value: 0.12508327601680538.


0.1141633802254949


100%|██████████| 15/15 [00:11<00:00,  1.31it/s]


0.10957143963927951


100%|██████████| 15/15 [00:11<00:00,  1.31it/s]


0.131534915112456


100%|██████████| 15/15 [00:11<00:00,  1.30it/s]


0.11901452086436476


100%|██████████| 15/15 [00:11<00:00,  1.32it/s]
[I 2025-07-16 16:46:05,201] Trial 273 finished with value: 0.12192145670401923 and parameters: {'log2_hidden_layer_0': 5, 'log2_hidden_layer_1': 3, 'dropout': 0.5, 'lr': 0.00016485564522617326, 'batch_size': 180, 'num_epochs': 15}. Best is trial 63 with value: 0.12508327601680538.


0.12756495119997663


100%|██████████| 15/15 [00:11<00:00,  1.31it/s]


0.07232423440291107


100%|██████████| 15/15 [00:11<00:00,  1.32it/s]


0.05714770446451864


100%|██████████| 15/15 [00:11<00:00,  1.30it/s]


0.08575241678408067


100%|██████████| 15/15 [00:11<00:00,  1.32it/s]
[I 2025-07-16 16:46:52,012] Trial 274 finished with value: 0.07576195778190055 and parameters: {'log2_hidden_layer_0': 5, 'log2_hidden_layer_1': 3, 'dropout': 0.6, 'lr': 0.001680081018592326, 'batch_size': 180, 'num_epochs': 15}. Best is trial 63 with value: 0.12508327601680538.


0.0878234754760918


100%|██████████| 15/15 [00:11<00:00,  1.31it/s]


0.10999174262228742


100%|██████████| 15/15 [00:11<00:00,  1.31it/s]


0.13608776708476647


100%|██████████| 15/15 [00:11<00:00,  1.30it/s]


0.12165690006266887


100%|██████████| 15/15 [00:11<00:00,  1.31it/s]
[I 2025-07-16 16:47:38,911] Trial 275 finished with value: 0.12414912977523213 and parameters: {'log2_hidden_layer_0': 5, 'log2_hidden_layer_1': 3, 'dropout': 0.6, 'lr': 0.00013418044910131012, 'batch_size': 180, 'num_epochs': 15}. Best is trial 63 with value: 0.12508327601680538.


0.12886010933120573


100%|██████████| 15/15 [00:11<00:00,  1.30it/s]


0.11002484859909684


100%|██████████| 15/15 [00:11<00:00,  1.31it/s]


0.13733709466312452


100%|██████████| 15/15 [00:11<00:00,  1.27it/s]


0.12103793733688052


100%|██████████| 15/15 [00:11<00:00,  1.31it/s]
[I 2025-07-16 16:48:26,223] Trial 276 finished with value: 0.12439299937890688 and parameters: {'log2_hidden_layer_0': 5, 'log2_hidden_layer_1': 3, 'dropout': 0.6, 'lr': 0.00011173274086653974, 'batch_size': 180, 'num_epochs': 15}. Best is trial 63 with value: 0.12508327601680538.


0.12917211691652564


100%|██████████| 15/15 [00:11<00:00,  1.32it/s]


0.10999015035692486


100%|██████████| 15/15 [00:11<00:00,  1.31it/s]


0.13699814803424024


100%|██████████| 15/15 [00:11<00:00,  1.33it/s]


0.1208347388619045


100%|██████████| 15/15 [00:11<00:00,  1.34it/s]
[I 2025-07-16 16:49:12,523] Trial 277 finished with value: 0.12429111543478846 and parameters: {'log2_hidden_layer_0': 5, 'log2_hidden_layer_1': 3, 'dropout': 0.6, 'lr': 0.00011000473027417827, 'batch_size': 180, 'num_epochs': 15}. Best is trial 63 with value: 0.12508327601680538.


0.12934142448608424


100%|██████████| 15/15 [00:11<00:00,  1.32it/s]


0.11005895930087815


100%|██████████| 15/15 [00:11<00:00,  1.33it/s]


0.13736439131078768


100%|██████████| 15/15 [00:11<00:00,  1.31it/s]


0.1208969324647608


100%|██████████| 15/15 [00:11<00:00,  1.30it/s]
[I 2025-07-16 16:49:59,124] Trial 278 finished with value: 0.12435571123207478 and parameters: {'log2_hidden_layer_0': 5, 'log2_hidden_layer_1': 3, 'dropout': 0.6, 'lr': 0.00011021450414380659, 'batch_size': 180, 'num_epochs': 15}. Best is trial 63 with value: 0.12508327601680538.


0.1291025618518725


100%|██████████| 10/10 [00:07<00:00,  1.33it/s]


0.1080974608870095


100%|██████████| 10/10 [00:07<00:00,  1.32it/s]


0.13318179807637778


100%|██████████| 10/10 [00:07<00:00,  1.33it/s]


0.11426247843719618


100%|██████████| 10/10 [00:07<00:00,  1.32it/s]
[I 2025-07-16 16:50:30,309] Trial 279 finished with value: 0.12031058850779575 and parameters: {'log2_hidden_layer_0': 5, 'log2_hidden_layer_1': 3, 'dropout': 0.6, 'lr': 0.0001200646822659516, 'batch_size': 180, 'num_epochs': 10}. Best is trial 63 with value: 0.12508327601680538.


0.12570061663059953


100%|██████████| 15/15 [00:11<00:00,  1.33it/s]


0.10837617372173429


100%|██████████| 15/15 [00:11<00:00,  1.32it/s]


0.12408182760711818


100%|██████████| 15/15 [00:11<00:00,  1.29it/s]


0.11502418232381245


100%|██████████| 15/15 [00:11<00:00,  1.28it/s]
[I 2025-07-16 16:51:17,306] Trial 280 finished with value: 0.11778503469700877 and parameters: {'log2_hidden_layer_0': 5, 'log2_hidden_layer_1': 3, 'dropout': 0.3, 'lr': 0.00015363744176272775, 'batch_size': 180, 'num_epochs': 15}. Best is trial 63 with value: 0.12508327601680538.


0.12365795513537012


100%|██████████| 15/15 [00:11<00:00,  1.30it/s]


0.10759072899886529


100%|██████████| 15/15 [00:11<00:00,  1.31it/s]


0.13245883995953694


100%|██████████| 15/15 [00:11<00:00,  1.28it/s]


0.11725362018803041


100%|██████████| 15/15 [00:11<00:00,  1.31it/s]
[I 2025-07-16 16:52:04,453] Trial 281 finished with value: 0.12121959501407988 and parameters: {'log2_hidden_layer_0': 5, 'log2_hidden_layer_1': 3, 'dropout': 0.2, 'lr': 0.00011209258163485654, 'batch_size': 180, 'num_epochs': 15}. Best is trial 63 with value: 0.12508327601680538.


0.12757519090988684


100%|██████████| 15/15 [00:11<00:00,  1.33it/s]


0.10937794654861105


100%|██████████| 15/15 [00:11<00:00,  1.33it/s]


0.13674512334254826


100%|██████████| 15/15 [00:11<00:00,  1.32it/s]


0.12125894095376617


100%|██████████| 15/15 [00:11<00:00,  1.28it/s]
[I 2025-07-16 16:52:51,189] Trial 282 finished with value: 0.12424734227692813 and parameters: {'log2_hidden_layer_0': 5, 'log2_hidden_layer_1': 3, 'dropout': 0.6, 'lr': 0.0001222834829305946, 'batch_size': 180, 'num_epochs': 15}. Best is trial 63 with value: 0.12508327601680538.


0.129607358262787


100%|██████████| 15/15 [00:11<00:00,  1.27it/s]


0.10993238001778559


100%|██████████| 15/15 [00:11<00:00,  1.28it/s]


0.1355999531129868


100%|██████████| 15/15 [00:11<00:00,  1.27it/s]


0.12123008600151576


100%|██████████| 15/15 [00:11<00:00,  1.31it/s]
[I 2025-07-16 16:53:39,011] Trial 283 finished with value: 0.12389622031455587 and parameters: {'log2_hidden_layer_0': 5, 'log2_hidden_layer_1': 3, 'dropout': 0.6, 'lr': 0.000139658321137908, 'batch_size': 180, 'num_epochs': 15}. Best is trial 63 with value: 0.12508327601680538.


0.12882246212593534


100%|██████████| 15/15 [00:11<00:00,  1.31it/s]


0.09700866969346504


100%|██████████| 15/15 [00:11<00:00,  1.31it/s]


0.08828458504184457


100%|██████████| 15/15 [00:11<00:00,  1.32it/s]


0.08162096798083351


100%|██████████| 15/15 [00:11<00:00,  1.33it/s]
[I 2025-07-16 16:54:25,494] Trial 284 finished with value: 0.08778901972701382 and parameters: {'log2_hidden_layer_0': 5, 'log2_hidden_layer_1': 3, 'dropout': 0.6, 'lr': 0.0007677327150968788, 'batch_size': 180, 'num_epochs': 15}. Best is trial 63 with value: 0.12508327601680538.


0.08424185619191217


100%|██████████| 15/15 [00:11<00:00,  1.31it/s]


0.10987214553664305


100%|██████████| 15/15 [00:11<00:00,  1.30it/s]


0.13737518318490916


100%|██████████| 15/15 [00:11<00:00,  1.28it/s]


0.12103900456537674


100%|██████████| 15/15 [00:11<00:00,  1.29it/s]
[I 2025-07-16 16:55:12,912] Trial 285 finished with value: 0.12436497550049114 and parameters: {'log2_hidden_layer_0': 5, 'log2_hidden_layer_1': 3, 'dropout': 0.6, 'lr': 0.000112009424548044, 'batch_size': 180, 'num_epochs': 15}. Best is trial 63 with value: 0.12508327601680538.


0.1291735687150356


100%|██████████| 5/5 [00:03<00:00,  1.25it/s]


0.03724020156125369


100%|██████████| 5/5 [00:03<00:00,  1.26it/s]


0.08453643064332486


100%|██████████| 5/5 [00:03<00:00,  1.30it/s]


0.06985752731946972


100%|██████████| 5/5 [00:03<00:00,  1.30it/s]
[I 2025-07-16 16:55:29,604] Trial 286 finished with value: 0.06987349506615706 and parameters: {'log2_hidden_layer_0': 3, 'log2_hidden_layer_1': 3, 'dropout': 0.6, 'lr': 0.00011049791207411446, 'batch_size': 180, 'num_epochs': 5}. Best is trial 63 with value: 0.12508327601680538.


0.08785982074057994


100%|██████████| 15/15 [00:11<00:00,  1.28it/s]


0.10994294323461348


100%|██████████| 15/15 [00:11<00:00,  1.25it/s]


0.1373873573737127


100%|██████████| 15/15 [00:11<00:00,  1.29it/s]


0.12113403685454556


100%|██████████| 15/15 [00:11<00:00,  1.28it/s]
[I 2025-07-16 16:56:17,688] Trial 287 finished with value: 0.12440256281063347 and parameters: {'log2_hidden_layer_0': 5, 'log2_hidden_layer_1': 3, 'dropout': 0.6, 'lr': 0.00011145343621297955, 'batch_size': 180, 'num_epochs': 15}. Best is trial 63 with value: 0.12508327601680538.


0.12914591377966217


100%|██████████| 15/15 [00:11<00:00,  1.28it/s]


0.10949865683447445


100%|██████████| 15/15 [00:11<00:00,  1.27it/s]


0.1368618422896572


100%|██████████| 15/15 [00:11<00:00,  1.30it/s]


0.1214436230422384


100%|██████████| 15/15 [00:11<00:00,  1.33it/s]
[I 2025-07-16 16:57:05,087] Trial 288 finished with value: 0.12425345830204698 and parameters: {'log2_hidden_layer_0': 5, 'log2_hidden_layer_1': 3, 'dropout': 0.6, 'lr': 0.00012105268195505145, 'batch_size': 180, 'num_epochs': 15}. Best is trial 63 with value: 0.12508327601680538.


0.1292097110418179


100%|██████████| 15/15 [00:33<00:00,  2.24s/it]


0.10611838042165242


100%|██████████| 15/15 [00:34<00:00,  2.27s/it]


0.13284866905272744


100%|██████████| 15/15 [00:34<00:00,  2.32s/it]


0.12112447637600784


100%|██████████| 15/15 [00:35<00:00,  2.36s/it]
[I 2025-07-16 16:59:25,872] Trial 289 finished with value: 0.12245050757834175 and parameters: {'log2_hidden_layer_0': 5, 'log2_hidden_layer_1': 3, 'dropout': 0.6, 'lr': 0.00011102662706357468, 'batch_size': 60, 'num_epochs': 15}. Best is trial 63 with value: 0.12508327601680538.


0.1297105044629793


100%|██████████| 15/15 [00:11<00:00,  1.26it/s]


0.10970524049809063


100%|██████████| 15/15 [00:11<00:00,  1.29it/s]


0.1361470103803945


100%|██████████| 15/15 [00:11<00:00,  1.28it/s]


0.12155203800114184


100%|██████████| 15/15 [00:11<00:00,  1.28it/s]
[I 2025-07-16 17:00:13,806] Trial 290 finished with value: 0.12412584316630637 and parameters: {'log2_hidden_layer_0': 5, 'log2_hidden_layer_1': 3, 'dropout': 0.6, 'lr': 0.00013340511283891786, 'batch_size': 180, 'num_epochs': 15}. Best is trial 63 with value: 0.12508327601680538.


0.12909908378559856


100%|██████████| 15/15 [00:11<00:00,  1.27it/s]


0.10950394174669913


100%|██████████| 15/15 [00:11<00:00,  1.28it/s]


0.13679277445962174


100%|██████████| 15/15 [00:11<00:00,  1.29it/s]


0.12144223977804117


100%|██████████| 15/15 [00:11<00:00,  1.33it/s]
[I 2025-07-16 17:01:01,369] Trial 291 finished with value: 0.12430452543501648 and parameters: {'log2_hidden_layer_0': 5, 'log2_hidden_layer_1': 3, 'dropout': 0.6, 'lr': 0.0001229671442269548, 'batch_size': 180, 'num_epochs': 15}. Best is trial 63 with value: 0.12508327601680538.


0.1294791457557039


100%|██████████| 15/15 [01:06<00:00,  4.43s/it]


0.08760645132083425


100%|██████████| 15/15 [01:08<00:00,  4.56s/it]


0.10257728871219547


100%|██████████| 15/15 [01:07<00:00,  4.53s/it]


0.09425918164405729


100%|██████████| 15/15 [01:07<00:00,  4.49s/it]
[I 2025-07-16 17:05:37,294] Trial 292 finished with value: 0.09782793757790088 and parameters: {'log2_hidden_layer_0': 5, 'log2_hidden_layer_1': 3, 'dropout': 0.2, 'lr': 0.00011047920023735171, 'batch_size': 30, 'num_epochs': 15}. Best is trial 63 with value: 0.12508327601680538.


0.10686882863451654


100%|██████████| 30/30 [00:23<00:00,  1.30it/s]


0.10549305994164192


100%|██████████| 30/30 [00:23<00:00,  1.28it/s]


0.11960417094482147


100%|██████████| 30/30 [00:23<00:00,  1.28it/s]


0.1095391378041443


100%|██████████| 30/30 [00:22<00:00,  1.30it/s]
[I 2025-07-16 17:07:11,333] Trial 293 finished with value: 0.11128386474903937 and parameters: {'log2_hidden_layer_0': 5, 'log2_hidden_layer_1': 3, 'dropout': 0.6, 'lr': 0.00014611749199025767, 'batch_size': 180, 'num_epochs': 30}. Best is trial 63 with value: 0.12508327601680538.


0.1104990903055498


100%|██████████| 15/15 [00:11<00:00,  1.29it/s]


0.11002142024176892


100%|██████████| 15/15 [00:12<00:00,  1.20it/s]


0.13731533816900865


100%|██████████| 15/15 [00:11<00:00,  1.27it/s]


0.12078392336758031


100%|██████████| 15/15 [00:11<00:00,  1.29it/s]
[I 2025-07-16 17:07:59,979] Trial 294 finished with value: 0.1243178652971855 and parameters: {'log2_hidden_layer_0': 5, 'log2_hidden_layer_1': 3, 'dropout': 0.6, 'lr': 0.00010991886579824455, 'batch_size': 180, 'num_epochs': 15}. Best is trial 63 with value: 0.12508327601680538.


0.12915077941038414


100%|██████████| 15/15 [00:03<00:00,  4.83it/s]


0.10529594114034786


100%|██████████| 15/15 [00:03<00:00,  4.84it/s]


0.1275346873680676


100%|██████████| 15/15 [00:03<00:00,  4.82it/s]


0.1085170324795008


100%|██████████| 15/15 [00:03<00:00,  4.95it/s]
[I 2025-07-16 17:08:12,642] Trial 295 finished with value: 0.11375652951936002 and parameters: {'log2_hidden_layer_0': 5, 'log2_hidden_layer_1': 3, 'dropout': 0.6, 'lr': 0.00011947558627172488, 'batch_size': 720, 'num_epochs': 15}. Best is trial 63 with value: 0.12508327601680538.


0.11367845708952382


100%|██████████| 15/15 [00:17<00:00,  1.16s/it]


0.10706153073777978


100%|██████████| 15/15 [00:17<00:00,  1.14s/it]


0.1355495271570918


100%|██████████| 15/15 [00:17<00:00,  1.18s/it]


0.12149482492383537


100%|██████████| 15/15 [00:17<00:00,  1.16s/it]
[I 2025-07-16 17:09:23,681] Trial 296 finished with value: 0.12306566176935342 and parameters: {'log2_hidden_layer_0': 5, 'log2_hidden_layer_1': 3, 'dropout': 0.6, 'lr': 0.00013357442368446397, 'batch_size': 120, 'num_epochs': 15}. Best is trial 63 with value: 0.12508327601680538.


0.12815676425870676


100%|██████████| 15/15 [00:11<00:00,  1.26it/s]


0.11204055745832554


100%|██████████| 15/15 [00:12<00:00,  1.24it/s]


0.13587236476326017


100%|██████████| 15/15 [00:11<00:00,  1.25it/s]


0.11410795220946793


100%|██████████| 15/15 [00:11<00:00,  1.29it/s]
[I 2025-07-16 17:10:12,380] Trial 297 finished with value: 0.1210641064405907 and parameters: {'log2_hidden_layer_0': 5, 'log2_hidden_layer_1': 3, 'dropout': 0.7, 'lr': 0.00012007786108649006, 'batch_size': 180, 'num_epochs': 15}. Best is trial 63 with value: 0.12508327601680538.


0.12223555133130909


100%|██████████| 15/15 [00:11<00:00,  1.29it/s]


0.10997452037588266


100%|██████████| 15/15 [00:11<00:00,  1.30it/s]


0.13711398126176103


100%|██████████| 15/15 [00:11<00:00,  1.27it/s]


0.12097868833908557


100%|██████████| 15/15 [00:11<00:00,  1.28it/s]
[I 2025-07-16 17:11:00,064] Trial 298 finished with value: 0.1243126743236353 and parameters: {'log2_hidden_layer_0': 5, 'log2_hidden_layer_1': 3, 'dropout': 0.6, 'lr': 0.00011102807430281982, 'batch_size': 180, 'num_epochs': 15}. Best is trial 63 with value: 0.12508327601680538.


0.1291835073178119


100%|██████████| 15/15 [00:11<00:00,  1.27it/s]


0.042695017740730264


100%|██████████| 15/15 [00:12<00:00,  1.22it/s]


0.047256997735913436


100%|██████████| 15/15 [00:11<00:00,  1.25it/s]


0.06474061259040477


100%|██████████| 15/15 [00:11<00:00,  1.26it/s]
[I 2025-07-16 17:11:49,251] Trial 299 finished with value: 0.058269648140522046 and parameters: {'log2_hidden_layer_0': 5, 'log2_hidden_layer_1': 3, 'dropout': 0.5, 'lr': 0.0022746213501426437, 'batch_size': 180, 'num_epochs': 15}. Best is trial 63 with value: 0.12508327601680538.


0.0783859644950397


100%|██████████| 15/15 [00:11<00:00,  1.26it/s]


0.1096943070740738


100%|██████████| 15/15 [00:12<00:00,  1.25it/s]


0.13649623645663564


100%|██████████| 15/15 [00:11<00:00,  1.27it/s]


0.12172668056126239


100%|██████████| 15/15 [00:11<00:00,  1.26it/s]
[I 2025-07-16 17:12:38,057] Trial 300 finished with value: 0.12423102372845835 and parameters: {'log2_hidden_layer_0': 5, 'log2_hidden_layer_1': 3, 'dropout': 0.6, 'lr': 0.0001290187070108035, 'batch_size': 180, 'num_epochs': 15}. Best is trial 63 with value: 0.12508327601680538.


0.12900687082186155
Best hyperparameters: {'log2_hidden_layer_0': 5, 'log2_hidden_layer_1': 3, 'dropout': 0.2, 'lr': 0.0001159469267719501, 'batch_size': 360, 'num_epochs': 15}
Best Pearson score: 0.12508327601680538


{'log2_hidden_layer_0': 5,
 'log2_hidden_layer_1': 3,
 'dropout': 0.2,
 'lr': 0.0001159469267719501,
 'batch_size': 360,
 'num_epochs': 15}

#### Nineth Iteration: AE + MLP instead of GBDT feature selection + MLP (train together)

Define the model

In [34]:
# Define gaussian noise for autoencoder
class GaussianNoise(nnmx.Module):
    def __init__(self, mean: float = 0.0, stddev: float = 0.01):
        super().__init__()
        self.mean = mean
        self.stddev = stddev

    def __call__(self, x, training = True):
        if training:
            x += mx.random.normal(loc=self.mean, scale=self.stddev, shape=x.shape)
        return x

In [35]:
# Define the model
# We do not use the reset method this time so you have to create the model at each fold
class AEMLX(nnmx.Module):
    def __init__(self, num_features, hidden_layers_size, latent_size, dropout):
        super().__init__()

        # Initialize layers for encoder
        last_layer = num_features
        self.encoder_layers = []
        for current_layer in hidden_layers_size:
            self.encoder_layers.append(nnmx.Linear(last_layer, current_layer))
            last_layer = current_layer
        self.encoder_layers.append(nnmx.Linear(last_layer, latent_size))

        # Initialize layers for decoder
        last_layer = latent_size
        self.decoder_layers = []
        for current_layer in hidden_layers_size[::-1]:
            self.decoder_layers.append(nnmx.Linear(last_layer, current_layer))
            last_layer = current_layer
        self.decoder_layers.append(nnmx.Linear(last_layer, num_features))

        # Initialize activation
        self.activation = nnmx.ReLU()

        # Initialze gaussian noise to apply upon training
        # self.gaussian_noise = GaussianNoise()

        # Initialize dropout
        self.dropout = nnmx.Dropout(p = dropout)

    def __call__(self, x, training = True):
        # if training:
        #     x = self.gaussian_noise(x)
            
        for inx, layer in enumerate(self.encoder_layers):
            x = layer(x)
            x = self.activation(x)
            x = self.dropout(x)
        for inx, layer in enumerate(self.decoder_layers):
            if inx == len(self.decoder_layers) - 1:
                x = layer(x)
            else:
                x = layer(x)
                x = self.activation(x)
                x = self.dropout(x)
        return x

    def get_latent(self, x):
        for inx, layer in enumerate(self.encoder_layers):
            x = layer(x)
            x = self.activation(x)
        return x

Train model with CV and evaluate

In [36]:
# Separate function for train & eval step
def train_aemlp_mlx(ae_model, ae_loss_and_grad_fn, ae_optimizer, ae_num_epochs,
                     mlp_model, mlp_loss_and_grad_fn, mlp_optimizer, mlp_num_epochs,
                     X_train, Y_train, batch_size):
    # Train ae first
    ae_model.train()
    for _ in tqdm(range(ae_num_epochs)):
        for (inputs, targets) in batch_iterate(batch_size, X_train, Y_train):
            # get gradients for ae, output is the inputs itself
            _, ae_grads = ae_loss_and_grad_fn(ae_model, inputs, inputs)

            # Update the optimizer state and model parameters in a single call
            ae_optimizer.update(ae_model, ae_grads)

            # Force a graph evaluation
            mx.eval(ae_model.parameters(), ae_optimizer.state)

    # Train mlp later
    mlp_model.train()
    for _ in tqdm(range(mlp_num_epochs)):
        for (inputs, targets) in batch_iterate(batch_size, X_train, Y_train):
            # get the latent representation for X_train
            latent_inputs = ae_model.get_latent(inputs)
            used_inputs = mx.concatenate([inputs, latent_inputs], axis=1)
            # get gradients for mlp
            _, mlp_grads = mlp_loss_and_grad_fn(mlp_model, used_inputs, targets)

            # Update the optimizer state and model parameters in a single call
            mlp_optimizer.update(mlp_model, mlp_grads)

            # Force a graph evaluation
            mx.eval(mlp_model.parameters(), mlp_optimizer.state)

    # # Train ae and mlp together
    # ae_model.train()
    # mlp_model.train()
    # for _ in tqdm(range(ae_num_epochs)):
    #     for (inputs, targets) in batch_iterate(batch_size, X_train, Y_train):
    #         # get gradients for ae, output is the inputs itself
    #         _, ae_grads = ae_loss_and_grad_fn(ae_model, inputs, inputs)

    #         # Update the optimizer state and model parameters in a single call
    #         ae_optimizer.update(ae_model, ae_grads)

    #         # Force a graph evaluation
    #         mx.eval(ae_model.parameters(), ae_optimizer.state)

    #         # get gradients for mlp
    #         latent_inputs = ae_model.get_latent(inputs)
    #         used_inputs = mx.concatenate([inputs, latent_inputs], axis=1)
    #         _, mlp_grads = mlp_loss_and_grad_fn(mlp_model, used_inputs, targets)

    #         # Update the optimizer state and model parameters in a single call
    #         mlp_optimizer.update(mlp_model, mlp_grads)

    #         # Force a graph evaluation
    #         mx.eval(mlp_model.parameters(), mlp_optimizer.state)

def eval_aemlp_mlx(ae_model, mlp_model, X_test, Y_test, batch_size):
    outputs_all = np.zeros(0)
    targets_all = np.zeros(0)
    ae_model.eval()
    mlp_model.eval()
    for (inputs, targets) in batch_iterate(batch_size, X_test, Y_test, shuffle=False):
        latent_inputs = ae_model.get_latent(inputs)
        used_inputs = mx.concatenate([inputs, latent_inputs], axis=1)
        outputs = mlp_model(used_inputs).reshape(-1)
        # convert back to numpy
        outputs, targets = np.array(outputs), np.array(targets)
        # Load to overall Y_test, Y_pred to calculate pearson score later
        outputs_all = np.concatenate([outputs_all, outputs])
        targets_all = np.concatenate([targets_all, targets])
    return pearson_score(targets_all, outputs_all)

In [37]:
def train_eval_cv_mlx_aemlp(num_features, ae_hidden_layers_size, ae_latent_size, ae_dropout, ae_lr, ae_num_epochs,
                            mlp_hidden_layers_size, mlp_dropout, mlp_lr, mlp_num_epochs,
                            cv, X_train_arr, X_test_arr, Y_train_arr, Y_test_arr, batch_size):
    cv_pearson = 0
    for _, (X_train, X_test, Y_train, Y_test) in enumerate(zip(X_train_arr, X_test_arr, Y_train_arr, Y_test_arr)):
        # initialize the model
        mx.random.seed(default_random_state)
        ae_model = AEMLX(num_features, ae_hidden_layers_size, ae_latent_size, ae_dropout)

        mx.random.seed(default_random_state)
        mlp_model = MLPMLX(ae_latent_size + num_features, mlp_hidden_layers_size, mlp_dropout)

        # Initialize the loss function (both use same loss function)
        def loss_fn(model, X, Y):
            Y_pred = model(X).reshape(-1)
            Y = Y.reshape(-1)
            return mx.mean(nnmx.losses.mse_loss(Y_pred, Y))
        ae_loss_and_grad_fn = nnmx.value_and_grad(ae_model, loss_fn)
        mlp_loss_and_grad_fn = nnmx.value_and_grad(mlp_model, loss_fn)

        # Reinitialize the optimizer
        ae_optimizer = optimmx.Adam(learning_rate = ae_lr)
        mlp_optimizer = optimmx.Adam(learning_rate = mlp_lr)

        # Train the model
        train_aemlp_mlx(ae_model, ae_loss_and_grad_fn, ae_optimizer, ae_num_epochs,
                        mlp_model, mlp_loss_and_grad_fn, mlp_optimizer, mlp_num_epochs,
                        X_train, Y_train, batch_size)

        # Test the model
        pearson = eval_aemlp_mlx(ae_model, mlp_model, X_test, Y_test, batch_size)
        print(pearson)
        if pearson == -1:
            return pearson
        cv_pearson += pearson
    return cv_pearson / cv

Conduct training and evaluating process of the model

In [38]:
# # Create the CV data, seems to be better with only anonymized features
# best_features = [col for col in train_df.columns if "X" in col] + \
#                 ["volume", "bid_qty", "ask_qty", "buy_qty", "sell_qty"] + \
#                 [col for col in train_df.columns.tolist() if "X" not in col and col not in ["timestamp", "label"]]
# train_added_df = pd.concat([train_df, popular_features_train], axis=1)
# X_train_arr, X_test_arr, Y_train_arr, Y_test_arr = create_cv(train_added_df, best_features)
# for i in range(default_cv):
#     X_train_arr[i] = float64_to_float32(X_train_arr[i])
#     X_test_arr[i] = float64_to_float32(X_test_arr[i])
#     Y_train_arr[i] = float64_to_float32(Y_train_arr[i])
#     Y_test_arr[i] = float64_to_float32(Y_test_arr[i])
# X_train_arr, X_test_arr, Y_train_arr, Y_test_arr = normal_cv_to_mlx_cv(X_train_arr, X_test_arr, Y_train_arr, Y_test_arr)

In [39]:
# # # Training process of the default config
# num_features = len(best_features)
# ae_hidden_layers_size = [64]
# ae_latent_size = 16
# mlp_hidden_layers_size = [4, 2]
# lr = 0.0005
# dropout = 0.5
# batch_size = 180
# num_epochs = 30

# train_eval_cv_mlx_aemlp(num_features, ae_hidden_layers_size, ae_latent_size, 
#                         mlp_hidden_layers_size, dropout, 
#                         lr, default_cv,
#                         X_train_arr, X_test_arr, Y_train_arr, Y_test_arr,
#                         batch_size, num_epochs)

Optimize with bayesian optimization

In [40]:
ae_default_num_layers = 2
mlp_default_num_layers = 1

In [41]:
def objective_aemlp_mlx(trial):
    # First initialize the parameters
    num_features = len(best_features)

    # initialize ae layers
    ae_num_layers = ae_default_num_layers
    ae_log_2_hidden_layers_size = []
    for i in range(ae_num_layers):
        if len(ae_log_2_hidden_layers_size) == 0:
            ae_log_2_hidden_layers_size.append(trial.suggest_int(f"ae_log2_hidden_layer_{i}", 3, int(math.ceil(math.log2(num_features)))))
        else:
            ae_log_2_hidden_layers_size.append(trial.suggest_int(f"ae_log2_hidden_layer_{i}", 3, ae_log_2_hidden_layers_size[-1]))
    ae_hidden_layers_size = [2**i for i in ae_log_2_hidden_layers_size]
    ae_latent_size = 2**trial.suggest_int("ae_log2_latent_size", 3, ae_log_2_hidden_layers_size[-1])
    ae_dropout = trial.suggest_categorical("ae_dropout", [0.2, 0.3, 0.4, 0.5, 0.6, 0.7])
    ae_lr = trial.suggest_float("ae_lr", 0.0001, 0.01, log=True)
    ae_num_epochs = trial.suggest_categorical("num_epochs", [5 * i for i in range(1, 7)])

    # initialize mlp layers
    mlp_num_layers = mlp_default_num_layers
    mlp_log_2_hidden_layers_size = []
    for i in range(mlp_num_layers):
        if len(mlp_log_2_hidden_layers_size) == 0:
            mlp_log_2_hidden_layers_size.append(trial.suggest_int(f"mlp_log2_hidden_layer_{i}", 2, int(math.ceil(math.log2(ae_latent_size + num_features)))))
        else:
            mlp_log_2_hidden_layers_size.append(trial.suggest_int(f"mlp_log2_hidden_layer_{i}", 2, mlp_log_2_hidden_layers_size[-1]))
    mlp_hidden_layers_size = [2**i for i in mlp_log_2_hidden_layers_size]
    mlp_dropout = trial.suggest_categorical("mle_dropout", [0.2, 0.3, 0.4, 0.5, 0.6, 0.7])
    mlp_lr = trial.suggest_float("mlp_lr", 0.0001, 0.01, log=True)
    # mlp_num_epochs = trial.suggest_categorical("mlp_num_epochs", [10, 20, 30, 40, 50])
    mlp_num_epochs = ae_num_epochs

    # batch size
    batch_size = trial.suggest_categorical("batch_size", [30, 60, 120, 180, 360, 720, 1440])
    
    # Conduct training based on those parameters
    cv_pearson = train_eval_cv_mlx_aemlp(num_features, ae_hidden_layers_size, ae_latent_size, ae_dropout, ae_lr, ae_num_epochs,
                                         mlp_hidden_layers_size, mlp_dropout, mlp_lr, mlp_num_epochs,
                                         default_cv, X_train_arr, X_test_arr, Y_train_arr, Y_test_arr,
                                         batch_size)
    
    return cv_pearson

In [42]:
def optimize_aemlp_mlx(study_name, storage_name, objective_function=objective_aemlp_mlx, n_trials = 100, n_jobs = 1):
    print("Conduct hyperparam opt for AE-MLP")
    study = optuna.create_study(
        study_name = study_name,
        direction ='maximize',
        storage = f"sqlite:///{storage_name}.db",
        sampler = TPESampler(seed = 101, n_startup_trials=10),
        load_if_exists=True
    )
    study.optimize(objective_function, n_trials=n_trials, n_jobs=n_jobs)
    print('Best hyperparameters:', study.best_params)
    print('Best Pearson score:', study.best_value)
    return study.best_params

In [43]:
best_features = [col for col in train_df.columns if "X" in col and "interaction" not in col]
X_train_arr, X_test_arr, Y_train_arr, Y_test_arr = create_cv(train_df, best_features)
for i in range(default_cv):
    X_train_arr[i] = float64_to_float32(X_train_arr[i])
    X_test_arr[i] = float64_to_float32(X_test_arr[i])
    Y_train_arr[i] = float64_to_float32(Y_train_arr[i])
    Y_test_arr[i] = float64_to_float32(Y_test_arr[i])
X_train_arr, X_test_arr, Y_train_arr, Y_test_arr = normal_cv_to_mlx_cv(X_train_arr, X_test_arr, Y_train_arr, Y_test_arr)

[3, 4, 5, 6] [8, 9, 10, 11]
[4, 5, 6, 7] [9, 10, 11, 12]
[5, 6, 7, 8] [10, 11, 12, 1]
[6, 7, 8, 9] [11, 12, 1, 2]


In [None]:
optimize_aemlp_mlx(
    f"aemlp_mlx_{feature_version}_{default_cv}_{default_random_state}_{ae_default_num_layers}_{mlp_default_num_layers}_study",
    f"aemlp_mlx_{feature_version}_{default_cv}_{default_random_state}_{ae_default_num_layers}_{mlp_default_num_layers}_study"
)

[I 2025-07-16 17:12:39,809] A new study created in RDB with name: aemlp_mlx_2_4_101_2_1_study


Conduct hyperparam opt for AE-MLP


100%|██████████| 25/25 [02:07<00:00,  5.12s/it]
100%|██████████| 25/25 [00:44<00:00,  1.79s/it]


0.07083862792685322


100%|██████████| 25/25 [02:04<00:00,  5.00s/it]
100%|██████████| 25/25 [00:44<00:00,  1.80s/it]


0.08354669320165632


100%|██████████| 25/25 [02:09<00:00,  5.20s/it]
100%|██████████| 25/25 [00:46<00:00,  1.85s/it]


0.06824357669370684


100%|██████████| 25/25 [02:09<00:00,  5.18s/it]
100%|██████████| 25/25 [00:44<00:00,  1.77s/it]
[I 2025-07-16 17:24:16,372] Trial 0 finished with value: 0.07453052749901101 and parameters: {'ae_log2_hidden_layer_0': 5, 'ae_log2_hidden_layer_1': 4, 'ae_log2_latent_size': 3, 'ae_dropout': 0.6, 'ae_lr': 0.00023981586388374809, 'num_epochs': 25, 'mlp_log2_hidden_layer_0': 2, 'mle_dropout': 0.3, 'mlp_lr': 0.0001886854774519709, 'batch_size': 60}. Best is trial 0 with value: 0.07453052749901101.


0.07549321217382765


100%|██████████| 15/15 [00:03<00:00,  4.54it/s]
100%|██████████| 15/15 [00:01<00:00, 13.82it/s]


0.08104492241165681


100%|██████████| 15/15 [00:03<00:00,  4.33it/s]
100%|██████████| 15/15 [00:01<00:00, 13.64it/s]


0.09775593386103013


100%|██████████| 15/15 [00:03<00:00,  4.53it/s]
100%|██████████| 15/15 [00:01<00:00, 13.78it/s]


0.06886802567778894


100%|██████████| 15/15 [00:03<00:00,  4.35it/s]
100%|██████████| 15/15 [00:01<00:00, 13.69it/s]
[I 2025-07-16 17:24:34,526] Trial 1 finished with value: 0.08502817703736065 and parameters: {'ae_log2_hidden_layer_0': 7, 'ae_log2_hidden_layer_1': 5, 'ae_log2_latent_size': 4, 'ae_dropout': 0.2, 'ae_lr': 0.00015642024418779235, 'num_epochs': 15, 'mlp_log2_hidden_layer_0': 2, 'mle_dropout': 0.2, 'mlp_lr': 0.0014138722599267763, 'batch_size': 1440}. Best is trial 1 with value: 0.08502817703736065.


0.09244382619896671


100%|██████████| 25/25 [00:51<00:00,  2.05s/it]
100%|██████████| 25/25 [00:15<00:00,  1.63it/s]


0.08648335596086322


100%|██████████| 25/25 [00:50<00:00,  2.01s/it]
100%|██████████| 25/25 [00:15<00:00,  1.66it/s]


0.11099534281035474


100%|██████████| 25/25 [00:52<00:00,  2.08s/it]
100%|██████████| 25/25 [00:15<00:00,  1.63it/s]


0.08250940073057213


100%|██████████| 25/25 [03:19<00:00,  7.99s/it]
100%|██████████| 25/25 [00:33<00:00,  1.36s/it]
[I 2025-07-16 17:31:48,953] Trial 2 finished with value: 0.0896077357429978 and parameters: {'ae_log2_hidden_layer_0': 7, 'ae_log2_hidden_layer_1': 5, 'ae_log2_latent_size': 4, 'ae_dropout': 0.7, 'ae_lr': 0.002190386936542009, 'num_epochs': 25, 'mlp_log2_hidden_layer_0': 6, 'mle_dropout': 0.7, 'mlp_lr': 0.0002553026020716765, 'batch_size': 180}. Best is trial 2 with value: 0.0896077357429978.


0.0784428434702011


100%|██████████| 15/15 [00:07<00:00,  2.14it/s]
100%|██████████| 15/15 [00:02<00:00,  5.49it/s]


0.07318641422861


100%|██████████| 15/15 [00:06<00:00,  2.18it/s]
100%|██████████| 15/15 [00:02<00:00,  5.73it/s]


0.08580227541894896


100%|██████████| 15/15 [00:06<00:00,  2.21it/s]
100%|██████████| 15/15 [00:01<00:00,  7.72it/s]


0.1026645784449774


100%|██████████| 15/15 [00:03<00:00,  4.96it/s]
100%|██████████| 15/15 [00:20<00:00,  1.34s/it]
[I 2025-07-16 17:32:40,425] Trial 3 finished with value: 0.08383331130074101 and parameters: {'ae_log2_hidden_layer_0': 4, 'ae_log2_hidden_layer_1': 3, 'ae_log2_latent_size': 3, 'ae_dropout': 0.2, 'ae_lr': 0.00048407598264737844, 'num_epochs': 15, 'mlp_log2_hidden_layer_0': 7, 'mle_dropout': 0.7, 'mlp_lr': 0.002971547402265493, 'batch_size': 1440}. Best is trial 2 with value: 0.0896077357429978.


0.07367997711042767


100%|██████████| 10/10 [16:46<00:00, 100.66s/it] 
100%|██████████| 10/10 [00:36<00:00,  3.67s/it]


0.07546328370534132


100%|██████████| 10/10 [01:38<00:00,  9.83s/it]
100%|██████████| 10/10 [00:36<00:00,  3.62s/it]


0.1305374859474908


100%|██████████| 10/10 [01:37<00:00,  9.77s/it]
100%|██████████| 10/10 [00:48<00:00,  4.87s/it]


0.08393361594295543


100%|██████████| 10/10 [01:39<00:00,  9.97s/it]
100%|██████████| 10/10 [00:39<00:00,  3.93s/it]
[I 2025-07-16 17:57:12,342] Trial 4 finished with value: 0.08832444586508256 and parameters: {'ae_log2_hidden_layer_0': 4, 'ae_log2_hidden_layer_1': 3, 'ae_log2_latent_size': 3, 'ae_dropout': 0.6, 'ae_lr': 0.0005164211587438335, 'num_epochs': 10, 'mlp_log2_hidden_layer_0': 7, 'mle_dropout': 0.6, 'mlp_lr': 0.0003406790268619332, 'batch_size': 30}. Best is trial 2 with value: 0.0896077357429978.


0.06336339786454272


100%|██████████| 10/10 [00:19<00:00,  1.98s/it]
100%|██████████| 10/10 [00:06<00:00,  1.59it/s]


0.023975748118211473


100%|██████████| 10/10 [00:19<00:00,  1.99s/it]
100%|██████████| 10/10 [00:06<00:00,  1.50it/s]


0.1282754136920142


100%|██████████| 10/10 [00:20<00:00,  2.00s/it]
100%|██████████| 10/10 [00:06<00:00,  1.56it/s]


0.05885243718775736


100%|██████████| 10/10 [00:20<00:00,  2.01s/it]
100%|██████████| 10/10 [00:06<00:00,  1.60it/s]
[I 2025-07-16 17:58:59,161] Trial 5 finished with value: 0.06881929988436095 and parameters: {'ae_log2_hidden_layer_0': 7, 'ae_log2_hidden_layer_1': 4, 'ae_log2_latent_size': 4, 'ae_dropout': 0.3, 'ae_lr': 0.0012412583088355978, 'num_epochs': 10, 'mlp_log2_hidden_layer_0': 6, 'mle_dropout': 0.3, 'mlp_lr': 0.004275613445812877, 'batch_size': 180}. Best is trial 2 with value: 0.0896077357429978.


0.06417360053946074


100%|██████████| 15/15 [00:26<00:00,  1.75s/it]
100%|██████████| 15/15 [00:09<00:00,  1.60it/s]


0.09401447185093115


100%|██████████| 15/15 [00:26<00:00,  1.74s/it]
100%|██████████| 15/15 [00:09<00:00,  1.57it/s]


0.10843063297083129


100%|██████████| 15/15 [00:26<00:00,  1.79s/it]
100%|██████████| 15/15 [00:09<00:00,  1.58it/s]


0.10334287156824214


100%|██████████| 15/15 [00:28<00:00,  1.91s/it]
100%|██████████| 15/15 [00:14<00:00,  1.00it/s]
[I 2025-07-16 18:01:31,821] Trial 6 finished with value: 0.0981172522534084 and parameters: {'ae_log2_hidden_layer_0': 4, 'ae_log2_hidden_layer_1': 3, 'ae_log2_latent_size': 3, 'ae_dropout': 0.3, 'ae_lr': 0.0004236017919707298, 'num_epochs': 15, 'mlp_log2_hidden_layer_0': 5, 'mle_dropout': 0.6, 'mlp_lr': 0.000551919594602697, 'batch_size': 180}. Best is trial 6 with value: 0.0981172522534084.


0.08668103262362903


100%|██████████| 20/20 [00:37<00:00,  1.89s/it]
100%|██████████| 20/20 [00:17<00:00,  1.18it/s]


0.08933723682373983


100%|██████████| 20/20 [00:33<00:00,  1.68s/it]
100%|██████████| 20/20 [00:16<00:00,  1.22it/s]


0.04718401179415049


100%|██████████| 20/20 [01:05<00:00,  3.26s/it]
100%|██████████| 20/20 [00:27<00:00,  1.38s/it]


0.006569495779550258


100%|██████████| 20/20 [00:31<00:00,  1.57s/it]
100%|██████████| 20/20 [10:59<00:00, 32.98s/it] 
[I 2025-07-16 18:16:22,213] Trial 7 finished with value: 0.04379681758856209 and parameters: {'ae_log2_hidden_layer_0': 3, 'ae_log2_hidden_layer_1': 3, 'ae_log2_latent_size': 3, 'ae_dropout': 0.6, 'ae_lr': 0.001651927639666053, 'num_epochs': 20, 'mlp_log2_hidden_layer_0': 4, 'mle_dropout': 0.2, 'mlp_lr': 0.008607749987514188, 'batch_size': 180}. Best is trial 6 with value: 0.0981172522534084.


0.032096525956807766


 53%|█████▎    | 16/30 [02:42<02:15,  9.67s/it]

#### Tenth Trial: Adding interaction terms to models

In [None]:
best_features = ['X757', 'X758', 'X759', 'X508', 'X614', 'X752', 'X331', 'X445', 'X465', 'X385', 
                 'X466', 'X95', 'X23', 'X219', 'X31', 'X373', 'X379', 'X284', 'X750', 'X652', 
                 'X279', 'X89', 'X169', 'X753', 'X226', 'X28', 'X444', 'X272', 'X271', 'X218']
best_interaction_features = ['X758_X508_X219_interaction', 'X758_X508_X218_interaction', 'X758_X219_X89_interaction']

X_train_arr, X_test_arr, Y_train_arr, Y_test_arr = create_cv(train_df, best_features + best_interaction_features)

XGBoost

In [None]:
best_xgboost_params_common_truncated_with_interaction = optimize_xgboost(
    f"xgboost_{feature_version}_{default_cv}_{default_random_state}_{default_n_trees}_common_truncated_{len(best_features)}_with_interaction_study",
    f"xgboost_{feature_version}_{default_cv}_{default_random_state}_{default_n_trees}_common_truncated_{len(best_features)}_with_interaction_study"
) 

LightGBM

In [None]:
best_lightgbm_params_common_truncated_with_interaction = optimize_lightgbm(
    f"lightgbm_{feature_version}_{default_cv}_{default_random_state}_{default_n_trees}_common_truncated_{len(best_features)}_with_interaction_study",
    f"lightgbm_{feature_version}_{default_cv}_{default_random_state}_{default_n_trees}_common_truncated_{len(best_features)}_with_interaction_study"
) 

MLP

In [None]:
default_num_layers = 2
# Create the CV data, seems to be better with only anonymized features
best_features = ['X757', 'X758', 'X759', 'X508', 'X614', 'X752', 'X331', 'X445', 'X465', 'X385', 
                 'X466', 'X95', 'X23', 'X219', 'X31', 'X373', 'X379', 'X284', 'X750', 'X652', 
                 'X279', 'X89', 'X169', 'X753', 'X226', 'X28', 'X444', 'X272', 'X271', 'X218']
                # ["volume", "bid_qty", "ask_qty", "buy_qty", "sell_qty"] + \
                # [col for col in train_df.columns.tolist() if "X" not in col and col not in ["timestamp", "label"]]
best_interaction_features = ['X758_X508_X219_interaction', 'X758_X508_X218_interaction', 'X758_X219_X89_interaction']
best_features = list(set(best_features + best_interaction_features))
# best_features = [col for col in train_df.columns if "X" in col]
# train_added_df = pd.concat([train_df, popular_features_train], axis=1)
X_train_arr, X_test_arr, Y_train_arr, Y_test_arr = create_cv(train_df, best_features)

# Convert to float32
for i in range(default_cv):
    X_train_arr[i] = float64_to_float32(X_train_arr[i])
    X_test_arr[i] = float64_to_float32(X_test_arr[i])
    Y_train_arr[i] = float64_to_float32(Y_train_arr[i])
    Y_test_arr[i] = float64_to_float32(Y_test_arr[i])

# Convert to MLX
X_train_arr, X_test_arr, Y_train_arr, Y_test_arr = normal_cv_to_mlx_cv(X_train_arr, X_test_arr, Y_train_arr, Y_test_arr)

In [None]:
optimize_mlp_mlx(
    f"mlp_mlx_{feature_version}_{default_cv}_{default_random_state}_{default_num_layers}_common_truncated_30_with_interaction_study",
    f"mlp_mlx_{feature_version}_{default_cv}_{default_random_state}_{default_num_layers}_common_truncated_30_with_interaction_study"
)

#### Eleventh Trial: AE-MLP but MLP step is concat of good features + AE features

In [None]:
# Custom function for batch iteration
def batch_iterate_with_concat(batch_size, X, Y, X_good, shuffle = True):
    for i in range(0, Y.size, batch_size):
        X_curr = X[i: min(i + batch_size, Y.size), :]
        Y_curr = Y[i: min(i + batch_size, Y.size)]
        X_good_curr = X_good[i: min(i + batch_size, Y.size), :]
        if shuffle:
            inx_lst = mx.random.permutation(batch_size)
            X_curr = X_curr[inx_lst, :]
            Y_curr = Y_curr[inx_lst]
            X_good_curr = X_good_curr[inx_lst, :]
        yield X_curr, Y_curr, X_good_curr

In [None]:
# Separate function for train & eval step
def train_aemlp_mlx_with_concat(ae_model, ae_loss_and_grad_fn, ae_optimizer, ae_num_epochs,
                                mlp_model, mlp_loss_and_grad_fn, mlp_optimizer, mlp_num_epochs,
                                X_train, Y_train, X_train_good, batch_size):
    # Train ae first
    ae_model.train()
    for _ in tqdm(range(ae_num_epochs)):
        for (inputs, targets) in batch_iterate(batch_size, X_train, Y_train):
            # get gradients for ae, output is the inputs itself
            _, ae_grads = ae_loss_and_grad_fn(ae_model, inputs, inputs)

            # Update the optimizer state and model parameters in a single call
            ae_optimizer.update(ae_model, ae_grads)

            # Force a graph evaluation
            mx.eval(ae_model.parameters(), ae_optimizer.state)

    # Train mlp later
    mlp_model.train()
    for _ in tqdm(range(mlp_num_epochs)):
        for (inputs, targets, inputs_good) in batch_iterate_with_concat(batch_size, X_train, Y_train, X_train_good):
            # get the latent representation for X_train
            latent_inputs = ae_model.get_latent(inputs)
            used_inputs = mx.concatenate([inputs_good, latent_inputs], axis=1)
            # get gradients for mlp
            _, mlp_grads = mlp_loss_and_grad_fn(mlp_model, used_inputs, targets)

            # Update the optimizer state and model parameters in a single call
            mlp_optimizer.update(mlp_model, mlp_grads)

            # Force a graph evaluation
            mx.eval(mlp_model.parameters(), mlp_optimizer.state)

    # # Train ae and mlp together
    # ae_model.train()
    # mlp_model.train()
    # for _ in tqdm(range(ae_num_epochs)):
    #     for (inputs, targets) in batch_iterate(batch_size, X_train, Y_train):
    #         # get gradients for ae, output is the inputs itself
    #         _, ae_grads = ae_loss_and_grad_fn(ae_model, inputs, inputs)

    #         # Update the optimizer state and model parameters in a single call
    #         ae_optimizer.update(ae_model, ae_grads)

    #         # Force a graph evaluation
    #         mx.eval(ae_model.parameters(), ae_optimizer.state)

    #         # get gradients for mlp
    #         latent_inputs = ae_model.get_latent(inputs)
    #         used_inputs = mx.concatenate([inputs, latent_inputs], axis=1)
    #         _, mlp_grads = mlp_loss_and_grad_fn(mlp_model, used_inputs, targets)

    #         # Update the optimizer state and model parameters in a single call
    #         mlp_optimizer.update(mlp_model, mlp_grads)

    #         # Force a graph evaluation
    #         mx.eval(mlp_model.parameters(), mlp_optimizer.state)

def eval_aemlp_mlx_with_concat(ae_model, mlp_model, X_test, Y_test, X_test_good, batch_size):
    outputs_all = np.zeros(0)
    targets_all = np.zeros(0)
    ae_model.eval()
    mlp_model.eval()
    for (inputs, targets, inputs_good) in batch_iterate_with_concat(batch_size, X_test, Y_test, X_test_good, shuffle=False):
        latent_inputs = ae_model.get_latent(inputs)
        used_inputs = mx.concatenate([inputs_good, latent_inputs], axis=1)
        outputs = mlp_model(used_inputs).reshape(-1)
        # convert back to numpy
        outputs, targets = np.array(outputs), np.array(targets)
        # Load to overall Y_test, Y_pred to calculate pearson score later
        outputs_all = np.concatenate([outputs_all, outputs])
        targets_all = np.concatenate([targets_all, targets])
    return pearson_score(targets_all, outputs_all)

In [None]:
def train_eval_cv_mlx_aemlp_with_concat(num_features, ae_hidden_layers_size, ae_latent_size, ae_dropout, ae_lr, ae_num_epochs,
                                        mlp_hidden_layers_size, mlp_dropout, mlp_lr, mlp_num_epochs,
                                        cv, X_train_arr, X_test_arr, Y_train_arr, Y_test_arr, 
                                        X_train_good_arr, X_test_good_arr, batch_size):
    cv_pearson = 0
    for _, (X_train, X_test, Y_train, Y_test, X_train_good, X_test_good) in enumerate(zip(X_train_arr, X_test_arr, Y_train_arr, Y_test_arr, X_train_good_arr, X_test_good_arr)):
        # initialize the model
        mx.random.seed(default_random_state)
        ae_model = AEMLX(num_features, ae_hidden_layers_size, ae_latent_size, ae_dropout)

        mx.random.seed(default_random_state)
        mlp_model = MLPMLX(ae_latent_size + X_train_good.shape[1], mlp_hidden_layers_size, mlp_dropout)

        # Initialize the loss function (both use same loss function)
        def loss_fn(model, X, Y):
            Y_pred = model(X).reshape(-1)
            Y = Y.reshape(-1)
            return mx.mean(nnmx.losses.mse_loss(Y_pred, Y))
        ae_loss_and_grad_fn = nnmx.value_and_grad(ae_model, loss_fn)
        mlp_loss_and_grad_fn = nnmx.value_and_grad(mlp_model, loss_fn)

        # Reinitialize the optimizer
        ae_optimizer = optimmx.Adam(learning_rate = ae_lr)
        mlp_optimizer = optimmx.Adam(learning_rate = mlp_lr)

        # Train the model
        train_aemlp_mlx_with_concat(ae_model, ae_loss_and_grad_fn, ae_optimizer, ae_num_epochs,
                                    mlp_model, mlp_loss_and_grad_fn, mlp_optimizer, mlp_num_epochs,
                                    X_train, Y_train, X_train_good, batch_size)

        # Test the model
        pearson = eval_aemlp_mlx_with_concat(ae_model, mlp_model, X_test, Y_test, X_test_good, batch_size)
        print(pearson)
        if pearson == -1:
            return pearson
        cv_pearson += pearson
    return cv_pearson / cv

In [None]:
def objective_aemlp_mlx_with_concat(trial):
    # First initialize the parameters
    num_features = len(best_features)

    # initialize ae layers
    ae_num_layers = ae_default_num_layers
    ae_log_2_hidden_layers_size = []
    for i in range(ae_num_layers):
        if len(ae_log_2_hidden_layers_size) == 0:
            ae_log_2_hidden_layers_size.append(trial.suggest_int(f"ae_log2_hidden_layer_{i}", 3, int(math.ceil(math.log2(num_features)))))
        else:
            ae_log_2_hidden_layers_size.append(trial.suggest_int(f"ae_log2_hidden_layer_{i}", 3, ae_log_2_hidden_layers_size[-1]))
    ae_hidden_layers_size = [2**i for i in ae_log_2_hidden_layers_size]
    ae_latent_size = 2**trial.suggest_int("ae_log2_latent_size", 3, ae_log_2_hidden_layers_size[-1])
    ae_dropout = trial.suggest_categorical("ae_dropout", [0.2, 0.3, 0.4, 0.5, 0.6, 0.7])
    ae_lr = trial.suggest_float("ae_lr", 0.0001, 0.01, log=True)
    ae_num_epochs = trial.suggest_categorical("num_epochs", [10, 20, 30, 40, 50])

    # initialize mlp layers
    mlp_num_layers = mlp_default_num_layers
    mlp_log_2_hidden_layers_size = []
    for i in range(mlp_num_layers):
        if len(mlp_log_2_hidden_layers_size) == 0:
            mlp_log_2_hidden_layers_size.append(trial.suggest_int(f"mlp_log2_hidden_layer_{i}", 2, int(math.ceil(math.log2(ae_latent_size + num_features)))))
        else:
            mlp_log_2_hidden_layers_size.append(trial.suggest_int(f"mlp_log2_hidden_layer_{i}", 2, mlp_log_2_hidden_layers_size[-1]))
    mlp_hidden_layers_size = [2**i for i in mlp_log_2_hidden_layers_size]
    mlp_dropout = trial.suggest_categorical("mlp_dropout", [0.2, 0.3, 0.4, 0.5, 0.6, 0.7])
    mlp_lr = trial.suggest_float("mlp_lr", 0.0001, 0.01, log=True)
    # mlp_num_epochs = trial.suggest_categorical("mlp_num_epochs", [10, 20, 30, 40, 50])
    mlp_num_epochs = ae_num_epochs

    # batch size
    batch_size = trial.suggest_categorical("batch_size", [30, 60, 120, 180, 360, 720, 1440])
    
    # Conduct training based on those parameters
    cv_pearson = train_eval_cv_mlx_aemlp_with_concat(num_features, ae_hidden_layers_size, ae_latent_size, ae_dropout, ae_lr, ae_num_epochs,
                                                     mlp_hidden_layers_size, mlp_dropout, mlp_lr, mlp_num_epochs,
                                                     default_cv, X_train_arr, X_test_arr, Y_train_arr, Y_test_arr,
                                                     X_train_good_arr, X_test_good_arr, batch_size)
    
    return cv_pearson

In [None]:
best_features = [col for col in train_df.columns if "X" in col and "interaction" not in col]
X_train_arr, X_test_arr, Y_train_arr, Y_test_arr = create_cv(train_df, best_features)
for i in range(default_cv):
    X_train_arr[i] = float64_to_float32(X_train_arr[i])
    X_test_arr[i] = float64_to_float32(X_test_arr[i])
    Y_train_arr[i] = float64_to_float32(Y_train_arr[i])
    Y_test_arr[i] = float64_to_float32(Y_test_arr[i])
X_train_arr, X_test_arr, Y_train_arr, Y_test_arr = normal_cv_to_mlx_cv(X_train_arr, X_test_arr, Y_train_arr, Y_test_arr)
best_features_good = ['X757', 'X758', 'X759', 'X508', 'X614', 'X752', 'X331', 'X445', 'X465', 'X385', 
                      'X466', 'X95', 'X23', 'X219', 'X31', 'X373', 'X379', 'X284', 'X750', 'X652', 
                      'X279', 'X89', 'X169', 'X753', 'X226', 'X28', 'X444', 'X272', 'X271', 'X218']
X_train_good_arr, X_test_good_arr, _, _ = create_cv(train_df, best_features_good)
for i in range(default_cv):
    X_train_good_arr[i] = float64_to_float32(X_train_good_arr[i])
    X_test_good_arr[i] = float64_to_float32(X_test_good_arr[i])
X_train_good_arr, X_test_good_arr, _, _ = normal_cv_to_mlx_cv(X_train_good_arr, X_test_good_arr)

In [None]:
ae_default_num_layers = 2
mlp_default_num_layers = 1

In [None]:
optimize_aemlp_mlx(
    f"aemlp_mlx_{feature_version}_{default_cv}_{default_random_state}_{ae_default_num_layers}_{mlp_default_num_layers}_with_good_concat_study",
    f"aemlp_mlx_{feature_version}_{default_cv}_{default_random_state}_{ae_default_num_layers}_{mlp_default_num_layers}_with_good_concat_study",
    objective_function = objective_aemlp_mlx_with_concat
)