In [1]:
import math
import gc
import pickle
import random
from copy import deepcopy
from tqdm import tqdm
import numpy as np
import pandas as pd
import polars as pl
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from xgboost import XGBRegressor, XGBClassifier
from lightgbm import LGBMRegressor, LGBMClassifier
from catboost import CatBoostRegressor, CatBoostClassifier
import optuna
from optuna.samplers import RandomSampler, TPESampler, GPSampler
import warnings
warnings.filterwarnings("ignore")
# import multiprocessing
# max_n_jobs = multiprocessing.cpu_count()
import shap
import torch
from torch.utils.data import Dataset, TensorDataset, DataLoader, Sampler
import torch.nn as nn
import torch.optim as optim
import mlx.core as mx
import mlx.nn as nnmx
import mlx.optimizers as optimmx

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Set the device
device = "mps" if torch.backends.mps.is_available() else "cpu"

In [3]:
feature_version = 2
# 1 for pc feature, 
# 2 for label correlation feature # seems to work most consistently
# 3 for best features based on combination rank
# 4 for including time features (in case we want to reverse engineer the masked timestamp)
# 5 for increasing number of correlation features + only use those that are in the same cluster

In [4]:
default_random_state = 101
random.seed(default_random_state)
np.random.seed(default_random_state)
torch.manual_seed(default_random_state)
torch.mps.manual_seed(default_random_state)
mx.random.seed(default_random_state)

#### Import train data and popular features

In [5]:
train_df = pd.read_parquet(f"data/cleaned/cleaned_train_{feature_version}.parquet")
train_df.head()

Unnamed: 0,X473,X205,X198,X444,X466,X445,X472,X26,X29,X217,...,normalized_buy_volume,normalized_sell_volume,liquidity_adjusted_imbalance,pressure_spread_interaction,trade_direction_ratio,net_buy_volume,bid_skew,ask_skew,timestamp,label
0,-0.201346,-1.978504,-1.700689,-0.142546,-0.163476,-0.128331,-0.126241,1.406392,1.474789,-0.981975,...,11.542564,5.339347,0.063569,-0.230493,0.79681,131.421,0.644635,0.355365,2023-03-01 00:00:00,0.562539
1,-0.186231,-1.830295,-1.669471,-0.135499,-0.159388,-0.12479,-0.115015,1.003783,1.312735,-0.94019,...,13.626484,137.821061,0.01161,-0.549445,0.620251,203.896,0.942921,0.057079,2023-03-01 00:01:00,0.533686
2,-0.182398,-1.80354,-1.662645,-0.133705,-0.158627,-0.123891,-0.112303,0.760801,1.219124,-0.933071,...,360.242073,2.263386,0.015877,0.530818,0.538664,22.858,0.007283,0.992717,2023-03-01 00:02:00,0.546505
3,-0.177415,-1.714013,-1.620037,-0.133251,-0.158334,-0.123658,-0.109113,0.955549,1.353001,-0.891216,...,69.011716,5.946089,0.025702,0.45478,0.728757,210.779,0.187976,0.812024,2023-03-01 00:03:00,0.357703
4,-0.174164,-1.68417,-1.600188,-0.128862,-0.156668,-0.121464,-0.106383,0.90546,1.36188,-0.878711,...,3.623647,12.867864,0.081042,-0.533689,0.689066,54.004,0.887255,0.112745,2023-03-01 00:04:00,0.362452


In [6]:
popular_features_train = pd.read_parquet("data/cleaned/popular_features_train.parquet")
popular_features_train.head()

Unnamed: 0,volume,bid_qty,ask_qty,buy_qty,sell_qty
0,221.389,15.283,8.425,176.405,44.984
1,847.796,38.59,2.336,525.846,321.95
2,295.596,0.442,60.25,159.227,136.369
3,460.705,4.865,21.016,335.742,124.963
4,142.818,27.158,3.451,98.411,44.407


#### Implement some helper function

In [7]:
# First need to split into some fold
train_df["timestamp"] = pd.to_datetime(train_df["timestamp"])

default_cv = 4
default_cv_type = "full"
# NOTE: default_cv must set to 1 instead of 3 based on consistency with LB score contains 49% of test data
# NOTE: 3 cv with gap is slightly better or almost equal

def create_cv(train_df, features=None, cv=default_cv):
    if features is not None:
        train_df = train_df[features + ["timestamp", "label"]]
    X_train_arr = []
    X_test_arr = []
    Y_train_arr = []
    Y_test_arr = []
    for i in range(cv):
        train_month = list(range(3 + i, 7 + i))
        # train_month = [3, 4, 5, 6, 7, 8]
        test_month = list(map(lambda x: x % 12 if x > 12 else x, list(range(8 + i, 12 + i))))
        print(train_month, test_month)
        # test_month = [9, 10, 11, 12, 1, 2] # try to make a gap to see if there is any differences in cv-lb correlation
        # print(train_month, test_month)
        train = train_df[train_df["timestamp"].dt.month.isin(train_month)].reset_index().drop("index", axis = 1)
        test = train_df[train_df["timestamp"].dt.month.isin(test_month)].reset_index().drop("index", axis = 1)
        X_train_arr.append(train.drop(["timestamp", "label"], axis = 1))
        X_test_arr.append(test.drop(["timestamp", "label"], axis = 1))
        Y_train_arr.append(train["label"])
        Y_test_arr.append(test["label"])  
    return X_train_arr, X_test_arr, Y_train_arr, Y_test_arr

# def create_cv_random_test(train_df, features=None, test_cv=10):
#     # randomize so that we have 1 train, but try it on 10 different test 
#     if features is not None:
#         train_df = train_df[features + ["timestamp", "label"]]
#     X_train_arr = []
#     X_test_arr = []
#     Y_train_arr = []
#     Y_test_arr = []

#     # Create train data
#     train_month = [3, 4, 5, 6, 7, 8]
#     train = train_df[train_df["timestamp"].dt.month.isin(train_month)] 
#     X_train_arr.append(train.drop(["timestamp", "label"], axis = 1))
#     Y_train_arr.append(train["label"])

#     test_month = [9, 10, 11, 12, 1, 2]
#     test = train_df[train_df["timestamp"].dt.month.isin(test_month)]
#     # Create test data
#     for _ in range(test_cv):
#         random_test = test.sample(frac = 0.5, random_state = default_random_state)
#         X_test_arr.append(random_test.drop(["timestamp", "label"], axis = 1))
#         Y_test_arr.append(random_test["label"])

#     return X_train_arr, X_test_arr, Y_train_arr, Y_test_arr 

# class [-1, 0, 1] -> [0, 1, 2] => < -0.2 => neg, > 0.2 => pos, else => neutral
def create_classification_class(label):
    if label < -0.4: return 0
    elif label < 0: return 1
    elif label < 0.4: return 2
    return 3

def create_cv_classification(train_df, features=None, cv=default_cv):
    if features is not None:
        train_df = train_df[features + ["timestamp", "label"]]
    X_train_arr = []
    X_test_arr = []
    Y_train_arr = []
    Y_test_arr = []
    for i in range(cv):
        train_month = list(range(3 + i, 7 + i))
        # train_month = [3, 4, 5, 6, 7, 8]
        test_month = list(map(lambda x: x % 12 if x > 12 else x, list(range(8 + i, 12 + i))))
        print(train_month, test_month)
        # test_month = [9, 10, 11, 12, 1, 2] # try to make a gap to see if there is any differences in cv-lb correlation
        # print(train_month, test_month)
        train = train_df[train_df["timestamp"].dt.month.isin(train_month)].reset_index().drop("index", axis = 1)
        test = train_df[train_df["timestamp"].dt.month.isin(test_month)].reset_index().drop("index", axis = 1)
        X_train_arr.append(train.drop(["timestamp", "label"], axis = 1))
        X_test_arr.append(test.drop(["timestamp", "label"], axis = 1))
        Y_train_arr.append(train["label"].apply(lambda x: create_classification_class(x)))
        Y_test_arr.append(test["label"].apply(lambda x: create_classification_class(x)))  
    return X_train_arr, X_test_arr, Y_train_arr, Y_test_arr

In [8]:
def pearson_score(Y_test, Y_pred):
    if isinstance(Y_test, pd.Series) or isinstance(Y_test, pd.DataFrame):
        Y_test = Y_test.values
    if isinstance(Y_pred, pd.Series) or isinstance(Y_pred, pd.DataFrame):
        Y_pred = Y_pred.values
    Y_test = np.ravel(Y_test)
    Y_pred = np.ravel(Y_pred)
    pearson = np.corrcoef(Y_test, Y_pred)[0, 1]
    if np.isnan(pearson):
        if np.std(Y_pred) == 0:
            print(Y_pred)
            print("Error: zero variance prediction")
        elif np.isnan(Y_pred).any():
            print("Error: nan prediction")
        return -1
    else:
        return pearson

In [9]:
# Make function specifically for cross validation
def train_eval_cv(model, cv, X_train_arr, X_test_arr, Y_train_arr, Y_test_arr, scoring_function=pearson_score):
    cv_score = 0

    for i in range(cv):
        X_train, X_test = X_train_arr[i], X_test_arr[i]
        Y_train, Y_test = Y_train_arr[i], Y_test_arr[i]
        model.fit(X_train, Y_train)
        Y_pred = model.predict(X_test)
        cv_score += scoring_function(Y_test, Y_pred)
    
    return cv_score / cv

def train_eval_cv_random_test(model, cv, X_train_arr, X_test_arr, Y_train_arr, Y_test_arr, scoring_function=pearson_score, test_cv = 10):
    cv_score = 0

    for i in range(cv):
        curr_cv_score = 0

        # Conduct fitting
        X_train, X_test = X_train_arr[i], X_test_arr[i]
        Y_train, Y_test = Y_train_arr[i], Y_test_arr[i]
        model.fit(X_train, Y_train)
        
        # sampling and testing
        len_test = X_test.shape[0]
        for seed in tqdm(range(test_cv)):
            np.random.seed(seed)
            test_index = np.random.choice(len_test, size = len_test // 2, replace = False) 
            X_test_sample = X_test.loc[test_index, :]
            Y_test_sample = Y_test[test_index]
            Y_pred_sample = model.predict(X_test_sample)
            curr_cv_score += scoring_function(Y_test_sample, Y_pred_sample)
        
        cv_score += curr_cv_score / test_cv
    
    np.random.seed(default_random_state)
    return cv_score

In [10]:
default_n_trees = 2000
# Finetuning XGBoost
def objective_xgboost(trial):
    params = {
        "n_estimators": default_n_trees,
        "max_depth": trial.suggest_int("max_depth", 2, 10),
        "learning_rate": trial.suggest_float("learning_rate", 0.001, 0.1, log = True), # 0.001 - 0.1 -> 0.01 - 0.05 
        "verbosity": 0,
        "subsample": trial.suggest_float("subsample", 0.05, 1.0), # 1.0 -> 0.2
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.05, 1),
        "colsample_bynode": trial.suggest_float("colsample_bynode", 0.05, 1), 
        "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.05, 1), 
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 10),
        "reg_alpha": trial.suggest_float("reg_alpha", 0, 100),
        "reg_lambda": trial.suggest_float("reg_lambda", 0, 100),
        "gamma": trial.suggest_float("gamma", 0, 5),
        "enable_categorical": True,
        "random_state": default_random_state
    }

    xgbr = XGBRegressor(**params)
    cv_pearson = train_eval_cv(xgbr, default_cv, X_train_arr, X_test_arr, Y_train_arr, Y_test_arr, pearson_score)
    return cv_pearson

def objective_lightgbm(trial):
    params = {
        "n_estimators": default_n_trees,
        "verbosity": -1,
        "max_depth": trial.suggest_int("max_depth", 2, 10), # 1 - 10 => 1 - 5
        "learning_rate": trial.suggest_float("learning_rate", 0.001, 0.1, log=True), # 0.001 - 0.1 -> 0.005 - 0.02
        "num_leaves": trial.suggest_int("num_leaves", 2, 2**10),
        "subsample": trial.suggest_float("subsample", 0.05, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.05, 1.0),
        "min_child_weight": trial.suggest_float("min_child_weight", 0, 1),
        "reg_alpha": trial.suggest_float("reg_alpha", 0, 100),
        "reg_lambda": trial.suggest_float("reg_lambda", 0, 100),
        "random_state": default_random_state
    }

    lgbr = LGBMRegressor(**params)
    cv_pearson = train_eval_cv(lgbr, default_cv, X_train_arr, X_test_arr, Y_train_arr, Y_test_arr, pearson_score)
    return cv_pearson

def objective_catboost(trial):
    params = {
        "iterations": default_n_trees,
        "verbose": False,
        "learning_rate": trial.suggest_float("learning_rate", 0.001, 0.1, log=True), # 0.001 - 0.1 => 0.01 - 0.1
        "depth": trial.suggest_int("depth", 1, 10), #  1 - 10 => 5 - 15
        "subsample": trial.suggest_float("subsample", 0.05, 1.0),
        "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.05, 1.0),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 2, 600),
        "reg_lambda": trial.suggest_float("reg_lambda", 0, 100),
        "random_seed": default_random_state
    }

    cbr = CatBoostRegressor(**params)
    cv_pearson = train_eval_cv(cbr, default_cv, X_train_arr, X_test_arr, Y_train_arr, Y_test_arr, pearson_score)
    return cv_pearson

In [11]:
# Finetuning XGBoost
def objective_xgboost_classification(trial):
    params = {
        "n_estimators": default_n_trees,
        "max_depth": trial.suggest_int("max_depth", 2, 10),
        "learning_rate": trial.suggest_float("learning_rate", 0.001, 0.1, log = True), # 0.001 - 0.1 -> 0.01 - 0.05 
        "verbosity": 0,
        "subsample": trial.suggest_float("subsample", 0.05, 1.0), # 1.0 -> 0.2
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.05, 1),
        "colsample_bynode": trial.suggest_float("colsample_bynode", 0.05, 1), 
        "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.05, 1), 
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 10),
        "reg_alpha": trial.suggest_float("reg_alpha", 0, 100),
        "reg_lambda": trial.suggest_float("reg_lambda", 0, 100),
        "gamma": trial.suggest_float("gamma", 0, 5),
        "enable_categorical": True,
        "random_state": default_random_state
    }

    xgbr = XGBClassifier(**params)
    cv_acc = train_eval_cv(xgbr, default_cv, X_train_arr, X_test_arr, Y_train_arr, Y_test_arr, accuracy_score)
    return cv_acc

def objective_lightgbm_classification(trial):
    params = {
        "n_estimators": default_n_trees,
        "verbosity": -1,
        "max_depth": trial.suggest_int("max_depth", 2, 10), # 1 - 10 => 1 - 5
        "learning_rate": trial.suggest_float("learning_rate", 0.001, 0.1, log=True), # 0.001 - 0.1 -> 0.005 - 0.02
        "num_leaves": trial.suggest_int("num_leaves", 2, 2**10),
        "subsample": trial.suggest_float("subsample", 0.05, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.05, 1.0),
        "min_child_weight": trial.suggest_float("min_child_weight", 0, 1),
        "reg_alpha": trial.suggest_float("reg_alpha", 0, 100),
        "reg_lambda": trial.suggest_float("reg_lambda", 0, 100),
        "random_state": default_random_state
    }

    lgbr = LGBMClassifier(**params)
    cv_acc = train_eval_cv(lgbr, default_cv, X_train_arr, X_test_arr, Y_train_arr, Y_test_arr, accuracy_score)
    return cv_acc

def objective_catboost_classification(trial):
    params = {
        "iterations": default_n_trees,
        "verbose": False,
        "learning_rate": trial.suggest_float("learning_rate", 0.001, 0.1, log=True), # 0.001 - 0.1 => 0.01 - 0.1
        "depth": trial.suggest_int("depth", 1, 10), #  1 - 10 => 5 - 15
        "subsample": trial.suggest_float("subsample", 0.05, 1.0),
        "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.05, 1.0),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 2, 600),
        "reg_lambda": trial.suggest_float("reg_lambda", 0, 100),
        "random_seed": default_random_state
    }

    cbr = CatBoostRegressor(**params)
    cv_acc = train_eval_cv(cbr, default_cv, X_train_arr, X_test_arr, Y_train_arr, Y_test_arr, accuracy_score)
    return cv_acc

In [12]:
default_n_trials = 100
default_n_jobs = 1

def optimize_xgboost(study_name, storage_name, objective_function=objective_xgboost, n_trials = default_n_trials, n_jobs = default_n_jobs):
    print("Conduct hyperparam opt for XGBoost")
    study = optuna.create_study(
        study_name = study_name,
        direction ='maximize',
        storage = f"sqlite:///{storage_name}.db",
        sampler = TPESampler(seed = 101, n_startup_trials=10),
        load_if_exists=True
    )
    study.optimize(objective_function, n_trials=n_trials, n_jobs=n_jobs)
    print('Best hyperparameters:', study.best_params)
    print('Best Pearson score:', study.best_value)
    return study.best_params

def optimize_lightgbm(study_name, storage_name, objective_function=objective_lightgbm, n_trials = default_n_trials, n_jobs = default_n_jobs):
    print("Conduct hyperparam opt for LightGBM")
    study = optuna.create_study(
        study_name = study_name,
        direction='maximize',
        storage = f"sqlite:///{storage_name}.db",
        sampler = TPESampler(seed = 101, n_startup_trials=10),
        load_if_exists=True
    )
    study.optimize(objective_function, n_trials=n_trials, n_jobs=n_jobs)
    print('Best hyperparameters:', study.best_params)
    print('Best Pearson score:', study.best_value)
    return study.best_params

def optimize_catboost(study_name, storage_name, objective_function=objective_catboost, n_trials = default_n_trials, n_jobs = default_n_jobs):
    print("Conduct hyperparam opt for CatBoost")
    study = optuna.create_study(
        study_name = study_name,
        direction='maximize',
        storage = f"sqlite:///{storage_name}.db",
        sampler = TPESampler(seed = 101, n_startup_trials=10),
        load_if_exists=True
    )
    study.optimize(objective_function, n_trials=n_trials, n_jobs=n_jobs)
    print('Best hyperparameters:', study.best_params)
    print('Best Pearson score:', study.best_value)
    return study.best_params

#### First iteration: training with all features from the collection, no popular features

In [13]:
original_features = [f for f in train_df.columns if "X" in f]

X_train_arr, X_test_arr, Y_train_arr, Y_test_arr = create_cv(train_df, original_features)

[3, 4, 5, 6] [8, 9, 10, 11]
[4, 5, 6, 7] [9, 10, 11, 12]
[5, 6, 7, 8] [10, 11, 12, 1]
[6, 7, 8, 9] [11, 12, 1, 2]


In [None]:
best_params_xgboost = optimize_xgboost(
    f"xgboost_{feature_version}_{default_cv}_{default_random_state}_{default_n_trees}_study",
    f"xgboost_{feature_version}_{default_cv}_{default_random_state}_{default_n_trees}_study"
)

In [None]:
best_params_lightgbm = optimize_lightgbm(
    f"lightgbm_{feature_version}_{default_cv}_{default_random_state}_{default_n_trees}_study",
    f"lightgbm_{feature_version}_{default_cv}_{default_random_state}_{default_n_trees}_study"
)

In [None]:
# best_params_catboost = optimize_catboost(
#     f"catboost_{feature_version}_{default_cv}_{default_random_state}_{default_n_trees}_study",
#     f"catboost_{feature_version}_{default_cv}_{default_random_state}_{default_n_trees}_study"
# )
# # Need to take down as catboost might not work well in this situation

Analyze params - cv relationship

In [14]:
def get_study_df(filename):
    study = optuna.load_study(
        study_name = filename,
        storage = f"sqlite:///{filename}.db"
    )
    study_df = []
    for trial in study.trials:
        trial_dict = trial.params
        trial_dict["value"] = trial.value
        study_df.append(trial_dict)

    return pd.DataFrame(study_df)

In [15]:
def params_value_viz(study_df):
    nrows = (study_df.shape[1] - 1) // 3 + ((study_df.shape[1] - 1) % 3 > 0)
    fig, ax = plt.subplots(nrows = nrows, ncols = 3, figsize = (14, 5 * nrows))
    for inx, var in enumerate(study_df.columns):
        x, y = inx // 3, inx % 3
        if var != "value":
            sns.regplot(study_df, x = var, y = "value", ax = ax[x][y], lowess=True, line_kws={'color': 'green'}, ci = 95)
    plt.show()

In [None]:
study_df_xgboost = get_study_df(f"xgboost_{feature_version}_{default_cv}_{default_random_state}_{default_n_trees}_study")   
params_value_viz(study_df_xgboost)

In [None]:
study_df_lightgbm = get_study_df(f"lightgbm_{feature_version}_{default_cv}_{default_random_state}_{default_n_trees}_study")
params_value_viz(study_df_lightgbm)

In [None]:
# study_df_catboost = get_study_df(f"catboost_{feature_version}_{default_cv}_{default_random_state}_{default_n_trees}_study")
# params_value_viz(study_df_catboost)

Analyze feature importance + CV performance

In [16]:
def get_best_params_from_file(filename):
    study = optuna.load_study(
        study_name = filename,
        storage = f"sqlite:///{filename}.db"
    )
    return study.best_params

In [17]:
def get_shap_values(model, X_train_arr, X_test_arr, Y_train_arr, Y_test_arr, sample_size=10000):
    mean_abs_shap_all = np.zeros(X_train_arr[0].shape[1])
    for i in range(default_cv):
        X_train, X_test = X_train_arr[i], X_test_arr[i]
        Y_train, Y_test = Y_train_arr[i], Y_test_arr[i]
        model.fit(X_train, Y_train)
        X_test_sample = X_test.sample(sample_size, random_state = default_random_state)
        explainer = shap.TreeExplainer(model)
        shap_values = explainer.shap_values(X_test)
        mean_abs_shap = np.mean(np.abs(shap_values), axis = 0)
        mean_abs_shap_all += mean_abs_shap
    mean_abs_shap_all /= default_cv
    return mean_abs_shap_all

In [None]:
params = {
    "n_estimators": default_n_trees,
    "verbosity": 0,
    "enable_categorical": True,
    "random_state": default_random_state
}
best_params_xgboost = get_best_params_from_file(f"xgboost_{feature_version}_{default_cv}_{default_random_state}_{default_n_trees}_study")
for p in best_params_xgboost:
    params[p] = best_params_xgboost[p]

xgboost_feature_importances = {}

xgbr = XGBRegressor(**params)
for i in range(default_cv):
    X_train, X_test = X_train_arr[i], X_test_arr[i]
    Y_train, Y_test = Y_train_arr[i], Y_test_arr[i]
    xgbr.fit(X_train, Y_train)
    print(pearson_score(Y_test, xgbr.predict(X_test)))
    features = xgbr.feature_names_in_.tolist()
    # features_i = xgbr.feature_importances_.tolist()
    features_i = get_shap_values(xgbr, X_train_arr, X_test_arr, Y_train_arr, Y_test_arr)
    for inx, feat in enumerate(features):
        xgboost_feature_importances[feat] = xgboost_feature_importances.get(feat, 0) + features_i[inx]

# print(feature_importances)
plt.hist(xgboost_feature_importances.values())
# Seems like only COD features are important (can try to only use 4-8 hours if 4-13 hours does not work well)

In [None]:
print([f for f in xgboost_feature_importances if xgboost_feature_importances[f] > 0.01])

In [None]:
params = {
    "n_estimators": default_n_trees,
    "verbosity": -1,
    "random_state": default_random_state
}
best_params_lightgbm = get_best_params_from_file(f"lightgbm_{feature_version}_{default_cv}_{default_random_state}_{default_n_trees}_study")
for p in best_params_lightgbm:
    params[p] = best_params_lightgbm[p]

lightgbm_feature_importances = {}

lgbr = LGBMRegressor(**params)
for i in range(default_cv):
    X_train, X_test = X_train_arr[i], X_test_arr[i]
    Y_train, Y_test = Y_train_arr[i], Y_test_arr[i]
    lgbr.fit(X_train, Y_train)
    print(pearson_score(Y_test, lgbr.predict(X_test)))
    features = lgbr.feature_names_in_.tolist()
    # features_i = lgbr.feature_importances_.tolist()
    features_i = get_shap_values(lgbr, X_train_arr, X_test_arr, Y_train_arr, Y_test_arr)
    for inx, feat in enumerate(features):
        lightgbm_feature_importances[feat] = lightgbm_feature_importances.get(feat, 0) + features_i[inx]

plt.hist(lightgbm_feature_importances.values())
# seems to pick up time features not as good as past 4 hours features

In [None]:
print([f for f in lightgbm_feature_importances if lightgbm_feature_importances[f] >= 0.01])

In [None]:
# params = {
#     "iterations": default_n_trees,
#     "verbose": False,
#     "random_seed": default_random_state
# }
# best_params_catboost = get_best_params_from_file(f"catboost_{feature_version}_{default_cv}_{default_random_state}_{default_n_trees}_study")
# for p in best_params_catboost:
#     params[p] = best_params_catboost[p]

# catboost_feature_importances = {}

# cbr = CatBoostRegressor(**params)
# cv_rmse = 0

# for i in range(default_cv):
#     X_train, X_test = X_train_arr[i], X_test_arr[i]
#     Y_train, Y_test = Y_train_arr[i], Y_test_arr[i]
#     cbr.fit(X_train, Y_train)
#     print(pearson_score(Y_test, cbr.predict(X_test)))
#     features = cbr.feature_names_
#     # features_i = cbr.feature_importances_.tolist()
#     features_i = get_shap_values(cbr, X_train_arr, X_test_arr, Y_train_arr, Y_test_arr)
#     for inx, feat in enumerate(features):
#         catboost_feature_importances[feat] = catboost_feature_importances.get(feat, 0) + features_i[inx]

# plt.hist(catboost_feature_importances.values())
# # can pick up a combination of both past cod and tss, not good at picking up ph, temp

In [None]:
# print([f for f in catboost_feature_importances if catboost_feature_importances[f] >= 0.02])

Get top 20 important features in all of them

In [None]:
xgboost_feature_importances_df = pd.DataFrame(
    {"var": xgboost_feature_importances.keys(), "importance": xgboost_feature_importances.values()}
)
xgboost_feature_importances_df["importance"] /= default_cv
# xgboost_feature_importances_df["rank_importance"] = xgboost_feature_importances_df["importance"].rank(ascending=False)
lightgbm_feature_importances_df = pd.DataFrame(
    {"var": lightgbm_feature_importances.keys(), "importance": lightgbm_feature_importances.values()}
)
lightgbm_feature_importances_df["importance"] /= default_cv
# lightgbm_feature_importances_df["rank_importance"] = lightgbm_feature_importances_df["importance"].rank(ascending=False)
# catboost_feature_importances_df = pd.DataFrame(
#     {"var": catboost_feature_importances.keys(), "importance_catboost": catboost_feature_importances.values()}
# )
# catboost_feature_importances_df["rank_importance"] = catboost_feature_importances_df["importance_catboost"].rank(ascending=False)
feature_importances_df = xgboost_feature_importances_df.merge(
    lightgbm_feature_importances_df,
    on="var",
    how="inner",
    suffixes=("_xgboost", "_lightgbm")
)
# feature_importances_df = feature_importances_df.merge(
#     catboost_feature_importances_df,
#     on="var",
#     how="inner",
#     suffixes=("", "_catboost")
# )
# feature_importances_df = feature_importances_df[["var", "rank_importance_xgboost", "rank_importance_lightgbm", "rank_importance_catboost"]]
# feature_importances_df["rank"] = 1/3 * (feature_importances_df["rank_importance_xgboost"] + feature_importances_df["rank_importance_lightgbm"] + feature_importances_df["rank_importance_catboost"])

feature_importances_df["importance"] = 1/2 * (feature_importances_df["importance_xgboost"] + feature_importances_df["importance_lightgbm"])
feature_importances_df = feature_importances_df.sort_values(by="importance", ascending=False).reset_index().drop("index", axis = 1)
feature_importances_df[:50]

In [None]:
feature_importances_df.to_csv("feature_importances_df.csv", index = False)

In [None]:
feature_importances_df = pd.read_csv("feature_importances_df.csv")
feature_importances_df

In [None]:
print(feature_importances_df.loc[:49, "var"].tolist())

#### Second Iteration: adding popular feature in addition to original features correlated to label

In [None]:
train_added_df = pd.concat([train_df, popular_features_train], axis = 1)

X_train_arr, X_test_arr, Y_train_arr, Y_test_arr = create_cv(train_added_df)

In [None]:
best_params_xgboost_popular_feature = optimize_xgboost(
    f"xgboost_{feature_version}_{default_cv}_{default_random_state}_{default_n_trees}_popular_feature_study",
    f"xgboost_{feature_version}_{default_cv}_{default_random_state}_{default_n_trees}_popular_feature_study"
)

In [None]:
best_params_lightgbm_popular_feature = optimize_lightgbm(
    f"lightgbm_{feature_version}_{default_cv}_{default_random_state}_{default_n_trees}_popular_feature_study",
    f"lightgbm_{feature_version}_{default_cv}_{default_random_state}_{default_n_trees}_popular_feature_study"
)

Check for feature importance

In [None]:
params = {
    "n_estimators": default_n_trees,
    "verbosity": 0,
    "enable_categorical": True,
    "random_state": default_random_state
}
best_params_xgboost_popular_feature = get_best_params_from_file(f"xgboost_{feature_version}_{default_cv}_{default_random_state}_{default_n_trees}_popular_feature_study")
for p in best_params_xgboost_popular_feature:
    params[p] = best_params_xgboost_popular_feature[p]

xgboost_feature_importances = {}

xgbr = XGBRegressor(**params)
for i in range(default_cv):
    X_train, X_test = X_train_arr[i], X_test_arr[i]
    Y_train, Y_test = Y_train_arr[i], Y_test_arr[i]
    xgbr.fit(X_train, Y_train)
    print(pearson_score(Y_test, xgbr.predict(X_test)))
    features = xgbr.feature_names_in_.tolist()
    # features_i = xgbr.feature_importances_.tolist()
    features_i = get_shap_values(xgbr, X_train_arr, X_test_arr, Y_train_arr, Y_test_arr)
    for inx, feat in enumerate(features):
        xgboost_feature_importances[feat] = xgboost_feature_importances.get(feat, 0) + features_i[inx]

# print(feature_importances)
plt.hist(xgboost_feature_importances.values())
# Seems like only COD features are important (can try to only use 4-8 hours if 4-13 hours does not work well)

In [None]:
params = {
    "n_estimators": default_n_trees,
    "verbosity": -1,
    "random_state": default_random_state
}
best_params_lightgbm_popular_feature = get_best_params_from_file(f"lightgbm_{feature_version}_{default_cv}_{default_random_state}_{default_n_trees}_popular_feature_study")
for p in best_params_lightgbm_popular_feature:
    params[p] = best_params_lightgbm_popular_feature[p]

lightgbm_feature_importances = {}

lgbr = LGBMRegressor(**params)
for i in range(default_cv):
    X_train, X_test = X_train_arr[i], X_test_arr[i]
    Y_train, Y_test = Y_train_arr[i], Y_test_arr[i]
    lgbr.fit(X_train, Y_train)
    print(pearson_score(Y_test, lgbr.predict(X_test)))
    features = lgbr.feature_names_in_.tolist()
    # features_i = lgbr.feature_importances_.tolist()
    features_i = get_shap_values(lgbr, X_train_arr, X_test_arr, Y_train_arr, Y_test_arr)
    for inx, feat in enumerate(features):
        lightgbm_feature_importances[feat] = lightgbm_feature_importances.get(feat, 0) + features_i[inx]

plt.hist(lightgbm_feature_importances.values())
# seems to pick up time features not as good as past 4 hours features

In [None]:
xgboost_feature_importances_df = pd.DataFrame(
    {"var": xgboost_feature_importances.keys(), "importance": xgboost_feature_importances.values()}
)
xgboost_feature_importances_df["importance"] /= default_cv
# xgboost_feature_importances_df["rank_importance"] = xgboost_feature_importances_df["importance"].rank(ascending=False)
lightgbm_feature_importances_df = pd.DataFrame(
    {"var": lightgbm_feature_importances.keys(), "importance": lightgbm_feature_importances.values()}
)
lightgbm_feature_importances_df["importance"] /= default_cv
# lightgbm_feature_importances_df["rank_importance"] = lightgbm_feature_importances_df["importance"].rank(ascending=False)
# catboost_feature_importances_df = pd.DataFrame(
#     {"var": catboost_feature_importances.keys(), "importance_catboost": catboost_feature_importances.values()}
# )
# catboost_feature_importances_df["rank_importance"] = catboost_feature_importances_df["importance_catboost"].rank(ascending=False)
feature_importances_df = xgboost_feature_importances_df.merge(
    lightgbm_feature_importances_df,
    on="var",
    how="inner",
    suffixes=("_xgboost", "_lightgbm")
)
# feature_importances_df = feature_importances_df.merge(
#     catboost_feature_importances_df,
#     on="var",
#     how="inner",
#     suffixes=("", "_catboost")
# )
# feature_importances_df = feature_importances_df[["var", "rank_importance_xgboost", "rank_importance_lightgbm", "rank_importance_catboost"]]
# feature_importances_df["rank"] = 1/3 * (feature_importances_df["rank_importance_xgboost"] + feature_importances_df["rank_importance_lightgbm"] + feature_importances_df["rank_importance_catboost"])
feature_importances_df["importance"] = 1/2 * (feature_importances_df["importance_xgboost"] + feature_importances_df["importance_lightgbm"])
feature_importances_df = feature_importances_df.sort_values(by="importance", ascending=False).reset_index().drop("index", axis = 1)
feature_importances_df

In [None]:
feature_importances_df.to_csv("feature_importances_df.csv", index = False)

In [None]:
feature_importances_df = pd.read_csv("feature_importances_df.csv")
feature_importances_df

In [None]:
feature_importances_df[~feature_importances_df["var"].str.contains("X")]

In [None]:
print(feature_importances_df.loc[:29, "var"].tolist())

#### Third iteration: a more truncated version from the first collection

In [None]:
best_xgboost_score = optuna.load_study(
    study_name = "xgboost_2_4_101_1000_popular_feature_study",
    storage = f"sqlite:///xgboost_2_4_101_1000_popular_feature_study.db"
).best_value
best_lightgbm_score = optuna.load_study(
    study_name = "lightgbm_2_4_101_1000_popular_feature_study",
    storage = f"sqlite:///lightgbm_2_4_101_1000_popular_feature_study.db"
).best_value
feature_importances_df["weighted_importance"] = (best_xgboost_score * feature_importances_df["importance_xgboost"] + best_lightgbm_score * feature_importances_df["importance_lightgbm"]) / (best_xgboost_score + best_lightgbm_score)
feature_importances_df = feature_importances_df.sort_values("weighted_importance", ascending=False, ignore_index=True)
feature_importances_df

In [None]:
print(feature_importances_df.loc[:49, "var"].tolist())

XGBoost

In [None]:
# xgboost_importance_threshold = 0.011
# xgboost_best_features = [
#     f for f in xgboost_feature_importances if xgboost_feature_importances[f] > xgboost_importance_threshold
# ] + ["volume", "bid_qty", "ask_qty", "buy_qty", "sell_qty"]
# print(len(xgboost_best_features))
# train_added_df = pd.concat([train_df, popular_features_train], axis=1)

# X_train_arr, X_test_arr, Y_train_arr, Y_test_arr = create_cv(train_added_df, xgboost_best_features)

In [None]:
# best_xgboost_params_truncated = optimize_xgboost(
#     f"xgboost_{feature_version}_{default_cv}_{default_random_state}_{default_n_trees}_truncated_study",
#     f"xgboost_{feature_version}_{default_cv}_{default_random_state}_{default_n_trees}_truncated_study"
# ) # much worse than using all features  

LightGBM

In [None]:
# lightgbm_importance_threshold = 20
# lightgbm_best_features = [
#     f for f in lightgbm_feature_importances if lightgbm_feature_importances[f] > lightgbm_importance_threshold
# ] + ["volume", "bid_qty", "ask_qty", "buy_qty", "sell_qty"]
# print(len(lightgbm_best_features))
# train_added_df = pd.concat([train_df, popular_features_train], axis=1)

# X_train_arr, X_test_arr, Y_train_arr, Y_test_arr = create_cv(train_added_df, lightgbm_best_features)

In [None]:
# best_lightgbm_params_truncated = optimize_lightgbm(
#     f"lightgbm_{feature_version}_{default_cv}_{default_random_state}_{default_n_trees}_truncated_study",
#     f"lightgbm_{feature_version}_{default_cv}_{default_random_state}_{default_n_trees}_truncated_study"
# )
# # also much worse 

#### Fourth Iteration: a common truncated version using good features across all models + popular features

In [None]:
best_features = ['X862', 'X598', 'X863', 'X856', 'X612', 'X466', 'X533', 'X861', 'X445', 'X531', 
                  'X385', 'X23', 'X284', 'X465', 'X331', 'X95', 'X285', 'X31', 'X169', 'X137'] + \
                ["volume", "bid_qty", "ask_qty", "buy_qty", "sell_qty"] + \
                [col for col in train_df.columns.tolist() if "X" not in col and col not in ["timestamp", "label"]]
best_features = list(set(best_features))
train_added_df = pd.concat([train_df, popular_features_train], axis=1)
X_train_arr, X_test_arr, Y_train_arr, Y_test_arr = create_cv(train_added_df, best_features)

XGBoost

In [None]:
best_xgboost_params_common_truncated = optimize_xgboost(
    f"xgboost_{feature_version}_{default_cv}_{default_random_state}_{default_n_trees}_common_truncated_20_study",
    f"xgboost_{feature_version}_{default_cv}_{default_random_state}_{default_n_trees}_common_truncated_20_study"
) 

LightGBM

In [None]:
best_lightgbm_params_common_truncated = optimize_lightgbm(
    f"lightgbm_{feature_version}_{default_cv}_{default_random_state}_{default_n_trees}_common_truncated_20_study",
    f"lightgbm_{feature_version}_{default_cv}_{default_random_state}_{default_n_trees}_common_truncated_20_study"
)

Catboost

In [None]:
best_catboost_params_common_truncated = optimize_catboost(
    f"catboost_{feature_version}_{default_cv}_{default_random_state}_{default_n_trees}_common_truncated_20_study",
    f"catboost_{feature_version}_{default_cv}_{default_random_state}_{default_n_trees}_common_truncated_20_study"
)

Analyze model performance and feature importance across train and test

In [None]:
params = {
    "n_estimators": default_n_trees,
    "verbosity": 0,
    "enable_categorical": True,
    "random_state": default_random_state
}
best_params_xgboost = get_best_params_from_file(f"xgboost_{feature_version}_{default_cv}_{default_random_state}_{default_n_trees}_common_truncated_20_study")
for p in best_params_xgboost:
    params[p] = best_params_xgboost[p]

xgbr_arr = []

for i in tqdm(range(default_cv)):
    xgbr = XGBRegressor(**params)
    xgbr.fit(X_train_arr[i], Y_train_arr[i])
    xgbr_arr.append(xgbr)

In [None]:
params = {
    "n_estimators": default_n_trees,
    "verbosity": -1,
    "random_state": default_random_state,
}
best_params_lightgbm = get_best_params_from_file(f"lightgbm_{feature_version}_{default_cv}_{default_random_state}_{default_n_trees}_common_truncated_20_study")
for p in best_params_lightgbm:
    params[p] = best_params_lightgbm[p]

lgbr_arr = []

for i in tqdm(range(default_cv)):
    lgbr = LGBMRegressor(**params)
    lgbr.fit(X_train_arr[i], Y_train_arr[i])
    lgbr_arr.append(lgbr)

In [None]:
xgboost_feature_importances = {}
lightgbm_feature_importances = {}

for i in tqdm(range(default_cv)):
    features = xgbr_arr[i].feature_names_in_.tolist()
    features_i = get_shap_values(xgbr_arr[i], X_train_arr, X_test_arr, Y_train_arr, Y_test_arr)
    for inx, feat in enumerate(features):
        xgboost_feature_importances[feat] = xgboost_feature_importances.get(feat, 0) + features_i[inx]
    features = lgbr_arr[i].feature_names_in_.tolist()
    features_i = get_shap_values(xgbr_arr[i], X_train_arr, X_test_arr, Y_train_arr, Y_test_arr)
    for inx, feat in enumerate(features):
        lightgbm_feature_importances[feat] = lightgbm_feature_importances.get(feat, 0) + features_i[inx]

xgboost_feature_importances_df = pd.DataFrame(
    {"var": xgboost_feature_importances.keys(), "importance": xgboost_feature_importances.values()}
)
xgboost_feature_importances_df["importance"] /= default_cv
# xgboost_feature_importances_df["rank_importance"] = xgboost_feature_importances_df["importance"].rank(ascending=False)
lightgbm_feature_importances_df = pd.DataFrame(
    {"var": lightgbm_feature_importances.keys(), "importance": lightgbm_feature_importances.values()}
)
lightgbm_feature_importances_df["importance"] /= default_cv
# lightgbm_feature_importances_df["rank_importance"] = lightgbm_feature_importances_df["importance"].rank(ascending=False)
# catboost_feature_importances_df = pd.DataFrame(
#     {"var": catboost_feature_importances.keys(), "importance_catboost": catboost_feature_importances.values()}
# )
# catboost_feature_importances_df["rank_importance"] = catboost_feature_importances_df["importance_catboost"].rank(ascending=False)
feature_importances_df_common_truncated = xgboost_feature_importances_df.merge(
    lightgbm_feature_importances_df,
    on="var",
    how="inner",
    suffixes=("_xgboost", "_lightgbm")
)
# feature_importances_df = feature_importances_df.merge(
#     catboost_feature_importances_df,
#     on="var",
#     how="inner",
#     suffixes=("", "_catboost")
# )
# feature_importances_df = feature_importances_df[["var", "rank_importance_xgboost", "rank_importance_lightgbm", "rank_importance_catboost"]]
# feature_importances_df["rank"] = 1/3 * (feature_importances_df["rank_importance_xgboost"] + feature_importances_df["rank_importance_lightgbm"] + feature_importances_df["rank_importance_catboost"])
feature_importances_df_common_truncated["importance"] = 1/2 * (feature_importances_df_common_truncated["importance_xgboost"] + feature_importances_df_common_truncated["importance_lightgbm"])
feature_importances_df_common_truncated = feature_importances_df_common_truncated.sort_values(by="importance", ascending=False).reset_index().drop("index", axis = 1)
feature_importances_df_common_truncated

In [None]:
best_xgboost_score = optuna.load_study(
    study_name = "xgboost_2_4_101_1000_common_truncated_20_study",
    storage = f"sqlite:///xgboost_2_4_101_1000_common_truncated_20_study.db"
).best_value
best_lightgbm_score = optuna.load_study(
    study_name = "lightgbm_2_4_101_1000_common_truncated_20_study",
    storage = f"sqlite:///lightgbm_2_4_101_1000_common_truncated_20_study.db"
).best_value
feature_importances_df_common_truncated["weighted_importance"] = (best_xgboost_score * feature_importances_df_common_truncated["importance_xgboost"] + best_lightgbm_score * feature_importances_df_common_truncated["importance_lightgbm"]) / (best_xgboost_score + best_lightgbm_score)
feature_importances_df_common_truncated = feature_importances_df_common_truncated.sort_values("weighted_importance", ascending=False, ignore_index=True)
feature_importances_df_common_truncated

#### Fifth Iteration Instead of using GBDT, can we use MLP on these features

Convert from normal CV to torch type CV

In [None]:
# Create the CV data, seems to be better with only anonymized features
best_features = ['X862', 'X598', 'X863', 'X856', 'X612', 'X466', 'X533', 'X861', 'X445', 'X531',
                 'X385', 'X23', 'X465', 'X284', 'X331', 'X95', 'X169', 'X285', 'X137', 'X31']
                # ["volume", "bid_qty", "ask_qty", "buy_qty", "sell_qty"] + \
                # [col for col in train_df.columns.tolist() if "X" not in col and col not in ["timestamp", "label"]]
best_features = list(set(best_features))
train_added_df = pd.concat([train_df, popular_features_train], axis=1)
X_train_arr, X_test_arr, Y_train_arr, Y_test_arr = create_cv(train_added_df, best_features)

In [None]:
# Extra code to "reduce" from float64 to float32
def float64_to_float32(data):
    if isinstance(data, pd.DataFrame):
        for col in data.columns:
            data[col] = data[col].astype("float32")
    elif isinstance(data, pd.Series):
        data = data.astype("float32")
    return data

for i in range(default_cv):
    X_train_arr[i] = float64_to_float32(X_train_arr[i])
    X_test_arr[i] = float64_to_float32(X_test_arr[i])
    Y_train_arr[i] = float64_to_float32(Y_train_arr[i])
    Y_test_arr[i] = float64_to_float32(Y_test_arr[i])

In [None]:
def normal_cv_to_torch_cv(X_train_arr, X_test_arr, Y_train_arr, Y_test_arr, cv=default_cv):
    train_arr = []
    test_arr = []
    for i in range(cv):
        # First shuffle the data
        X_train, Y_train = X_train_arr[i], Y_train_arr[i]
        X_train["label"] = Y_train
        # Instead of shuffle the training data when create the dataloader, try to shuffle beforehand
        # X_train = X_train.sample(frac = 1, random_state = default_random_state)
        # not shuffle, keep it by date
        Y_train = X_train["label"]
        X_train = X_train.drop("label", axis = 1)

        # Then normalize
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train.values)

        # Create train dataset
        X_train, Y_train = torch.from_numpy(X_train), torch.from_numpy(Y_train.values)
        train_dataset = TensorDataset(X_train, Y_train)
        train_arr.append(train_dataset)

        # Normalize X_test
        X_test = scaler.transform(X_test_arr[i].values)

        # Create test dataset
        X_test, Y_test = torch.from_numpy(X_test), torch.from_numpy(Y_test_arr[i].values)
        test_dataset = TensorDataset(X_test, Y_test)
        test_arr.append(test_dataset)
        
    return train_arr, test_arr

In [None]:
train_arr, test_arr = normal_cv_to_torch_cv(X_train_arr, X_test_arr, Y_train_arr, Y_test_arr)

Define the model

In [None]:
# Define the model
class MLP(nn.Module):
    def __init__(self, num_features, hidden_layers_size, dropout):
        super(MLP, self).__init__()

        # Initialize layers & batchnorm
        last_layer = num_features
        self.layers = nn.ModuleList()
        for current_layer in hidden_layers_size:
            self.layers.append(nn.Linear(last_layer, current_layer))
            last_layer = current_layer
        self.layers.append(nn.Linear(last_layer, 1))

        # Initialize activation
        self.activation = nn.ReLU()

        # Initialze dropout
        self.dropout = nn.Dropout(p = dropout)

    def forward(self, x):
        for inx, layer in enumerate(self.layers):
            if inx == len(self.layers) - 1:
                x = layer(x)
            else:
                x = layer(x)
                x = self.activation(x)
                x = self.dropout(x)
        return x

    def reset(self):
        for layer in self.layers:
            layer.reset_parameters()

Train model with CV and evaluate

In [None]:
# Separate function for train & eval step
def train_mlp(model, criterion, optimizer, train_dataloader, num_epochs):
    model.train()
    for _ in tqdm(range(num_epochs)):
        for (inputs, targets) in train_dataloader:
            # Load to device
            inputs, targets= inputs.to(device), targets.to(device)
            # Forward step
            outputs = model(inputs)
            # get error
            error = criterion(outputs, targets)
            # Zero out the past gradient
            optimizer.zero_grad()
            # Backprop
            error.backward()
            # Gradient Descent
            optimizer.step()

def eval_mlp(model, test_dataloader):
    outputs_all = np.zeros(0)
    targets_all = np.zeros(0)
    model.eval()
    with torch.no_grad():
        for _, (inputs, targets) in enumerate(test_dataloader):
            # Load to device
            inputs = inputs.to(device)
            # Forward step
            outputs = model(inputs).detach().cpu().numpy().flatten()
            # Load to overall Y_test, Y_pred to calculate pearson score later
            outputs_all = np.concatenate([outputs_all, outputs])
            targets_all = np.concatenate([targets_all, targets])
    return pearson_score(targets_all, outputs_all)

In [None]:
def train_eval_cv_torch(model, lr, cv, train_arr, test_arr, batch_size, num_epochs):
    cv_pearson = 0
    for i in range(cv):
        # Get the dataloader
        train_dataset = train_arr[i]
        train_dataloader = DataLoader(train_dataset, batch_size = batch_size, num_workers=0)
        test_dataset = test_arr[i]
        test_dataloader = DataLoader(test_dataset, batch_size = batch_size, num_workers=0)

        # Reinitialize the model
        model.reset()
        model.to(device)

        # Initialize the loss function
        criterion = nn.MSELoss()

        # Reinitialize the optimizer
        optimizer = optim.Adam(model.parameters(), lr = lr)

        # Train the model
        train_mlp(model, criterion, optimizer, train_dataloader, num_epochs)

        # Test the model
        pearson = eval_mlp(model, test_dataloader)
        print(pearson)
        cv_pearson += pearson
    return cv_pearson / cv

In [None]:
# Training process of the default config
hidden_layers_size = [16, 8, 4]
lr = 0.001
batch_size = 60
num_epochs = 10

mlpr = MLP(len(best_features), hidden_layers_size=hidden_layers_size, dropout = 0.3)

train_eval_cv_torch(mlpr, lr, default_cv, train_arr, test_arr, batch_size, num_epochs)

#### Sixth Iteration: Change this into a classification problem

In [None]:
original_features = [f for f in train_df.columns if "X" in f]

X_train_arr, X_test_arr, Y_train_arr, Y_test_arr = create_cv_classification(train_df, original_features)

In [None]:
best_params_xgboost_classification = optimize_xgboost(
    f"xgboost_{feature_version}_{default_cv}_{default_random_state}_{default_n_trees}_classification_study",
    f"xgboost_{feature_version}_{default_cv}_{default_random_state}_{default_n_trees}_classification_study",
    objective_xgboost_classification
)

In [None]:
best_params_lightgbm_classification = optimize_lightgbm(
    f"lightgbm_{feature_version}_{default_cv}_{default_random_state}_{default_n_trees}_classification_study",
    f"lightgbm_{feature_version}_{default_cv}_{default_random_state}_{default_n_trees}_classification_study",
    objective_lightgbm_classification
)

#### Seventh Iteration: Search for the best way to train

In [None]:
def search_training_scheme(model, train_df, cv = default_cv, features = None):
    folds_trial = [
        # level 1
        [[0, 1, 2, 3]], 
        [[0, 1]], [[1, 2]], [[2, 3]],
        [[0]], [[1]], [[2]], [[3]],
        [[0, 1], [1, 2], [2, 3]],
        [[0, 1], [2, 3]],
        [[0], [1], [2], [3]],
        # level 2
        [[0, 1, 2, 3], [0, 1]],
        [[0, 1, 2, 3], [1, 2]],
        [[0, 1, 2, 3], [2, 3]],
        [[0, 1, 2, 3], [0, 1], [2, 3]],
        [[0, 1, 2, 3], [0, 1], [1, 2], [2, 3]],
        [[0, 1, 2, 3], [0], [1], [2], [3]],
        [[0, 1], [2, 3], [0], [1], [2], [3]],
        [[0, 1], [1, 2], [2, 3], [0], [1], [2], [3]],
        # level 3
        [[0, 1, 2, 3], [0, 1], [0]],
        [[0, 1, 2, 3], [2, 3], [3]],
        [[0, 1, 2, 3], [0, 1], [2, 3], [0], [1], [2], [3]],
        [[0, 1, 2, 3], [0, 1], [1, 2], [2, 3], [0], [1], [2], [3]],
    ]

    if features is not None:
        train_df = train_df[features + ["timestamp", "label"]]

    for folds in folds_trial:
        print(f"Current folds list is {folds}")
        model_lst = [deepcopy(model)] * len(folds)
        cv_pearson = []
        for i in range(cv):
            train_month = list(range(3 + i, 7 + i))
            test_month = list(map(lambda x: x % 12 if x > 12 else x, list(range(8 + i, 12 + i))))
            test = train_df[train_df["timestamp"].dt.month.isin(test_month)].reset_index().drop("index", axis = 1)
            X_test, Y_test = test.drop(["timestamp", "label"], axis = 1), test["label"]
            Y_pred = np.zeros(Y_test.shape[0])
            for j in range(len(folds)):
                fold = folds[j]
                model = model_lst[j]
                train_month_curr = [train_month[f] for f in fold]
                train_curr = train_df[train_df["timestamp"].dt.month.isin(train_month_curr)].reset_index().drop("index", axis = 1)
                X_train, Y_train = train_curr.drop(["timestamp", "label"], axis = 1), train_curr["label"]
                model.fit(X_train, Y_train)
                Y_pred += model.predict(X_test)
            Y_pred /= len(folds)
            cv_pearson.append(pearson_score(Y_test, Y_pred))
            print(f"Finish fold {i} with score: {pearson_score(Y_test, Y_pred)}")
        print(f"Finish trial with mean score: {np.mean(np.array(cv_pearson))}")
        print(f"Finish trial with std score: {np.std(np.array(cv_pearson))}")
        print()

In [None]:
train_added_df = pd.concat([train_df, popular_features_train], axis = 1)

params = {
    "n_estimators": default_n_trees,
    "verbosity": 0,
    "enable_categorical": True,
    "random_state": default_random_state
}
best_params_xgboost_popular_feature = get_best_params_from_file(f"xgboost_{feature_version}_{default_cv}_{default_random_state}_{default_n_trees}_popular_feature_study")
for p in best_params_xgboost_popular_feature:
    params[p] = best_params_xgboost_popular_feature[p]

xgbr = XGBRegressor(**params)
search_training_scheme(xgbr, train_added_df)
# Notable
# [[0, 1, 2, 3]]
# [[0, 1, 2, 3], [1, 2]]
# [[0, 1, 2, 3], [0, 1], [2, 3]]
# [[0, 1, 2, 3], [0, 1], [1, 2], [2, 3]]
# [[0, 1, 2, 3], [0], [1], [2], [3]] 
# [[0, 1, 2, 3], [0, 1], [2, 3], [0], [1], [2], [3]]
# [[0, 1, 2, 3], [0, 1], [1, 2], [2, 3], [0], [1], [2], [3]]

In [None]:
train_added_df = pd.concat([train_df, popular_features_train], axis = 1)

params = {
    "n_estimators": default_n_trees,
    "verbosity": -1,
    "random_state": default_random_state
}
best_params_lightgbm_popular_feature = get_best_params_from_file(f"lightgbm_{feature_version}_{default_cv}_{default_random_state}_{default_n_trees}_popular_feature_study")
for p in best_params_lightgbm_popular_feature:
    params[p] = best_params_lightgbm_popular_feature[p]

lgbr = LGBMRegressor(**params)
search_training_scheme(lgbr, train_added_df)
# [[0, 1, 2, 3]]
# [[0, 1, 2, 3], [0, 1]]
# [[0, 1, 2, 3], [0, 1], [2, 3]]
# [[0, 1, 2, 3], [0], [1], [2], [3]]
# [[0, 1, 2, 3], [0, 1], [0]]
# [[0, 1, 2, 3], [0, 1], [2, 3], [0], [1], [2], [3]]

#### Eighth Iteration: rewrite the code for MLP training using MLX

Create the data for training + custom batch iteration

In [18]:
# Create the CV data, seems to be better with only anonymized features
# best_features = ['X862', 'X598', 'X863', 'X856', 'X612', 'X466', 'X533', 'X861', 'X445', 'X531', 
#                  'X385', 'X23', 'X284', 'X465', 'X331', 'X95', 'X285', 'X31', 'X169', 'X137', 
#                 'X379', 'X186', 'X852', 'X302', 'X868', 'X89', 'X219', 'X855', 'X540', 'X301'] 
                #  'X198', 'X373', 'X524', 'X291', 'X444', 'X279', 'X300', 'X181', 'X367', 'X538', 
                #  'X288', 'X226', 'X857', 'X860', 'X205', 'X298', 'X272', 'X472', 'X28', 'X754']
                # ["volume", "bid_qty", "ask_qty", "buy_qty", "sell_qty"] + \
                # [col for col in train_df.columns.tolist() if "X" not in col and col not in ["timestamp", "label"]]
# best_features = list(set(best_features))
# best_features = [col for col in train_df.columns if "X" in col]
# train_added_df = pd.concat([train_df, popular_features_train], axis=1)
# X_train_arr, X_test_arr, Y_train_arr, Y_test_arr = create_cv(train_added_df, best_features)

In [19]:
# Extra code to "reduce" from float64 to float32
def float64_to_float32(data):
    if isinstance(data, pd.DataFrame):
        for col in data.columns:
            data[col] = data[col].astype("float32")
    elif isinstance(data, pd.Series):
        data = data.astype("float32")
    return data

# for i in range(default_cv):
#     X_train_arr[i] = float64_to_float32(X_train_arr[i])
#     X_test_arr[i] = float64_to_float32(X_test_arr[i])
#     Y_train_arr[i] = float64_to_float32(Y_train_arr[i])
#     Y_test_arr[i] = float64_to_float32(Y_test_arr[i])

In [20]:
def normal_cv_to_mlx_cv(X_train_arr, X_test_arr, Y_train_arr, Y_test_arr, cv=default_cv):
    for i in range(cv):
        # Normalize forst
        scaler = StandardScaler()
        X_train_arr[i] = scaler.fit_transform(X_train_arr[i].values)
        X_test_arr[i] = scaler.transform(X_test_arr[i].values)

        # Convert to mlx format
        X_train_arr[i] = mx.array(X_train_arr[i])
        X_test_arr[i] = mx.array(X_test_arr[i])
        Y_train_arr[i] = mx.array(Y_train_arr[i].values)
        Y_test_arr[i] = mx.array(Y_test_arr[i].values)
        
    return X_train_arr, X_test_arr, Y_train_arr, Y_test_arr

# X_train_arr, X_test_arr, Y_train_arr, Y_test_arr = normal_cv_to_mlx_cv(X_train_arr, X_test_arr, Y_train_arr, Y_test_arr)

Define the model

In [21]:
# Define the model
# We do not use the reset method this time so you have to create the model at each fold
class MLPMLX(nnmx.Module):
    def __init__(self, num_features, hidden_layers_size, dropout):
        super().__init__()

        # Initialize layers & batchnorm
        last_layer = num_features
        self.layers = []
        for current_layer in hidden_layers_size:
            self.layers.append(nnmx.Linear(last_layer, current_layer))
            last_layer = current_layer
        self.layers.append(nnmx.Linear(last_layer, 1))

        # Initialize activation
        self.activation = nnmx.ReLU()

        # Initialize dropout
        self.dropout = nnmx.Dropout(p = dropout)

    def __call__(self, x):
        for inx, layer in enumerate(self.layers):
            x = layer(x)
            if inx != len(self.layers) - 1:
                x = self.activation(x)
                x = self.dropout(x)
        return x

Train model with CV and evaluate

In [22]:
# Custom function for batch iteration
def batch_iterate(batch_size, X, Y, shuffle = True):
    for i in range(0, Y.size, batch_size):
        X_curr = X[i: min(i + batch_size, Y.size), :]
        Y_curr = Y[i: min(i + batch_size, Y.size)]
        if shuffle:
            inx_lst = mx.random.permutation(batch_size)
            X_curr = X_curr[inx_lst, :]
            Y_curr = Y_curr[inx_lst]
        yield X_curr, Y_curr

In [23]:
# Separate function for train & eval step
def train_mlp_mlx(model, loss_and_grad_fn, optimizer, X_train, Y_train, batch_size, num_epochs):
    model.train()
    for _ in tqdm(range(num_epochs)):
        for (inputs, targets) in batch_iterate(batch_size, X_train, Y_train):
            _, grads = loss_and_grad_fn(model, inputs, targets)
            # Update the optimizer state and model parameters in a single call
            optimizer.update(model, grads)
            # Force a graph evaluation
            mx.eval(model.parameters(), optimizer.state)

def eval_mlp_mlx(model, X_test, Y_test, batch_size):
    outputs_all = np.zeros(0)
    targets_all = np.zeros(0)
    model.eval()
    for (inputs, targets) in batch_iterate(batch_size, X_test, Y_test, shuffle=False):
        outputs = model(inputs).reshape(-1)
        # convert back to numpy
        outputs, targets = np.array(outputs), np.array(targets)
        # Load to overall Y_test, Y_pred to calculate pearson score later
        outputs_all = np.concatenate([outputs_all, outputs])
        targets_all = np.concatenate([targets_all, targets])
    return pearson_score(targets_all, outputs_all)

In [24]:
def train_eval_cv_mlx(num_features, hidden_layers_size, dropout, lr, cv, X_train_arr, X_test_arr, Y_train_arr, Y_test_arr, batch_size, num_epochs):
    cv_pearson = 0
    for _, (X_train, X_test, Y_train, Y_test) in enumerate(zip(X_train_arr, X_test_arr, Y_train_arr, Y_test_arr)):
        # initialize the model
        mx.random.seed(default_random_state)
        model = MLPMLX(num_features, hidden_layers_size, dropout)

        # Initialize the loss function
        def loss_fn(model, X, Y):
            Y_pred = model(X).reshape(-1)
            return mx.mean(nnmx.losses.mse_loss(Y_pred, Y))
        loss_and_grad_fn = nnmx.value_and_grad(model, loss_fn)

        # Reinitialize the optimizer
        optimizer = optimmx.Adam(learning_rate = lr)

        # Train the model
        train_mlp_mlx(model, loss_and_grad_fn, optimizer, X_train, Y_train, batch_size, num_epochs)

        # Test the model
        pearson = eval_mlp_mlx(model, X_test, Y_test, batch_size)
        print(pearson)
        if pearson == -1:
            return pearson
        cv_pearson += pearson
    return cv_pearson / cv

Conduct training and evaluating process of the model

In [25]:
# # Training process of the default config
# num_features = len(best_features)
# hidden_layers_size = [8, 8, 8]
# dropout = 0.2
# lr = 0.001
# batch_size = 180
# num_epochs = 10

# train_eval_cv_mlx(num_features, hidden_layers_size, dropout, lr, default_cv, X_train_arr, X_test_arr, Y_train_arr, Y_test_arr, batch_size, num_epochs)

Conduct Bayesian Optimization on this

In [26]:
default_num_layers = 1

In [27]:
def objective_mlp_mlx(trial):
    # First initialize the parameters
    num_features = len(best_features)
    num_layers = default_num_layers
    log_2_hidden_layers_size = []
    for i in range(num_layers):
        if len(log_2_hidden_layers_size) == 0:
            log_2_hidden_layers_size.append(trial.suggest_int(f"log2_hidden_layer_{i}", 2, 6))
        else:
            log_2_hidden_layers_size.append(trial.suggest_int(f"log2_hidden_layer_{i}", 2, log_2_hidden_layers_size[-1]))
    hidden_layers_size = [2**l for l in log_2_hidden_layers_size]
    dropout = trial.suggest_float("dropout", 0.2, 0.7)
    lr = trial.suggest_float("lr", 0.0001, 0.01, log=True)
    batch_size = trial.suggest_categorical("batch_size", [30, 60, 120, 180, 360, 720])
    num_epochs = trial.suggest_categorical("num_epochs", [10, 20, 30, 40, 50, 60, 70, 80, 90, 100])
    
    # Conduct training based on those parameters
    return train_eval_cv_mlx(num_features, hidden_layers_size, dropout, lr, default_cv, X_train_arr, X_test_arr, Y_train_arr, Y_test_arr, batch_size, num_epochs)

In [28]:
def optimize_mlp_mlx(study_name, storage_name, objective_function=objective_mlp_mlx, n_trials = 100, n_jobs = 1):
    print("Conduct hyperparam opt for MLP")
    study = optuna.create_study(
        study_name = study_name,
        direction ='maximize',
        storage = f"sqlite:///{storage_name}.db",
        sampler = TPESampler(seed = 101, n_startup_trials=10),
        load_if_exists=True
    )
    study.optimize(objective_function, n_trials=n_trials, n_jobs=n_jobs)
    print('Best hyperparameters:', study.best_params)
    print('Best Pearson score:', study.best_value)
    return study.best_params

In [29]:
# Create the CV data, seems to be better with only anonymized features
best_features = ['X862', 'X598', 'X863', 'X856', 'X612', 'X466', 'X533', 'X861', 'X445', 'X531', 
                 'X385', 'X23', 'X284', 'X465', 'X331', 'X95', 'X285', 'X31', 'X169', 'X137',]
                # 'X379', 'X186', 'X852', 'X302', 'X868', 'X89', 'X219', 'X855', 'X540', 'X301',] 
                #  'X198', 'X373', 'X524', 'X291', 'X444', 'X279', 'X300', 'X181', 'X367', 'X538', 
                #  'X288', 'X226', 'X857', 'X860', 'X205', 'X298', 'X272', 'X472', 'X28', 'X754']
                # ["volume", "bid_qty", "ask_qty", "buy_qty", "sell_qty"] + \
                # [col for col in train_df.columns.tolist() if "X" not in col and col not in ["timestamp", "label"]]
best_features = list(set(best_features))
# best_features = [col for col in train_df.columns if "X" in col]
# train_added_df = pd.concat([train_df, popular_features_train], axis=1)
X_train_arr, X_test_arr, Y_train_arr, Y_test_arr = create_cv(train_df, best_features)

# Convert to float32
for i in range(default_cv):
    X_train_arr[i] = float64_to_float32(X_train_arr[i])
    X_test_arr[i] = float64_to_float32(X_test_arr[i])
    Y_train_arr[i] = float64_to_float32(Y_train_arr[i])
    Y_test_arr[i] = float64_to_float32(Y_test_arr[i])

# Convert to MLX
X_train_arr, X_test_arr, Y_train_arr, Y_test_arr = normal_cv_to_mlx_cv(X_train_arr, X_test_arr, Y_train_arr, Y_test_arr)

[3, 4, 5, 6] [8, 9, 10, 11]
[4, 5, 6, 7] [9, 10, 11, 12]
[5, 6, 7, 8] [10, 11, 12, 1]
[6, 7, 8, 9] [11, 12, 1, 2]


In [30]:
optimize_mlp_mlx(
    f"mlp_mlx_{feature_version}_{default_cv}_{default_random_state}_{default_num_layers}_common_truncated_{len(best_features)}_study",
    f"mlp_mlx_{feature_version}_{default_cv}_{default_random_state}_{default_num_layers}_common_truncated_{len(best_features)}_study"
)

[I 2025-07-08 12:47:28,373] A new study created in RDB with name: mlp_mlx_2_4_101_1_common_truncated_20_study


Conduct hyperparam opt for MLP


100%|██████████| 60/60 [00:17<00:00,  3.50it/s]


0.1354244937447434


100%|██████████| 60/60 [00:17<00:00,  3.47it/s]


0.15794843671552705


100%|██████████| 60/60 [00:18<00:00,  3.27it/s]


0.11990150297965195


100%|██████████| 60/60 [00:16<00:00,  3.64it/s]
[I 2025-07-08 12:48:38,226] Trial 0 finished with value: 0.12760028768572979 and parameters: {'log2_hidden_layer_0': 4, 'dropout': 0.48533379343406985, 'lr': 0.00011401144576866207, 'batch_size': 360, 'num_epochs': 60}. Best is trial 0 with value: 0.12760028768572979.


0.09712671730299671


100%|██████████| 70/70 [00:36<00:00,  1.94it/s]


0.12654283329769164


100%|██████████| 70/70 [00:35<00:00,  1.97it/s]


0.13679583381196705


100%|██████████| 70/70 [00:35<00:00,  1.96it/s]


0.04961185550472564


100%|██████████| 70/70 [00:35<00:00,  1.96it/s]
[I 2025-07-08 12:51:02,262] Trial 1 finished with value: 0.09584544514565954 and parameters: {'log2_hidden_layer_0': 3, 'dropout': 0.5426531643900891, 'lr': 0.0010857627761370686, 'batch_size': 180, 'num_epochs': 70}. Best is trial 0 with value: 0.12760028768572979.


0.07043125796825386


100%|██████████| 20/20 [00:15<00:00,  1.31it/s]


0.13251954890938447


100%|██████████| 20/20 [00:15<00:00,  1.30it/s]


0.16303848445345395


100%|██████████| 20/20 [00:15<00:00,  1.29it/s]


0.1328321987574123


100%|██████████| 20/20 [00:15<00:00,  1.31it/s]
[I 2025-07-08 12:52:05,129] Trial 2 finished with value: 0.13371565200910307 and parameters: {'log2_hidden_layer_0': 4, 'dropout': 0.6612346862836118, 'lr': 0.00015642024418779235, 'batch_size': 120, 'num_epochs': 20}. Best is trial 2 with value: 0.13371565200910307.


0.10647237591616154


100%|██████████| 50/50 [00:39<00:00,  1.28it/s]


0.08180821224960928


100%|██████████| 50/50 [00:39<00:00,  1.26it/s]


0.10020222571973178


100%|██████████| 50/50 [00:41<00:00,  1.21it/s]


0.09151585367619679


100%|██████████| 50/50 [00:38<00:00,  1.28it/s]
[I 2025-07-08 12:54:45,960] Trial 3 finished with value: 0.08211748190020414 and parameters: {'log2_hidden_layer_0': 2, 'dropout': 0.22640420054463312, 'lr': 0.00022727614012638906, 'batch_size': 120, 'num_epochs': 50}. Best is trial 2 with value: 0.13371565200910307.


0.054943635955278666


100%|██████████| 50/50 [00:06<00:00,  7.41it/s]


0.09794925651346865


100%|██████████| 50/50 [00:06<00:00,  7.39it/s]


0.11198483684818133


100%|██████████| 50/50 [00:06<00:00,  7.37it/s]


0.0726175050073553


100%|██████████| 50/50 [00:06<00:00,  7.44it/s]
[I 2025-07-08 12:55:13,267] Trial 4 finished with value: 0.08828709410878775 and parameters: {'log2_hidden_layer_0': 5, 'dropout': 0.4648116993765072, 'lr': 0.003140803308225737, 'batch_size': 720, 'num_epochs': 50}. Best is trial 2 with value: 0.13371565200910307.


0.07059677806614571


100%|██████████| 10/10 [00:30<00:00,  3.06s/it]


0.07579310445248721


100%|██████████| 10/10 [00:31<00:00,  3.10s/it]


0.11655509044266833


100%|██████████| 10/10 [00:30<00:00,  3.08s/it]


0.04085231728782849


100%|██████████| 10/10 [00:30<00:00,  3.06s/it]
[I 2025-07-08 12:57:21,490] Trial 5 finished with value: 0.07571581673407579 and parameters: {'log2_hidden_layer_0': 6, 'dropout': 0.3924688730312222, 'lr': 0.0012686203981887116, 'batch_size': 30, 'num_epochs': 10}. Best is trial 2 with value: 0.13371565200910307.


0.06966275475331915


100%|██████████| 70/70 [00:09<00:00,  7.40it/s]


0.12459213059544945


100%|██████████| 70/70 [00:09<00:00,  7.32it/s]


0.1369229372897704


100%|██████████| 70/70 [00:09<00:00,  7.13it/s]


0.09062738627227766


100%|██████████| 70/70 [00:09<00:00,  7.11it/s]
[I 2025-07-08 12:58:00,476] Trial 6 finished with value: 0.11295463472946025 and parameters: {'log2_hidden_layer_0': 6, 'dropout': 0.5682456656339292, 'lr': 0.0002998670698625724, 'batch_size': 720, 'num_epochs': 70}. Best is trial 2 with value: 0.13371565200910307.


0.0996760847603435


100%|██████████| 50/50 [00:38<00:00,  1.29it/s]


0.10345023125448581


100%|██████████| 50/50 [00:38<00:00,  1.30it/s]


0.14585027371107356


100%|██████████| 50/50 [00:38<00:00,  1.29it/s]


0.07130139165247941


100%|██████████| 50/50 [00:39<00:00,  1.28it/s]
[I 2025-07-08 13:00:36,700] Trial 7 finished with value: 0.08857291627436752 and parameters: {'log2_hidden_layer_0': 6, 'dropout': 0.21165274619931057, 'lr': 0.0012478457141046418, 'batch_size': 120, 'num_epochs': 50}. Best is trial 2 with value: 0.13371565200910307.


0.03368976847943129


100%|██████████| 10/10 [00:08<00:00,  1.23it/s]


0.08223043647263979


100%|██████████| 10/10 [00:08<00:00,  1.23it/s]


0.10981231453942253


100%|██████████| 10/10 [00:07<00:00,  1.28it/s]


0.08952583780349228


100%|██████████| 10/10 [00:07<00:00,  1.27it/s]
[I 2025-07-08 13:01:10,033] Trial 8 finished with value: 0.08797435414886179 and parameters: {'log2_hidden_layer_0': 2, 'dropout': 0.642520594272805, 'lr': 0.000357143245968115, 'batch_size': 120, 'num_epochs': 10}. Best is trial 2 with value: 0.13371565200910307.


0.07032882777989259


100%|██████████| 80/80 [00:21<00:00,  3.71it/s]


0.0427030596599554


100%|██████████| 80/80 [00:22<00:00,  3.62it/s]


0.1153749929587024


100%|██████████| 80/80 [00:21<00:00,  3.67it/s]


0.055262951413567456


100%|██████████| 80/80 [00:21<00:00,  3.72it/s]
[I 2025-07-08 13:02:37,553] Trial 9 finished with value: 0.06846679918968351 and parameters: {'log2_hidden_layer_0': 6, 'dropout': 0.35131190089149067, 'lr': 0.0016540298012824451, 'batch_size': 360, 'num_epochs': 80}. Best is trial 2 with value: 0.13371565200910307.


0.060526192726508804


100%|██████████| 20/20 [00:31<00:00,  1.56s/it]


0.05703333439716173


100%|██████████| 20/20 [00:30<00:00,  1.52s/it]


0.17176934902354848


100%|██████████| 20/20 [00:30<00:00,  1.54s/it]


0.02387997650567019


100%|██████████| 20/20 [00:30<00:00,  1.52s/it]
[I 2025-07-08 13:04:42,875] Trial 10 finished with value: 0.06414438381222973 and parameters: {'log2_hidden_layer_0': 4, 'dropout': 0.6926149132172468, 'lr': 0.0070899951046823765, 'batch_size': 60, 'num_epochs': 20}. Best is trial 2 with value: 0.13371565200910307.


0.0038948753225385335


100%|██████████| 60/60 [00:15<00:00,  3.79it/s]


0.13532862688667924


100%|██████████| 60/60 [00:15<00:00,  3.80it/s]


0.15788726870200426


100%|██████████| 60/60 [00:15<00:00,  3.86it/s]


0.12140716346179681


100%|██████████| 60/60 [00:15<00:00,  3.93it/s]
[I 2025-07-08 13:05:45,829] Trial 11 finished with value: 0.12822353598885952 and parameters: {'log2_hidden_layer_0': 4, 'dropout': 0.5624231307674092, 'lr': 0.00010547614465309742, 'batch_size': 360, 'num_epochs': 60}. Best is trial 2 with value: 0.13371565200910307.


0.09827108490495774


100%|██████████| 20/20 [00:05<00:00,  3.91it/s]


0.11879358087506876


100%|██████████| 20/20 [00:05<00:00,  3.86it/s]


0.12921615502083894


100%|██████████| 20/20 [00:05<00:00,  3.90it/s]


0.09495908600828067


100%|██████████| 20/20 [00:05<00:00,  3.85it/s]
[I 2025-07-08 13:06:06,935] Trial 12 finished with value: 0.10063068430296998 and parameters: {'log2_hidden_layer_0': 3, 'dropout': 0.6108716449880092, 'lr': 0.00010640347180264444, 'batch_size': 360, 'num_epochs': 20}. Best is trial 2 with value: 0.13371565200910307.


0.059553915307691566


100%|██████████| 60/60 [02:58<00:00,  2.98s/it]


0.12360864973788237


100%|██████████| 60/60 [02:55<00:00,  2.93s/it]


0.13635626039770904


100%|██████████| 60/60 [02:56<00:00,  2.94s/it]


0.06963762475419139


100%|██████████| 60/60 [02:52<00:00,  2.87s/it]
[I 2025-07-08 13:17:55,094] Trial 13 finished with value: 0.10235460173855032 and parameters: {'log2_hidden_layer_0': 5, 'dropout': 0.6913290970275929, 'lr': 0.0004648786307758226, 'batch_size': 30, 'num_epochs': 60}. Best is trial 2 with value: 0.13371565200910307.


0.07981587206441845


100%|██████████| 30/30 [00:14<00:00,  2.05it/s]


0.12510873333650563


100%|██████████| 30/30 [00:14<00:00,  2.06it/s]


0.13967908108061258


100%|██████████| 30/30 [00:14<00:00,  2.04it/s]


0.11182465355504795


100%|██████████| 30/30 [00:14<00:00,  2.06it/s]
[I 2025-07-08 13:18:54,477] Trial 14 finished with value: 0.11822078277985068 and parameters: {'log2_hidden_layer_0': 3, 'dropout': 0.5418030881644968, 'lr': 0.00019921244619599797, 'batch_size': 180, 'num_epochs': 30}. Best is trial 2 with value: 0.13371565200910307.


0.09627066314723659


100%|██████████| 90/90 [02:11<00:00,  1.46s/it]


0.13900682866563419


100%|██████████| 90/90 [02:11<00:00,  1.46s/it]


0.1506326687490932


100%|██████████| 90/90 [02:13<00:00,  1.48s/it]


0.07075306987567664


100%|██████████| 90/90 [02:11<00:00,  1.46s/it]
[I 2025-07-08 13:27:44,886] Trial 15 finished with value: 0.10978834013429827 and parameters: {'log2_hidden_layer_0': 5, 'dropout': 0.6160187840759288, 'lr': 0.0004611282767574715, 'batch_size': 60, 'num_epochs': 90}. Best is trial 2 with value: 0.13371565200910307.


0.07876079324678904


100%|██████████| 40/40 [00:29<00:00,  1.37it/s]


0.1350814142981342


100%|██████████| 40/40 [00:29<00:00,  1.38it/s]


0.15923191547478974


100%|██████████| 40/40 [00:29<00:00,  1.37it/s]


0.12835072353903443


100%|██████████| 40/40 [00:29<00:00,  1.38it/s]
[I 2025-07-08 13:29:42,636] Trial 16 finished with value: 0.13189130654229445 and parameters: {'log2_hidden_layer_0': 4, 'dropout': 0.39970061968974435, 'lr': 0.00014763021858201237, 'batch_size': 120, 'num_epochs': 40}. Best is trial 2 with value: 0.13371565200910307.


0.10490117285721948


100%|██████████| 40/40 [00:29<00:00,  1.38it/s]


0.1216728848077518


100%|██████████| 40/40 [00:29<00:00,  1.38it/s]


0.12544864786790885


100%|██████████| 40/40 [00:29<00:00,  1.37it/s]


0.06889894366746709


100%|██████████| 40/40 [00:28<00:00,  1.38it/s]
[I 2025-07-08 13:31:40,128] Trial 17 finished with value: 0.10274877087095431 and parameters: {'log2_hidden_layer_0': 3, 'dropout': 0.34034451512035274, 'lr': 0.0006608902651086694, 'batch_size': 120, 'num_epochs': 40}. Best is trial 2 with value: 0.13371565200910307.


0.09497460714068952


100%|██████████| 40/40 [00:29<00:00,  1.38it/s]


0.12491877429441146


100%|██████████| 40/40 [00:29<00:00,  1.37it/s]


0.13169895920494465


100%|██████████| 40/40 [00:29<00:00,  1.36it/s]


0.10203156194253041


100%|██████████| 40/40 [00:29<00:00,  1.38it/s]
[I 2025-07-08 13:33:37,955] Trial 18 finished with value: 0.11354034340819 and parameters: {'log2_hidden_layer_0': 5, 'dropout': 0.41025583641707525, 'lr': 0.00017997715671745228, 'batch_size': 120, 'num_epochs': 40}. Best is trial 2 with value: 0.13371565200910307.


0.0955120781908735


100%|██████████| 40/40 [00:29<00:00,  1.37it/s]


0.0987740146721907


100%|██████████| 40/40 [00:29<00:00,  1.37it/s]


0.13376157818751228


100%|██████████| 40/40 [00:29<00:00,  1.36it/s]


0.08942110410900074


100%|██████████| 40/40 [00:29<00:00,  1.37it/s]
[I 2025-07-08 13:35:36,331] Trial 19 finished with value: 0.10443539517861883 and parameters: {'log2_hidden_layer_0': 4, 'dropout': 0.27534033898794075, 'lr': 0.0006899733919241115, 'batch_size': 120, 'num_epochs': 40}. Best is trial 2 with value: 0.13371565200910307.


0.09578488374577158


100%|██████████| 100/100 [01:12<00:00,  1.38it/s]


0.12251391188244072


100%|██████████| 100/100 [01:12<00:00,  1.38it/s]


0.13484352550747628


100%|██████████| 100/100 [01:13<00:00,  1.37it/s]


0.07666744728991688


100%|██████████| 100/100 [01:12<00:00,  1.39it/s]
[I 2025-07-08 13:40:27,732] Trial 20 finished with value: 0.10799141567384357 and parameters: {'log2_hidden_layer_0': 3, 'dropout': 0.4892620635779577, 'lr': 0.00017052214186931875, 'batch_size': 120, 'num_epochs': 100}. Best is trial 2 with value: 0.13371565200910307.


0.09794077801554038


100%|██████████| 20/20 [00:05<00:00,  4.00it/s]


0.12809537674573646


100%|██████████| 20/20 [00:04<00:00,  4.06it/s]


0.15311338517944598


100%|██████████| 20/20 [00:05<00:00,  3.96it/s]


0.12352402105235315


100%|██████████| 20/20 [00:04<00:00,  4.04it/s]
[I 2025-07-08 13:40:48,135] Trial 21 finished with value: 0.12419620317942803 and parameters: {'log2_hidden_layer_0': 4, 'dropout': 0.5822138124688291, 'lr': 0.0001168390788948932, 'batch_size': 360, 'num_epochs': 20}. Best is trial 2 with value: 0.13371565200910307.


0.0920520297401766


100%|██████████| 60/60 [00:14<00:00,  4.03it/s]


0.13565537853320603


100%|██████████| 60/60 [00:14<00:00,  4.01it/s]


0.15748938242569344


100%|██████████| 60/60 [00:15<00:00,  3.98it/s]


0.12079621285951829


100%|██████████| 60/60 [00:14<00:00,  4.01it/s]
[I 2025-07-08 13:41:48,489] Trial 22 finished with value: 0.12765905145552453 and parameters: {'log2_hidden_layer_0': 4, 'dropout': 0.5161547028800869, 'lr': 0.00010119759610130922, 'batch_size': 360, 'num_epochs': 60}. Best is trial 2 with value: 0.13371565200910307.


0.09669523200368037


100%|██████████| 30/30 [00:21<00:00,  1.37it/s]


0.11866564527081826


100%|██████████| 30/30 [00:21<00:00,  1.38it/s]


0.13021599704144185


100%|██████████| 30/30 [00:22<00:00,  1.36it/s]


0.10059616156178458


100%|██████████| 30/30 [00:21<00:00,  1.37it/s]
[I 2025-07-08 13:43:17,416] Trial 23 finished with value: 0.11148034552170732 and parameters: {'log2_hidden_layer_0': 5, 'dropout': 0.4301781808479258, 'lr': 0.0002773118607905075, 'batch_size': 120, 'num_epochs': 30}. Best is trial 2 with value: 0.13371565200910307.


0.09644357821278463


100%|██████████| 90/90 [00:11<00:00,  7.92it/s]


0.13274645421602094


100%|██████████| 90/90 [00:11<00:00,  8.00it/s]


0.156051321085285


100%|██████████| 90/90 [00:11<00:00,  7.91it/s]


0.11943713500691558


100%|██████████| 90/90 [00:11<00:00,  7.98it/s]
[I 2025-07-08 13:44:02,929] Trial 24 finished with value: 0.12834113187525512 and parameters: {'log2_hidden_layer_0': 4, 'dropout': 0.6465513259692508, 'lr': 0.00015330185399132728, 'batch_size': 720, 'num_epochs': 90}. Best is trial 2 with value: 0.13371565200910307.


0.10512961719279905


100%|██████████| 90/90 [00:11<00:00,  7.98it/s]


0.13225136423649653


100%|██████████| 90/90 [00:11<00:00,  7.96it/s]


0.1560516466450809


100%|██████████| 90/90 [00:11<00:00,  7.91it/s]


0.11833274931315875


100%|██████████| 90/90 [00:11<00:00,  7.97it/s]
[I 2025-07-08 13:44:48,444] Trial 25 finished with value: 0.12806220484240322 and parameters: {'log2_hidden_layer_0': 4, 'dropout': 0.6625886285245972, 'lr': 0.00015769833767767283, 'batch_size': 720, 'num_epochs': 90}. Best is trial 2 with value: 0.13371565200910307.


0.10561305917487669


100%|██████████| 90/90 [00:11<00:00,  7.96it/s]


0.11575833152575894


100%|██████████| 90/90 [00:11<00:00,  8.01it/s]


0.12393800545522975


100%|██████████| 90/90 [00:11<00:00,  7.97it/s]


0.07687678680626193


100%|██████████| 90/90 [00:11<00:00,  8.03it/s]
[I 2025-07-08 13:45:33,748] Trial 26 finished with value: 0.10145748342510055 and parameters: {'log2_hidden_layer_0': 3, 'dropout': 0.3183009364531869, 'lr': 0.00040435874307540465, 'batch_size': 720, 'num_epochs': 90}. Best is trial 2 with value: 0.13371565200910307.


0.08925680991315162


100%|██████████| 80/80 [00:09<00:00,  8.02it/s]


0.1386696755572118


100%|██████████| 80/80 [00:10<00:00,  7.96it/s]


0.1404402116416906


100%|██████████| 80/80 [00:10<00:00,  7.91it/s]


0.049583268318181384


100%|██████████| 80/80 [00:10<00:00,  7.96it/s]
[I 2025-07-08 13:46:14,215] Trial 27 finished with value: 0.10076619691874189 and parameters: {'log2_hidden_layer_0': 4, 'dropout': 0.6413669956338547, 'lr': 0.00281501098346779, 'batch_size': 720, 'num_epochs': 80}. Best is trial 2 with value: 0.13371565200910307.


0.07437163215788378


100%|██████████| 100/100 [04:48<00:00,  2.88s/it]


0.09940397017693207


100%|██████████| 100/100 [04:47<00:00,  2.88s/it]


0.1442361447409763


100%|██████████| 100/100 [04:50<00:00,  2.90s/it]


0.06636362284891006


100%|██████████| 100/100 [04:59<00:00,  2.99s/it]
[I 2025-07-08 14:05:44,392] Trial 28 finished with value: 0.09841164672350558 and parameters: {'log2_hidden_layer_0': 5, 'dropout': 0.3748104178197583, 'lr': 0.00024492779076705967, 'batch_size': 30, 'num_epochs': 100}. Best is trial 2 with value: 0.13371565200910307.


0.08364284912720389


100%|██████████| 90/90 [00:47<00:00,  1.91it/s]


0.12679376008431956


100%|██████████| 90/90 [00:45<00:00,  1.97it/s]


0.15906997959010222


100%|██████████| 90/90 [00:45<00:00,  1.96it/s]


0.10465484933745929


100%|██████████| 90/90 [00:45<00:00,  1.98it/s]
[I 2025-07-08 14:08:49,687] Trial 29 finished with value: 0.12275669396880698 and parameters: {'log2_hidden_layer_0': 4, 'dropout': 0.49086181614628965, 'lr': 0.00014171408734057927, 'batch_size': 180, 'num_epochs': 90}. Best is trial 2 with value: 0.13371565200910307.


0.10050818686334685


100%|██████████| 40/40 [01:00<00:00,  1.51s/it]


-0.011874893287056685


100%|██████████| 40/40 [01:00<00:00,  1.51s/it]


-0.11243760152500237


100%|██████████| 40/40 [01:00<00:00,  1.52s/it]


0.024079603489844253


100%|██████████| 40/40 [01:00<00:00,  1.51s/it]
[I 2025-07-08 14:12:54,524] Trial 30 finished with value: -0.02148426767100166 and parameters: {'log2_hidden_layer_0': 3, 'dropout': 0.45876978424712833, 'lr': 0.00906904035227868, 'batch_size': 60, 'num_epochs': 40}. Best is trial 2 with value: 0.13371565200910307.


0.01429582063820817


100%|██████████| 60/60 [00:15<00:00,  3.83it/s]


0.13363531699686806


100%|██████████| 60/60 [00:15<00:00,  3.82it/s]


0.1584983446565978


100%|██████████| 60/60 [00:15<00:00,  3.79it/s]


0.12446631282070401


100%|██████████| 60/60 [00:15<00:00,  3.84it/s]
[I 2025-07-08 14:13:57,863] Trial 31 finished with value: 0.13056185088808026 and parameters: {'log2_hidden_layer_0': 4, 'dropout': 0.5986839523495175, 'lr': 0.00014481383816607505, 'batch_size': 360, 'num_epochs': 60}. Best is trial 2 with value: 0.13371565200910307.


0.10564742907815111


100%|██████████| 20/20 [00:02<00:00,  7.52it/s]


0.12495602115500704


100%|██████████| 20/20 [00:02<00:00,  7.53it/s]


0.15047304279859158


100%|██████████| 20/20 [00:02<00:00,  7.36it/s]


0.12348800515845945


100%|██████████| 20/20 [00:02<00:00,  7.37it/s]
[I 2025-07-08 14:14:08,903] Trial 32 finished with value: 0.12162151860136625 and parameters: {'log2_hidden_layer_0': 4, 'dropout': 0.6085094741222699, 'lr': 0.00013981083197038688, 'batch_size': 720, 'num_epochs': 20}. Best is trial 2 with value: 0.13371565200910307.


0.08756900529340694


100%|██████████| 60/60 [00:45<00:00,  1.31it/s]


0.12068583666442502


100%|██████████| 60/60 [00:45<00:00,  1.31it/s]


0.15237291368881584


100%|██████████| 60/60 [00:46<00:00,  1.30it/s]


0.09648544287039908


100%|██████████| 60/60 [00:45<00:00,  1.31it/s]
[I 2025-07-08 14:17:13,563] Trial 33 finished with value: 0.11894135631616193 and parameters: {'log2_hidden_layer_0': 4, 'dropout': 0.6598247769850475, 'lr': 0.00020873349055787067, 'batch_size': 120, 'num_epochs': 60}. Best is trial 2 with value: 0.13371565200910307.


0.10622123204100781


100%|██████████| 70/70 [00:18<00:00,  3.81it/s]


0.12220232028665999


100%|██████████| 70/70 [00:18<00:00,  3.82it/s]


0.1320921094583992


100%|██████████| 70/70 [00:18<00:00,  3.77it/s]


0.06994526693881749


100%|██████████| 70/70 [00:18<00:00,  3.82it/s]
[I 2025-07-08 14:18:27,666] Trial 34 finished with value: 0.10457033743607504 and parameters: {'log2_hidden_layer_0': 5, 'dropout': 0.5321799620159445, 'lr': 0.0006094867566869692, 'batch_size': 360, 'num_epochs': 70}. Best is trial 2 with value: 0.13371565200910307.


0.09404165306042352


100%|██████████| 90/90 [00:45<00:00,  1.97it/s]


0.12410843139503305


100%|██████████| 90/90 [00:45<00:00,  1.98it/s]


0.1396670628049336


100%|██████████| 90/90 [00:45<00:00,  1.96it/s]


0.06322657564920962


100%|██████████| 90/90 [00:45<00:00,  1.97it/s]
[I 2025-07-08 14:21:31,170] Trial 35 finished with value: 0.10546676858078262 and parameters: {'log2_hidden_layer_0': 3, 'dropout': 0.5907173121903424, 'lr': 0.0003200231721150614, 'batch_size': 180, 'num_epochs': 90}. Best is trial 2 with value: 0.13371565200910307.


0.09486500447395417


100%|██████████| 40/40 [00:05<00:00,  7.56it/s]


0.13368488337589945


100%|██████████| 40/40 [00:05<00:00,  7.45it/s]


0.15744026146643633


100%|██████████| 40/40 [00:05<00:00,  7.41it/s]


0.12111965957843775


100%|██████████| 40/40 [00:05<00:00,  7.44it/s]
[I 2025-07-08 14:21:52,895] Trial 36 finished with value: 0.12807817359033188 and parameters: {'log2_hidden_layer_0': 4, 'dropout': 0.6996006891037465, 'lr': 0.0002283094032864045, 'batch_size': 720, 'num_epochs': 40}. Best is trial 2 with value: 0.13371565200910307.


0.10006788994055404


100%|██████████| 10/10 [00:07<00:00,  1.30it/s]


0.1301877844681106


100%|██████████| 10/10 [00:07<00:00,  1.30it/s]


0.1568292570100989


100%|██████████| 10/10 [00:07<00:00,  1.28it/s]


0.13038030424759062


100%|██████████| 10/10 [00:07<00:00,  1.30it/s]
[I 2025-07-08 14:22:25,144] Trial 37 finished with value: 0.12924284569416603 and parameters: {'log2_hidden_layer_0': 4, 'dropout': 0.6656823851551276, 'lr': 0.00013817991364382227, 'batch_size': 120, 'num_epochs': 10}. Best is trial 2 with value: 0.13371565200910307.


0.09957403705086394


100%|██████████| 10/10 [00:07<00:00,  1.30it/s]


0.09240331354116717


100%|██████████| 10/10 [00:07<00:00,  1.30it/s]


0.09500022245603226


100%|██████████| 10/10 [00:07<00:00,  1.30it/s]


0.05964259332057927


100%|██████████| 10/10 [00:07<00:00,  1.31it/s]
[I 2025-07-08 14:22:57,170] Trial 38 finished with value: 0.07331729418428087 and parameters: {'log2_hidden_layer_0': 2, 'dropout': 0.6754653817286873, 'lr': 0.000132008915924904, 'batch_size': 120, 'num_epochs': 10}. Best is trial 2 with value: 0.13371565200910307.


0.046223047419344784


100%|██████████| 10/10 [00:07<00:00,  1.31it/s]


0.12276863282807556


100%|██████████| 10/10 [00:07<00:00,  1.31it/s]


0.1306888541502155


100%|██████████| 10/10 [00:07<00:00,  1.28it/s]


0.11203169423921656


100%|██████████| 10/10 [00:07<00:00,  1.31it/s]
[I 2025-07-08 14:23:29,261] Trial 39 finished with value: 0.11292009285058274 and parameters: {'log2_hidden_layer_0': 5, 'dropout': 0.289700702592649, 'lr': 0.0002624394802125138, 'batch_size': 120, 'num_epochs': 10}. Best is trial 2 with value: 0.13371565200910307.


0.08619119018482334


100%|██████████| 50/50 [00:38<00:00,  1.31it/s]


0.12594580182534235


100%|██████████| 50/50 [00:38<00:00,  1.31it/s]


0.15245988998158266


100%|██████████| 50/50 [00:38<00:00,  1.29it/s]


0.08302361999702633


100%|██████████| 50/50 [00:38<00:00,  1.30it/s]
[I 2025-07-08 14:26:04,295] Trial 40 finished with value: 0.11204637890780457 and parameters: {'log2_hidden_layer_0': 4, 'dropout': 0.625066458099046, 'lr': 0.000903331103359946, 'batch_size': 120, 'num_epochs': 50}. Best is trial 2 with value: 0.13371565200910307.


0.08675620382726697


100%|██████████| 10/10 [00:07<00:00,  1.27it/s]


0.13142974299906504


100%|██████████| 10/10 [00:07<00:00,  1.28it/s]


0.16036644828714028


100%|██████████| 10/10 [00:07<00:00,  1.29it/s]


0.1318778312450848


100%|██████████| 10/10 [00:07<00:00,  1.29it/s]
[I 2025-07-08 14:26:36,820] Trial 41 finished with value: 0.13146661019841105 and parameters: {'log2_hidden_layer_0': 4, 'dropout': 0.6470235225841743, 'lr': 0.00017048516021811837, 'batch_size': 120, 'num_epochs': 10}. Best is trial 2 with value: 0.13371565200910307.


0.10219241826235406


100%|██████████| 10/10 [00:07<00:00,  1.29it/s]


0.1315972910279485


100%|██████████| 10/10 [00:07<00:00,  1.28it/s]


0.16106108331698885


100%|██████████| 10/10 [00:07<00:00,  1.28it/s]


0.1320126528821304


100%|██████████| 10/10 [00:07<00:00,  1.31it/s]
[I 2025-07-08 14:27:09,173] Trial 42 finished with value: 0.131505394467535 and parameters: {'log2_hidden_layer_0': 4, 'dropout': 0.635019848757596, 'lr': 0.00017739000041033776, 'batch_size': 120, 'num_epochs': 10}. Best is trial 2 with value: 0.13371565200910307.


0.10135055064307225


100%|██████████| 10/10 [00:07<00:00,  1.29it/s]


0.13230320946722965


100%|██████████| 10/10 [00:07<00:00,  1.27it/s]


0.16205450669477423


100%|██████████| 10/10 [00:07<00:00,  1.25it/s]


0.13316850831318758


100%|██████████| 10/10 [00:07<00:00,  1.27it/s]
[I 2025-07-08 14:27:42,017] Trial 43 finished with value: 0.13134473097342342 and parameters: {'log2_hidden_layer_0': 4, 'dropout': 0.5851864115945808, 'lr': 0.00019668752552389737, 'batch_size': 120, 'num_epochs': 10}. Best is trial 2 with value: 0.13371565200910307.


0.09785269941850215


100%|██████████| 10/10 [00:07<00:00,  1.28it/s]


0.11751966825931706


100%|██████████| 10/10 [00:07<00:00,  1.32it/s]


0.1461724298392277


100%|██████████| 10/10 [00:07<00:00,  1.29it/s]


0.11459692878974381


100%|██████████| 10/10 [00:07<00:00,  1.30it/s]
[I 2025-07-08 14:28:14,145] Trial 44 finished with value: 0.12002482321683762 and parameters: {'log2_hidden_layer_0': 5, 'dropout': 0.6322104820837163, 'lr': 0.0003486562478104974, 'batch_size': 120, 'num_epochs': 10}. Best is trial 2 with value: 0.13371565200910307.


0.10181026597906188


100%|██████████| 10/10 [00:07<00:00,  1.27it/s]


0.1228154720009866


100%|██████████| 10/10 [00:07<00:00,  1.29it/s]


0.1408876471750822


100%|██████████| 10/10 [00:07<00:00,  1.27it/s]


0.11293969473146141


100%|██████████| 10/10 [00:07<00:00,  1.29it/s]
[I 2025-07-08 14:28:46,769] Trial 45 finished with value: 0.1147324251838388 and parameters: {'log2_hidden_layer_0': 3, 'dropout': 0.5654527325431701, 'lr': 0.00019915789067294115, 'batch_size': 120, 'num_epochs': 10}. Best is trial 2 with value: 0.13371565200910307.


0.08228688682782498


100%|██████████| 10/10 [00:07<00:00,  1.27it/s]


0.10247305806303222


100%|██████████| 10/10 [00:07<00:00,  1.30it/s]


0.13199638000035638


100%|██████████| 10/10 [00:07<00:00,  1.27it/s]


0.059233898758244696


100%|██████████| 10/10 [00:07<00:00,  1.29it/s]
[I 2025-07-08 14:29:19,326] Trial 46 finished with value: 0.0908037128855991 and parameters: {'log2_hidden_layer_0': 4, 'dropout': 0.43931875922752084, 'lr': 0.0019313066192562605, 'batch_size': 120, 'num_epochs': 10}. Best is trial 2 with value: 0.13371565200910307.


0.06951151472076313


100%|██████████| 10/10 [00:07<00:00,  1.27it/s]


0.07087168272442612


100%|██████████| 10/10 [00:07<00:00,  1.29it/s]


0.116955569414986


100%|██████████| 10/10 [00:07<00:00,  1.28it/s]


0.04058562602204814


100%|██████████| 10/10 [00:07<00:00,  1.28it/s]
[I 2025-07-08 14:29:51,967] Trial 47 finished with value: 0.07046645323037214 and parameters: {'log2_hidden_layer_0': 4, 'dropout': 0.5779702259829143, 'lr': 0.005120343690285784, 'batch_size': 120, 'num_epochs': 10}. Best is trial 2 with value: 0.13371565200910307.


0.05345293476002826


100%|██████████| 10/10 [00:30<00:00,  3.08s/it]


0.1260901805145974


100%|██████████| 10/10 [00:30<00:00,  3.06s/it]


0.1478254264096473


100%|██████████| 10/10 [00:30<00:00,  3.06s/it]


0.12173709415904484


100%|██████████| 10/10 [00:30<00:00,  3.07s/it]
[I 2025-07-08 14:31:59,849] Trial 48 finished with value: 0.12393360345486365 and parameters: {'log2_hidden_layer_0': 3, 'dropout': 0.5177046461262229, 'lr': 0.00018453708291305253, 'batch_size': 30, 'num_epochs': 10}. Best is trial 2 with value: 0.13371565200910307.


0.1000817127361651


100%|██████████| 20/20 [00:15<00:00,  1.28it/s]


0.1167671145703395


100%|██████████| 20/20 [00:15<00:00,  1.28it/s]


0.1276722378855821


100%|██████████| 20/20 [00:15<00:00,  1.27it/s]


0.10539072145840672


100%|██████████| 20/20 [00:15<00:00,  1.28it/s]
[I 2025-07-08 14:33:03,831] Trial 49 finished with value: 0.11094424291536936 and parameters: {'log2_hidden_layer_0': 5, 'dropout': 0.3897073294193937, 'lr': 0.00028567395590886125, 'batch_size': 120, 'num_epochs': 20}. Best is trial 2 with value: 0.13371565200910307.


0.09394689774714914


100%|██████████| 70/70 [00:54<00:00,  1.28it/s]


0.07007156073810941


100%|██████████| 70/70 [00:55<00:00,  1.27it/s]


0.12447910289046898


100%|██████████| 70/70 [00:55<00:00,  1.27it/s]


0.047602856336522475


100%|██████████| 70/70 [00:53<00:00,  1.30it/s]
[I 2025-07-08 14:36:44,367] Trial 50 finished with value: 0.07781313432028764 and parameters: {'log2_hidden_layer_0': 2, 'dropout': 0.687421846597532, 'lr': 0.0005310547185663014, 'batch_size': 120, 'num_epochs': 70}. Best is trial 2 with value: 0.13371565200910307.


0.0690990173160497


100%|██████████| 60/60 [00:16<00:00,  3.74it/s]


0.13479304315466226


100%|██████████| 60/60 [00:16<00:00,  3.73it/s]


0.15825560722622434


100%|██████████| 60/60 [00:16<00:00,  3.72it/s]


0.12779704853275822


100%|██████████| 60/60 [00:16<00:00,  3.70it/s]
[I 2025-07-08 14:37:49,338] Trial 51 finished with value: 0.1311621185205162 and parameters: {'log2_hidden_layer_0': 4, 'dropout': 0.599817296539226, 'lr': 0.00011971887233370113, 'batch_size': 360, 'num_epochs': 60}. Best is trial 2 with value: 0.13371565200910307.


0.10380277516842011


100%|██████████| 80/80 [02:04<00:00,  1.56s/it]


0.11723022681523437


100%|██████████| 80/80 [02:04<00:00,  1.55s/it]


0.15277571046553604


100%|██████████| 80/80 [02:05<00:00,  1.56s/it]


0.09073886373063164


100%|██████████| 80/80 [02:03<00:00,  1.54s/it]
[I 2025-07-08 14:46:08,766] Trial 52 finished with value: 0.11583795126099611 and parameters: {'log2_hidden_layer_0': 4, 'dropout': 0.6218311397563053, 'lr': 0.00012192215962417914, 'batch_size': 60, 'num_epochs': 80}. Best is trial 2 with value: 0.13371565200910307.


0.10260700403258241


100%|██████████| 50/50 [00:38<00:00,  1.28it/s]


0.13169139760048781


100%|██████████| 50/50 [00:38<00:00,  1.29it/s]


0.15775398094263476


100%|██████████| 50/50 [00:39<00:00,  1.27it/s]


0.12360876628955596


100%|██████████| 50/50 [00:38<00:00,  1.29it/s]
[I 2025-07-08 14:48:46,080] Trial 53 finished with value: 0.12993726731340013 and parameters: {'log2_hidden_layer_0': 4, 'dropout': 0.5546529106570303, 'lr': 0.00010369628992252433, 'batch_size': 120, 'num_epochs': 50}. Best is trial 2 with value: 0.13371565200910307.


0.10669492442092199


100%|██████████| 30/30 [00:07<00:00,  3.78it/s]


0.13179151274857873


100%|██████████| 30/30 [00:07<00:00,  3.80it/s]


0.15988610186719024


100%|██████████| 30/30 [00:07<00:00,  3.77it/s]


0.12872584430374923


100%|██████████| 30/30 [00:07<00:00,  3.82it/s]
[I 2025-07-08 14:49:18,218] Trial 54 finished with value: 0.13016821884489477 and parameters: {'log2_hidden_layer_0': 4, 'dropout': 0.6500568573540949, 'lr': 0.00017172319030216913, 'batch_size': 360, 'num_epochs': 30}. Best is trial 2 with value: 0.13371565200910307.


0.10026941646006089


100%|██████████| 20/20 [00:15<00:00,  1.29it/s]


0.128696775235558


100%|██████████| 20/20 [00:15<00:00,  1.27it/s]


0.16218323753490527


100%|██████████| 20/20 [00:18<00:00,  1.10it/s]


0.12786765479159312


100%|██████████| 20/20 [00:16<00:00,  1.20it/s]
[I 2025-07-08 14:50:25,661] Trial 55 finished with value: 0.13032994306251114 and parameters: {'log2_hidden_layer_0': 4, 'dropout': 0.600113408318019, 'lr': 0.00022024185186880706, 'batch_size': 120, 'num_epochs': 20}. Best is trial 2 with value: 0.13371565200910307.


0.10257210468798818


100%|██████████| 60/60 [00:53<00:00,  1.12it/s]


0.12781087475301736


100%|██████████| 60/60 [00:46<00:00,  1.28it/s]


0.1569780184085816


100%|██████████| 60/60 [00:46<00:00,  1.28it/s]


0.11888224696776334


100%|██████████| 60/60 [00:51<00:00,  1.18it/s]
[I 2025-07-08 14:53:44,966] Trial 56 finished with value: 0.12788194867281627 and parameters: {'log2_hidden_layer_0': 4, 'dropout': 0.6751466603525904, 'lr': 0.00011406806499386118, 'batch_size': 120, 'num_epochs': 60}. Best is trial 2 with value: 0.13371565200910307.


0.10785665456190277


100%|██████████| 10/10 [00:05<00:00,  1.68it/s]


0.12792083353211117


100%|██████████| 10/10 [00:05<00:00,  1.82it/s]


0.1575655706379337


100%|██████████| 10/10 [00:06<00:00,  1.51it/s]


0.1276681541499048


100%|██████████| 10/10 [00:06<00:00,  1.63it/s]
[I 2025-07-08 14:54:10,229] Trial 57 finished with value: 0.1269584924205644 and parameters: {'log2_hidden_layer_0': 4, 'dropout': 0.41793076104118876, 'lr': 0.0003866619248464587, 'batch_size': 180, 'num_epochs': 10}. Best is trial 2 with value: 0.13371565200910307.


0.09467941136230795


100%|██████████| 40/40 [00:11<00:00,  3.37it/s]


0.12337139008544447


100%|██████████| 40/40 [00:12<00:00,  3.31it/s]


0.1392475067708556


100%|██████████| 40/40 [00:13<00:00,  2.87it/s]


0.11322508941096344


100%|██████████| 40/40 [00:12<00:00,  3.27it/s]
[I 2025-07-08 14:55:00,931] Trial 58 finished with value: 0.1179351923212921 and parameters: {'log2_hidden_layer_0': 3, 'dropout': 0.640608648687016, 'lr': 0.00018907820226998805, 'batch_size': 360, 'num_epochs': 40}. Best is trial 2 with value: 0.13371565200910307.


0.09589678301790487


100%|██████████| 10/10 [00:36<00:00,  3.65s/it]


0.11726763300483142


100%|██████████| 10/10 [00:34<00:00,  3.43s/it]


0.14592376929159584


100%|██████████| 10/10 [00:36<00:00,  3.70s/it]


0.10587842463893894


100%|██████████| 10/10 [00:46<00:00,  4.70s/it]
[I 2025-07-08 14:57:44,365] Trial 59 finished with value: 0.11688319503471195 and parameters: {'log2_hidden_layer_0': 5, 'dropout': 0.46908559388520316, 'lr': 0.00016643349364551014, 'batch_size': 30, 'num_epochs': 10}. Best is trial 2 with value: 0.13371565200910307.


0.09846295320348164


100%|██████████| 100/100 [02:11<00:00,  1.32s/it]


0.13283466340852199


100%|██████████| 100/100 [01:27<00:00,  1.14it/s]


0.15751406299884477


100%|██████████| 100/100 [01:18<00:00,  1.28it/s]


0.10862827410797947


100%|██████████| 100/100 [01:25<00:00,  1.17it/s]
[I 2025-07-08 15:04:08,685] Trial 60 finished with value: 0.12504483811741404 and parameters: {'log2_hidden_layer_0': 4, 'dropout': 0.35439377662958366, 'lr': 0.00012251157062988545, 'batch_size': 120, 'num_epochs': 100}. Best is trial 2 with value: 0.13371565200910307.


0.10120235195430995


100%|██████████| 60/60 [00:16<00:00,  3.60it/s]


0.13337593851091795


100%|██████████| 60/60 [00:16<00:00,  3.73it/s]


0.15840900186876725


100%|██████████| 60/60 [00:16<00:00,  3.70it/s]


0.12347422952310086


100%|██████████| 60/60 [00:17<00:00,  3.37it/s]
[I 2025-07-08 15:05:15,946] Trial 61 finished with value: 0.12973459994029912 and parameters: {'log2_hidden_layer_0': 4, 'dropout': 0.595437005106563, 'lr': 0.0001503738248100814, 'batch_size': 360, 'num_epochs': 60}. Best is trial 2 with value: 0.13371565200910307.


0.10367922985841047


100%|██████████| 60/60 [00:17<00:00,  3.52it/s]


0.12413698630540729


100%|██████████| 60/60 [00:15<00:00,  3.78it/s]


0.159250346784831


100%|██████████| 60/60 [00:16<00:00,  3.75it/s]


0.10895156051206659


100%|██████████| 60/60 [00:16<00:00,  3.64it/s]
[I 2025-07-08 15:06:21,869] Trial 62 finished with value: 0.12435828730474444 and parameters: {'log2_hidden_layer_0': 4, 'dropout': 0.6163474119389707, 'lr': 0.0002490821835166894, 'batch_size': 360, 'num_epochs': 60}. Best is trial 2 with value: 0.13371565200910307.


0.10509425561667284


100%|██████████| 60/60 [00:17<00:00,  3.37it/s]


0.13460295345642428


100%|██████████| 60/60 [00:16<00:00,  3.71it/s]


0.15828444959187313


100%|██████████| 60/60 [00:17<00:00,  3.36it/s]


0.12711327956619467


100%|██████████| 60/60 [00:18<00:00,  3.29it/s]
[I 2025-07-08 15:07:32,465] Trial 63 finished with value: 0.13023600933642232 and parameters: {'log2_hidden_layer_0': 4, 'dropout': 0.5773086525552491, 'lr': 0.00012733863069967346, 'batch_size': 360, 'num_epochs': 60}. Best is trial 2 with value: 0.13371565200910307.


0.10094335473119724


100%|██████████| 60/60 [00:19<00:00,  3.06it/s]


0.13549213661952716


100%|██████████| 60/60 [00:18<00:00,  3.29it/s]


0.15755769110590537


100%|██████████| 60/60 [00:17<00:00,  3.45it/s]


0.12116793621555683


100%|██████████| 60/60 [00:16<00:00,  3.57it/s]
[I 2025-07-08 15:08:44,981] Trial 64 finished with value: 0.1278791594749057 and parameters: {'log2_hidden_layer_0': 4, 'dropout': 0.546040263357854, 'lr': 0.00010068842680770796, 'batch_size': 360, 'num_epochs': 60}. Best is trial 2 with value: 0.13371565200910307.


0.09729887395863338


100%|██████████| 20/20 [00:15<00:00,  1.28it/s]


0.12800689721534494


100%|██████████| 20/20 [00:16<00:00,  1.22it/s]


0.13712054108469338


100%|██████████| 20/20 [00:15<00:00,  1.26it/s]


0.11544061392018222


100%|██████████| 20/20 [00:17<00:00,  1.17it/s]
[I 2025-07-08 15:09:51,382] Trial 65 finished with value: 0.12067447633116396 and parameters: {'log2_hidden_layer_0': 5, 'dropout': 0.6087613978221642, 'lr': 0.00015620157678051773, 'batch_size': 120, 'num_epochs': 20}. Best is trial 2 with value: 0.13371565200910307.


0.10212985310443531


100%|██████████| 40/40 [01:10<00:00,  1.76s/it]


0.11876887150754989


100%|██████████| 40/40 [01:11<00:00,  1.78s/it]


0.1484948449512405


100%|██████████| 40/40 [01:11<00:00,  1.79s/it]


0.09673262748839218


100%|██████████| 40/40 [01:11<00:00,  1.79s/it]
[I 2025-07-08 15:14:39,124] Trial 66 finished with value: 0.11754003625707707 and parameters: {'log2_hidden_layer_0': 4, 'dropout': 0.6527393145213072, 'lr': 0.00021705535217685597, 'batch_size': 60, 'num_epochs': 40}. Best is trial 2 with value: 0.13371565200910307.


0.10616380108112573


100%|██████████| 80/80 [00:23<00:00,  3.44it/s]


0.13069554284653598


100%|██████████| 80/80 [00:24<00:00,  3.28it/s]


0.15681985745654378


100%|██████████| 80/80 [00:22<00:00,  3.48it/s]


0.10996269538506086


100%|██████████| 80/80 [00:23<00:00,  3.43it/s]
[I 2025-07-08 15:16:13,577] Trial 67 finished with value: 0.12561093806855428 and parameters: {'log2_hidden_layer_0': 4, 'dropout': 0.6760973537269692, 'lr': 0.00016950720066340966, 'batch_size': 360, 'num_epochs': 80}. Best is trial 2 with value: 0.13371565200910307.


0.10496565658607644


100%|██████████| 60/60 [00:52<00:00,  1.13it/s]


0.11749781850404513


100%|██████████| 60/60 [01:01<00:00,  1.02s/it]


0.13562677209575882


100%|██████████| 60/60 [01:02<00:00,  1.04s/it]


0.07567743965455576


100%|██████████| 60/60 [01:02<00:00,  1.04s/it]
[I 2025-07-08 15:20:14,152] Trial 68 finished with value: 0.10681545106106825 and parameters: {'log2_hidden_layer_0': 3, 'dropout': 0.6340655089967363, 'lr': 0.00029985660506859, 'batch_size': 120, 'num_epochs': 60}. Best is trial 2 with value: 0.13371565200910307.


0.0984597739899133


100%|██████████| 70/70 [01:06<00:00,  1.05it/s]


0.12165240993754974


100%|██████████| 70/70 [01:08<00:00,  1.02it/s]


0.1375364933571128


100%|██████████| 70/70 [01:12<00:00,  1.04s/it]


0.10045154682432962


100%|██████████| 70/70 [01:13<00:00,  1.06s/it]
[I 2025-07-08 15:24:57,584] Trial 69 finished with value: 0.11432794182529352 and parameters: {'log2_hidden_layer_0': 3, 'dropout': 0.5273888059171326, 'lr': 0.00014156845769122805, 'batch_size': 120, 'num_epochs': 70}. Best is trial 2 with value: 0.13371565200910307.


0.0976713171821819


100%|██████████| 30/30 [00:20<00:00,  1.44it/s]


0.13125352745521812


100%|██████████| 30/30 [00:20<00:00,  1.44it/s]


0.16366590225804473


100%|██████████| 30/30 [00:21<00:00,  1.43it/s]


0.12218363262622435


100%|██████████| 30/30 [00:20<00:00,  1.45it/s]
[I 2025-07-08 15:26:22,037] Trial 70 finished with value: 0.12951609265059008 and parameters: {'log2_hidden_layer_0': 4, 'dropout': 0.5895144945979736, 'lr': 0.00019726743968479182, 'batch_size': 180, 'num_epochs': 30}. Best is trial 2 with value: 0.13371565200910307.


0.10096130826287318


100%|██████████| 20/20 [00:21<00:00,  1.07s/it]


0.12838885217743196


100%|██████████| 20/20 [00:21<00:00,  1.06s/it]


0.15933137786060378


100%|██████████| 20/20 [00:21<00:00,  1.08s/it]


0.12681429024506072


100%|██████████| 20/20 [00:21<00:00,  1.06s/it]
[I 2025-07-08 15:27:49,186] Trial 71 finished with value: 0.12986313253154988 and parameters: {'log2_hidden_layer_0': 4, 'dropout': 0.594563695279104, 'lr': 0.00022473024564800863, 'batch_size': 120, 'num_epochs': 20}. Best is trial 2 with value: 0.13371565200910307.


0.1049180098431031


100%|██████████| 20/20 [00:21<00:00,  1.07s/it]


0.12702392308798502


100%|██████████| 20/20 [00:21<00:00,  1.06s/it]


0.15889889682522124


100%|██████████| 20/20 [00:21<00:00,  1.08s/it]


0.1253025979569769


100%|██████████| 20/20 [00:20<00:00,  1.05s/it]
[I 2025-07-08 15:29:15,810] Trial 72 finished with value: 0.12933172511311544 and parameters: {'log2_hidden_layer_0': 4, 'dropout': 0.6079891095464981, 'lr': 0.00023729149100236362, 'batch_size': 120, 'num_epochs': 20}. Best is trial 2 with value: 0.13371565200910307.


0.10610148258227865


100%|██████████| 20/20 [00:21<00:00,  1.05s/it]


0.1349799889070742


100%|██████████| 20/20 [00:21<00:00,  1.06s/it]


0.15876772097389358


100%|██████████| 20/20 [00:21<00:00,  1.08s/it]


0.1287703288589505


100%|██████████| 20/20 [00:20<00:00,  1.05s/it]
[I 2025-07-08 15:30:42,167] Trial 73 finished with value: 0.13029130130268748 and parameters: {'log2_hidden_layer_0': 4, 'dropout': 0.5571445226201693, 'lr': 0.0001148672535816629, 'batch_size': 120, 'num_epochs': 20}. Best is trial 2 with value: 0.13371565200910307.


0.09864716647083167


100%|██████████| 20/20 [00:21<00:00,  1.07s/it]


0.13364397447646556


100%|██████████| 20/20 [00:21<00:00,  1.07s/it]


0.1625897786063801


100%|██████████| 20/20 [00:21<00:00,  1.07s/it]


0.1339488848432667


100%|██████████| 20/20 [00:20<00:00,  1.04s/it]
[I 2025-07-08 15:32:08,603] Trial 74 finished with value: 0.13298969829700807 and parameters: {'log2_hidden_layer_0': 4, 'dropout': 0.6266821039913443, 'lr': 0.00014610880910106093, 'batch_size': 120, 'num_epochs': 20}. Best is trial 2 with value: 0.13371565200910307.


0.10177615526191995


100%|██████████| 10/10 [00:03<00:00,  2.93it/s]


0.11939261945079482


100%|██████████| 10/10 [00:03<00:00,  2.92it/s]


0.14842990569569864


100%|██████████| 10/10 [00:03<00:00,  2.85it/s]


0.11240341641806668


100%|██████████| 10/10 [00:03<00:00,  2.87it/s]
[I 2025-07-08 15:32:22,972] Trial 75 finished with value: 0.11391170025800892 and parameters: {'log2_hidden_layer_0': 4, 'dropout': 0.6652058914516523, 'lr': 0.00013048434447137774, 'batch_size': 360, 'num_epochs': 10}. Best is trial 2 with value: 0.13371565200910307.


0.07542085946747554


100%|██████████| 40/40 [00:40<00:00,  1.01s/it]


0.12710881446982333


100%|██████████| 40/40 [00:41<00:00,  1.03s/it]


0.15919861360404788


100%|██████████| 40/40 [00:41<00:00,  1.04s/it]


0.12117293925398569


100%|██████████| 40/40 [00:40<00:00,  1.02s/it]
[I 2025-07-08 15:35:08,431] Trial 76 finished with value: 0.12837184260799075 and parameters: {'log2_hidden_layer_0': 4, 'dropout': 0.6346498866005631, 'lr': 0.00015688612720118162, 'batch_size': 120, 'num_epochs': 40}. Best is trial 2 with value: 0.13371565200910307.


0.10600700310410613


100%|██████████| 50/50 [00:51<00:00,  1.02s/it]


0.12114600782631907


100%|██████████| 50/50 [00:49<00:00,  1.01it/s]


0.15720949061422046


100%|██████████| 50/50 [00:51<00:00,  1.04s/it]


0.10843360304495862


100%|██████████| 50/50 [00:50<00:00,  1.01s/it]
[I 2025-07-08 15:38:33,072] Trial 77 finished with value: 0.12265666454003568 and parameters: {'log2_hidden_layer_0': 4, 'dropout': 0.6269444947521061, 'lr': 0.00017889910231869187, 'batch_size': 120, 'num_epochs': 50}. Best is trial 2 with value: 0.13371565200910307.


0.10383755667464456


100%|██████████| 100/100 [06:07<00:00,  3.67s/it]


0.09379497126734952


100%|██████████| 100/100 [06:25<00:00,  3.85s/it]


0.15253599363960008


100%|██████████| 100/100 [06:28<00:00,  3.89s/it]


0.0665718607181914


100%|██████████| 100/100 [06:33<00:00,  3.93s/it]
[I 2025-07-08 16:04:13,339] Trial 78 finished with value: 0.09310092166010413 and parameters: {'log2_hidden_layer_0': 4, 'dropout': 0.6861744353058394, 'lr': 0.0009459221505201601, 'batch_size': 30, 'num_epochs': 100}. Best is trial 2 with value: 0.13371565200910307.


0.05950086101527545


100%|██████████| 10/10 [00:10<00:00,  1.05s/it]


0.1299623398320896


100%|██████████| 10/10 [00:10<00:00,  1.04s/it]


0.1305455583607906


100%|██████████| 10/10 [00:10<00:00,  1.06s/it]


0.11896505877935072


100%|██████████| 10/10 [00:10<00:00,  1.04s/it]
[I 2025-07-08 16:04:56,921] Trial 79 finished with value: 0.11628973400195507 and parameters: {'log2_hidden_layer_0': 5, 'dropout': 0.39387874194769823, 'lr': 0.00014225854121123566, 'batch_size': 120, 'num_epochs': 10}. Best is trial 2 with value: 0.13371565200910307.


0.08568597903558933


100%|██████████| 20/20 [00:40<00:00,  2.02s/it]


0.13141584902562586


100%|██████████| 20/20 [00:36<00:00,  1.82s/it]


0.16392214356704873


100%|██████████| 20/20 [00:35<00:00,  1.77s/it]


0.13705952408288613


100%|██████████| 20/20 [00:30<00:00,  1.54s/it]
[I 2025-07-08 16:07:22,679] Trial 80 finished with value: 0.13573289974055264 and parameters: {'log2_hidden_layer_0': 4, 'dropout': 0.6481374864980073, 'lr': 0.00011089472820919897, 'batch_size': 60, 'num_epochs': 20}. Best is trial 80 with value: 0.13573289974055264.


0.1105340822866499


100%|██████████| 20/20 [00:31<00:00,  1.56s/it]


0.13111147646032123


100%|██████████| 20/20 [00:32<00:00,  1.61s/it]


0.16378114173661545


100%|██████████| 20/20 [00:34<00:00,  1.75s/it]


0.13680498321521117


100%|██████████| 20/20 [00:32<00:00,  1.63s/it]
[I 2025-07-08 16:09:36,017] Trial 81 finished with value: 0.13554167101682704 and parameters: {'log2_hidden_layer_0': 4, 'dropout': 0.6529710849048741, 'lr': 0.00011253742273971315, 'batch_size': 60, 'num_epochs': 20}. Best is trial 80 with value: 0.13573289974055264.


0.11046908265516035


100%|██████████| 20/20 [00:33<00:00,  1.67s/it]


0.1306282484255283


100%|██████████| 20/20 [00:31<00:00,  1.60s/it]


0.16367651031424915


100%|██████████| 20/20 [00:32<00:00,  1.60s/it]


0.13582330514842952


100%|██████████| 20/20 [00:31<00:00,  1.55s/it]
[I 2025-07-08 16:11:47,094] Trial 82 finished with value: 0.13510878376465565 and parameters: {'log2_hidden_layer_0': 4, 'dropout': 0.649869808719525, 'lr': 0.00011802397033884457, 'batch_size': 60, 'num_epochs': 20}. Best is trial 80 with value: 0.13573289974055264.


0.11030707117041556


100%|██████████| 20/20 [00:31<00:00,  1.55s/it]


0.1314870929360196


100%|██████████| 20/20 [00:30<00:00,  1.55s/it]


0.16383458520469132


100%|██████████| 20/20 [00:31<00:00,  1.57s/it]


0.13705520971167506


100%|██████████| 20/20 [00:31<00:00,  1.59s/it]
[I 2025-07-08 16:13:54,929] Trial 83 finished with value: 0.13578952147540216 and parameters: {'log2_hidden_layer_0': 4, 'dropout': 0.6484428133617176, 'lr': 0.00011088807292761746, 'batch_size': 60, 'num_epochs': 20}. Best is trial 83 with value: 0.13578952147540216.


0.11078119804922264


100%|██████████| 20/20 [00:31<00:00,  1.56s/it]


0.1323311465850472


100%|██████████| 20/20 [00:30<00:00,  1.54s/it]


0.16440662917548385


100%|██████████| 20/20 [00:31<00:00,  1.56s/it]


0.13843117840845845


100%|██████████| 20/20 [00:31<00:00,  1.57s/it]
[I 2025-07-08 16:16:02,134] Trial 84 finished with value: 0.1363927058009744 and parameters: {'log2_hidden_layer_0': 4, 'dropout': 0.6564983391745214, 'lr': 0.00010302965349528251, 'batch_size': 60, 'num_epochs': 20}. Best is trial 84 with value: 0.1363927058009744.


0.11040186903490817


100%|██████████| 20/20 [00:31<00:00,  1.60s/it]


0.1314952354932313


100%|██████████| 20/20 [00:36<00:00,  1.80s/it]


0.16410448872947606


100%|██████████| 20/20 [00:37<00:00,  1.87s/it]


0.1377615949816473


100%|██████████| 20/20 [00:33<00:00,  1.67s/it]
[I 2025-07-08 16:18:24,151] Trial 85 finished with value: 0.13597616422144174 and parameters: {'log2_hidden_layer_0': 4, 'dropout': 0.6595728307115691, 'lr': 0.00010799408778210877, 'batch_size': 60, 'num_epochs': 20}. Best is trial 84 with value: 0.1363927058009744.


0.11054333768141225


100%|██████████| 20/20 [00:34<00:00,  1.74s/it]


0.13145884674108896


100%|██████████| 20/20 [00:37<00:00,  1.86s/it]


0.16397227512233348


100%|██████████| 20/20 [00:35<00:00,  1.79s/it]


0.13756375602448812


100%|██████████| 20/20 [00:35<00:00,  1.76s/it]
[I 2025-07-08 16:20:50,345] Trial 86 finished with value: 0.13589191496523917 and parameters: {'log2_hidden_layer_0': 4, 'dropout': 0.6590067468771843, 'lr': 0.0001092385581219941, 'batch_size': 60, 'num_epochs': 20}. Best is trial 84 with value: 0.1363927058009744.


0.11057278197304606


100%|██████████| 20/20 [00:34<00:00,  1.70s/it]


0.12005965001473755


100%|██████████| 20/20 [00:33<00:00,  1.66s/it]


0.13333068974506296


100%|██████████| 20/20 [00:32<00:00,  1.64s/it]


0.09862917711162665


100%|██████████| 20/20 [00:32<00:00,  1.64s/it]
[I 2025-07-08 16:23:06,231] Trial 87 finished with value: 0.11438997074971025 and parameters: {'log2_hidden_layer_0': 6, 'dropout': 0.65856821238203, 'lr': 0.0001079450721433277, 'batch_size': 60, 'num_epochs': 20}. Best is trial 84 with value: 0.1363927058009744.


0.10554036612741381


100%|██████████| 20/20 [00:32<00:00,  1.61s/it]


0.131754143945742


100%|██████████| 20/20 [00:32<00:00,  1.61s/it]


0.16425836546756417


100%|██████████| 20/20 [00:33<00:00,  1.66s/it]


0.13815723557666015


100%|██████████| 20/20 [00:33<00:00,  1.69s/it]
[I 2025-07-08 16:25:20,384] Trial 88 finished with value: 0.13601485187826284 and parameters: {'log2_hidden_layer_0': 4, 'dropout': 0.6993734711167882, 'lr': 0.00010158738525449899, 'batch_size': 60, 'num_epochs': 20}. Best is trial 84 with value: 0.1363927058009744.


0.10988966252308512


100%|██████████| 20/20 [00:33<00:00,  1.67s/it]


0.1317331477401004


100%|██████████| 20/20 [00:32<00:00,  1.61s/it]


0.16427958992841238


100%|██████████| 20/20 [00:33<00:00,  1.67s/it]


0.13789794379070125


100%|██████████| 20/20 [00:35<00:00,  1.77s/it]
[I 2025-07-08 16:27:37,665] Trial 89 finished with value: 0.13597887526600722 and parameters: {'log2_hidden_layer_0': 4, 'dropout': 0.6987731127759808, 'lr': 0.00010254890412084444, 'batch_size': 60, 'num_epochs': 20}. Best is trial 84 with value: 0.1363927058009744.


0.11000481960481484


100%|██████████| 20/20 [00:33<00:00,  1.68s/it]


0.13159617376634794


100%|██████████| 20/20 [00:32<00:00,  1.61s/it]


0.1642976859288331


100%|██████████| 20/20 [00:30<00:00,  1.52s/it]


0.1379531609237338


100%|██████████| 20/20 [00:31<00:00,  1.55s/it]
[I 2025-07-08 16:29:47,784] Trial 90 finished with value: 0.1359516885996195 and parameters: {'log2_hidden_layer_0': 4, 'dropout': 0.6993640050247971, 'lr': 0.00010286025318453452, 'batch_size': 60, 'num_epochs': 20}. Best is trial 84 with value: 0.1363927058009744.


0.10995973377956322


100%|██████████| 20/20 [00:34<00:00,  1.73s/it]


0.13190503623290942


100%|██████████| 20/20 [00:33<00:00,  1.68s/it]


0.16416339812531427


100%|██████████| 20/20 [00:32<00:00,  1.62s/it]


0.1382336592979158


100%|██████████| 20/20 [00:33<00:00,  1.65s/it]
[I 2025-07-08 16:32:04,201] Trial 91 finished with value: 0.136097349473497 and parameters: {'log2_hidden_layer_0': 4, 'dropout': 0.6963767840381105, 'lr': 0.00010166769719790664, 'batch_size': 60, 'num_epochs': 20}. Best is trial 84 with value: 0.1363927058009744.


0.11008730423784846


100%|██████████| 20/20 [00:32<00:00,  1.64s/it]


0.13188883008143398


100%|██████████| 20/20 [00:32<00:00,  1.61s/it]


0.16418891967804586


100%|██████████| 20/20 [00:33<00:00,  1.66s/it]


0.13809889759564023


100%|██████████| 20/20 [00:32<00:00,  1.64s/it]
[I 2025-07-08 16:34:18,054] Trial 92 finished with value: 0.13613624483417489 and parameters: {'log2_hidden_layer_0': 4, 'dropout': 0.6904999880324129, 'lr': 0.00010131464460118073, 'batch_size': 60, 'num_epochs': 20}. Best is trial 84 with value: 0.1363927058009744.


0.11036833198157943


100%|██████████| 20/20 [00:31<00:00,  1.56s/it]


0.13200434348500747


100%|██████████| 20/20 [00:33<00:00,  1.68s/it]


0.16419692694183888


100%|██████████| 20/20 [00:35<00:00,  1.79s/it]


0.13834348829420134


100%|██████████| 20/20 [00:36<00:00,  1.85s/it]
[I 2025-07-08 16:36:38,627] Trial 93 finished with value: 0.1362171851533347 and parameters: {'log2_hidden_layer_0': 4, 'dropout': 0.6918959052495159, 'lr': 0.0001010769262936787, 'batch_size': 60, 'num_epochs': 20}. Best is trial 84 with value: 0.1363927058009744.


0.11032398189229112


100%|██████████| 20/20 [00:32<00:00,  1.62s/it]


0.13182465480939856


100%|██████████| 20/20 [00:36<00:00,  1.82s/it]


0.16421235207258095


100%|██████████| 20/20 [00:43<00:00,  2.15s/it]


0.13831863066366315


100%|██████████| 20/20 [00:41<00:00,  2.09s/it]
[I 2025-07-08 16:39:15,539] Trial 94 finished with value: 0.13607741370519613 and parameters: {'log2_hidden_layer_0': 4, 'dropout': 0.6986318426808387, 'lr': 0.00010120128631771748, 'batch_size': 60, 'num_epochs': 20}. Best is trial 84 with value: 0.1363927058009744.


0.10995401727514188


100%|██████████| 20/20 [00:41<00:00,  2.05s/it]


0.1318299810261119


100%|██████████| 20/20 [00:41<00:00,  2.05s/it]


0.16420005348109257


100%|██████████| 20/20 [00:39<00:00,  1.98s/it]


0.1383784920668416


100%|██████████| 20/20 [00:39<00:00,  1.99s/it]
[I 2025-07-08 16:42:00,350] Trial 95 finished with value: 0.13608540701942123 and parameters: {'log2_hidden_layer_0': 4, 'dropout': 0.6998840989109383, 'lr': 0.00010068586548952529, 'batch_size': 60, 'num_epochs': 20}. Best is trial 84 with value: 0.1363927058009744.


0.10993310150363889


100%|██████████| 20/20 [00:39<00:00,  1.99s/it]


0.1289749597748769


100%|██████████| 20/20 [00:39<00:00,  1.96s/it]


0.16413093647448065


100%|██████████| 20/20 [00:39<00:00,  1.97s/it]


0.13368606855263893


100%|██████████| 20/20 [00:40<00:00,  2.01s/it]
[I 2025-07-08 16:44:42,286] Trial 96 finished with value: 0.13447015213879454 and parameters: {'log2_hidden_layer_0': 4, 'dropout': 0.6965611207006063, 'lr': 0.00012922363915374702, 'batch_size': 60, 'num_epochs': 20}. Best is trial 84 with value: 0.1363927058009744.


0.11108864375318166


100%|██████████| 20/20 [00:40<00:00,  2.03s/it]


0.13172274218046362


100%|██████████| 20/20 [00:40<00:00,  2.02s/it]


0.16391969791207311


100%|██████████| 20/20 [00:39<00:00,  1.99s/it]


0.13770000205022204


100%|██████████| 20/20 [00:38<00:00,  1.94s/it]
[I 2025-07-08 16:47:25,158] Trial 97 finished with value: 0.13589359835002016 and parameters: {'log2_hidden_layer_0': 4, 'dropout': 0.6822658876936356, 'lr': 0.00010507293369832689, 'batch_size': 60, 'num_epochs': 20}. Best is trial 84 with value: 0.1363927058009744.


0.11023195125732184


100%|██████████| 20/20 [00:39<00:00,  1.96s/it]


0.1319674038790983


100%|██████████| 20/20 [00:39<00:00,  1.96s/it]


0.16427290160045205


100%|██████████| 20/20 [00:39<00:00,  1.97s/it]


0.13841333665351888


100%|██████████| 20/20 [00:39<00:00,  1.95s/it]
[I 2025-07-08 16:50:05,139] Trial 98 finished with value: 0.1361774938535733 and parameters: {'log2_hidden_layer_0': 4, 'dropout': 0.6983469572106482, 'lr': 0.00010052293343784851, 'batch_size': 60, 'num_epochs': 20}. Best is trial 84 with value: 0.1363927058009744.


0.11005633328122401


100%|██████████| 20/20 [00:32<00:00,  1.61s/it]


0.13242163480736827


100%|██████████| 20/20 [00:29<00:00,  1.47s/it]


0.16454169535873153


100%|██████████| 20/20 [00:29<00:00,  1.47s/it]


0.1385079986964512


100%|██████████| 20/20 [00:29<00:00,  1.46s/it]
[I 2025-07-08 16:52:08,009] Trial 99 finished with value: 0.13647459350718286 and parameters: {'log2_hidden_layer_0': 4, 'dropout': 0.6743230512784762, 'lr': 0.00010024548511849526, 'batch_size': 60, 'num_epochs': 20}. Best is trial 99 with value: 0.13647459350718286.


0.11042704516618043
Best hyperparameters: {'log2_hidden_layer_0': 4, 'dropout': 0.6743230512784762, 'lr': 0.00010024548511849526, 'batch_size': 60, 'num_epochs': 20}
Best Pearson score: 0.13647459350718286


{'log2_hidden_layer_0': 4,
 'dropout': 0.6743230512784762,
 'lr': 0.00010024548511849526,
 'batch_size': 60,
 'num_epochs': 20}

#### Nineth Iteration: AE + MLP instead of GBDT feature selection + MLP (train together)

Define the model

In [None]:
# Define gaussian noise for autoencoder
class GaussianNoise(nnmx.Module):
    def __init__(self, mean: float = 0.0, stddev: float = 0.01):
        super().__init__()
        self.mean = mean
        self.stddev = stddev

    def __call__(self, x, training = True):
        if training:
            x += mx.random.normal(loc=self.mean, scale=self.stddev, shape=x.shape)
        return x

In [None]:
# Define the model
# We do not use the reset method this time so you have to create the model at each fold
class AEMLX(nnmx.Module):
    def __init__(self, num_features, hidden_layers_size, latent_size, dropout):
        super().__init__()

        # Initialize layers for encoder
        last_layer = num_features
        self.encoder_layers = []
        for current_layer in hidden_layers_size:
            self.encoder_layers.append(nnmx.Linear(last_layer, current_layer))
            last_layer = current_layer
        self.encoder_layers.append(nnmx.Linear(last_layer, latent_size))

        # Initialize layers for decoder
        last_layer = latent_size
        self.decoder_layers = []
        for current_layer in hidden_layers_size[::-1]:
            self.decoder_layers.append(nnmx.Linear(last_layer, current_layer))
            last_layer = current_layer
        self.decoder_layers.append(nnmx.Linear(last_layer, num_features))

        # Initialize activation
        self.activation = nnmx.ReLU()

        # Initialze gaussian noise to apply upon training
        self.gaussian_noise = GaussianNoise()

        # Initialize dropout
        self.dropout = nnmx.Dropout(p = dropout)

    def __call__(self, x, training = True):
        if training:
            x = self.gaussian_noise(x)
        for inx, layer in enumerate(self.encoder_layers):
            x = layer(x)
            x = self.activation(x)
        for inx, layer in enumerate(self.decoder_layers):
            if inx == len(self.decoder_layers) - 1:
                x = layer(x)
            else:
                x = layer(x)
                x = self.activation(x)
        return x

    def get_latent(self, x):
        for inx, layer in enumerate(self.encoder_layers):
            x = layer(x)
            x = self.activation(x)
        return x

Train model with CV and evaluate

In [None]:
# Separate function for train & eval step
def train_aemlp_mlx(ae_model, ae_loss_and_grad_fn, ae_optimizer, ae_num_epochs,
                     mlp_model, mlp_loss_and_grad_fn, mlp_optimizer, mlp_num_epochs,
                     X_train, Y_train, batch_size):
    # Train ae first
    ae_model.train()
    for _ in tqdm(range(ae_num_epochs)):
        for (inputs, targets) in batch_iterate(batch_size, X_train, Y_train):
            # get gradients for ae, output is the inputs itself
            _, ae_grads = ae_loss_and_grad_fn(ae_model, inputs, inputs)

            # Update the optimizer state and model parameters in a single call
            ae_optimizer.update(ae_model, ae_grads)

            # Force a graph evaluation
            mx.eval(ae_model.parameters(), ae_optimizer.state)

    # Train mlp later
    mlp_model.train()
    for _ in tqdm(range(mlp_num_epochs)):
        for (inputs, targets) in batch_iterate(batch_size, X_train, Y_train):
            # get the latent representation for X_train
            latent_inputs = ae_model.get_latent(inputs)
            used_inputs = mx.concatenate([inputs, latent_inputs], axis=1)
            # get gradients for mlp
            _, mlp_grads = mlp_loss_and_grad_fn(mlp_model, used_inputs, targets)

            # Update the optimizer state and model parameters in a single call
            mlp_optimizer.update(mlp_model, mlp_grads)

            # Force a graph evaluation
            mx.eval(mlp_model.parameters(), mlp_optimizer.state)

    # # Train ae and mlp together
    # ae_model.train()
    # mlp_model.train()
    # for _ in tqdm(range(ae_num_epochs)):
    #     for (inputs, targets) in batch_iterate(batch_size, X_train, Y_train):
    #         # get gradients for ae, output is the inputs itself
    #         _, ae_grads = ae_loss_and_grad_fn(ae_model, inputs, inputs)

    #         # Update the optimizer state and model parameters in a single call
    #         ae_optimizer.update(ae_model, ae_grads)

    #         # Force a graph evaluation
    #         mx.eval(ae_model.parameters(), ae_optimizer.state)

    #         # get gradients for mlp
    #         latent_inputs = ae_model.get_latent(inputs)
    #         used_inputs = mx.concatenate([inputs, latent_inputs], axis=1)
    #         _, mlp_grads = mlp_loss_and_grad_fn(mlp_model, used_inputs, targets)

    #         # Update the optimizer state and model parameters in a single call
    #         mlp_optimizer.update(mlp_model, mlp_grads)

    #         # Force a graph evaluation
    #         mx.eval(mlp_model.parameters(), mlp_optimizer.state)

def eval_aemlp_mlx(ae_model, mlp_model, X_test, Y_test, batch_size):
    outputs_all = np.zeros(0)
    targets_all = np.zeros(0)
    ae_model.eval()
    mlp_model.eval()
    for (inputs, targets) in batch_iterate(batch_size, X_test, Y_test, shuffle=False):
        latent_inputs = ae_model.get_latent(inputs)
        used_inputs = mx.concatenate([inputs, latent_inputs], axis=1)
        outputs = mlp_model(used_inputs).reshape(-1)
        # convert back to numpy
        outputs, targets = np.array(outputs), np.array(targets)
        # Load to overall Y_test, Y_pred to calculate pearson score later
        outputs_all = np.concatenate([outputs_all, outputs])
        targets_all = np.concatenate([targets_all, targets])
    return pearson_score(targets_all, outputs_all)

In [None]:
def train_eval_cv_mlx_aemlp(num_features, ae_hidden_layers_size, ae_latent_size, ae_dropout, ae_lr, ae_num_epochs,
                            mlp_hidden_layers_size, mlp_dropout, mlp_lr, mlp_num_epochs,
                            cv, X_train_arr, X_test_arr, Y_train_arr, Y_test_arr, batch_size):
    cv_pearson = 0
    for _, (X_train, X_test, Y_train, Y_test) in enumerate(zip(X_train_arr, X_test_arr, Y_train_arr, Y_test_arr)):
        # initialize the model
        mx.random.seed(default_random_state)
        ae_model = AEMLX(num_features, ae_hidden_layers_size, ae_latent_size, ae_dropout)

        mx.random.seed(default_random_state)
        mlp_model = MLPMLX(ae_latent_size + num_features, mlp_hidden_layers_size, mlp_dropout)

        # Initialize the loss function (both use same loss function)
        def loss_fn(model, X, Y):
            Y_pred = model(X).reshape(-1)
            Y = Y.reshape(-1)
            return mx.mean(nnmx.losses.mse_loss(Y_pred, Y))
        ae_loss_and_grad_fn = nnmx.value_and_grad(ae_model, loss_fn)
        mlp_loss_and_grad_fn = nnmx.value_and_grad(mlp_model, loss_fn)

        # Reinitialize the optimizer
        ae_optimizer = optimmx.Adam(learning_rate = ae_lr)
        mlp_optimizer = optimmx.Adam(learning_rate = mlp_lr)

        # Train the model
        train_aemlp_mlx(ae_model, ae_loss_and_grad_fn, ae_optimizer, ae_num_epochs,
                        mlp_model, mlp_loss_and_grad_fn, mlp_optimizer, mlp_num_epochs,
                        X_train, Y_train, batch_size)

        # Test the model
        pearson = eval_aemlp_mlx(ae_model, mlp_model, X_test, Y_test, batch_size)
        print(pearson)
        if pearson == -1:
            return pearson
        cv_pearson += pearson
    return cv_pearson / cv

Conduct training and evaluating process of the model

In [None]:
# # Create the CV data, seems to be better with only anonymized features
# best_features = [col for col in train_df.columns if "X" in col] + \
#                 ["volume", "bid_qty", "ask_qty", "buy_qty", "sell_qty"] + \
#                 [col for col in train_df.columns.tolist() if "X" not in col and col not in ["timestamp", "label"]]
# train_added_df = pd.concat([train_df, popular_features_train], axis=1)
# X_train_arr, X_test_arr, Y_train_arr, Y_test_arr = create_cv(train_added_df, best_features)
# for i in range(default_cv):
#     X_train_arr[i] = float64_to_float32(X_train_arr[i])
#     X_test_arr[i] = float64_to_float32(X_test_arr[i])
#     Y_train_arr[i] = float64_to_float32(Y_train_arr[i])
#     Y_test_arr[i] = float64_to_float32(Y_test_arr[i])
# X_train_arr, X_test_arr, Y_train_arr, Y_test_arr = normal_cv_to_mlx_cv(X_train_arr, X_test_arr, Y_train_arr, Y_test_arr)

In [None]:
# # # Training process of the default config
# num_features = len(best_features)
# ae_hidden_layers_size = [64]
# ae_latent_size = 16
# mlp_hidden_layers_size = [4, 2]
# lr = 0.0005
# dropout = 0.5
# batch_size = 180
# num_epochs = 30

# train_eval_cv_mlx_aemlp(num_features, ae_hidden_layers_size, ae_latent_size, 
#                         mlp_hidden_layers_size, dropout, 
#                         lr, default_cv,
#                         X_train_arr, X_test_arr, Y_train_arr, Y_test_arr,
#                         batch_size, num_epochs)

Optimize with bayesian optimization

In [None]:
ae_default_num_layers = 2
mlp_default_num_layers = 2

In [None]:
def objective_aemlp_mlx(trial):
    # First initialize the parameters
    num_features = len(best_features)

    # initialize ae layers
    ae_num_layers = ae_default_num_layers
    ae_log_2_hidden_layers_size = []
    for i in range(ae_num_layers):
        if len(ae_log_2_hidden_layers_size) == 0:
            ae_log_2_hidden_layers_size.append(trial.suggest_int(f"ae_log2_hidden_layer_{i}", 3, int(math.ceil(math.log2(num_features)))))
        else:
            ae_log_2_hidden_layers_size.append(trial.suggest_int(f"ae_log2_hidden_layer_{i}", 3, ae_log_2_hidden_layers_size[-1]))
    ae_hidden_layers_size = [2**i for i in ae_log_2_hidden_layers_size]
    ae_latent_size = 2**trial.suggest_int("ae_log2_latent_size", 3, ae_log_2_hidden_layers_size[-1])
    ae_dropout = trial.suggest_float("ae_dropout", 0.2, 0.7)
    ae_lr = trial.suggest_float("ae_lr", 0.0001, 0.01, log=True)
    ae_num_epochs = trial.suggest_categorical("num_epochs", [20, 40, 60, 80, 100])

    # initialize mlp layers
    mlp_num_layers = mlp_default_num_layers
    mlp_log_2_hidden_layers_size = []
    for i in range(mlp_num_layers):
        if len(mlp_log_2_hidden_layers_size) == 0:
            mlp_log_2_hidden_layers_size.append(trial.suggest_int(f"mlp_log2_hidden_layer_{i}", 2, int(math.ceil(math.log2(ae_latent_size + num_features)))))
        else:
            mlp_log_2_hidden_layers_size.append(trial.suggest_int(f"mlp_log2_hidden_layer_{i}", 2, mlp_log_2_hidden_layers_size[-1]))
    mlp_hidden_layers_size = [2**i for i in mlp_log_2_hidden_layers_size]
    mlp_dropout = trial.suggest_float("mlp_dropout", 0.2, 0.7)
    mlp_lr = trial.suggest_float("mlp_lr", 0.0001, 0.01, log=True)
    mlp_num_epochs = trial.suggest_categorical("mlp_num_epochs", [10, 20, 30, 40, 50])
    # mlp_num_epochs = ae_num_epochs

    # batch size
    batch_size = trial.suggest_categorical("batch_size", [30, 60, 120, 180, 360, 720, 1440])
    
    # Conduct training based on those parameters
    cv_pearson = train_eval_cv_mlx_aemlp(num_features, ae_hidden_layers_size, ae_latent_size, ae_dropout, ae_lr, ae_num_epochs,
                                         mlp_hidden_layers_size, mlp_dropout, mlp_lr, mlp_num_epochs,
                                         default_cv, X_train_arr, X_test_arr, Y_train_arr, Y_test_arr,
                                         batch_size)
    
    return cv_pearson

In [None]:
def optimize_aemlp_mlx(study_name, storage_name, objective_function=objective_aemlp_mlx, n_trials = 100, n_jobs = 1):
    print("Conduct hyperparam opt for AE-MLP")
    study = optuna.create_study(
        study_name = study_name,
        direction ='maximize',
        storage = f"sqlite:///{storage_name}.db",
        sampler = TPESampler(seed = 101, n_startup_trials=10),
        load_if_exists=True
    )
    study.optimize(objective_function, n_trials=n_trials, n_jobs=n_jobs)
    print('Best hyperparameters:', study.best_params)
    print('Best Pearson score:', study.best_value)
    return study.best_params

In [None]:
best_features = [col for col in train_df.columns if "X" in col]
X_train_arr, X_test_arr, Y_train_arr, Y_test_arr = create_cv(train_df, best_features)
for i in range(default_cv):
    X_train_arr[i] = float64_to_float32(X_train_arr[i])
    X_test_arr[i] = float64_to_float32(X_test_arr[i])
    Y_train_arr[i] = float64_to_float32(Y_train_arr[i])
    Y_test_arr[i] = float64_to_float32(Y_test_arr[i])
X_train_arr, X_test_arr, Y_train_arr, Y_test_arr = normal_cv_to_mlx_cv(X_train_arr, X_test_arr, Y_train_arr, Y_test_arr)

In [None]:
optimize_aemlp_mlx(
    f"aemlp_mlx_{feature_version}_{default_cv}_{default_random_state}_{ae_default_num_layers}_{mlp_default_num_layers}_study",
    f"aemlp_mlx_{feature_version}_{default_cv}_{default_random_state}_{ae_default_num_layers}_{mlp_default_num_layers}_study"
)