In [1]:
import pickle
import random
from copy import deepcopy
from tqdm import tqdm
import numpy as np
import pandas as pd
import polars as pl
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from xgboost import XGBRegressor, XGBClassifier
from lightgbm import LGBMRegressor, LGBMClassifier
from catboost import CatBoostRegressor, CatBoostClassifier
import optuna
from optuna.samplers import RandomSampler, TPESampler, GPSampler
import warnings
warnings.filterwarnings("ignore")
# import multiprocessing
# max_n_jobs = multiprocessing.cpu_count()
import shap
import torch
from torch.utils.data import Dataset, TensorDataset, DataLoader, Sampler
import torch.nn as nn
import torch.optim as optim
import mlx.core as mx
import mlx.nn as nnmx
import mlx.optimizers as optimmx

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Set the device
device = "mps" if torch.backends.mps.is_available() else "cpu"

In [3]:
feature_version = 2
# 1 for pc feature, 
# 2 for label correlation feature # seems to work most consistently
# 3 for best features based on combination rank
# 4 for including time features (in case we want to reverse engineer the masked timestamp)
# 5 for increasing number of correlation features + only use those that are in the same cluster

In [4]:
default_random_state = 101
random.seed(default_random_state)
np.random.seed(default_random_state)
torch.manual_seed(default_random_state)
torch.mps.manual_seed(default_random_state)
mx.random.seed(default_random_state)

#### Import train data and popular features

In [5]:
train_df = pd.read_parquet(f"data/cleaned/cleaned_train_{feature_version}.parquet")
train_df.head()

Unnamed: 0,X473,X205,X198,X444,X466,X445,X472,X26,X29,X217,...,normalized_buy_volume,normalized_sell_volume,liquidity_adjusted_imbalance,pressure_spread_interaction,trade_direction_ratio,net_buy_volume,bid_skew,ask_skew,timestamp,label
0,-0.201346,-1.978504,-1.700689,-0.142546,-0.163476,-0.128331,-0.126241,1.406392,1.474789,-0.981975,...,11.542564,5.339347,0.063569,-0.230493,0.79681,131.421,0.644635,0.355365,2023-03-01 00:00:00,0.562539
1,-0.186231,-1.830295,-1.669471,-0.135499,-0.159388,-0.12479,-0.115015,1.003783,1.312735,-0.94019,...,13.626484,137.821061,0.01161,-0.549445,0.620251,203.896,0.942921,0.057079,2023-03-01 00:01:00,0.533686
2,-0.182398,-1.80354,-1.662645,-0.133705,-0.158627,-0.123891,-0.112303,0.760801,1.219124,-0.933071,...,360.242073,2.263386,0.015877,0.530818,0.538664,22.858,0.007283,0.992717,2023-03-01 00:02:00,0.546505
3,-0.177415,-1.714013,-1.620037,-0.133251,-0.158334,-0.123658,-0.109113,0.955549,1.353001,-0.891216,...,69.011716,5.946089,0.025702,0.45478,0.728757,210.779,0.187976,0.812024,2023-03-01 00:03:00,0.357703
4,-0.174164,-1.68417,-1.600188,-0.128862,-0.156668,-0.121464,-0.106383,0.90546,1.36188,-0.878711,...,3.623647,12.867864,0.081042,-0.533689,0.689066,54.004,0.887255,0.112745,2023-03-01 00:04:00,0.362452


In [6]:
popular_features_train = pd.read_parquet("data/cleaned/popular_features_train.parquet")
popular_features_train.head()

Unnamed: 0,volume,bid_qty,ask_qty,buy_qty,sell_qty
0,221.389,15.283,8.425,176.405,44.984
1,847.796,38.59,2.336,525.846,321.95
2,295.596,0.442,60.25,159.227,136.369
3,460.705,4.865,21.016,335.742,124.963
4,142.818,27.158,3.451,98.411,44.407


#### Implement some helper function

In [7]:
# First need to split into some fold
train_df["timestamp"] = pd.to_datetime(train_df["timestamp"])

default_cv = 4
default_cv_type = "full"
# NOTE: default_cv must set to 1 instead of 3 based on consistency with LB score contains 49% of test data
# NOTE: 3 cv with gap is slightly better or almost equal

def create_cv(train_df, features=None, cv=default_cv):
    if features is not None:
        train_df = train_df[features + ["timestamp", "label"]]
    X_train_arr = []
    X_test_arr = []
    Y_train_arr = []
    Y_test_arr = []
    for i in range(cv):
        train_month = list(range(3 + i, 7 + i))
        # train_month = [3, 4, 5, 6, 7, 8]
        test_month = list(map(lambda x: x % 12 if x > 12 else x, list(range(8 + i, 12 + i))))
        print(train_month, test_month)
        # test_month = [9, 10, 11, 12, 1, 2] # try to make a gap to see if there is any differences in cv-lb correlation
        # print(train_month, test_month)
        train = train_df[train_df["timestamp"].dt.month.isin(train_month)].reset_index().drop("index", axis = 1)
        test = train_df[train_df["timestamp"].dt.month.isin(test_month)].reset_index().drop("index", axis = 1)
        X_train_arr.append(train.drop(["timestamp", "label"], axis = 1))
        X_test_arr.append(test.drop(["timestamp", "label"], axis = 1))
        Y_train_arr.append(train["label"])
        Y_test_arr.append(test["label"])  
    return X_train_arr, X_test_arr, Y_train_arr, Y_test_arr

# def create_cv_random_test(train_df, features=None, test_cv=10):
#     # randomize so that we have 1 train, but try it on 10 different test 
#     if features is not None:
#         train_df = train_df[features + ["timestamp", "label"]]
#     X_train_arr = []
#     X_test_arr = []
#     Y_train_arr = []
#     Y_test_arr = []

#     # Create train data
#     train_month = [3, 4, 5, 6, 7, 8]
#     train = train_df[train_df["timestamp"].dt.month.isin(train_month)] 
#     X_train_arr.append(train.drop(["timestamp", "label"], axis = 1))
#     Y_train_arr.append(train["label"])

#     test_month = [9, 10, 11, 12, 1, 2]
#     test = train_df[train_df["timestamp"].dt.month.isin(test_month)]
#     # Create test data
#     for _ in range(test_cv):
#         random_test = test.sample(frac = 0.5, random_state = default_random_state)
#         X_test_arr.append(random_test.drop(["timestamp", "label"], axis = 1))
#         Y_test_arr.append(random_test["label"])

#     return X_train_arr, X_test_arr, Y_train_arr, Y_test_arr 

# class [-1, 0, 1] -> [0, 1, 2] => < -0.2 => neg, > 0.2 => pos, else => neutral
def create_classification_class(label):
    if label < -0.4: return 0
    elif label < 0: return 1
    elif label < 0.4: return 2
    return 3

def create_cv_classification(train_df, features=None, cv=default_cv):
    if features is not None:
        train_df = train_df[features + ["timestamp", "label"]]
    X_train_arr = []
    X_test_arr = []
    Y_train_arr = []
    Y_test_arr = []
    for i in range(cv):
        train_month = list(range(3 + i, 7 + i))
        # train_month = [3, 4, 5, 6, 7, 8]
        test_month = list(map(lambda x: x % 12 if x > 12 else x, list(range(8 + i, 12 + i))))
        print(train_month, test_month)
        # test_month = [9, 10, 11, 12, 1, 2] # try to make a gap to see if there is any differences in cv-lb correlation
        # print(train_month, test_month)
        train = train_df[train_df["timestamp"].dt.month.isin(train_month)].reset_index().drop("index", axis = 1)
        test = train_df[train_df["timestamp"].dt.month.isin(test_month)].reset_index().drop("index", axis = 1)
        X_train_arr.append(train.drop(["timestamp", "label"], axis = 1))
        X_test_arr.append(test.drop(["timestamp", "label"], axis = 1))
        Y_train_arr.append(train["label"].apply(lambda x: create_classification_class(x)))
        Y_test_arr.append(test["label"].apply(lambda x: create_classification_class(x)))  
    return X_train_arr, X_test_arr, Y_train_arr, Y_test_arr

In [8]:
def pearson_score(Y_test, Y_pred):
    if isinstance(Y_test, pd.Series) or isinstance(Y_test, pd.DataFrame):
        Y_test = Y_test.values
    if isinstance(Y_pred, pd.Series) or isinstance(Y_pred, pd.DataFrame):
        Y_pred = Y_pred.values
    Y_test = np.ravel(Y_test)
    Y_pred = np.ravel(Y_pred)
    pearson = np.corrcoef(Y_test, Y_pred)[0, 1]
    if np.isnan(pearson):
        if np.std(Y_pred) == 0:
            print(Y_pred)
            print("Error: zero variance prediction")
        elif np.isnan(Y_pred).any():
            print("Error: nan prediction")
        return -1
    else:
        return pearson

In [9]:
# Make function specifically for cross validation
def train_eval_cv(model, cv, X_train_arr, X_test_arr, Y_train_arr, Y_test_arr, scoring_function=pearson_score):
    cv_score = 0

    for i in range(cv):
        X_train, X_test = X_train_arr[i], X_test_arr[i]
        Y_train, Y_test = Y_train_arr[i], Y_test_arr[i]
        model.fit(X_train, Y_train)
        Y_pred = model.predict(X_test)
        cv_score += scoring_function(Y_test, Y_pred)
    
    return cv_score / cv

def train_eval_cv_random_test(model, cv, X_train_arr, X_test_arr, Y_train_arr, Y_test_arr, scoring_function=pearson_score, test_cv = 10):
    cv_score = 0

    for i in range(cv):
        curr_cv_score = 0

        # Conduct fitting
        X_train, X_test = X_train_arr[i], X_test_arr[i]
        Y_train, Y_test = Y_train_arr[i], Y_test_arr[i]
        model.fit(X_train, Y_train)
        
        # sampling and testing
        len_test = X_test.shape[0]
        for seed in tqdm(range(test_cv)):
            np.random.seed(seed)
            test_index = np.random.choice(len_test, size = len_test // 2, replace = False) 
            X_test_sample = X_test.loc[test_index, :]
            Y_test_sample = Y_test[test_index]
            Y_pred_sample = model.predict(X_test_sample)
            curr_cv_score += scoring_function(Y_test_sample, Y_pred_sample)
        
        cv_score += curr_cv_score / test_cv
    
    np.random.seed(default_random_state)
    return cv_score

In [10]:
default_n_trees = 2000
# Finetuning XGBoost
def objective_xgboost(trial):
    params = {
        "n_estimators": default_n_trees,
        "max_depth": trial.suggest_int("max_depth", 2, 10),
        "learning_rate": trial.suggest_float("learning_rate", 0.001, 0.1, log = True), # 0.001 - 0.1 -> 0.01 - 0.05 
        "verbosity": 0,
        "subsample": trial.suggest_float("subsample", 0.05, 1.0), # 1.0 -> 0.2
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.05, 1),
        "colsample_bynode": trial.suggest_float("colsample_bynode", 0.05, 1), 
        "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.05, 1), 
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 10),
        "reg_alpha": trial.suggest_float("reg_alpha", 0, 100),
        "reg_lambda": trial.suggest_float("reg_lambda", 0, 100),
        "gamma": trial.suggest_float("gamma", 0, 5),
        "enable_categorical": True,
        "random_state": default_random_state
    }

    xgbr = XGBRegressor(**params)
    cv_pearson = train_eval_cv(xgbr, default_cv, X_train_arr, X_test_arr, Y_train_arr, Y_test_arr, pearson_score)
    return cv_pearson

def objective_lightgbm(trial):
    params = {
        "n_estimators": default_n_trees,
        "verbosity": -1,
        "max_depth": trial.suggest_int("max_depth", 2, 10), # 1 - 10 => 1 - 5
        "learning_rate": trial.suggest_float("learning_rate", 0.001, 0.1, log=True), # 0.001 - 0.1 -> 0.005 - 0.02
        "num_leaves": trial.suggest_int("num_leaves", 2, 2**10),
        "subsample": trial.suggest_float("subsample", 0.05, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.05, 1.0),
        "min_child_weight": trial.suggest_float("min_child_weight", 0, 1),
        "reg_alpha": trial.suggest_float("reg_alpha", 0, 100),
        "reg_lambda": trial.suggest_float("reg_lambda", 0, 100),
        "random_state": default_random_state
    }

    lgbr = LGBMRegressor(**params)
    cv_pearson = train_eval_cv(lgbr, default_cv, X_train_arr, X_test_arr, Y_train_arr, Y_test_arr, pearson_score)
    return cv_pearson

def objective_catboost(trial):
    params = {
        "iterations": default_n_trees,
        "verbose": False,
        "learning_rate": trial.suggest_float("learning_rate", 0.001, 0.1, log=True), # 0.001 - 0.1 => 0.01 - 0.1
        "depth": trial.suggest_int("depth", 1, 10), #  1 - 10 => 5 - 15
        "subsample": trial.suggest_float("subsample", 0.05, 1.0),
        "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.05, 1.0),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 2, 600),
        "reg_lambda": trial.suggest_float("reg_lambda", 0, 100),
        "random_seed": default_random_state
    }

    cbr = CatBoostRegressor(**params)
    cv_pearson = train_eval_cv(cbr, default_cv, X_train_arr, X_test_arr, Y_train_arr, Y_test_arr, pearson_score)
    return cv_pearson

In [11]:
# Finetuning XGBoost
def objective_xgboost_classification(trial):
    params = {
        "n_estimators": default_n_trees,
        "max_depth": trial.suggest_int("max_depth", 2, 10),
        "learning_rate": trial.suggest_float("learning_rate", 0.001, 0.1, log = True), # 0.001 - 0.1 -> 0.01 - 0.05 
        "verbosity": 0,
        "subsample": trial.suggest_float("subsample", 0.05, 1.0), # 1.0 -> 0.2
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.05, 1),
        "colsample_bynode": trial.suggest_float("colsample_bynode", 0.05, 1), 
        "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.05, 1), 
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 10),
        "reg_alpha": trial.suggest_float("reg_alpha", 0, 100),
        "reg_lambda": trial.suggest_float("reg_lambda", 0, 100),
        "gamma": trial.suggest_float("gamma", 0, 5),
        "enable_categorical": True,
        "random_state": default_random_state
    }

    xgbr = XGBClassifier(**params)
    cv_acc = train_eval_cv(xgbr, default_cv, X_train_arr, X_test_arr, Y_train_arr, Y_test_arr, accuracy_score)
    return cv_acc

def objective_lightgbm_classification(trial):
    params = {
        "n_estimators": default_n_trees,
        "verbosity": -1,
        "max_depth": trial.suggest_int("max_depth", 2, 10), # 1 - 10 => 1 - 5
        "learning_rate": trial.suggest_float("learning_rate", 0.001, 0.1, log=True), # 0.001 - 0.1 -> 0.005 - 0.02
        "num_leaves": trial.suggest_int("num_leaves", 2, 2**10),
        "subsample": trial.suggest_float("subsample", 0.05, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.05, 1.0),
        "min_child_weight": trial.suggest_float("min_child_weight", 0, 1),
        "reg_alpha": trial.suggest_float("reg_alpha", 0, 100),
        "reg_lambda": trial.suggest_float("reg_lambda", 0, 100),
        "random_state": default_random_state
    }

    lgbr = LGBMClassifier(**params)
    cv_acc = train_eval_cv(lgbr, default_cv, X_train_arr, X_test_arr, Y_train_arr, Y_test_arr, accuracy_score)
    return cv_acc

def objective_catboost_classification(trial):
    params = {
        "iterations": default_n_trees,
        "verbose": False,
        "learning_rate": trial.suggest_float("learning_rate", 0.001, 0.1, log=True), # 0.001 - 0.1 => 0.01 - 0.1
        "depth": trial.suggest_int("depth", 1, 10), #  1 - 10 => 5 - 15
        "subsample": trial.suggest_float("subsample", 0.05, 1.0),
        "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.05, 1.0),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 2, 600),
        "reg_lambda": trial.suggest_float("reg_lambda", 0, 100),
        "random_seed": default_random_state
    }

    cbr = CatBoostRegressor(**params)
    cv_acc = train_eval_cv(cbr, default_cv, X_train_arr, X_test_arr, Y_train_arr, Y_test_arr, accuracy_score)
    return cv_acc

In [12]:
default_n_trials = 100
default_n_jobs = 2

def optimize_xgboost(study_name, storage_name, objective_function=objective_xgboost, n_trials = default_n_trials, n_jobs = default_n_jobs):
    print("Conduct hyperparam opt for XGBoost")
    study = optuna.create_study(
        study_name = study_name,
        direction ='maximize',
        storage = f"sqlite:///{storage_name}.db",
        sampler = TPESampler(seed = 101, n_startup_trials=10),
        load_if_exists=True
    )
    study.optimize(objective_function, n_trials=n_trials, n_jobs=n_jobs)
    print('Best hyperparameters:', study.best_params)
    print('Best Pearson score:', study.best_value)
    return study.best_params

def optimize_lightgbm(study_name, storage_name, objective_function=objective_lightgbm, n_trials = default_n_trials, n_jobs = default_n_jobs):
    print("Conduct hyperparam opt for LightGBM")
    study = optuna.create_study(
        study_name = study_name,
        direction='maximize',
        storage = f"sqlite:///{storage_name}.db",
        sampler = TPESampler(seed = 101, n_startup_trials=10),
        load_if_exists=True
    )
    study.optimize(objective_function, n_trials=n_trials, n_jobs=n_jobs)
    print('Best hyperparameters:', study.best_params)
    print('Best Pearson score:', study.best_value)
    return study.best_params

def optimize_catboost(study_name, storage_name, objective_function=objective_catboost, n_trials = default_n_trials, n_jobs = default_n_jobs):
    print("Conduct hyperparam opt for CatBoost")
    study = optuna.create_study(
        study_name = study_name,
        direction='maximize',
        storage = f"sqlite:///{storage_name}.db",
        sampler = TPESampler(seed = 101, n_startup_trials=10),
        load_if_exists=True
    )
    study.optimize(objective_function, n_trials=n_trials, n_jobs=n_jobs)
    print('Best hyperparameters:', study.best_params)
    print('Best Pearson score:', study.best_value)
    return study.best_params

#### First iteration: training with all features from the collection, no popular features

In [None]:
original_features = [f for f in train_df.columns if "X" in f]

X_train_arr, X_test_arr, Y_train_arr, Y_test_arr = create_cv(train_df, original_features)

In [None]:
best_params_xgboost = optimize_xgboost(
    f"xgboost_{feature_version}_{default_cv}_{default_random_state}_{default_n_trees}_study",
    f"xgboost_{feature_version}_{default_cv}_{default_random_state}_{default_n_trees}_study"
)

In [None]:
best_params_lightgbm = optimize_lightgbm(
    f"lightgbm_{feature_version}_{default_cv}_{default_random_state}_{default_n_trees}_study",
    f"lightgbm_{feature_version}_{default_cv}_{default_random_state}_{default_n_trees}_study"
)

In [None]:
# best_params_catboost = optimize_catboost(
#     f"catboost_{feature_version}_{default_cv}_{default_random_state}_{default_n_trees}_study",
#     f"catboost_{feature_version}_{default_cv}_{default_random_state}_{default_n_trees}_study"
# )
# # Need to take down as catboost might not work well in this situation

Analyze params - cv relationship

In [13]:
def get_study_df(filename):
    study = optuna.load_study(
        study_name = filename,
        storage = f"sqlite:///{filename}.db"
    )
    study_df = []
    for trial in study.trials:
        trial_dict = trial.params
        trial_dict["value"] = trial.value
        study_df.append(trial_dict)

    return pd.DataFrame(study_df)

In [14]:
def params_value_viz(study_df):
    nrows = (study_df.shape[1] - 1) // 3 + ((study_df.shape[1] - 1) % 3 > 0)
    fig, ax = plt.subplots(nrows = nrows, ncols = 3, figsize = (14, 5 * nrows))
    for inx, var in enumerate(study_df.columns):
        x, y = inx // 3, inx % 3
        if var != "value":
            sns.regplot(study_df, x = var, y = "value", ax = ax[x][y], lowess=True, line_kws={'color': 'green'}, ci = 95)
    plt.show()

In [None]:
study_df_xgboost = get_study_df(f"xgboost_{feature_version}_{default_cv}_{default_random_state}_{default_n_trees}_study")   
params_value_viz(study_df_xgboost)

In [None]:
study_df_lightgbm = get_study_df(f"lightgbm_{feature_version}_{default_cv}_{default_random_state}_{default_n_trees}_study")
params_value_viz(study_df_lightgbm)

In [None]:
# study_df_catboost = get_study_df(f"catboost_{feature_version}_{default_cv}_{default_random_state}_{default_n_trees}_study")
# params_value_viz(study_df_catboost)

Analyze feature importance + CV performance

In [15]:
def get_best_params_from_file(filename):
    study = optuna.load_study(
        study_name = filename,
        storage = f"sqlite:///{filename}.db"
    )
    return study.best_params

In [16]:
def get_shap_values(model, X_train_arr, X_test_arr, Y_train_arr, Y_test_arr, sample_size=10000):
    mean_abs_shap_all = np.zeros(X_train_arr[0].shape[1])
    for i in range(default_cv):
        X_train, X_test = X_train_arr[i], X_test_arr[i]
        Y_train, Y_test = Y_train_arr[i], Y_test_arr[i]
        model.fit(X_train, Y_train)
        X_test_sample = X_test.sample(sample_size, random_state = default_random_state)
        explainer = shap.TreeExplainer(model)
        shap_values = explainer.shap_values(X_test)
        mean_abs_shap = np.mean(np.abs(shap_values), axis = 0)
        mean_abs_shap_all += mean_abs_shap
    mean_abs_shap_all /= default_cv
    return mean_abs_shap_all

In [None]:
params = {
    "n_estimators": default_n_trees,
    "verbosity": 0,
    "enable_categorical": True,
    "random_state": default_random_state
}
best_params_xgboost = get_best_params_from_file(f"xgboost_{feature_version}_{default_cv}_{default_random_state}_{default_n_trees}_study")
for p in best_params_xgboost:
    params[p] = best_params_xgboost[p]

xgboost_feature_importances = {}

xgbr = XGBRegressor(**params)
for i in range(default_cv):
    X_train, X_test = X_train_arr[i], X_test_arr[i]
    Y_train, Y_test = Y_train_arr[i], Y_test_arr[i]
    xgbr.fit(X_train, Y_train)
    print(pearson_score(Y_test, xgbr.predict(X_test)))
    features = xgbr.feature_names_in_.tolist()
    # features_i = xgbr.feature_importances_.tolist()
    features_i = get_shap_values(xgbr, X_train_arr, X_test_arr, Y_train_arr, Y_test_arr)
    for inx, feat in enumerate(features):
        xgboost_feature_importances[feat] = xgboost_feature_importances.get(feat, 0) + features_i[inx]

# print(feature_importances)
plt.hist(xgboost_feature_importances.values())
# Seems like only COD features are important (can try to only use 4-8 hours if 4-13 hours does not work well)

In [None]:
print([f for f in xgboost_feature_importances if xgboost_feature_importances[f] > 0.01])

In [None]:
params = {
    "n_estimators": default_n_trees,
    "verbosity": -1,
    "random_state": default_random_state
}
best_params_lightgbm = get_best_params_from_file(f"lightgbm_{feature_version}_{default_cv}_{default_random_state}_{default_n_trees}_study")
for p in best_params_lightgbm:
    params[p] = best_params_lightgbm[p]

lightgbm_feature_importances = {}

lgbr = LGBMRegressor(**params)
for i in range(default_cv):
    X_train, X_test = X_train_arr[i], X_test_arr[i]
    Y_train, Y_test = Y_train_arr[i], Y_test_arr[i]
    lgbr.fit(X_train, Y_train)
    print(pearson_score(Y_test, lgbr.predict(X_test)))
    features = lgbr.feature_names_in_.tolist()
    # features_i = lgbr.feature_importances_.tolist()
    features_i = get_shap_values(lgbr, X_train_arr, X_test_arr, Y_train_arr, Y_test_arr)
    for inx, feat in enumerate(features):
        lightgbm_feature_importances[feat] = lightgbm_feature_importances.get(feat, 0) + features_i[inx]

plt.hist(lightgbm_feature_importances.values())
# seems to pick up time features not as good as past 4 hours features

In [None]:
print([f for f in lightgbm_feature_importances if lightgbm_feature_importances[f] >= 0.01])

In [None]:
# params = {
#     "iterations": default_n_trees,
#     "verbose": False,
#     "random_seed": default_random_state
# }
# best_params_catboost = get_best_params_from_file(f"catboost_{feature_version}_{default_cv}_{default_random_state}_{default_n_trees}_study")
# for p in best_params_catboost:
#     params[p] = best_params_catboost[p]

# catboost_feature_importances = {}

# cbr = CatBoostRegressor(**params)
# cv_rmse = 0

# for i in range(default_cv):
#     X_train, X_test = X_train_arr[i], X_test_arr[i]
#     Y_train, Y_test = Y_train_arr[i], Y_test_arr[i]
#     cbr.fit(X_train, Y_train)
#     print(pearson_score(Y_test, cbr.predict(X_test)))
#     features = cbr.feature_names_
#     # features_i = cbr.feature_importances_.tolist()
#     features_i = get_shap_values(cbr, X_train_arr, X_test_arr, Y_train_arr, Y_test_arr)
#     for inx, feat in enumerate(features):
#         catboost_feature_importances[feat] = catboost_feature_importances.get(feat, 0) + features_i[inx]

# plt.hist(catboost_feature_importances.values())
# # can pick up a combination of both past cod and tss, not good at picking up ph, temp

In [None]:
# print([f for f in catboost_feature_importances if catboost_feature_importances[f] >= 0.02])

Get top 20 important features in all of them

In [None]:
xgboost_feature_importances_df = pd.DataFrame(
    {"var": xgboost_feature_importances.keys(), "importance": xgboost_feature_importances.values()}
)
xgboost_feature_importances_df["importance"] /= default_cv
# xgboost_feature_importances_df["rank_importance"] = xgboost_feature_importances_df["importance"].rank(ascending=False)
lightgbm_feature_importances_df = pd.DataFrame(
    {"var": lightgbm_feature_importances.keys(), "importance": lightgbm_feature_importances.values()}
)
lightgbm_feature_importances_df["importance"] /= default_cv
# lightgbm_feature_importances_df["rank_importance"] = lightgbm_feature_importances_df["importance"].rank(ascending=False)
# catboost_feature_importances_df = pd.DataFrame(
#     {"var": catboost_feature_importances.keys(), "importance_catboost": catboost_feature_importances.values()}
# )
# catboost_feature_importances_df["rank_importance"] = catboost_feature_importances_df["importance_catboost"].rank(ascending=False)
feature_importances_df = xgboost_feature_importances_df.merge(
    lightgbm_feature_importances_df,
    on="var",
    how="inner",
    suffixes=("_xgboost", "_lightgbm")
)
# feature_importances_df = feature_importances_df.merge(
#     catboost_feature_importances_df,
#     on="var",
#     how="inner",
#     suffixes=("", "_catboost")
# )
# feature_importances_df = feature_importances_df[["var", "rank_importance_xgboost", "rank_importance_lightgbm", "rank_importance_catboost"]]
# feature_importances_df["rank"] = 1/3 * (feature_importances_df["rank_importance_xgboost"] + feature_importances_df["rank_importance_lightgbm"] + feature_importances_df["rank_importance_catboost"])

feature_importances_df["importance"] = 1/2 * (feature_importances_df["importance_xgboost"] + feature_importances_df["importance_lightgbm"])
feature_importances_df = feature_importances_df.sort_values(by="importance", ascending=False).reset_index().drop("index", axis = 1)
feature_importances_df[:50]

In [None]:
feature_importances_df.to_csv("feature_importances_df.csv", index = False)

In [None]:
feature_importances_df = pd.read_csv("feature_importances_df.csv")
feature_importances_df

In [None]:
print(feature_importances_df.loc[:49, "var"].tolist())

#### Second Iteration: adding popular feature in addition to original features correlated to label

In [None]:
train_added_df = pd.concat([train_df, popular_features_train], axis = 1)

X_train_arr, X_test_arr, Y_train_arr, Y_test_arr = create_cv(train_added_df)

In [None]:
best_params_xgboost_popular_feature = optimize_xgboost(
    f"xgboost_{feature_version}_{default_cv}_{default_random_state}_{default_n_trees}_popular_feature_study",
    f"xgboost_{feature_version}_{default_cv}_{default_random_state}_{default_n_trees}_popular_feature_study"
)

In [None]:
best_params_lightgbm_popular_feature = optimize_lightgbm(
    f"lightgbm_{feature_version}_{default_cv}_{default_random_state}_{default_n_trees}_popular_feature_study",
    f"lightgbm_{feature_version}_{default_cv}_{default_random_state}_{default_n_trees}_popular_feature_study"
)

Check for feature importance

In [None]:
params = {
    "n_estimators": default_n_trees,
    "verbosity": 0,
    "enable_categorical": True,
    "random_state": default_random_state
}
best_params_xgboost_popular_feature = get_best_params_from_file(f"xgboost_{feature_version}_{default_cv}_{default_random_state}_{default_n_trees}_popular_feature_study")
for p in best_params_xgboost_popular_feature:
    params[p] = best_params_xgboost_popular_feature[p]

xgboost_feature_importances = {}

xgbr = XGBRegressor(**params)
for i in range(default_cv):
    X_train, X_test = X_train_arr[i], X_test_arr[i]
    Y_train, Y_test = Y_train_arr[i], Y_test_arr[i]
    xgbr.fit(X_train, Y_train)
    print(pearson_score(Y_test, xgbr.predict(X_test)))
    features = xgbr.feature_names_in_.tolist()
    # features_i = xgbr.feature_importances_.tolist()
    features_i = get_shap_values(xgbr, X_train_arr, X_test_arr, Y_train_arr, Y_test_arr)
    for inx, feat in enumerate(features):
        xgboost_feature_importances[feat] = xgboost_feature_importances.get(feat, 0) + features_i[inx]

# print(feature_importances)
plt.hist(xgboost_feature_importances.values())
# Seems like only COD features are important (can try to only use 4-8 hours if 4-13 hours does not work well)

In [None]:
params = {
    "n_estimators": default_n_trees,
    "verbosity": -1,
    "random_state": default_random_state
}
best_params_lightgbm_popular_feature = get_best_params_from_file(f"lightgbm_{feature_version}_{default_cv}_{default_random_state}_{default_n_trees}_popular_feature_study")
for p in best_params_lightgbm_popular_feature:
    params[p] = best_params_lightgbm_popular_feature[p]

lightgbm_feature_importances = {}

lgbr = LGBMRegressor(**params)
for i in range(default_cv):
    X_train, X_test = X_train_arr[i], X_test_arr[i]
    Y_train, Y_test = Y_train_arr[i], Y_test_arr[i]
    lgbr.fit(X_train, Y_train)
    print(pearson_score(Y_test, lgbr.predict(X_test)))
    features = lgbr.feature_names_in_.tolist()
    # features_i = lgbr.feature_importances_.tolist()
    features_i = get_shap_values(lgbr, X_train_arr, X_test_arr, Y_train_arr, Y_test_arr)
    for inx, feat in enumerate(features):
        lightgbm_feature_importances[feat] = lightgbm_feature_importances.get(feat, 0) + features_i[inx]

plt.hist(lightgbm_feature_importances.values())
# seems to pick up time features not as good as past 4 hours features

In [None]:
xgboost_feature_importances_df = pd.DataFrame(
    {"var": xgboost_feature_importances.keys(), "importance": xgboost_feature_importances.values()}
)
xgboost_feature_importances_df["importance"] /= default_cv
# xgboost_feature_importances_df["rank_importance"] = xgboost_feature_importances_df["importance"].rank(ascending=False)
lightgbm_feature_importances_df = pd.DataFrame(
    {"var": lightgbm_feature_importances.keys(), "importance": lightgbm_feature_importances.values()}
)
lightgbm_feature_importances_df["importance"] /= default_cv
# lightgbm_feature_importances_df["rank_importance"] = lightgbm_feature_importances_df["importance"].rank(ascending=False)
# catboost_feature_importances_df = pd.DataFrame(
#     {"var": catboost_feature_importances.keys(), "importance_catboost": catboost_feature_importances.values()}
# )
# catboost_feature_importances_df["rank_importance"] = catboost_feature_importances_df["importance_catboost"].rank(ascending=False)
feature_importances_df = xgboost_feature_importances_df.merge(
    lightgbm_feature_importances_df,
    on="var",
    how="inner",
    suffixes=("_xgboost", "_lightgbm")
)
# feature_importances_df = feature_importances_df.merge(
#     catboost_feature_importances_df,
#     on="var",
#     how="inner",
#     suffixes=("", "_catboost")
# )
# feature_importances_df = feature_importances_df[["var", "rank_importance_xgboost", "rank_importance_lightgbm", "rank_importance_catboost"]]
# feature_importances_df["rank"] = 1/3 * (feature_importances_df["rank_importance_xgboost"] + feature_importances_df["rank_importance_lightgbm"] + feature_importances_df["rank_importance_catboost"])
feature_importances_df["importance"] = 1/2 * (feature_importances_df["importance_xgboost"] + feature_importances_df["importance_lightgbm"])
feature_importances_df = feature_importances_df.sort_values(by="importance", ascending=False).reset_index().drop("index", axis = 1)
feature_importances_df

In [None]:
feature_importances_df.to_csv("feature_importances_df.csv", index = False)

In [None]:
feature_importances_df = pd.read_csv("feature_importances_df.csv")
feature_importances_df

In [None]:
feature_importances_df[~feature_importances_df["var"].str.contains("X")]

In [None]:
print(feature_importances_df.loc[:29, "var"].tolist())

#### Third iteration: a more truncated version from the first collection

In [None]:
best_xgboost_score = optuna.load_study(
    study_name = "xgboost_2_4_101_1000_popular_feature_study",
    storage = f"sqlite:///xgboost_2_4_101_1000_popular_feature_study.db"
).best_value
best_lightgbm_score = optuna.load_study(
    study_name = "lightgbm_2_4_101_1000_popular_feature_study",
    storage = f"sqlite:///lightgbm_2_4_101_1000_popular_feature_study.db"
).best_value
feature_importances_df["weighted_importance"] = (best_xgboost_score * feature_importances_df["importance_xgboost"] + best_lightgbm_score * feature_importances_df["importance_lightgbm"]) / (best_xgboost_score + best_lightgbm_score)
feature_importances_df = feature_importances_df.sort_values("weighted_importance", ascending=False, ignore_index=True)
feature_importances_df

In [None]:
print(feature_importances_df.loc[:49, "var"].tolist())

XGBoost

In [None]:
# xgboost_importance_threshold = 0.011
# xgboost_best_features = [
#     f for f in xgboost_feature_importances if xgboost_feature_importances[f] > xgboost_importance_threshold
# ] + ["volume", "bid_qty", "ask_qty", "buy_qty", "sell_qty"]
# print(len(xgboost_best_features))
# train_added_df = pd.concat([train_df, popular_features_train], axis=1)

# X_train_arr, X_test_arr, Y_train_arr, Y_test_arr = create_cv(train_added_df, xgboost_best_features)

In [None]:
# best_xgboost_params_truncated = optimize_xgboost(
#     f"xgboost_{feature_version}_{default_cv}_{default_random_state}_{default_n_trees}_truncated_study",
#     f"xgboost_{feature_version}_{default_cv}_{default_random_state}_{default_n_trees}_truncated_study"
# ) # much worse than using all features  

LightGBM

In [None]:
# lightgbm_importance_threshold = 20
# lightgbm_best_features = [
#     f for f in lightgbm_feature_importances if lightgbm_feature_importances[f] > lightgbm_importance_threshold
# ] + ["volume", "bid_qty", "ask_qty", "buy_qty", "sell_qty"]
# print(len(lightgbm_best_features))
# train_added_df = pd.concat([train_df, popular_features_train], axis=1)

# X_train_arr, X_test_arr, Y_train_arr, Y_test_arr = create_cv(train_added_df, lightgbm_best_features)

In [None]:
# best_lightgbm_params_truncated = optimize_lightgbm(
#     f"lightgbm_{feature_version}_{default_cv}_{default_random_state}_{default_n_trees}_truncated_study",
#     f"lightgbm_{feature_version}_{default_cv}_{default_random_state}_{default_n_trees}_truncated_study"
# )
# # also much worse 

#### Fourth Iteration: a common truncated version using good features across all models + popular features

In [None]:
best_features = ['X862', 'X598', 'X863', 'X856', 'X612', 'X466', 'X533', 'X861', 'X445', 'X531', 
                  'X385', 'X23', 'X284', 'X465', 'X331', 'X95', 'X285', 'X31', 'X169', 'X137'] + \
                ["volume", "bid_qty", "ask_qty", "buy_qty", "sell_qty"] + \
                [col for col in train_df.columns.tolist() if "X" not in col and col not in ["timestamp", "label"]]
best_features = list(set(best_features))
train_added_df = pd.concat([train_df, popular_features_train], axis=1)
X_train_arr, X_test_arr, Y_train_arr, Y_test_arr = create_cv(train_added_df, best_features)

XGBoost

In [None]:
best_xgboost_params_common_truncated = optimize_xgboost(
    f"xgboost_{feature_version}_{default_cv}_{default_random_state}_{default_n_trees}_common_truncated_20_study",
    f"xgboost_{feature_version}_{default_cv}_{default_random_state}_{default_n_trees}_common_truncated_20_study"
) 

LightGBM

In [None]:
best_lightgbm_params_common_truncated = optimize_lightgbm(
    f"lightgbm_{feature_version}_{default_cv}_{default_random_state}_{default_n_trees}_common_truncated_20_study",
    f"lightgbm_{feature_version}_{default_cv}_{default_random_state}_{default_n_trees}_common_truncated_20_study"
)

Catboost

In [None]:
best_catboost_params_common_truncated = optimize_catboost(
    f"catboost_{feature_version}_{default_cv}_{default_random_state}_{default_n_trees}_common_truncated_20_study",
    f"catboost_{feature_version}_{default_cv}_{default_random_state}_{default_n_trees}_common_truncated_20_study"
)

Analyze model performance and feature importance across train and test

In [None]:
params = {
    "n_estimators": default_n_trees,
    "verbosity": 0,
    "enable_categorical": True,
    "random_state": default_random_state
}
best_params_xgboost = get_best_params_from_file(f"xgboost_{feature_version}_{default_cv}_{default_random_state}_{default_n_trees}_common_truncated_20_study")
for p in best_params_xgboost:
    params[p] = best_params_xgboost[p]

xgbr_arr = []

for i in tqdm(range(default_cv)):
    xgbr = XGBRegressor(**params)
    xgbr.fit(X_train_arr[i], Y_train_arr[i])
    xgbr_arr.append(xgbr)

In [None]:
params = {
    "n_estimators": default_n_trees,
    "verbosity": -1,
    "random_state": default_random_state,
}
best_params_lightgbm = get_best_params_from_file(f"lightgbm_{feature_version}_{default_cv}_{default_random_state}_{default_n_trees}_common_truncated_20_study")
for p in best_params_lightgbm:
    params[p] = best_params_lightgbm[p]

lgbr_arr = []

for i in tqdm(range(default_cv)):
    lgbr = LGBMRegressor(**params)
    lgbr.fit(X_train_arr[i], Y_train_arr[i])
    lgbr_arr.append(lgbr)

In [None]:
xgboost_feature_importances = {}
lightgbm_feature_importances = {}

for i in tqdm(range(default_cv)):
    features = xgbr_arr[i].feature_names_in_.tolist()
    features_i = get_shap_values(xgbr_arr[i], X_train_arr, X_test_arr, Y_train_arr, Y_test_arr)
    for inx, feat in enumerate(features):
        xgboost_feature_importances[feat] = xgboost_feature_importances.get(feat, 0) + features_i[inx]
    features = lgbr_arr[i].feature_names_in_.tolist()
    features_i = get_shap_values(xgbr_arr[i], X_train_arr, X_test_arr, Y_train_arr, Y_test_arr)
    for inx, feat in enumerate(features):
        lightgbm_feature_importances[feat] = lightgbm_feature_importances.get(feat, 0) + features_i[inx]

xgboost_feature_importances_df = pd.DataFrame(
    {"var": xgboost_feature_importances.keys(), "importance": xgboost_feature_importances.values()}
)
xgboost_feature_importances_df["importance"] /= default_cv
# xgboost_feature_importances_df["rank_importance"] = xgboost_feature_importances_df["importance"].rank(ascending=False)
lightgbm_feature_importances_df = pd.DataFrame(
    {"var": lightgbm_feature_importances.keys(), "importance": lightgbm_feature_importances.values()}
)
lightgbm_feature_importances_df["importance"] /= default_cv
# lightgbm_feature_importances_df["rank_importance"] = lightgbm_feature_importances_df["importance"].rank(ascending=False)
# catboost_feature_importances_df = pd.DataFrame(
#     {"var": catboost_feature_importances.keys(), "importance_catboost": catboost_feature_importances.values()}
# )
# catboost_feature_importances_df["rank_importance"] = catboost_feature_importances_df["importance_catboost"].rank(ascending=False)
feature_importances_df_common_truncated = xgboost_feature_importances_df.merge(
    lightgbm_feature_importances_df,
    on="var",
    how="inner",
    suffixes=("_xgboost", "_lightgbm")
)
# feature_importances_df = feature_importances_df.merge(
#     catboost_feature_importances_df,
#     on="var",
#     how="inner",
#     suffixes=("", "_catboost")
# )
# feature_importances_df = feature_importances_df[["var", "rank_importance_xgboost", "rank_importance_lightgbm", "rank_importance_catboost"]]
# feature_importances_df["rank"] = 1/3 * (feature_importances_df["rank_importance_xgboost"] + feature_importances_df["rank_importance_lightgbm"] + feature_importances_df["rank_importance_catboost"])
feature_importances_df_common_truncated["importance"] = 1/2 * (feature_importances_df_common_truncated["importance_xgboost"] + feature_importances_df_common_truncated["importance_lightgbm"])
feature_importances_df_common_truncated = feature_importances_df_common_truncated.sort_values(by="importance", ascending=False).reset_index().drop("index", axis = 1)
feature_importances_df_common_truncated

In [None]:
best_xgboost_score = optuna.load_study(
    study_name = "xgboost_2_4_101_1000_common_truncated_20_study",
    storage = f"sqlite:///xgboost_2_4_101_1000_common_truncated_20_study.db"
).best_value
best_lightgbm_score = optuna.load_study(
    study_name = "lightgbm_2_4_101_1000_common_truncated_20_study",
    storage = f"sqlite:///lightgbm_2_4_101_1000_common_truncated_20_study.db"
).best_value
feature_importances_df_common_truncated["weighted_importance"] = (best_xgboost_score * feature_importances_df_common_truncated["importance_xgboost"] + best_lightgbm_score * feature_importances_df_common_truncated["importance_lightgbm"]) / (best_xgboost_score + best_lightgbm_score)
feature_importances_df_common_truncated = feature_importances_df_common_truncated.sort_values("weighted_importance", ascending=False, ignore_index=True)
feature_importances_df_common_truncated

#### Fifth Iteration Instead of using GBDT, can we use MLP on these features

Convert from normal CV to torch type CV

In [None]:
# Create the CV data, seems to be better with only anonymized features
best_features = ['X862', 'X598', 'X863', 'X856', 'X612', 'X466', 'X533', 'X861', 'X445', 'X531',
                 'X385', 'X23', 'X465', 'X284', 'X331', 'X95', 'X169', 'X285', 'X137', 'X31']
                # ["volume", "bid_qty", "ask_qty", "buy_qty", "sell_qty"] + \
                # [col for col in train_df.columns.tolist() if "X" not in col and col not in ["timestamp", "label"]]
best_features = list(set(best_features))
train_added_df = pd.concat([train_df, popular_features_train], axis=1)
X_train_arr, X_test_arr, Y_train_arr, Y_test_arr = create_cv(train_added_df, best_features)

In [None]:
# Extra code to "reduce" from float64 to float32
def float64_to_float32(data):
    if isinstance(data, pd.DataFrame):
        for col in data.columns:
            data[col] = data[col].astype("float32")
    elif isinstance(data, pd.Series):
        data = data.astype("float32")
    return data

for i in range(default_cv):
    X_train_arr[i] = float64_to_float32(X_train_arr[i])
    X_test_arr[i] = float64_to_float32(X_test_arr[i])
    Y_train_arr[i] = float64_to_float32(Y_train_arr[i])
    Y_test_arr[i] = float64_to_float32(Y_test_arr[i])

In [None]:
def normal_cv_to_torch_cv(X_train_arr, X_test_arr, Y_train_arr, Y_test_arr, cv=default_cv):
    train_arr = []
    test_arr = []
    for i in range(cv):
        # First shuffle the data
        X_train, Y_train = X_train_arr[i], Y_train_arr[i]
        X_train["label"] = Y_train
        # Instead of shuffle the training data when create the dataloader, try to shuffle beforehand
        # X_train = X_train.sample(frac = 1, random_state = default_random_state)
        # not shuffle, keep it by date
        Y_train = X_train["label"]
        X_train = X_train.drop("label", axis = 1)

        # Then normalize
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train.values)

        # Create train dataset
        X_train, Y_train = torch.from_numpy(X_train), torch.from_numpy(Y_train.values)
        train_dataset = TensorDataset(X_train, Y_train)
        train_arr.append(train_dataset)

        # Normalize X_test
        X_test = scaler.transform(X_test_arr[i].values)

        # Create test dataset
        X_test, Y_test = torch.from_numpy(X_test), torch.from_numpy(Y_test_arr[i].values)
        test_dataset = TensorDataset(X_test, Y_test)
        test_arr.append(test_dataset)
        
    return train_arr, test_arr

In [None]:
train_arr, test_arr = normal_cv_to_torch_cv(X_train_arr, X_test_arr, Y_train_arr, Y_test_arr)

Define the model

In [None]:
# Define the model
class MLP(nn.Module):
    def __init__(self, num_features, hidden_layers_size, dropout):
        super(MLP, self).__init__()

        # Initialize layers & batchnorm
        last_layer = num_features
        self.layers = nn.ModuleList()
        for current_layer in hidden_layers_size:
            self.layers.append(nn.Linear(last_layer, current_layer))
            last_layer = current_layer
        self.layers.append(nn.Linear(last_layer, 1))

        # Initialize activation
        self.activation = nn.ReLU()

        # Initialze dropout
        self.dropout = nn.Dropout(p = dropout)

    def forward(self, x):
        for inx, layer in enumerate(self.layers):
            if inx == len(self.layers) - 1:
                x = layer(x)
            else:
                x = layer(x)
                x = self.activation(x)
                x = self.dropout(x)
        return x

    def reset(self):
        for layer in self.layers:
            layer.reset_parameters()

Train model with CV and evaluate

In [None]:
# Separate function for train & eval step
def train_mlp(model, criterion, optimizer, train_dataloader, num_epochs):
    model.train()
    for _ in tqdm(range(num_epochs)):
        for (inputs, targets) in train_dataloader:
            # Load to device
            inputs, targets= inputs.to(device), targets.to(device)
            # Forward step
            outputs = model(inputs)
            # get error
            error = criterion(outputs, targets)
            # Zero out the past gradient
            optimizer.zero_grad()
            # Backprop
            error.backward()
            # Gradient Descent
            optimizer.step()

def eval_mlp(model, test_dataloader):
    outputs_all = np.zeros(0)
    targets_all = np.zeros(0)
    model.eval()
    with torch.no_grad():
        for _, (inputs, targets) in enumerate(test_dataloader):
            # Load to device
            inputs = inputs.to(device)
            # Forward step
            outputs = model(inputs).detach().cpu().numpy().flatten()
            # Load to overall Y_test, Y_pred to calculate pearson score later
            outputs_all = np.concatenate([outputs_all, outputs])
            targets_all = np.concatenate([targets_all, targets])
    return pearson_score(targets_all, outputs_all)

In [None]:
def train_eval_cv_torch(model, lr, cv, train_arr, test_arr, batch_size, num_epochs):
    cv_pearson = 0
    for i in range(cv):
        # Get the dataloader
        train_dataset = train_arr[i]
        train_dataloader = DataLoader(train_dataset, batch_size = batch_size, num_workers=0)
        test_dataset = test_arr[i]
        test_dataloader = DataLoader(test_dataset, batch_size = batch_size, num_workers=0)

        # Reinitialize the model
        model.reset()
        model.to(device)

        # Initialize the loss function
        criterion = nn.MSELoss()

        # Reinitialize the optimizer
        optimizer = optim.Adam(model.parameters(), lr = lr)

        # Train the model
        train_mlp(model, criterion, optimizer, train_dataloader, num_epochs)

        # Test the model
        pearson = eval_mlp(model, test_dataloader)
        print(pearson)
        cv_pearson += pearson
    return cv_pearson / cv

In [None]:
# Training process of the default config
hidden_layers_size = [16, 8, 4]
lr = 0.001
batch_size = 60
num_epochs = 10

mlpr = MLP(len(best_features), hidden_layers_size=hidden_layers_size, dropout = 0.3)

train_eval_cv_torch(mlpr, lr, default_cv, train_arr, test_arr, batch_size, num_epochs)

#### Sixth Iteration: Change this into a classification problem

In [None]:
original_features = [f for f in train_df.columns if "X" in f]

X_train_arr, X_test_arr, Y_train_arr, Y_test_arr = create_cv_classification(train_df, original_features)

In [None]:
best_params_xgboost_classification = optimize_xgboost(
    f"xgboost_{feature_version}_{default_cv}_{default_random_state}_{default_n_trees}_classification_study",
    f"xgboost_{feature_version}_{default_cv}_{default_random_state}_{default_n_trees}_classification_study",
    objective_xgboost_classification
)

In [None]:
best_params_lightgbm_classification = optimize_lightgbm(
    f"lightgbm_{feature_version}_{default_cv}_{default_random_state}_{default_n_trees}_classification_study",
    f"lightgbm_{feature_version}_{default_cv}_{default_random_state}_{default_n_trees}_classification_study",
    objective_lightgbm_classification
)

#### Seventh Iteration: Search for the best way to train

In [None]:
def search_training_scheme(model, train_df, cv = default_cv, features = None):
    folds_trial = [
        # level 1
        [[0, 1, 2, 3]], 
        [[0, 1]], [[1, 2]], [[2, 3]],
        [[0]], [[1]], [[2]], [[3]],
        [[0, 1], [1, 2], [2, 3]],
        [[0, 1], [2, 3]],
        [[0], [1], [2], [3]],
        # level 2
        [[0, 1, 2, 3], [0, 1]],
        [[0, 1, 2, 3], [1, 2]],
        [[0, 1, 2, 3], [2, 3]],
        [[0, 1, 2, 3], [0, 1], [2, 3]],
        [[0, 1, 2, 3], [0, 1], [1, 2], [2, 3]],
        [[0, 1, 2, 3], [0], [1], [2], [3]],
        [[0, 1], [2, 3], [0], [1], [2], [3]],
        [[0, 1], [1, 2], [2, 3], [0], [1], [2], [3]],
        # level 3
        [[0, 1, 2, 3], [0, 1], [0]],
        [[0, 1, 2, 3], [2, 3], [3]],
        [[0, 1, 2, 3], [0, 1], [2, 3], [0], [1], [2], [3]],
        [[0, 1, 2, 3], [0, 1], [1, 2], [2, 3], [0], [1], [2], [3]],
    ]

    if features is not None:
        train_df = train_df[features + ["timestamp", "label"]]

    for folds in folds_trial:
        print(f"Current folds list is {folds}")
        model_lst = [deepcopy(model)] * len(folds)
        cv_pearson = []
        for i in range(cv):
            train_month = list(range(3 + i, 7 + i))
            test_month = list(map(lambda x: x % 12 if x > 12 else x, list(range(8 + i, 12 + i))))
            test = train_df[train_df["timestamp"].dt.month.isin(test_month)].reset_index().drop("index", axis = 1)
            X_test, Y_test = test.drop(["timestamp", "label"], axis = 1), test["label"]
            Y_pred = np.zeros(Y_test.shape[0])
            for j in range(len(folds)):
                fold = folds[j]
                model = model_lst[j]
                train_month_curr = [train_month[f] for f in fold]
                train_curr = train_df[train_df["timestamp"].dt.month.isin(train_month_curr)].reset_index().drop("index", axis = 1)
                X_train, Y_train = train_curr.drop(["timestamp", "label"], axis = 1), train_curr["label"]
                model.fit(X_train, Y_train)
                Y_pred += model.predict(X_test)
            Y_pred /= len(folds)
            cv_pearson.append(pearson_score(Y_test, Y_pred))
            print(f"Finish fold {i} with score: {pearson_score(Y_test, Y_pred)}")
        print(f"Finish trial with mean score: {np.mean(np.array(cv_pearson))}")
        print(f"Finish trial with std score: {np.std(np.array(cv_pearson))}")
        print()

In [None]:
train_added_df = pd.concat([train_df, popular_features_train], axis = 1)

params = {
    "n_estimators": default_n_trees,
    "verbosity": 0,
    "enable_categorical": True,
    "random_state": default_random_state
}
best_params_xgboost_popular_feature = get_best_params_from_file(f"xgboost_{feature_version}_{default_cv}_{default_random_state}_{default_n_trees}_popular_feature_study")
for p in best_params_xgboost_popular_feature:
    params[p] = best_params_xgboost_popular_feature[p]

xgbr = XGBRegressor(**params)
search_training_scheme(xgbr, train_added_df)
# Notable
# [[0, 1, 2, 3]]
# [[0, 1, 2, 3], [1, 2]]
# [[0, 1, 2, 3], [0, 1], [2, 3]]
# [[0, 1, 2, 3], [0, 1], [1, 2], [2, 3]]
# [[0, 1, 2, 3], [0], [1], [2], [3]] 
# [[0, 1, 2, 3], [0, 1], [2, 3], [0], [1], [2], [3]]
# [[0, 1, 2, 3], [0, 1], [1, 2], [2, 3], [0], [1], [2], [3]]

In [None]:
train_added_df = pd.concat([train_df, popular_features_train], axis = 1)

params = {
    "n_estimators": default_n_trees,
    "verbosity": -1,
    "random_state": default_random_state
}
best_params_lightgbm_popular_feature = get_best_params_from_file(f"lightgbm_{feature_version}_{default_cv}_{default_random_state}_{default_n_trees}_popular_feature_study")
for p in best_params_lightgbm_popular_feature:
    params[p] = best_params_lightgbm_popular_feature[p]

lgbr = LGBMRegressor(**params)
search_training_scheme(lgbr, train_added_df)
# [[0, 1, 2, 3]]
# [[0, 1, 2, 3], [0, 1]]
# [[0, 1, 2, 3], [0, 1], [2, 3]]
# [[0, 1, 2, 3], [0], [1], [2], [3]]
# [[0, 1, 2, 3], [0, 1], [0]]
# [[0, 1, 2, 3], [0, 1], [2, 3], [0], [1], [2], [3]]

#### Eighth Iteration: rewrite the code for MLP training using MLX

Create the data for training + custom batch iteration

In [17]:
# Create the CV data, seems to be better with only anonymized features
# best_features = ['X862', 'X598', 'X863', 'X856', 'X612', 'X466', 'X533', 'X861', 'X445', 'X531', 
#                  'X385', 'X23', 'X284', 'X465', 'X331', 'X95', 'X285', 'X31', 'X169', 'X137', 
#                 'X379', 'X186', 'X852', 'X302', 'X868', 'X89', 'X219', 'X855', 'X540', 'X301'] 
                #  'X198', 'X373', 'X524', 'X291', 'X444', 'X279', 'X300', 'X181', 'X367', 'X538', 
                #  'X288', 'X226', 'X857', 'X860', 'X205', 'X298', 'X272', 'X472', 'X28', 'X754']
                # ["volume", "bid_qty", "ask_qty", "buy_qty", "sell_qty"] + \
                # [col for col in train_df.columns.tolist() if "X" not in col and col not in ["timestamp", "label"]]
# best_features = list(set(best_features))
# best_features = [col for col in train_df.columns if "X" in col]
# train_added_df = pd.concat([train_df, popular_features_train], axis=1)
# X_train_arr, X_test_arr, Y_train_arr, Y_test_arr = create_cv(train_added_df, best_features)

In [18]:
# Extra code to "reduce" from float64 to float32
def float64_to_float32(data):
    if isinstance(data, pd.DataFrame):
        for col in data.columns:
            data[col] = data[col].astype("float32")
    elif isinstance(data, pd.Series):
        data = data.astype("float32")
    return data

# for i in range(default_cv):
#     X_train_arr[i] = float64_to_float32(X_train_arr[i])
#     X_test_arr[i] = float64_to_float32(X_test_arr[i])
#     Y_train_arr[i] = float64_to_float32(Y_train_arr[i])
#     Y_test_arr[i] = float64_to_float32(Y_test_arr[i])

In [19]:
def normal_cv_to_mlx_cv(X_train_arr, X_test_arr, Y_train_arr, Y_test_arr, cv=default_cv):
    for i in range(cv):
        # Normalize forst
        scaler = StandardScaler()
        X_train_arr[i] = scaler.fit_transform(X_train_arr[i].values)
        X_test_arr[i] = scaler.transform(X_test_arr[i].values)

        # Convert to mlx format
        X_train_arr[i] = mx.array(X_train_arr[i])
        X_test_arr[i] = mx.array(X_test_arr[i])
        Y_train_arr[i] = mx.array(Y_train_arr[i].values)
        Y_test_arr[i] = mx.array(Y_test_arr[i].values)
        
    return X_train_arr, X_test_arr, Y_train_arr, Y_test_arr

# X_train_arr, X_test_arr, Y_train_arr, Y_test_arr = normal_cv_to_mlx_cv(X_train_arr, X_test_arr, Y_train_arr, Y_test_arr)

Define the model

In [20]:
# Define the model
# We do not use the reset method this time so you have to create the model at each fold
class MLPMLX(nnmx.Module):
    def __init__(self, num_features, hidden_layers_size, dropout):
        super().__init__()

        # Initialize layers & batchnorm
        last_layer = num_features
        self.layers = []
        for current_layer in hidden_layers_size:
            self.layers.append(nnmx.Linear(last_layer, current_layer))
            last_layer = current_layer
        self.layers.append(nnmx.Linear(last_layer, 1))

        # Initialize activation
        self.activation = nnmx.ReLU()

        # Initialze dropout
        self.dropout = nnmx.Dropout(p = dropout)

    def __call__(self, x):
        for inx, layer in enumerate(self.layers):
            if inx == len(self.layers) - 1:
                x = layer(x)
            else:
                x = layer(x)
                x = self.activation(x)
                x = self.dropout(x)
        return x

Train model with CV and evaluate

In [21]:
# Custom function for batch iteration
def batch_iterate(batch_size, X, Y, shuffle = True):
    for i in range(0, Y.size, batch_size):
        X_curr = X[i: min(i + batch_size, Y.size), :]
        Y_curr = Y[i: min(i + batch_size, Y.size)]
        if shuffle:
            inx_lst = mx.random.permutation(batch_size)
            X_curr = X_curr[inx_lst, :]
            Y_curr = Y_curr[inx_lst]
        yield X_curr, Y_curr

In [22]:
# Separate function for train & eval step
def train_mlp_mlx(model, loss_and_grad_fn, optimizer, X_train, Y_train, batch_size, num_epochs):
    model.train()
    for _ in tqdm(range(num_epochs)):
        for (inputs, targets) in batch_iterate(batch_size, X_train, Y_train):
            _, grads = loss_and_grad_fn(model, inputs, targets)
            # Update the optimizer state and model parameters in a single call
            optimizer.update(model, grads)
            # Force a graph evaluation
            mx.eval(model.parameters(), optimizer.state)

def eval_mlp_mlx(model, X_test, Y_test, batch_size):
    outputs_all = np.zeros(0)
    targets_all = np.zeros(0)
    model.eval()
    for (inputs, targets) in batch_iterate(batch_size, X_test, Y_test, shuffle=False):
        outputs = model(inputs).reshape(-1)
        # convert back to numpy
        outputs, targets = np.array(outputs), np.array(targets)
        # Load to overall Y_test, Y_pred to calculate pearson score later
        outputs_all = np.concatenate([outputs_all, outputs])
        targets_all = np.concatenate([targets_all, targets])
    return pearson_score(targets_all, outputs_all)

In [23]:
def train_eval_cv_mlx(num_features, hidden_layers_size, dropout, lr, cv, X_train_arr, X_test_arr, Y_train_arr, Y_test_arr, batch_size, num_epochs):
    cv_pearson = 0
    for _, (X_train, X_test, Y_train, Y_test) in enumerate(zip(X_train_arr, X_test_arr, Y_train_arr, Y_test_arr)):
        # initialize the model
        mx.random.seed(default_random_state)
        model = MLPMLX(num_features, hidden_layers_size, dropout)

        # Initialize the loss function
        def loss_fn(model, X, Y):
            Y_pred = model(X).reshape(-1)
            return mx.mean(nnmx.losses.mse_loss(Y_pred, Y))
        loss_and_grad_fn = nnmx.value_and_grad(model, loss_fn)

        # Reinitialize the optimizer
        optimizer = optimmx.Adam(learning_rate = lr)

        # Train the model
        train_mlp_mlx(model, loss_and_grad_fn, optimizer, X_train, Y_train, batch_size, num_epochs)

        # Test the model
        pearson = eval_mlp_mlx(model, X_test, Y_test, batch_size)
        print(pearson)
        if pearson == -1:
            return pearson
        cv_pearson += pearson
    return cv_pearson / cv

Conduct training and evaluating process of the model

In [24]:
# # Training process of the default config
# num_features = len(best_features)
# hidden_layers_size = [8, 8, 8]
# dropout = 0.2
# lr = 0.001
# batch_size = 180
# num_epochs = 10

# train_eval_cv_mlx(num_features, hidden_layers_size, dropout, lr, default_cv, X_train_arr, X_test_arr, Y_train_arr, Y_test_arr, batch_size, num_epochs)

Conduct Bayesian Optimization on this

In [25]:
default_num_layers = 2

In [26]:
def objective_mlp_mlx(trial):
    # First initialize the parameters
    num_features = len(best_features)
    num_layers = default_num_layers
    log_2_hidden_layers_size = []
    for i in range(num_layers):
        if len(log_2_hidden_layers_size) == 0:
            log_2_hidden_layers_size.append(trial.suggest_int(f"log2_hidden_layer_{i}", 2, 6))
        else:
            log_2_hidden_layers_size.append(trial.suggest_int(f"log2_hidden_layer_{i}", 2, log_2_hidden_layers_size[-1]))
    hidden_layers_size = [2**l for l in log_2_hidden_layers_size]
    dropout = trial.suggest_float("dropout", 0.2, 0.7)
    lr = trial.suggest_float("lr", 0.0001, 0.01, log=True)
    batch_size = trial.suggest_categorical("batch_size", [30, 60, 120, 180, 360, 720])
    num_epochs = trial.suggest_categorical("num_epochs", [10, 20, 30, 40, 50])
    
    # Conduct training based on those parameters
    return train_eval_cv_mlx(num_features, hidden_layers_size, dropout, lr, default_cv, X_train_arr, X_test_arr, Y_train_arr, Y_test_arr, batch_size, num_epochs)

In [27]:
def optimize_mlp_mlx(study_name, storage_name, objective_function=objective_mlp_mlx, n_trials = 100, n_jobs = 1):
    print("Conduct hyperparam opt for MLP")
    study = optuna.create_study(
        study_name = study_name,
        direction ='maximize',
        storage = f"sqlite:///{storage_name}.db",
        sampler = TPESampler(seed = 101, n_startup_trials=10),
        load_if_exists=True
    )
    study.optimize(objective_function, n_trials=n_trials, n_jobs=n_jobs)
    print('Best hyperparameters:', study.best_params)
    print('Best Pearson score:', study.best_value)
    return study.best_params

In [28]:
# Create the CV data, seems to be better with only anonymized features
best_features = ['X862', 'X598', 'X863', 'X856', 'X612', 'X466', 'X533', 'X861', 'X445', 'X531', 
                 'X385', 'X23', 'X284', 'X465', 'X331', 'X95', 'X285', 'X31', 'X169', 'X137',]
                # 'X379', 'X186', 'X852', 'X302', 'X868', 'X89', 'X219', 'X855', 'X540', 'X301',] 
                #  'X198', 'X373', 'X524', 'X291', 'X444', 'X279', 'X300', 'X181', 'X367', 'X538', 
                #  'X288', 'X226', 'X857', 'X860', 'X205', 'X298', 'X272', 'X472', 'X28', 'X754']
                # ["volume", "bid_qty", "ask_qty", "buy_qty", "sell_qty"] + \
                # [col for col in train_df.columns.tolist() if "X" not in col and col not in ["timestamp", "label"]]
best_features = list(set(best_features))
# best_features = [col for col in train_df.columns if "X" in col]
train_added_df = pd.concat([train_df, popular_features_train], axis=1)
X_train_arr, X_test_arr, Y_train_arr, Y_test_arr = create_cv(train_added_df, best_features)

# Convert to float32
for i in range(default_cv):
    X_train_arr[i] = float64_to_float32(X_train_arr[i])
    X_test_arr[i] = float64_to_float32(X_test_arr[i])
    Y_train_arr[i] = float64_to_float32(Y_train_arr[i])
    Y_test_arr[i] = float64_to_float32(Y_test_arr[i])

# Convert to MLX
X_train_arr, X_test_arr, Y_train_arr, Y_test_arr = normal_cv_to_mlx_cv(X_train_arr, X_test_arr, Y_train_arr, Y_test_arr)

[3, 4, 5, 6] [8, 9, 10, 11]
[4, 5, 6, 7] [9, 10, 11, 12]
[5, 6, 7, 8] [10, 11, 12, 1]
[6, 7, 8, 9] [11, 12, 1, 2]


In [None]:
optimize_mlp_mlx(
    f"mlp_mlx_{feature_version}_{default_cv}_{default_random_state}_{default_num_layers}_common_truncated_{len(best_features)}_study",
    f"mlp_mlx_{feature_version}_{default_cv}_{default_random_state}_{default_num_layers}_common_truncated_{len(best_features)}_study"
)

[I 2025-07-03 18:58:23,329] Using an existing study with name 'mlp_mlx_2_4_101_2_common_truncated_20_study' instead of creating a new one.


Conduct hyperparam opt for MLP


100%|██████████| 10/10 [00:01<00:00,  6.09it/s]


0.11162511798178269


100%|██████████| 10/10 [00:01<00:00,  6.14it/s]


0.12526485043391805


100%|██████████| 10/10 [00:01<00:00,  6.16it/s]


0.12706050126441645


100%|██████████| 10/10 [00:01<00:00,  6.40it/s]
[I 2025-07-03 18:58:30,149] Trial 103 finished with value: 0.12615410501187177 and parameters: {'log2_hidden_layer_0': 3, 'log2_hidden_layer_1': 2, 'dropout': 0.3213794354607382, 'lr': 0.0008879169839240599, 'batch_size': 720, 'num_epochs': 10}. Best is trial 33 with value: 0.12910364491215776.


0.14066595036736987


100%|██████████| 10/10 [00:01<00:00,  6.42it/s]


0.11067548445530152


100%|██████████| 10/10 [00:01<00:00,  6.27it/s]


0.12551159186148358


100%|██████████| 10/10 [00:01<00:00,  6.22it/s]


0.1278969916350272


100%|██████████| 10/10 [00:01<00:00,  5.84it/s]
[I 2025-07-03 18:58:36,955] Trial 104 finished with value: 0.12626034722820048 and parameters: {'log2_hidden_layer_0': 3, 'log2_hidden_layer_1': 2, 'dropout': 0.30638174607845275, 'lr': 0.0007960035679812472, 'batch_size': 720, 'num_epochs': 10}. Best is trial 33 with value: 0.12910364491215776.


0.14095732096098962


100%|██████████| 10/10 [00:01<00:00,  6.26it/s]


0.11202821362870666


100%|██████████| 10/10 [00:01<00:00,  6.44it/s]


0.12457702013608407


100%|██████████| 10/10 [00:01<00:00,  6.42it/s]


0.12652015797788446


100%|██████████| 10/10 [00:01<00:00,  6.43it/s]
[I 2025-07-03 18:58:43,542] Trial 105 finished with value: 0.125352849390945 and parameters: {'log2_hidden_layer_0': 3, 'log2_hidden_layer_1': 2, 'dropout': 0.258741786251158, 'lr': 0.0008003005582561433, 'batch_size': 720, 'num_epochs': 10}. Best is trial 33 with value: 0.12910364491215776.


0.13828600582110478


100%|██████████| 10/10 [00:03<00:00,  3.19it/s]


0.1105973714084766


100%|██████████| 10/10 [00:03<00:00,  3.24it/s]


0.12471887075971488


100%|██████████| 10/10 [00:03<00:00,  3.15it/s]


0.1200168533888543


100%|██████████| 10/10 [00:03<00:00,  3.24it/s]
[I 2025-07-03 18:58:56,575] Trial 106 finished with value: 0.12535702322614672 and parameters: {'log2_hidden_layer_0': 3, 'log2_hidden_layer_1': 2, 'dropout': 0.43154341184579237, 'lr': 0.0006498228820818782, 'batch_size': 360, 'num_epochs': 10}. Best is trial 33 with value: 0.12910364491215776.


0.14609499734754117


100%|██████████| 10/10 [00:06<00:00,  1.64it/s]


0.11177840941858021


100%|██████████| 10/10 [00:06<00:00,  1.57it/s]


0.12775636661647793


100%|██████████| 10/10 [00:06<00:00,  1.58it/s]


0.12933478882385346


100%|██████████| 10/10 [00:06<00:00,  1.58it/s]
[I 2025-07-03 18:59:22,821] Trial 107 finished with value: 0.12635974138712847 and parameters: {'log2_hidden_layer_0': 3, 'log2_hidden_layer_1': 2, 'dropout': 0.28296460032248993, 'lr': 0.0004814121952192467, 'batch_size': 180, 'num_epochs': 10}. Best is trial 33 with value: 0.12910364491215776.


0.13656940068960222


100%|██████████| 10/10 [00:06<00:00,  1.58it/s]


0.11858825178432846


100%|██████████| 10/10 [00:06<00:00,  1.60it/s]


0.1270893996150844


100%|██████████| 10/10 [00:06<00:00,  1.57it/s]


0.10827312873219509


100%|██████████| 10/10 [00:06<00:00,  1.62it/s]
[I 2025-07-03 18:59:48,928] Trial 108 finished with value: 0.11380005286306735 and parameters: {'log2_hidden_layer_0': 2, 'log2_hidden_layer_1': 2, 'dropout': 0.2283629357380477, 'lr': 0.0005021021615336415, 'batch_size': 180, 'num_epochs': 10}. Best is trial 33 with value: 0.12910364491215776.


0.1012494313206615


100%|██████████| 30/30 [00:18<00:00,  1.61it/s]


0.1359546408391863


100%|██████████| 30/30 [00:18<00:00,  1.61it/s]


0.1467930071339389


100%|██████████| 30/30 [00:18<00:00,  1.60it/s]


0.10744853972295289


100%|██████████| 30/30 [00:18<00:00,  1.62it/s]
[I 2025-07-03 19:01:04,599] Trial 109 finished with value: 0.1237290736251391 and parameters: {'log2_hidden_layer_0': 4, 'log2_hidden_layer_1': 3, 'dropout': 0.27170423969703583, 'lr': 0.00042289532808261756, 'batch_size': 180, 'num_epochs': 30}. Best is trial 33 with value: 0.12910364491215776.


0.10472010680447834


100%|██████████| 10/10 [00:06<00:00,  1.61it/s]


0.1166702124717527


100%|██████████| 10/10 [00:06<00:00,  1.59it/s]


0.1277252410637023


100%|██████████| 10/10 [00:06<00:00,  1.59it/s]


0.1263553943711715


100%|██████████| 10/10 [00:06<00:00,  1.61it/s]
[I 2025-07-03 19:01:30,695] Trial 110 finished with value: 0.12644783321190625 and parameters: {'log2_hidden_layer_0': 3, 'log2_hidden_layer_1': 2, 'dropout': 0.30183783544291853, 'lr': 0.0005937806065629142, 'batch_size': 180, 'num_epochs': 10}. Best is trial 33 with value: 0.12910364491215776.


0.13504048494099852


100%|██████████| 40/40 [00:24<00:00,  1.61it/s]


0.13061733713133367


100%|██████████| 40/40 [00:24<00:00,  1.61it/s]


0.1313634981406186


100%|██████████| 40/40 [00:25<00:00,  1.60it/s]


0.1067203970011954


100%|██████████| 40/40 [00:24<00:00,  1.61it/s]
[I 2025-07-03 19:03:11,389] Trial 111 finished with value: 0.11923787632710312 and parameters: {'log2_hidden_layer_0': 3, 'log2_hidden_layer_1': 2, 'dropout': 0.27972397683715544, 'lr': 0.0005527295535764759, 'batch_size': 180, 'num_epochs': 40}. Best is trial 33 with value: 0.12910364491215776.


0.1082502730352648


100%|██████████| 10/10 [00:06<00:00,  1.62it/s]


0.11584040948149121


100%|██████████| 10/10 [00:06<00:00,  1.61it/s]


0.12753204089536516


100%|██████████| 10/10 [00:06<00:00,  1.58it/s]


0.12617590361220335


100%|██████████| 10/10 [00:06<00:00,  1.63it/s]
[I 2025-07-03 19:03:37,282] Trial 112 finished with value: 0.12617551278610056 and parameters: {'log2_hidden_layer_0': 3, 'log2_hidden_layer_1': 2, 'dropout': 0.29350247469073054, 'lr': 0.0005966820083961509, 'batch_size': 180, 'num_epochs': 10}. Best is trial 33 with value: 0.12910364491215776.


0.1351536971553426


100%|██████████| 10/10 [00:06<00:00,  1.60it/s]


0.12162592305644994


100%|██████████| 10/10 [00:06<00:00,  1.60it/s]


0.12703196553928814


100%|██████████| 10/10 [00:06<00:00,  1.61it/s]


0.12324896013273169


100%|██████████| 10/10 [00:06<00:00,  1.59it/s]
[I 2025-07-03 19:04:03,340] Trial 113 finished with value: 0.1260077023462088 and parameters: {'log2_hidden_layer_0': 3, 'log2_hidden_layer_1': 2, 'dropout': 0.28827017736893695, 'lr': 0.0007525623569887625, 'batch_size': 180, 'num_epochs': 10}. Best is trial 33 with value: 0.12910364491215776.


0.13212396065636547


100%|██████████| 10/10 [00:06<00:00,  1.60it/s]


0.1115303953322806


100%|██████████| 10/10 [00:06<00:00,  1.61it/s]


0.12756581210077447


100%|██████████| 10/10 [00:06<00:00,  1.58it/s]


0.12812111066136156


100%|██████████| 10/10 [00:06<00:00,  1.62it/s]
[I 2025-07-03 19:04:29,348] Trial 114 finished with value: 0.125512725092226 and parameters: {'log2_hidden_layer_0': 3, 'log2_hidden_layer_1': 2, 'dropout': 0.2498014205486528, 'lr': 0.0005089357099607268, 'batch_size': 180, 'num_epochs': 10}. Best is trial 33 with value: 0.12910364491215776.


0.1348335822744874


100%|██████████| 10/10 [00:06<00:00,  1.60it/s]


0.11239217616861329


100%|██████████| 10/10 [00:06<00:00,  1.61it/s]


0.12705381164825585


100%|██████████| 10/10 [00:06<00:00,  1.60it/s]


0.12963926649656513


100%|██████████| 10/10 [00:06<00:00,  1.60it/s]
[I 2025-07-03 19:04:55,357] Trial 115 finished with value: 0.1265812415057963 and parameters: {'log2_hidden_layer_0': 3, 'log2_hidden_layer_1': 2, 'dropout': 0.3038882685904237, 'lr': 0.00047430315210669337, 'batch_size': 180, 'num_epochs': 10}. Best is trial 33 with value: 0.12910364491215776.


0.13723971170975088


100%|██████████| 10/10 [00:06<00:00,  1.61it/s]


0.10871596217500319


100%|██████████| 10/10 [00:06<00:00,  1.61it/s]


0.1249651136608981


100%|██████████| 10/10 [00:06<00:00,  1.58it/s]


0.1250309316703131


100%|██████████| 10/10 [00:06<00:00,  1.63it/s]
[I 2025-07-03 19:05:21,318] Trial 116 finished with value: 0.1248692271933575 and parameters: {'log2_hidden_layer_0': 3, 'log2_hidden_layer_1': 2, 'dropout': 0.32707214001001245, 'lr': 0.0003524601997862773, 'batch_size': 180, 'num_epochs': 10}. Best is trial 33 with value: 0.12910364491215776.


0.14076490126721555


100%|██████████| 10/10 [00:06<00:00,  1.61it/s]


0.11870384620847699


100%|██████████| 10/10 [00:06<00:00,  1.64it/s]


0.14698982940811242


100%|██████████| 10/10 [00:06<00:00,  1.67it/s]


0.11864096413822804


100%|██████████| 10/10 [00:05<00:00,  1.68it/s]
[I 2025-07-03 19:05:46,583] Trial 117 finished with value: 0.12822363975683893 and parameters: {'log2_hidden_layer_0': 4, 'log2_hidden_layer_1': 2, 'dropout': 0.3308609644189722, 'lr': 0.00039181265253887917, 'batch_size': 180, 'num_epochs': 10}. Best is trial 33 with value: 0.12910364491215776.


0.12855991927253826


100%|██████████| 20/20 [00:11<00:00,  1.68it/s]


0.11903157277375725


100%|██████████| 20/20 [00:11<00:00,  1.68it/s]


0.14375339569104312


100%|██████████| 20/20 [00:12<00:00,  1.65it/s]


0.09817091000363495


100%|██████████| 20/20 [00:12<00:00,  1.61it/s]
[I 2025-07-03 19:06:35,976] Trial 118 finished with value: 0.11938629673937678 and parameters: {'log2_hidden_layer_0': 4, 'log2_hidden_layer_1': 2, 'dropout': 0.2862023017767736, 'lr': 0.0004708990807863045, 'batch_size': 180, 'num_epochs': 20}. Best is trial 33 with value: 0.12910364491215776.


0.11658930848907181


100%|██████████| 10/10 [00:06<00:00,  1.62it/s]


0.1317814930894874


100%|██████████| 10/10 [00:06<00:00,  1.63it/s]


0.14850938602592914


100%|██████████| 10/10 [00:06<00:00,  1.63it/s]


0.12028682000579322


100%|██████████| 10/10 [00:05<00:00,  1.70it/s]
[I 2025-07-03 19:07:01,362] Trial 119 finished with value: 0.12435835291916006 and parameters: {'log2_hidden_layer_0': 4, 'log2_hidden_layer_1': 4, 'dropout': 0.30459681421090384, 'lr': 0.00040708793362585673, 'batch_size': 180, 'num_epochs': 10}. Best is trial 33 with value: 0.12910364491215776.


0.09685571255543052


100%|██████████| 10/10 [00:06<00:00,  1.63it/s]


0.12245406937161797


100%|██████████| 10/10 [00:06<00:00,  1.59it/s]


0.1465194466000718


100%|██████████| 10/10 [00:06<00:00,  1.61it/s]


0.13393878557478958


100%|██████████| 10/10 [00:05<00:00,  1.68it/s]
[I 2025-07-03 19:07:26,997] Trial 120 finished with value: 0.13249689924331817 and parameters: {'log2_hidden_layer_0': 4, 'log2_hidden_layer_1': 2, 'dropout': 0.5103560161329487, 'lr': 0.00028522350535275116, 'batch_size': 180, 'num_epochs': 10}. Best is trial 120 with value: 0.13249689924331817.


0.12707529542679333


100%|██████████| 50/50 [00:29<00:00,  1.68it/s]


0.11744617103101361


100%|██████████| 50/50 [00:30<00:00,  1.63it/s]


0.14262808293215362


100%|██████████| 50/50 [00:31<00:00,  1.60it/s]


0.10082309058062869


100%|██████████| 50/50 [00:30<00:00,  1.62it/s]
[I 2025-07-03 19:09:30,588] Trial 121 finished with value: 0.11865126736211261 and parameters: {'log2_hidden_layer_0': 4, 'log2_hidden_layer_1': 2, 'dropout': 0.512195272543047, 'lr': 0.0002400624761339491, 'batch_size': 180, 'num_epochs': 50}. Best is trial 120 with value: 0.13249689924331817.


0.11370772490465449


100%|██████████| 10/10 [00:06<00:00,  1.64it/s]


0.11477762978798801


100%|██████████| 10/10 [00:06<00:00,  1.60it/s]


0.13948354366612445


100%|██████████| 10/10 [00:06<00:00,  1.63it/s]


0.12173283112634353


100%|██████████| 10/10 [00:05<00:00,  1.68it/s]
[I 2025-07-03 19:09:56,089] Trial 122 finished with value: 0.12533434150777645 and parameters: {'log2_hidden_layer_0': 4, 'log2_hidden_layer_1': 2, 'dropout': 0.311619133062661, 'lr': 0.0002912542558892033, 'batch_size': 180, 'num_epochs': 10}. Best is trial 120 with value: 0.13249689924331817.


0.12534336145064984


100%|██████████| 10/10 [00:05<00:00,  1.69it/s]


0.12223746573497285


100%|██████████| 10/10 [00:06<00:00,  1.64it/s]


0.14780579617881456


100%|██████████| 10/10 [00:06<00:00,  1.60it/s]


0.1233513618994491


100%|██████████| 10/10 [00:06<00:00,  1.62it/s]
[I 2025-07-03 19:10:21,608] Trial 123 finished with value: 0.1304786232303066 and parameters: {'log2_hidden_layer_0': 4, 'log2_hidden_layer_1': 2, 'dropout': 0.4918175678751657, 'lr': 0.00032541959424006576, 'batch_size': 180, 'num_epochs': 10}. Best is trial 120 with value: 0.13249689924331817.


0.12851986910798988


100%|██████████| 10/10 [00:06<00:00,  1.65it/s]


0.12356241311388279


100%|██████████| 10/10 [00:05<00:00,  1.69it/s]


0.14915679139870744


100%|██████████| 10/10 [00:05<00:00,  1.68it/s]


0.11635332807764272


100%|██████████| 10/10 [00:05<00:00,  1.69it/s]
[I 2025-07-03 19:10:46,411] Trial 124 finished with value: 0.12981207011851537 and parameters: {'log2_hidden_layer_0': 4, 'log2_hidden_layer_1': 2, 'dropout': 0.4791705501229536, 'lr': 0.0003722944959756091, 'batch_size': 180, 'num_epochs': 10}. Best is trial 120 with value: 0.13249689924331817.


0.1301757478838285


100%|██████████| 10/10 [00:05<00:00,  1.67it/s]


0.12375275842187286


100%|██████████| 10/10 [00:06<00:00,  1.63it/s]


0.14912049249782988


100%|██████████| 10/10 [00:06<00:00,  1.61it/s]


0.11539883091254735


100%|██████████| 10/10 [00:06<00:00,  1.62it/s]
[I 2025-07-03 19:11:11,927] Trial 125 finished with value: 0.12944697943590153 and parameters: {'log2_hidden_layer_0': 4, 'log2_hidden_layer_1': 2, 'dropout': 0.4782027638471424, 'lr': 0.0003525735143756651, 'batch_size': 180, 'num_epochs': 10}. Best is trial 120 with value: 0.13249689924331817.


0.1295158359113561


100%|██████████| 10/10 [00:06<00:00,  1.62it/s]


0.12424762142697633


100%|██████████| 10/10 [00:06<00:00,  1.62it/s]


0.14967872308412883


100%|██████████| 10/10 [00:06<00:00,  1.61it/s]


0.13658710369184446


100%|██████████| 10/10 [00:06<00:00,  1.63it/s]
[I 2025-07-03 19:11:37,726] Trial 126 finished with value: 0.13456006656930575 and parameters: {'log2_hidden_layer_0': 4, 'log2_hidden_layer_1': 2, 'dropout': 0.5290109662764044, 'lr': 0.0003770996700328954, 'batch_size': 180, 'num_epochs': 10}. Best is trial 126 with value: 0.13456006656930575.


0.12772681807427336


100%|██████████| 10/10 [00:06<00:00,  1.62it/s]


0.12427325961903238


100%|██████████| 10/10 [00:06<00:00,  1.63it/s]


0.14923719755286377


100%|██████████| 10/10 [00:06<00:00,  1.63it/s]


0.13711075595236993


100%|██████████| 10/10 [00:06<00:00,  1.62it/s]
[I 2025-07-03 19:12:03,389] Trial 127 finished with value: 0.1345180193067368 and parameters: {'log2_hidden_layer_0': 4, 'log2_hidden_layer_1': 2, 'dropout': 0.5327667817507495, 'lr': 0.0003762618661789324, 'batch_size': 180, 'num_epochs': 10}. Best is trial 126 with value: 0.13456006656930575.


0.12745086410268106


100%|██████████| 10/10 [00:06<00:00,  1.64it/s]


0.1245066840174861


100%|██████████| 10/10 [00:06<00:00,  1.62it/s]


0.14813093962360113


100%|██████████| 10/10 [00:06<00:00,  1.61it/s]


0.13781981500137813


100%|██████████| 10/10 [00:06<00:00,  1.66it/s]
[I 2025-07-03 19:12:28,963] Trial 128 finished with value: 0.1341682003593426 and parameters: {'log2_hidden_layer_0': 4, 'log2_hidden_layer_1': 2, 'dropout': 0.5335952125730387, 'lr': 0.00031005056153064715, 'batch_size': 180, 'num_epochs': 10}. Best is trial 126 with value: 0.13456006656930575.


0.12621536279490503


100%|██████████| 10/10 [00:06<00:00,  1.63it/s]


0.12419545355451513


100%|██████████| 10/10 [00:06<00:00,  1.62it/s]


0.14831610827070746


100%|██████████| 10/10 [00:06<00:00,  1.65it/s]


0.13594435459828333


100%|██████████| 10/10 [00:06<00:00,  1.63it/s]
[I 2025-07-03 19:12:54,600] Trial 129 finished with value: 0.13407124907877582 and parameters: {'log2_hidden_layer_0': 4, 'log2_hidden_layer_1': 2, 'dropout': 0.5178911892003206, 'lr': 0.0003143360343518444, 'batch_size': 180, 'num_epochs': 10}. Best is trial 126 with value: 0.13456006656930575.


0.12782907989159745


100%|██████████| 10/10 [00:06<00:00,  1.64it/s]


0.12464139205535954


100%|██████████| 10/10 [00:05<00:00,  1.68it/s]


0.14829423317373913


100%|██████████| 10/10 [00:06<00:00,  1.66it/s]


0.13774366383517006


100%|██████████| 10/10 [00:05<00:00,  1.68it/s]
[I 2025-07-03 19:13:19,578] Trial 130 finished with value: 0.1342386482980236 and parameters: {'log2_hidden_layer_0': 4, 'log2_hidden_layer_1': 2, 'dropout': 0.533212846118693, 'lr': 0.0003120268443449961, 'batch_size': 180, 'num_epochs': 10}. Best is trial 126 with value: 0.13456006656930575.


0.12627530412782567


100%|██████████| 10/10 [00:05<00:00,  1.68it/s]


0.12544400885033463


100%|██████████| 10/10 [00:06<00:00,  1.66it/s]


0.14808157570800376


100%|██████████| 10/10 [00:05<00:00,  1.68it/s]


0.13812753262638833


100%|██████████| 10/10 [00:05<00:00,  1.68it/s]
[I 2025-07-03 19:13:44,426] Trial 131 finished with value: 0.13458356506895813 and parameters: {'log2_hidden_layer_0': 4, 'log2_hidden_layer_1': 2, 'dropout': 0.5423738392064327, 'lr': 0.000325110697900159, 'batch_size': 180, 'num_epochs': 10}. Best is trial 131 with value: 0.13458356506895813.


0.12668114309110579


100%|██████████| 10/10 [00:05<00:00,  1.68it/s]


0.12445572996042494


100%|██████████| 10/10 [00:05<00:00,  1.69it/s]


0.14851409994994302


100%|██████████| 10/10 [00:06<00:00,  1.66it/s]


0.13821140069692536


100%|██████████| 10/10 [00:05<00:00,  1.69it/s]
[I 2025-07-03 19:14:09,163] Trial 132 finished with value: 0.13412278396574495 and parameters: {'log2_hidden_layer_0': 4, 'log2_hidden_layer_1': 2, 'dropout': 0.5361239368866518, 'lr': 0.0003067571796475689, 'batch_size': 180, 'num_epochs': 10}. Best is trial 131 with value: 0.13458356506895813.


0.12530990525568658


100%|██████████| 10/10 [00:05<00:00,  1.67it/s]


0.1265419196628794


100%|██████████| 10/10 [00:05<00:00,  1.68it/s]


0.14814465441317104


100%|██████████| 10/10 [00:06<00:00,  1.66it/s]


0.13747884711464795


100%|██████████| 10/10 [00:05<00:00,  1.68it/s]
[I 2025-07-03 19:14:34,021] Trial 133 finished with value: 0.13485496938684774 and parameters: {'log2_hidden_layer_0': 4, 'log2_hidden_layer_1': 2, 'dropout': 0.5529116959865439, 'lr': 0.000318306978640413, 'batch_size': 180, 'num_epochs': 10}. Best is trial 133 with value: 0.13485496938684774.


0.12725445635669252


100%|██████████| 10/10 [00:05<00:00,  1.69it/s]


0.1264147531091373


100%|██████████| 10/10 [00:05<00:00,  1.67it/s]


0.1480834717857444


100%|██████████| 10/10 [00:05<00:00,  1.67it/s]


0.1376833703761802


100%|██████████| 10/10 [00:05<00:00,  1.68it/s]
[I 2025-07-03 19:14:58,825] Trial 134 finished with value: 0.1348309066514331 and parameters: {'log2_hidden_layer_0': 4, 'log2_hidden_layer_1': 2, 'dropout': 0.5544753242814541, 'lr': 0.00033421206947488035, 'batch_size': 180, 'num_epochs': 10}. Best is trial 133 with value: 0.13485496938684774.


0.12714203133467053


100%|██████████| 10/10 [00:05<00:00,  1.67it/s]


0.12582861597206527


100%|██████████| 10/10 [00:06<00:00,  1.61it/s]


0.14796364953781008


100%|██████████| 10/10 [00:06<00:00,  1.53it/s]


0.13818975127465127


100%|██████████| 10/10 [00:06<00:00,  1.58it/s]
[I 2025-07-03 19:15:24,905] Trial 135 finished with value: 0.1347985975302991 and parameters: {'log2_hidden_layer_0': 4, 'log2_hidden_layer_1': 2, 'dropout': 0.5549666512987825, 'lr': 0.0003070206910146805, 'batch_size': 180, 'num_epochs': 10}. Best is trial 133 with value: 0.13485496938684774.


0.12721237333666974


100%|██████████| 10/10 [00:06<00:00,  1.59it/s]


0.12393974817390752


100%|██████████| 10/10 [00:06<00:00,  1.58it/s]


0.1466182360558971


100%|██████████| 10/10 [00:06<00:00,  1.57it/s]


0.13680159806096878


100%|██████████| 10/10 [00:06<00:00,  1.60it/s]
[I 2025-07-03 19:15:51,139] Trial 136 finished with value: 0.1331130738304375 and parameters: {'log2_hidden_layer_0': 4, 'log2_hidden_layer_1': 2, 'dropout': 0.5545676675031711, 'lr': 0.00025671697702026996, 'batch_size': 180, 'num_epochs': 10}. Best is trial 133 with value: 0.13485496938684774.


0.12509271303097658


100%|██████████| 10/10 [00:06<00:00,  1.59it/s]


0.1243645004799624


100%|██████████| 10/10 [00:06<00:00,  1.59it/s]


0.1471716690520706


100%|██████████| 10/10 [00:06<00:00,  1.59it/s]


0.13616218067201324


100%|██████████| 10/10 [00:06<00:00,  1.59it/s]
[I 2025-07-03 19:16:17,405] Trial 137 finished with value: 0.13327411355896357 and parameters: {'log2_hidden_layer_0': 4, 'log2_hidden_layer_1': 2, 'dropout': 0.5501537346142693, 'lr': 0.00025309391029951195, 'batch_size': 180, 'num_epochs': 10}. Best is trial 133 with value: 0.13485496938684774.


0.12539810403180798


100%|██████████| 10/10 [00:06<00:00,  1.60it/s]


0.12448421528477203


100%|██████████| 10/10 [00:06<00:00,  1.59it/s]


0.1460351900638283


100%|██████████| 10/10 [00:06<00:00,  1.58it/s]


0.13740324105398527


100%|██████████| 10/10 [00:06<00:00,  1.58it/s]
[I 2025-07-03 19:16:43,648] Trial 138 finished with value: 0.13305403197259397 and parameters: {'log2_hidden_layer_0': 4, 'log2_hidden_layer_1': 2, 'dropout': 0.5568278369432818, 'lr': 0.0002504619605107871, 'batch_size': 180, 'num_epochs': 10}. Best is trial 133 with value: 0.13485496938684774.


0.12429348148779022


100%|██████████| 10/10 [00:06<00:00,  1.59it/s]


0.12416846180933458


100%|██████████| 10/10 [00:06<00:00,  1.59it/s]


0.14570819919440967


100%|██████████| 10/10 [00:06<00:00,  1.57it/s]


0.1370715779978037


100%|██████████| 10/10 [00:06<00:00,  1.58it/s]
[I 2025-07-03 19:17:10,062] Trial 139 finished with value: 0.1328399670202301 and parameters: {'log2_hidden_layer_0': 4, 'log2_hidden_layer_1': 2, 'dropout': 0.5575819384104589, 'lr': 0.0002419432989039061, 'batch_size': 180, 'num_epochs': 10}. Best is trial 133 with value: 0.13485496938684774.


0.12441162907937238


100%|██████████| 10/10 [00:06<00:00,  1.59it/s]


0.12388015243378446


100%|██████████| 10/10 [00:06<00:00,  1.60it/s]


0.1461917918958131


100%|██████████| 10/10 [00:06<00:00,  1.58it/s]


0.1362831200012334


100%|██████████| 10/10 [00:06<00:00,  1.60it/s]
[I 2025-07-03 19:17:36,312] Trial 140 finished with value: 0.13266617458577298 and parameters: {'log2_hidden_layer_0': 4, 'log2_hidden_layer_1': 2, 'dropout': 0.5624621374117386, 'lr': 0.0002402119764255689, 'batch_size': 180, 'num_epochs': 10}. Best is trial 133 with value: 0.13485496938684774.


0.12430963401226094


100%|██████████| 10/10 [00:06<00:00,  1.59it/s]


0.12308268632283871


100%|██████████| 10/10 [00:06<00:00,  1.59it/s]


0.14408868385623466


100%|██████████| 10/10 [00:06<00:00,  1.58it/s]


0.13728139311289922


100%|██████████| 10/10 [00:06<00:00,  1.58it/s]
[I 2025-07-03 19:18:02,647] Trial 141 finished with value: 0.1322320833723296 and parameters: {'log2_hidden_layer_0': 4, 'log2_hidden_layer_1': 2, 'dropout': 0.5743591675630596, 'lr': 0.00021123011737736938, 'batch_size': 180, 'num_epochs': 10}. Best is trial 133 with value: 0.13485496938684774.


0.12447557019734584


100%|██████████| 10/10 [00:06<00:00,  1.57it/s]


0.12364380443318053


100%|██████████| 10/10 [00:06<00:00,  1.59it/s]


0.14710350797959892


100%|██████████| 10/10 [00:06<00:00,  1.58it/s]


0.13667412044693003


100%|██████████| 10/10 [00:06<00:00,  1.59it/s]
[I 2025-07-03 19:18:29,026] Trial 142 finished with value: 0.1328062064625307 and parameters: {'log2_hidden_layer_0': 4, 'log2_hidden_layer_1': 2, 'dropout': 0.5545937177751636, 'lr': 0.00024021033274569843, 'batch_size': 180, 'num_epochs': 10}. Best is trial 133 with value: 0.13485496938684774.


0.12380339299041329


100%|██████████| 10/10 [00:06<00:00,  1.58it/s]


0.12409181667282103


100%|██████████| 10/10 [00:06<00:00,  1.59it/s]


0.14748338108297732


100%|██████████| 10/10 [00:06<00:00,  1.58it/s]


0.13670405833207883


100%|██████████| 10/10 [00:06<00:00,  1.56it/s]
[I 2025-07-03 19:18:55,516] Trial 143 finished with value: 0.1333195702924733 and parameters: {'log2_hidden_layer_0': 4, 'log2_hidden_layer_1': 2, 'dropout': 0.552223884900867, 'lr': 0.0002607499115637671, 'batch_size': 180, 'num_epochs': 10}. Best is trial 133 with value: 0.13485496938684774.


0.12499902508201602


  0%|          | 0/10 [00:00<?, ?it/s]

#### Nineth Iteration: AE + MLP instead of GBDT feature selection + MLP