In [2]:
import sys
import os

from bayes_opt import BayesianOptimization, SequentialDomainReductionTransformer
from bayes_opt.event import Events
from bayes_opt.logger import JSONLogger
from catboost import CatBoostRegressor, Pool
from joblib import delayed, Parallel
import numpy as np
import pandas as pd
import plotly.express as px
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.model_selection import KFold, train_test_split
from sklearn.preprocessing import PolynomialFeatures
from tqdm.notebook import tqdm

import warnings
warnings.filterwarnings("ignore")

module_path = os.path.abspath(os.path.join('../../../'))
if module_path not in sys.path:
    sys.path.append(module_path + "/scripts/yk4r2-dataset-loader/")

from dataset_utils import DataSetsLoader, realized_volatility, RMSPE, unique_counter

In [3]:
agg_functions = [np.sum, np.mean, np.std]

book_features = {
    'wap1': agg_functions,
    'wap2': agg_functions,
    'log_return1': agg_functions + [realized_volatility],
    'log_return2': agg_functions + [realized_volatility],
    'wap_balance': agg_functions,
    'price_spread1': agg_functions,
    'price_spread2': agg_functions,
    'bid_spread': agg_functions,
    'ask_spread': agg_functions,
    'total_volume': agg_functions,
    'volume_imbalance': agg_functions,
}

trade_features = {
    'log_return': [realized_volatility],
    'seconds_in_bucket': [unique_counter],
    'size': [np.sum],
    'order_count': [np.mean],
}

In [4]:
%%time
data = DataSetsLoader('../../../data/', book_features, trade_features)
data.get_datasets()

Our training set has 428932 rows


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:   27.7s
[Parallel(n_jobs=-1)]: Done 112 out of 112 | elapsed:  1.7min finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.1s finished


CPU times: user 7.36 s, sys: 3.78 s, total: 11.1 s
Wall time: 1min 53s


In [5]:
optimized_params = dict(
    iterations = 1000, 
    depth = 8, 
    learning_rate = 1, 
    loss_function = 'RMSE',
    use_best_model = True,
    random_seed = 29,
    bagging_temperature = 0.95,
    task_type = "GPU",
    verbose = 0,
)

In [1]:
def get_model(**params):
    return CatBoostRegressor(**params)

def foldwise_prediction(fold, trn_ind, val_ind, params, oof_predictions):
    x_train, x_val = x.iloc[trn_ind], x.iloc[val_ind]
    y_train, y_val = y.iloc[trn_ind], y.iloc[val_ind]
    train_weights = 1 / np.square(y_train)
    val_weights = 1 / np.square(y_val)

    train_pool = Pool(x_train, y_train, weight=train_weights)
    val_pool = Pool(x_val, y_val, weight=val_weights)
    test_pool = Pool(x_test)

    params_init = optimized_params
    params_init.update(params)

    params_init['depth'] = int(round(params_init['depth']))
    model = get_model(**params_init)
    model.fit(train_pool, eval_set = val_pool)

    oof_predictions[val_ind] = model.predict(val_pool)
    return oof_predictions

def validate(params):
    oof_predictions = np.zeros(x.shape[0])
    test_predictions = np.zeros(x_test.shape[0])
    kfold = KFold(n_splits=5, random_state=66, shuffle=True)
    for fold, (trn_ind, val_ind) in enumerate(kfold.split(x)):
        oof_predictions = foldwise_prediction(fold, trn_ind, val_ind, params, oof_predictions)
    print('Work in progress.')
    return oof_predictions

def evaluate_model(**params):
    oof_predictions = validate(params)
    current_score = -np.sum(RMSPE(y.values, oof_predictions))
    return current_score

In [42]:
pbounds = {
    'learning_rate': (1e-4, 1e-1),
    'depth': (6, 12),
    'l2_leaf_reg': (1e-3, 1e2),
    'random_strength': (1e-2, 10),
    'bagging_temperature': (0, 10),
}

In [11]:
x = data.train.drop(['row_id', 'target', 'time_id'], axis=1)
y = data.train['target']
x_test = data.test.drop(['row_id', 'time_id'], axis=1)

x['stock_id'] = x['stock_id'].astype(int)
x_test['stock_id'] = x_test['stock_id'].astype(int)

In [50]:
# bounds_transformer = SequentialDomainReductionTransformer()
optimizer = BayesianOptimization(evaluate_model, pbounds, random_state=42)#, bounds_transformer=bounds_transformer)

logger = JSONLogger(path="bayesian_search_logs/logs.json")
optimizer.subscribe(Events.OPTIMIZATION_STEP, logger)

optimizer.maximize(init_points=2, n_iter=100, kappa=10)

optimizer.max
# optimized params one can find in ./logs.json

It is working!
It is working!
It is working!
It is working!
It is working!
It is working!
It is working!
It is working!
It is working!
It is working!
It is working!
It is working!
It is working!
It is working!
It is working!
It is working!
It is working!
It is working!
It is working!
It is working!
It is working!
It is working!
It is working!
It is working!
It is working!
It is working!
It is working!
It is working!
It is working!
It is working!
It is working!
It is working!
It is working!
It is working!
It is working!
It is working!
It is working!
It is working!
It is working!
It is working!
It is working!
It is working!
It is working!
It is working!
It is working!
It is working!
It is working!
It is working!
It is working!
It is working!
It is working!
It is working!
It is working!
It is working!
It is working!
It is working!
It is working!
It is working!
It is working!
It is working!
It is working!
It is working!
It is working!
It is working!
It is working!
It is working!
It is work

{'target': -61656.37255996844,
 'params': {'bagging_temperature': 0.0,
  'depth': 7.831029203164766,
  'l2_leaf_reg': 66.82288764247305,
  'learning_rate': 0.1,
  'random_strength': 4.46219815406808}}

In [51]:
optimizer.max

{'target': -61656.37255996844,
 'params': {'bagging_temperature': 0.0,
  'depth': 7.831029203164766,
  'l2_leaf_reg': 66.82288764247305,
  'learning_rate': 0.1,
  'random_strength': 4.46219815406808}}

In [19]:
optimized_params = {
    'iterations': 1000,
    'bagging_temperature': 0.0,
    'depth': 8,
    'use_best_model': True,
    'random_seed': 29,
    'l2_leaf_reg': 66.82288764247305,
    'learning_rate': 0.1,
    'random_strength': 4.46219815406808,
    'task_type': 'GPU',
    'verbose': 0,
}

In [20]:
pbounds = {
    'depth': (8, 14),
    'l2_leaf_reg': (4e1, 1e2),
    'random_strength': (4.5, 10),
    'min_data_in_leaf': (1, 1000),
}

In [35]:
def get_model(**params):
    return CatBoostRegressor(**params)

def foldwise_prediction(fold, trn_ind, val_ind, params, oof_predictions):
    x_train, x_val = x.iloc[trn_ind], x.iloc[val_ind]
    y_train, y_val = y.iloc[trn_ind], y.iloc[val_ind]
    train_weights = 1 / np.square(y_train)
    val_weights = 1 / np.square(y_val)

    train_pool = Pool(x_train, y_train, weight=train_weights)
    val_pool = Pool(x_val, y_val, weight=val_weights)
    test_pool = Pool(x_test)

    params_init = optimized_params
    params_init.update(params)

    params_init['depth'] = int(round(params_init['depth']))
    params_init['min_data_in_leaf'] = int(round(params_init['min_data_in_leaf']))
    params_init['border_count'] = int(round(params_init['border_count']))

    model = get_model(**params_init)
    model.fit(train_pool, eval_set = val_pool)

    oof_predictions[val_ind] = model.predict(val_pool)
    return oof_predictions

def validate(params):
    oof_predictions = np.zeros(x.shape[0])
    test_predictions = np.zeros(x_test.shape[0])
    kfold = KFold(n_splits=5, random_state=66, shuffle=True)
    for fold, (trn_ind, val_ind) in enumerate(kfold.split(x)):
        oof_predictions = foldwise_prediction(fold, trn_ind, val_ind, params, oof_predictions)
    print('Work in progress.')
    return oof_predictions

def evaluate_model(**params):
    oof_predictions = validate(params)
    current_score = -np.sum(RMSPE(y.values, oof_predictions))
    return current_score

In [25]:
# bounds_transformer = SequentialDomainReductionTransformer()
optimizer = BayesianOptimization(evaluate_model, pbounds, random_state=42)#, bounds_transformer=bounds_transformer)

logger = JSONLogger(path="bayesian_search_logs/logs_2.json")
optimizer.subscribe(Events.OPTIMIZATION_STEP, logger)

optimizer.maximize(init_points=4, n_iter=50, kappa=5)

optimizer.max
# optimized params one can find in ./logs_2.json

Custom logger is already specified. Specify more than one logger at same time is not thread safe.

Work in progress.
Work in progress.
Work in progress.
Work in progress.
Work in progress.
Work in progress.
Work in progress.
Work in progress.
Work in progress.
Work in progress.
Work in progress.
Work in progress.
Work in progress.
Work in progress.
Work in progress.
Work in progress.
Work in progress.
Work in progress.
Work in progress.
Work in progress.
Work in progress.
Work in progress.
Work in progress.
Work in progress.
Work in progress.
Work in progress.
Work in progress.
Work in progress.
Work in progress.
Work in progress.
Work in progress.
Work in progress.
Work in progress.
Work in progress.
Work in progress.
Work in progress.
Work in progress.
Work in progress.
Work in progress.
Work in progress.
Work in progress.
Work in progress.
Work in progress.
Work in progress.
Work in progress.
Work in progress.
Work in progress.
Work in progress.
Work in progress.
Work in progress.
Work in progress.


{'target': -61264.19407447548,
 'params': {'depth': 8.910879015694857,
  'l2_leaf_reg': 87.33605827474537,
  'min_data_in_leaf': 748.4768070719027,
  'random_strength': 4.602399651075166}}

In [28]:
optimized_params = {
    'iterations': 1000,
    'bagging_temperature': 0.0,
    'depth': 9,
    'use_best_model': True,
    'random_seed': 29,
    'l2_leaf_reg': 87.33605827474537,
    'learning_rate': 0.1,
    'random_strength': 4.602399651075166,
    'min_data_in_leaf': 750,
    'task_type': 'GPU',
    'verbose': 0,
}

In [29]:
pbounds = {
    'od_pval': (1e-10, 1e-2),
    'border_count': (32, 254),
}

In [36]:
# bounds_transformer = SequentialDomainReductionTransformer()
optimizer = BayesianOptimization(evaluate_model, pbounds, random_state=42)#, bounds_transformer=bounds_transformer)

logger = JSONLogger(path="bayesian_search_logs/logs_3.json")
optimizer.subscribe(Events.OPTIMIZATION_STEP, logger)

optimizer.maximize(init_points=4, n_iter=50, kappa=5)

optimizer.max
# optimized params one can find in ./logs_2.json

Work in progress.
Work in progress.
Work in progress.
Work in progress.
Work in progress.
Work in progress.
Work in progress.
Work in progress.
Work in progress.
Work in progress.
Work in progress.
Work in progress.
Work in progress.
Work in progress.
Work in progress.
Work in progress.
Work in progress.
Work in progress.
Work in progress.
Work in progress.
Work in progress.
Work in progress.
Work in progress.
Work in progress.
Work in progress.
Work in progress.
Work in progress.
Work in progress.
Work in progress.
Work in progress.
Work in progress.
Work in progress.
Work in progress.
Work in progress.
Work in progress.
Work in progress.
Work in progress.
Work in progress.
Work in progress.
Work in progress.
Work in progress.
Work in progress.
Work in progress.
Work in progress.
Work in progress.
Work in progress.
Work in progress.
Work in progress.
Work in progress.
Work in progress.
Work in progress.
Work in progress.
Work in progress.
Work in progress.


{'target': -61227.30182455127,
 'params': {'border_count': 192.570851493023, 'od_pval': 0.003640704772242277}}

In [50]:
optimized_params = {
    'iterations': 1000,
    'bagging_temperature': 0.0,
    'depth': 9,
    'use_best_model': True,
    'random_seed': 29,
    'l2_leaf_reg': 87.33605827474537,
    'learning_rate': 0.1,
    'random_strength': 4.602399651075166,
    'min_data_in_leaf': 750,
    'border_count': 193,
    'od_pval': 0.003640704772242277,
    'task_type': 'GPU',
    'verbose': 0,
    'random_seed': 42,
}