In [None]:
import numpy as np
import pandas as pd
import polars as pl
import matplotlib.pyplot as plt
import seaborn as sns
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
import optuna
from optuna.samplers import RandomSampler
import warnings
warnings.filterwarnings("ignore")
import multiprocessing
max_n_jobs = multiprocessing.cpu_count()
print(f"Maximum n_jobs you can use: {max_n_jobs}")

In [None]:
feature_version = 2
default_cv = 1
# 1 for pc feature, 
# 2 for label correlation feature
# 3 for best features based on combination rank

Train the best model

In [None]:
popular_features_train = pd.read_parquet("data/cleaned/popular_features_train.parquet")

In [None]:
train_df = pd.read_parquet(f"data/cleaned/cleaned_train_{feature_version}.parquet")
train_df = pd.concat([train_df, popular_features_train], axis = 1)
train_df["timestamp"] = pd.to_datetime(train_df["timestamp"])
#train_df = train_df[train_df["timestamp"].dt.month.isin([12, 1, 2])].reset_index().drop("index", axis = 1)
X_train = train_df.drop(columns=["timestamp", "label"])
Y_train = train_df["label"]

In [None]:
best_features = ['X862', 'X598', 'X863', 'X533', 'X379', 'X856', 'X28', 'X284', 'X466', 'X95', 
                 'X331', 'X465', 'X852', 'X861', 'X198', 'X169', 'X285', 'X23', 'X531', 'X444', 
                 'X754', 'X445', 'X758', 'X137', 'X279', 'X540', 'X31', 'X218', 'X291', 'X511']+ \
                ["volume", "bid_qty", "ask_qty", "buy_qty", "sell_qty"] + \
                [col for col in train_df.columns.tolist() if "X" not in col and col not in ["timestamp", "label"]]
best_features = list(set(best_features))
X_train = X_train[best_features]

In [None]:
default_n_trees = 1000
default_random_state = 101

In [None]:
def get_best_params_from_file(filename):
    study = optuna.load_study(
        study_name = filename,
        storage = f"sqlite:///{filename}.db"
    )
    return study.best_params

In [None]:
params = {
    "n_estimators": default_n_trees,
    "verbosity": 0,
    "enable_categorical": True,
    "random_state": default_random_state
}
best_params_xgboost = get_best_params_from_file(f"xgboost_{feature_version}_{default_cv}_{default_random_state}_{default_n_trees}_common_truncated_study")
for p in best_params_xgboost:
    params[p] = best_params_xgboost[p]

xgbr = XGBRegressor(**params)
xgbr.fit(X_train, Y_train)

In [None]:
params = {
    "n_estimators": default_n_trees,
    "verbosity": -1,
    "random_state": default_random_state,
}
best_params_lightgbm = get_best_params_from_file(f"lightgbm_{feature_version}_{default_cv}_{default_random_state}_{default_n_trees}_common_truncated_study")
for p in best_params_lightgbm:
    params[p] = best_params_lightgbm[p]

lgbr = LGBMRegressor(**params)
lgbr.fit(X_train, Y_train)

In [None]:
params = {
    "iterations": default_n_trees,
    "verbose": False,
    "random_seed": default_random_state
}
best_params_catboost = get_best_params_from_file(f"catboost_{feature_version}_{default_cv}_{default_random_state}_{default_n_trees}_common_truncated_study")
for p in best_params_catboost:
    params[p] = best_params_catboost[p]

catboost_feature_importances = {}

cbr = CatBoostRegressor(**params)
cbr.fit(X_train, Y_train)

Making prediction & submission

In [None]:
popular_features_test = pd.read_parquet("data/cleaned/popular_features_test.parquet")

In [None]:
X_test = pd.read_parquet(f"data/cleaned/cleaned_test_{feature_version}.parquet")
X_test = pd.concat([X_test, popular_features_test], axis = 1)
X_test = X_test.drop(columns=["label"])
X_test = X_test[best_features]
Y_pred = lgbr.predict(X_test)

In [None]:
submission = pd.DataFrame({
    "id": X_test.index + 1,
    "prediction": np.clip(Y_pred, -0.4, 0.4)
})
submission.head()

In [None]:
submission.to_csv('submission.csv', index=False)