In [1]:
import numpy as np
import pandas as pd
import polars as pl
from copy import deepcopy
import matplotlib.pyplot as plt
import seaborn as sns
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
import optuna
from optuna.samplers import RandomSampler
import warnings
warnings.filterwarnings("ignore")
import multiprocessing
max_n_jobs = multiprocessing.cpu_count()
print(f"Maximum n_jobs you can use: {max_n_jobs}")

Maximum n_jobs you can use: 12


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
feature_version = 2
default_cv = 4
# 1 for pc feature, 
# 2 for label correlation feature
# 3 for best features based on combination rank

Train the best model

In [3]:
popular_features_train = pd.read_parquet("data/cleaned/popular_features_train.parquet")

In [4]:
train_df = pd.read_parquet(f"data/cleaned/cleaned_train_{feature_version}.parquet")
train_df = pd.concat([train_df, popular_features_train], axis = 1)
train_df["timestamp"] = pd.to_datetime(train_df["timestamp"])
#train_df = train_df[train_df["timestamp"].dt.month.isin([12, 1, 2])].reset_index().drop("index", axis = 1)
X_train = train_df.drop(columns=["timestamp", "label"])
Y_train = train_df["label"]

In [None]:
best_features = ['X862', 'X598', 'X863', 'X856', 'X612', 'X466', 'X533', 'X861', 'X445', 'X531', 
                 'X385', 'X23', 'X284', 'X465', 'X331', 'X95', 'X285', 'X31', 'X169', 'X137', 
                 'X379', 'X186', 'X852', 'X302', 'X868', 'X89', 'X219', 'X855', 'X540', 'X301'] + \
                ["volume", "bid_qty", "ask_qty", "buy_qty", "sell_qty"] + \
                [col for col in train_df.columns.tolist() if "X" not in col and col not in ["timestamp", "label"]]
best_features = list(set(best_features))
X_train = X_train[best_features]

In [6]:
default_n_trees = 1000
default_random_state = 101

In [7]:
def get_best_params_from_file(filename):
    study = optuna.load_study(
        study_name = filename,
        storage = f"sqlite:///{filename}.db"
    )
    return study.best_params

In [8]:
months = [3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 1, 2]
training_timeframe = [] 
for window_size in [12, 6, 3]:
    for i in range(13 - window_size):
        if i % window_size == 0:
            training_timeframe.append(months[i: i + window_size])
training_timeframe

[[3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 1, 2],
 [3, 4, 5, 6, 7, 8],
 [9, 10, 11, 12, 1, 2],
 [3, 4, 5],
 [6, 7, 8],
 [9, 10, 11],
 [12, 1, 2]]

In [None]:
params = {
    "n_estimators": default_n_trees,
    "verbosity": 0,
    "enable_categorical": True,
    "random_state": default_random_state
}
best_params_xgboost = get_best_params_from_file(f"xgboost_{feature_version}_{default_cv}_{default_random_state}_{default_n_trees}_common_truncated_study")
for p in best_params_xgboost:
    params[p] = best_params_xgboost[p]

xgbr_arr = []

for i in range(len(training_timeframe)):
    temp = deepcopy(train_df)
    temp = temp[temp["timestamp"].dt.month.isin(training_timeframe[i])].reset_index().drop("index", axis = 1)
    X_train = temp.drop(columns=["timestamp", "label"])
    Y_train = temp["label"]
    xgbr = XGBRegressor(**params)
    xgbr.fit(X_train, Y_train)
    xgbr_arr.append(xgbr)

In [None]:
params = {
    "n_estimators": default_n_trees,
    "verbosity": -1,
    "random_state": default_random_state,
}
best_params_lightgbm = get_best_params_from_file(f"lightgbm_{feature_version}_{default_cv}_{default_random_state}_{default_n_trees}_common_truncated_study")
for p in best_params_lightgbm:
    params[p] = best_params_lightgbm[p]

lgbr_arr = []

for i in range(len(training_timeframe)):
    temp = deepcopy(train_df)
    temp = temp[temp["timestamp"].dt.month.isin(training_timeframe[i])].reset_index().drop("index", axis = 1)
    X_train = temp.drop(columns=["timestamp", "label"])
    Y_train = temp["label"]
    lgbr = LGBMRegressor(**params)
    lgbr.fit(X_train, Y_train)
    lgbr_arr.append(lgbr)

In [11]:
# params = {
#     "iterations": default_n_trees,
#     "verbose": False,
#     "random_seed": default_random_state
# }
# best_params_catboost = get_best_params_from_file(f"catboost_{feature_version}_{default_cv}_{default_random_state}_{default_n_trees}_common_truncated_study")
# for p in best_params_catboost:
#     params[p] = best_params_catboost[p]

# catboost_feature_importances = {}

# cbr = CatBoostRegressor(**params)
# cbr.fit(X_train, Y_train)

Making prediction & submission

In [12]:
popular_features_test = pd.read_parquet("data/cleaned/popular_features_test.parquet")

In [13]:
X_test = pd.read_parquet(f"data/cleaned/cleaned_test_{feature_version}.parquet")
X_test = pd.concat([X_test, popular_features_test], axis = 1)
X_test = X_test.drop(columns=["label"])
# X_test = X_test[best_features]
Y_pred = np.zeros(X_test.shape[0])
for i in range(len(training_timeframe)):
    Y_pred += 1/2 * (xgbr_arr[i].predict(X_test) + lgbr_arr[i].predict(X_test))
Y_pred /= len(training_timeframe)

In [14]:
submission = pd.DataFrame({
    "id": X_test.index + 1,
    "prediction": Y_pred
})
submission.head()

Unnamed: 0,id,prediction
0,1,-0.282272
1,2,-0.083539
2,3,-0.119298
3,4,-0.645113
4,5,0.126825


In [15]:
submission.to_csv('submission.csv', index=False)