In [1]:
import numpy as np
import pandas as pd
import polars as pl
from copy import deepcopy
import matplotlib.pyplot as plt
import seaborn as sns
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
import optuna
from optuna.samplers import RandomSampler
import warnings
warnings.filterwarnings("ignore")
import multiprocessing
max_n_jobs = multiprocessing.cpu_count()
print(f"Maximum n_jobs you can use: {max_n_jobs}")
import shap
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


Maximum n_jobs you can use: 12


In [2]:
feature_version = 2
default_cv = 4
# 1 for pc feature, 
# 2 for label correlation feature
# 3 for best features based on combination rank

Train the best model

In [3]:
popular_features_train = pd.read_parquet("data/cleaned/popular_features_train.parquet")

In [4]:
train_df = pd.read_parquet(f"data/cleaned/cleaned_train_{feature_version}.parquet")
train_df = pd.concat([train_df, popular_features_train], axis = 1)
train_df["timestamp"] = pd.to_datetime(train_df["timestamp"])
#train_df = train_df[train_df["timestamp"].dt.month.isin([12, 1, 2])].reset_index().drop("index", axis = 1)
X_train = train_df.drop(columns=["timestamp", "label"])
Y_train = train_df["label"]

In [5]:
best_features = ['X862', 'X598', 'X863', 'X856', 'X612', 'X466', 'X533', 'X861', 'X445', 'X531', 
                 'X385', 'X23', 'X284', 'X465', 'X331', 'X95', 'X285', 'X31', 'X169', 'X137'] + \
                ["volume", "bid_qty", "ask_qty", "buy_qty", "sell_qty"] + \
                [col for col in train_df.columns.tolist() if "X" not in col and col not in ["timestamp", "label"]]
best_features = list(set(best_features))

In [6]:
default_n_trees = 1000
default_random_state = 101

In [7]:
def get_best_params_from_file(filename):
    study = optuna.load_study(
        study_name = filename,
        storage = f"sqlite:///{filename}.db"
    )
    return study.best_params

In [8]:
months = [3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 1, 2]
training_timeframe = [] 
for window_size in [12, 6, 3]:
    for i in range(13 - window_size):
        training_timeframe.append(months[i: i + window_size])
training_timeframe

[[3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 1, 2],
 [3, 4, 5, 6, 7, 8],
 [4, 5, 6, 7, 8, 9],
 [5, 6, 7, 8, 9, 10],
 [6, 7, 8, 9, 10, 11],
 [7, 8, 9, 10, 11, 12],
 [8, 9, 10, 11, 12, 1],
 [9, 10, 11, 12, 1, 2],
 [3, 4, 5],
 [4, 5, 6],
 [5, 6, 7],
 [6, 7, 8],
 [7, 8, 9],
 [8, 9, 10],
 [9, 10, 11],
 [10, 11, 12],
 [11, 12, 1],
 [12, 1, 2]]

In [9]:
params = {
    "n_estimators": default_n_trees,
    "verbosity": 0,
    "enable_categorical": True,
    "random_state": default_random_state
}
best_params_xgboost = get_best_params_from_file(f"xgboost_{feature_version}_{default_cv}_{default_random_state}_{default_n_trees}_common_truncated_20_study")
for p in best_params_xgboost:
    params[p] = best_params_xgboost[p]

xgbr_arr = []

for i in tqdm(range(len(training_timeframe))):
    temp = deepcopy(train_df)
    temp = temp[temp["timestamp"].dt.month.isin(training_timeframe[i])].reset_index().drop("index", axis = 1)
    X_train = temp.drop(columns=["timestamp", "label"])
    X_train = X_train[best_features]
    Y_train = temp["label"]
    xgbr = XGBRegressor(**params)
    xgbr.fit(X_train, Y_train)
    xgbr_arr.append(xgbr)

100%|██████████| 18/18 [01:01<00:00,  3.41s/it]


In [10]:
params = {
    "n_estimators": default_n_trees,
    "verbosity": -1,
    "random_state": default_random_state,
}
best_params_lightgbm = get_best_params_from_file(f"lightgbm_{feature_version}_{default_cv}_{default_random_state}_{default_n_trees}_common_truncated_20_study")
for p in best_params_lightgbm:
    params[p] = best_params_lightgbm[p]

lgbr_arr = []

for i in tqdm(range(len(training_timeframe))):
    temp = deepcopy(train_df)
    temp = temp[temp["timestamp"].dt.month.isin(training_timeframe[i])].reset_index().drop("index", axis = 1)
    X_train = temp.drop(columns=["timestamp", "label"])
    X_train = X_train[best_features]
    Y_train = temp["label"]
    lgbr = LGBMRegressor(**params)
    lgbr.fit(X_train, Y_train)
    lgbr_arr.append(lgbr)

100%|██████████| 18/18 [01:34<00:00,  5.24s/it]


In [11]:
# params = {
#     "iterations": default_n_trees,
#     "verbose": False,
#     "random_seed": default_random_state
# }
# best_params_catboost = get_best_params_from_file(f"catboost_{feature_version}_{default_cv}_{default_random_state}_{default_n_trees}_common_truncated_study")
# for p in best_params_catboost:
#     params[p] = best_params_catboost[p]

# catboost_feature_importances = {}

# cbr = CatBoostRegressor(**params)
# cbr.fit(X_train, Y_train)

Making prediction & submission

In [12]:
popular_features_test = pd.read_parquet("data/cleaned/popular_features_test.parquet")

In [13]:
X_test = pd.read_parquet(f"data/cleaned/cleaned_test_{feature_version}.parquet")
X_test = pd.concat([X_test, popular_features_test], axis = 1)
X_test = X_test.drop(columns=["label"])
X_test = X_test[best_features]
Y_pred = np.zeros(X_test.shape[0])
for i in range(len(training_timeframe)):
    Y_pred += 1/2 * (xgbr_arr[i].predict(X_test) + lgbr_arr[i].predict(X_test))
Y_pred /= len(training_timeframe)

In [14]:
submission = pd.DataFrame({
    "id": X_test.index + 1,
    "prediction": Y_pred
})
submission.head()

Unnamed: 0,id,prediction
0,1,0.033701
1,2,-0.027094
2,3,-0.038597
3,4,-0.152391
4,5,0.058571


In [15]:
submission.to_csv('submission.csv', index=False)

Analysis of contribution to submission

In [16]:
def get_shap_values(model, X_test):
    explainer = shap.TreeExplainer(model)
    shap_values = explainer.shap_values(X_test)
    mean_abs_shap = np.mean(np.abs(shap_values), axis = 0)
    return mean_abs_shap

In [None]:
xgboost_feature_importances = {}
lightgbm_feature_importances = {}

# only consider the fold with whole dataset to be comparable with cv scheme
features = xgbr_arr[0].feature_names_in_.tolist()
features_i = get_shap_values(xgbr_arr[i], X_test)
for inx, feat in enumerate(features):
    xgboost_feature_importances[feat] = xgboost_feature_importances.get(feat, 0) + features_i[inx]
features = lgbr_arr[0].feature_names_in_.tolist()
features_i = get_shap_values(lgbr_arr[i], X_test)
for inx, feat in enumerate(features):
    lightgbm_feature_importances[feat] = lightgbm_feature_importances.get(feat, 0) + features_i[inx]

xgboost_feature_importances_df = pd.DataFrame(
    {"var": xgboost_feature_importances.keys(), "importance": xgboost_feature_importances.values()}
)
#xgboost_feature_importances_df["importance"] /= len(training_timeframe)
# xgboost_feature_importances_df["rank_importance"] = xgboost_feature_importances_df["importance"].rank(ascending=False)
lightgbm_feature_importances_df = pd.DataFrame(
    {"var": lightgbm_feature_importances.keys(), "importance": lightgbm_feature_importances.values()}
)
#lightgbm_feature_importances_df["importance"] /= len(training_timeframe)
# lightgbm_feature_importances_df["rank_importance"] = lightgbm_feature_importances_df["importance"].rank(ascending=False)
# catboost_feature_importances_df = pd.DataFrame(
#     {"var": catboost_feature_importances.keys(), "importance_catboost": catboost_feature_importances.values()}
# )
# catboost_feature_importances_df["rank_importance"] = catboost_feature_importances_df["importance_catboost"].rank(ascending=False)
feature_importances_df = xgboost_feature_importances_df.merge(
    lightgbm_feature_importances_df,
    on="var",
    how="inner",
    suffixes=("_xgboost", "_lightgbm")
)
# feature_importances_df = feature_importances_df.merge(
#     catboost_feature_importances_df,
#     on="var",
#     how="inner",
#     suffixes=("", "_catboost")
# )
# feature_importances_df = feature_importances_df[["var", "rank_importance_xgboost", "rank_importance_lightgbm", "rank_importance_catboost"]]
# feature_importances_df["rank"] = 1/3 * (feature_importances_df["rank_importance_xgboost"] + feature_importances_df["rank_importance_lightgbm"] + feature_importances_df["rank_importance_catboost"])
feature_importances_df["importance"] = 1/2 * (feature_importances_df["importance_xgboost"] + feature_importances_df["importance_lightgbm"])
feature_importances_df = feature_importances_df.sort_values(by="importance", ascending=False).reset_index().drop("index", axis = 1)
feature_importances_df

In [23]:
best_xgboost_score = optuna.load_study(
    study_name = "xgboost_2_4_101_1000_common_truncated_20_study",
    storage = f"sqlite:///xgboost_2_4_101_1000_common_truncated_20_study.db"
).best_value
best_lightgbm_score = optuna.load_study(
    study_name = "lightgbm_2_4_101_1000_common_truncated_20_study",
    storage = f"sqlite:///lightgbm_2_4_101_1000_common_truncated_20_study.db"
).best_value
feature_importances_df["weighted_importance"] = (best_xgboost_score * feature_importances_df["importance_xgboost"] + best_lightgbm_score * feature_importances_df["importance_lightgbm"]) / (best_xgboost_score + best_lightgbm_score)
feature_importances_df = feature_importances_df.sort_values("weighted_importance", ascending=False, ignore_index=True)
feature_importances_df

Unnamed: 0,var,importance_xgboost,importance_lightgbm,importance,weighted_importance
0,X598,0.076992,0.060874,0.068933,0.069127
1,X285,0.058618,0.069572,0.064095,0.063963
2,X385,0.046733,0.057317,0.052025,0.051897
3,X863,0.046855,0.049898,0.048377,0.04834
4,X23,0.038693,0.055906,0.0473,0.047093
5,X284,0.047948,0.03764,0.042794,0.042918
6,X95,0.036736,0.049176,0.042956,0.042806
7,X31,0.033952,0.045246,0.039599,0.039463
8,X331,0.035199,0.041542,0.03837,0.038294
9,X861,0.026407,0.050334,0.038371,0.038083
