In [1]:
import pickle
import numpy as np
import pandas as pd
import polars as pl
import random
from copy import deepcopy
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
import optuna
from optuna.samplers import RandomSampler
import warnings
warnings.filterwarnings("ignore")
import multiprocessing
max_n_jobs = multiprocessing.cpu_count()
print(f"Maximum n_jobs you can use: {max_n_jobs}")
import shap
from tqdm import tqdm
import mlx.core as mx
import mlx.nn as nnmx
import mlx.optimizers as optimmx

  from .autonotebook import tqdm as notebook_tqdm


Maximum n_jobs you can use: 12


In [2]:
feature_version = 2
default_cv = 4
# 1 for pc feature, 
# 2 for label correlation feature
# 3 for best features based on combination rank

In [3]:
default_n_trees = 1000
default_random_state = 101
default_num_layers = 2
random.seed(default_random_state)
np.random.seed(default_random_state)
mx.random.seed(default_random_state)

### Load train data

In [4]:
popular_features_train = pd.read_parquet("data/cleaned/popular_features_train.parquet")

In [5]:
train_df = pd.read_parquet(f"data/cleaned/cleaned_train_{feature_version}.parquet")
train_df = pd.concat([train_df, popular_features_train], axis = 1)
train_df["__index_level_0__"] = pd.to_datetime(train_df["__index_level_0__"])
# train_df = train_df[train_df["__index_level_0__"].dt.month.isin([12, 1, 2])].reset_index().drop("index", axis = 1)
# X_train = train_df.drop(columns=["__index_level_0__", "label"])
# Y_train = train_df["label"]

### Train best model

#### Get best features and training folds

In [6]:
best_features = ['X757', 'X758', 'X759', 'X508', 'X614', 'X752', 'X331', 'X445', 'X465', 'X385', 
                 'X466', 'X95', 'X23', 'X219', 'X31', 'X373', 'X379', 'X284', 'X750', 'X652', 
                 'X279', 'X89', 'X169', 'X753', 'X226', 'X28', 'X444', 'X272', 'X271', 'X218']
best_features = list(set(best_features))

In [7]:
months = [3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 1, 2]
training_timeframe = [] 
# [12, 9, 8, 6, 4, 3] is currently best one
window_sizes = [12, 9, 8, 6, 4, 3]
for window_size in window_sizes:
    for i in range(13 - window_size):
        training_timeframe.append(months[i: i + window_size])
training_timeframe

[[3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 1, 2],
 [3, 4, 5, 6, 7, 8, 9, 10, 11],
 [4, 5, 6, 7, 8, 9, 10, 11, 12],
 [5, 6, 7, 8, 9, 10, 11, 12, 1],
 [6, 7, 8, 9, 10, 11, 12, 1, 2],
 [3, 4, 5, 6, 7, 8, 9, 10],
 [4, 5, 6, 7, 8, 9, 10, 11],
 [5, 6, 7, 8, 9, 10, 11, 12],
 [6, 7, 8, 9, 10, 11, 12, 1],
 [7, 8, 9, 10, 11, 12, 1, 2],
 [3, 4, 5, 6, 7, 8],
 [4, 5, 6, 7, 8, 9],
 [5, 6, 7, 8, 9, 10],
 [6, 7, 8, 9, 10, 11],
 [7, 8, 9, 10, 11, 12],
 [8, 9, 10, 11, 12, 1],
 [9, 10, 11, 12, 1, 2],
 [3, 4, 5, 6],
 [4, 5, 6, 7],
 [5, 6, 7, 8],
 [6, 7, 8, 9],
 [7, 8, 9, 10],
 [8, 9, 10, 11],
 [9, 10, 11, 12],
 [10, 11, 12, 1],
 [11, 12, 1, 2],
 [3, 4, 5],
 [4, 5, 6],
 [5, 6, 7],
 [6, 7, 8],
 [7, 8, 9],
 [8, 9, 10],
 [9, 10, 11],
 [10, 11, 12],
 [11, 12, 1],
 [12, 1, 2]]

#### Utils function

In [8]:
def get_best_params_from_file(filename):
    study = optuna.load_study(
        study_name = filename,
        storage = f"sqlite:///{filename}.db"
    )
    return study.best_params

#### XGBoost

In [9]:
params = {
    "n_estimators": default_n_trees,
    "verbosity": 0,
    "enable_categorical": True,
    "random_state": default_random_state
}
best_params_xgboost = get_best_params_from_file(f"xgboost_{feature_version}_{default_cv}_{default_random_state}_{default_n_trees}_common_truncated_20_study")
for p in best_params_xgboost:
    params[p] = best_params_xgboost[p]

xgbr_arr = []

for i in tqdm(range(len(training_timeframe))):
    temp = deepcopy(train_df)
    temp = temp[temp["__index_level_0__"].dt.month.isin(training_timeframe[i])].reset_index().drop("index", axis = 1)
    X_train = temp.drop(columns=["__index_level_0__", "label"])
    X_train = X_train[best_features]
    Y_train = temp["label"]
    xgbr = XGBRegressor(**params)
    xgbr.fit(X_train, Y_train)
    xgbr_arr.append(xgbr)

100%|██████████| 36/36 [01:17<00:00,  2.16s/it]


#### LightGBM

In [None]:
params = {
    "n_estimators": default_n_trees,
    "verbosity": -1,
    "random_state": default_random_state,
}
best_params_lightgbm = get_best_params_from_file(f"lightgbm_{feature_version}_{default_cv}_{default_random_state}_{default_n_trees}_common_truncated_20_study")
for p in best_params_lightgbm:
    params[p] = best_params_lightgbm[p]

lgbr_arr = []

for i in tqdm(range(len(training_timeframe))):
    temp = deepcopy(train_df)
    temp = temp[temp["__index_level_0__"].dt.month.isin(training_timeframe[i])].reset_index().drop("index", axis = 1)
    X_train = temp.drop(columns=["__index_level_0__", "label"])
    X_train = X_train[best_features]
    Y_train = temp["label"]
    lgbr = LGBMRegressor(**params)
    lgbr.fit(X_train, Y_train)
    lgbr_arr.append(lgbr)

#### CatBoost

In [None]:
# params = {
#     "iterations": default_n_trees,
#     "verbose": False,
#     "random_seed": default_random_state
# }
# best_params_catboost = get_best_params_from_file(f"catboost_{feature_version}_{default_cv}_{default_random_state}_{default_n_trees}_common_truncated_20_study")
# for p in best_params_catboost:
#     params[p] = best_params_catboost[p]

# cbr_arr = []

# for i in tqdm(range(len(training_timeframe))):
#     temp = deepcopy(train_df)
#     temp = temp[temp["__index_level_0__"].dt.month.isin(training_timeframe[i])].reset_index().drop("index", axis = 1)
#     X_train = temp.drop(columns=["__index_level_0__", "label"])
#     X_train = X_train[best_features]
#     Y_train = temp["label"]
#     cbr = CatBoostRegressor(**params)
#     cbr.fit(X_train, Y_train)
#     cbr_arr.append(xgbr)

#### MLP

Data preprocessing

In [10]:
# Extra code to "reduce" from float64 to float32
def float64_to_float32(data):
    if isinstance(data, pd.DataFrame):
        for col in data.columns:
            data[col] = data[col].astype("float32")
    elif isinstance(data, pd.Series):
        data = data.astype("float32")
    return data

In [11]:
def normal_cv_to_mlx_cv(X_train, Y_train = None):
    # Normalize forst
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train.values)

    # Convert to mlx format
    X_train = mx.array(X_train)
    if Y_train is not None:
        Y_train = mx.array(Y_train.values)
        return X_train, Y_train
    else:
        return X_train

Define model

In [12]:
# Define the model
# We do not use the reset method this time so you have to create the model at each fold
class MLPMLX(nnmx.Module):
    def __init__(self, num_features, hidden_layers_size, dropout):
        super().__init__()

        # Initialize layers & batchnorm
        last_layer = num_features
        self.layers = []
        for current_layer in hidden_layers_size:
            self.layers.append(nnmx.Linear(last_layer, current_layer))
            last_layer = current_layer
        self.layers.append(nnmx.Linear(last_layer, 1))

        # Initialize activation
        self.activation = nnmx.ReLU()

        # Initialze dropout
        self.dropout = nnmx.Dropout(p = dropout)

    def __call__(self, x):
        for inx, layer in enumerate(self.layers):
            if inx == len(self.layers) - 1:
                x = layer(x)
            else:
                x = layer(x)
                x = self.activation(x)
                x = self.dropout(x)
        return x

Define batch

In [13]:
# Custom function for batch iteration
def batch_iterate(batch_size, X, Y = None, shuffle = True):
    if Y is not None:
        for i in range(0, Y.size, batch_size):
            X_curr = X[i: min(i + batch_size, Y.size), :]
            Y_curr = Y[i: min(i + batch_size, Y.size)]
            if shuffle:
                inx_lst = mx.random.permutation(batch_size)
                X_curr = X_curr[inx_lst, :]
                Y_curr = Y_curr[inx_lst]
            yield X_curr, Y_curr
    else:
        for i in range(0, X.shape[0], batch_size):
            X_curr = X[i: min(i + batch_size, X.shape[0]), :]
            yield X_curr

Define training function

In [14]:
# Separate function for train & eval step
def train_mlp_mlx(model, loss_and_grad_fn, optimizer, X_train, Y_train, batch_size, num_epochs):
    model.train()
    for _ in tqdm(range(num_epochs)):
        for (inputs, targets) in batch_iterate(batch_size, X_train, Y_train):
            _, grads = loss_and_grad_fn(model, inputs, targets)
            # Update the optimizer state and model parameters in a single call
            optimizer.update(model, grads)
            # Force a graph evaluation
            mx.eval(model.parameters(), optimizer.state)

In [15]:
def train_mlp_mlx_lst(num_features, hidden_layers_size, dropout, lr, train_df, batch_size, num_epochs, best_features, training_timeframe):
    mlp_mlx_arr = []
    for _, timeframe in enumerate(training_timeframe):
        # Get the data
        temp = deepcopy(train_df)
        temp = temp[temp["__index_level_0__"].dt.month.isin(timeframe)].reset_index().drop("index", axis = 1)
        X_train = temp.drop(columns=["__index_level_0__", "label"])
        X_train = X_train[best_features]
        Y_train = temp["label"]

        # Preprocess the data
        # convert to float 32
        X_train = float64_to_float32(X_train)
        Y_train = float64_to_float32(Y_train)
        # convert to mlx
        X_train, Y_train = normal_cv_to_mlx_cv(X_train, Y_train)
        
        # initialize the model
        mx.random.seed(default_random_state)
        model = MLPMLX(num_features, hidden_layers_size, dropout)

        # Initialize the loss function
        def loss_fn(model, X, Y):
            Y_pred = model(X).reshape(-1)
            return mx.mean(nnmx.losses.mse_loss(Y_pred, Y))
        loss_and_grad_fn = nnmx.value_and_grad(model, loss_fn)

        # Reinitialize the optimizer
        optimizer = optimmx.Adam(learning_rate = lr)

        # Train the model
        train_mlp_mlx(model, loss_and_grad_fn, optimizer, X_train, Y_train, batch_size, num_epochs)

        # Add the model to model list
        mlp_mlx_arr.append(model)
    return mlp_mlx_arr

Define function for making prediction

In [16]:
def predict_mlp_mlx(model, X_test, batch_size):
    # Preprocess the data
    # convert to float 32
    X_test = float64_to_float32(X_test)
    # convert to mlx
    X_test = normal_cv_to_mlx_cv(X_test)
    
    outputs_all = np.zeros(0)
    model.eval()
    for inputs in batch_iterate(batch_size, X_test):
        outputs = model(inputs).reshape(-1)
        # convert back to numpy
        outputs = np.array(outputs)
        # Load to overall Y_test, Y_pred to calculate pearson score later
        outputs_all = np.concatenate([outputs_all, outputs])
    return outputs_all

Train our model list based on data

In [17]:
best_features = ['X757', 'X758', 'X759', 'X508', 'X614', 'X752', 'X331', 'X445', 'X465', 'X385', 
                 'X466', 'X95', 'X23', 'X219', 'X31', 'X373', 'X379', 'X284', 'X750', 'X652', 
                 'X279', 'X89', 'X169', 'X753', 'X226', 'X28', 'X444', 'X272', 'X271', 'X218']
best_features = list(set(best_features))

In [None]:
# Get parameters for training
best_params_mlp_1 = get_best_params_from_file(f"mlp_mlx_{feature_version}_{default_cv}_{default_random_state}_1_common_truncated_{len(best_features)}_study")
num_features = len(best_features)
log_2_hidden_layers_size_1 = []
for i in range(1):
    log_2_hidden_layers_size_1.append(best_params_mlp_1[f"log2_hidden_layer_{i}"])
hidden_layers_size_1 = [2**l for l in log_2_hidden_layers_size_1]
dropout_1 = best_params_mlp_1["dropout"]
lr_1 = best_params_mlp_1["lr"]
batch_size_1 = best_params_mlp_1["batch_size"]
num_epochs_1 = best_params_mlp_1["num_epochs"]

# Conduct training to get model list
mlp_mlx_arr_1 = train_mlp_mlx_lst(num_features, hidden_layers_size_1, dropout_1, lr_1, train_df, batch_size_1, num_epochs_1, best_features, training_timeframe)

In [18]:
# Get parameters for training
best_params_mlp_2 = get_best_params_from_file(f"mlp_mlx_{feature_version}_{default_cv}_{default_random_state}_2_common_truncated_{len(best_features)}_study")
num_features = len(best_features)
log_2_hidden_layers_size_2 = []
for i in range(2):
    log_2_hidden_layers_size_2.append(best_params_mlp_2[f"log2_hidden_layer_{i}"])
hidden_layers_size_2 = [2**l for l in log_2_hidden_layers_size_2]
dropout_2 = best_params_mlp_2["dropout"]
lr_2 = best_params_mlp_2["lr"]
batch_size_2 = best_params_mlp_2["batch_size"]
num_epochs_2 = best_params_mlp_2["num_epochs"]

# Conduct training to get model list
mlp_mlx_arr_2 = train_mlp_mlx_lst(num_features, hidden_layers_size_2, dropout_2, lr_2, train_df, batch_size_2, num_epochs_2, best_features, training_timeframe)

100%|██████████| 20/20 [00:13<00:00,  1.52it/s]
100%|██████████| 20/20 [00:10<00:00,  1.95it/s]
100%|██████████| 20/20 [00:10<00:00,  1.97it/s]
100%|██████████| 20/20 [00:10<00:00,  1.95it/s]
100%|██████████| 20/20 [00:10<00:00,  1.99it/s]
100%|██████████| 20/20 [00:08<00:00,  2.26it/s]
100%|██████████| 20/20 [00:12<00:00,  1.66it/s]
100%|██████████| 20/20 [00:17<00:00,  1.12it/s]
100%|██████████| 20/20 [00:18<00:00,  1.10it/s]
100%|██████████| 20/20 [00:18<00:00,  1.09it/s]
100%|██████████| 20/20 [00:12<00:00,  1.54it/s]
100%|██████████| 20/20 [00:06<00:00,  3.16it/s]
100%|██████████| 20/20 [00:06<00:00,  3.05it/s]
100%|██████████| 20/20 [00:06<00:00,  3.08it/s]
100%|██████████| 20/20 [00:06<00:00,  3.08it/s]
100%|██████████| 20/20 [00:06<00:00,  3.02it/s]
100%|██████████| 20/20 [00:06<00:00,  3.11it/s]
100%|██████████| 20/20 [00:04<00:00,  4.76it/s]
100%|██████████| 20/20 [10:48<00:00, 32.40s/it] 
100%|██████████| 20/20 [00:04<00:00,  4.17it/s]
100%|██████████| 20/20 [00:10<00:00,  1

### Saving model list

In [19]:
# # Since training of this model is expensive, try to save them somewhere
# with open(f"xgbr_{feature_version}_{default_cv}_{default_random_state}_{default_n_trees}_common_truncated_{len(best_features)}_model.pkl", "wb") as f:
#     pickle.dump(xgbr_arr, f)
# with open(f"lgbr_{feature_version}_{default_cv}_{default_random_state}_{default_n_trees}_common_truncated_{len(best_features)}_model.pkl", "wb") as f:
#     pickle.dump(lgbr_arr, f)
# with open(f"cbr_{feature_version}_{default_cv}_{default_random_state}_{default_n_trees}_common_truncated_{len(best_features)}_model.pkl", "wb") as f:
#     pickle.dump(cbr_arr, f)
# with open(f"mlp_mlx_{feature_version}_{default_cv}_{default_random_state}_{default_num_layers}_common_truncated_{len(best_features)}_model.pkl", "wb") as f:
#     pickle.dump(mlp_mlx_arr, f)

### Making prediction & submission

In [20]:
popular_features_test = pd.read_parquet("data/cleaned/popular_features_test.parquet")

In [21]:
# # import mlp_mlx model if needed
# best_features = ['X862', 'X598', 'X863', 'X856', 'X612', 'X466', 'X533', 'X861', 'X445', 'X531', 
#                      'X385', 'X23', 'X284', 'X465', 'X331', 'X95', 'X285', 'X31', 'X169', 'X137']
# with open(f"xgbr_{feature_version}_{default_cv}_{default_random_state}_{default_n_trees}_common_truncated_{len(best_features)}_model.pkl", "rb") as f:
#     xgbr_arr = pickle.load(f)
# with open(f"lgbr_{feature_version}_{default_cv}_{default_random_state}_{default_n_trees}_common_truncated_{len(best_features)}_model.pkl", "rb") as f:
#     lgbr_arr = pickle.load(f)
# with open(f"cbr_{feature_version}_{default_cv}_{default_random_state}_{default_n_trees}_common_truncated_{len(best_features)}_model.pkl", "rb") as f:
#     cbr_arr = pickle.load(f)
# with open(f"mlp_mlx_{feature_version}_{default_cv}_{default_random_state}_1_common_truncated_{len(best_features)}_model.pkl", "rb") as f:
#     mlp_mlx_arr_1 = pickle.load(f)
# with open(f"mlp_mlx_{feature_version}_{default_cv}_{default_random_state}_2_common_truncated_{len(best_features)}_model.pkl", "rb") as f:
#     mlp_mlx_arr_2 = pickle.load(f)

In [None]:
X_test = pd.read_parquet(f"data/cleaned/cleaned_test_{feature_version}.parquet")
X_test = pd.concat([X_test, popular_features_test], axis = 1)
X_test = X_test.drop(columns=["label"])

# get X_test for gbdt
best_features_gbdt = ['X757', 'X758', 'X759', 'X508', 'X614', 'X752', 'X331', 'X445', 'X465', 'X385', 
                      'X466', 'X95', 'X23', 'X219', 'X31', 'X373', 'X379', 'X284', 'X750', 'X652', 
                      'X279', 'X89', 'X169', 'X753', 'X226', 'X28', 'X444', 'X272', 'X271', 'X218']
best_features_gbdt = list(set(best_features_gbdt))
X_test_gbdt = X_test[best_features_gbdt]

# get X_test for mlp
best_features_mlp = ['X757', 'X758', 'X759', 'X508', 'X614', 'X752', 'X331', 'X445', 'X465', 'X385', 
                     'X466', 'X95', 'X23', 'X219', 'X31', 'X373', 'X379', 'X284', 'X750', 'X652', 
                     'X279', 'X89', 'X169', 'X753', 'X226', 'X28', 'X444', 'X272', 'X271', 'X218']
best_features_mlp = list(set(best_features_mlp))
X_test_mlp = X_test[best_features_mlp]

# conduct prediction
Y_pred = np.zeros(X_test.shape[0])
n_models = 2
for i in tqdm(range(len(training_timeframe))):
    Y_pred += (
        xgbr_arr[i].predict(X_test_gbdt) + 
        # lgbr_arr[i].predict(X_test_gbdt) +
        predict_mlp_mlx(mlp_mlx_arr_2[i], X_test_mlp, batch_size_2)
    ) / n_models
Y_pred /= len(training_timeframe)

# Not really better than above
# Y_pred = np.zeros(X_test.shape[0])
# n_models = 2
# Y_pred_dict = {w: np.zeros(X_test.shape[0]) for w in window_sizes} # timeframe size - prediction
# for i in tqdm(range(len(training_timeframe))):
#     Y_pred_dict[len(training_timeframe[i])] += (
#         xgbr_arr[i].predict(X_test_gbdt) + 
#         # lgbr_arr[i].predict(X_test_gbdt) +
#         predict_mlp_mlx(mlp_mlx_arr_2[i], X_test_mlp, batch_size_2)
#     ) / n_models
# for w in Y_pred_dict:
#     Y_pred += Y_pred_dict[w] / (12 - w + 1)
# Y_pred /= len(window_sizes)

100%|██████████| 36/36 [00:20<00:00,  1.80it/s]


In [23]:
submission = pd.DataFrame({
    "id": X_test.index + 1,
    "prediction": Y_pred
})
submission.head()

Unnamed: 0,id,prediction
0,1,-0.001865
1,2,0.063624
2,3,-0.076649
3,4,-0.096325
4,5,-0.058924


In [24]:
submission.to_csv('submission.csv', index=False)

Analysis of contribution to submission

In [None]:
# def get_shap_values(model, X_test):
#     explainer = shap.TreeExplainer(model)
#     shap_values = explainer.shap_values(X_test)
#     mean_abs_shap = np.mean(np.abs(shap_values), axis = 0)
#     return mean_abs_shap

In [None]:
# xgboost_feature_importances = {}
# lightgbm_feature_importances = {}

# # only consider the fold with whole dataset to be comparable with cv scheme
# features = xgbr_arr[0].feature_names_in_.tolist()
# features_i = get_shap_values(xgbr_arr[i], X_test)
# for inx, feat in enumerate(features):
#     xgboost_feature_importances[feat] = xgboost_feature_importances.get(feat, 0) + features_i[inx]
# features = lgbr_arr[0].feature_names_in_.tolist()
# features_i = get_shap_values(lgbr_arr[i], X_test)
# for inx, feat in enumerate(features):
#     lightgbm_feature_importances[feat] = lightgbm_feature_importances.get(feat, 0) + features_i[inx]

# xgboost_feature_importances_df = pd.DataFrame(
#     {"var": xgboost_feature_importances.keys(), "importance": xgboost_feature_importances.values()}
# )
# #xgboost_feature_importances_df["importance"] /= len(training_timeframe)
# # xgboost_feature_importances_df["rank_importance"] = xgboost_feature_importances_df["importance"].rank(ascending=False)
# lightgbm_feature_importances_df = pd.DataFrame(
#     {"var": lightgbm_feature_importances.keys(), "importance": lightgbm_feature_importances.values()}
# )
# #lightgbm_feature_importances_df["importance"] /= len(training_timeframe)
# # lightgbm_feature_importances_df["rank_importance"] = lightgbm_feature_importances_df["importance"].rank(ascending=False)
# # catboost_feature_importances_df = pd.DataFrame(
# #     {"var": catboost_feature_importances.keys(), "importance_catboost": catboost_feature_importances.values()}
# # )
# # catboost_feature_importances_df["rank_importance"] = catboost_feature_importances_df["importance_catboost"].rank(ascending=False)
# feature_importances_df = xgboost_feature_importances_df.merge(
#     lightgbm_feature_importances_df,
#     on="var",
#     how="inner",
#     suffixes=("_xgboost", "_lightgbm")
# )
# # feature_importances_df = feature_importances_df.merge(
# #     catboost_feature_importances_df,
# #     on="var",
# #     how="inner",
# #     suffixes=("", "_catboost")
# # )
# # feature_importances_df = feature_importances_df[["var", "rank_importance_xgboost", "rank_importance_lightgbm", "rank_importance_catboost"]]
# # feature_importances_df["rank"] = 1/3 * (feature_importances_df["rank_importance_xgboost"] + feature_importances_df["rank_importance_lightgbm"] + feature_importances_df["rank_importance_catboost"])
# feature_importances_df["importance"] = 1/2 * (feature_importances_df["importance_xgboost"] + feature_importances_df["importance_lightgbm"])
# feature_importances_df = feature_importances_df.sort_values(by="importance", ascending=False).reset_index().drop("index", axis = 1)
# feature_importances_df

In [None]:
# best_xgboost_score = optuna.load_study(
#     study_name = "xgboost_2_4_101_1000_common_truncated_20_study",
#     storage = f"sqlite:///xgboost_2_4_101_1000_common_truncated_20_study.db"
# ).best_value
# best_lightgbm_score = optuna.load_study(
#     study_name = "lightgbm_2_4_101_1000_common_truncated_20_study",
#     storage = f"sqlite:///lightgbm_2_4_101_1000_common_truncated_20_study.db"
# ).best_value
# feature_importances_df["weighted_importance"] = (best_xgboost_score * feature_importances_df["importance_xgboost"] + best_lightgbm_score * feature_importances_df["importance_lightgbm"]) / (best_xgboost_score + best_lightgbm_score)
# feature_importances_df = feature_importances_df.sort_values("weighted_importance", ascending=False, ignore_index=True)
# feature_importances_df

### Training model on test shuffled data

In [None]:
# shuffled_popular_features_test = pd.read_parquet("data/cleaned/shuffled_popular_features_test.parquet")

# temporal_shuffled_popular_features_test = pd.read_parquet("data/cleaned/temporal_shuffled_popular_features_test.parquet")

# temporal_shuffled_X_test = pd.read_parquet(f"data/cleaned/temporal_shuffled_cleaned_test_{feature_version}.parquet")
# temporal_shuffled_X_test = temporal_shuffled_X_test.drop(columns=["label", "submission_inx"])

# shuffled_X_test = pd.read_parquet(f"data/cleaned/shuffled_cleaned_test_{feature_version}.parquet")
# shuffled_X_test = pd.concat([shuffled_X_test, shuffled_popular_features_test, temporal_shuffled_popular_features_test, temporal_shuffled_X_test], axis = 1)
# shuffled_submission_inx = shuffled_X_test["submission_inx"]
# shuffled_X_test = shuffled_X_test.drop(columns=["label", "submission_inx"])