In [1]:
import os
import glob
from joblib import Parallel, delayed, dump, load
import pandas as pd
from pandas.core.common import flatten
from collections import OrderedDict
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, QuantileTransformer, LabelEncoder
from sklearn.cluster import KMeans
import scipy as sc
from sklearn.model_selection import GroupKFold
from pytorch_tabnet.metrics import Metric
from pytorch_tabnet.tab_model import TabNetRegressor
import torch
from torch.optim import Adam
from torch.optim.lr_scheduler import CosineAnnealingWarmRestarts
import warnings
warnings.filterwarnings("ignore")
pd.set_option("max_columns", 300)

# Parameters

In [2]:
CONFIG = {
    "root_dir": "../../input/optiver-realized-volatility-prediction/",
    "ckpt_path": "../../ckpts/",
    "kfold_seed": 42,
    "n_splits": 5,
    "n_clusters": 7,
    "shuffle": True,
    "shuffle_seed": 1997,
    "pretrained": True
}

# Read Train Test

In [3]:
def read_train_test():
    
    train = pd.read_csv("../../input/optiver-realized-volatility-prediction/train.csv")
    test = pd.read_csv("../../input/optiver-realized-volatility-prediction/test.csv")
    
    # Create a key to merge with book and trade data
    train["row_id"] = train["stock_id"].astype(str) + "-" + train["time_id"].astype(str)
    test["row_id"] = test["stock_id"].astype(str) + "-" + test["time_id"].astype(str)
    
    print("Our training set has {} rows".format(train.shape[0]))
    
    return train, test

# Define basic metric function

In [4]:
def activity_counts(df):
    activity_counts_ = df.groupby(["time_id"])["seconds_in_bucket"].agg("count").reset_index()
    activity_counts_ = activity_counts_.rename(columns={"seconds_in_bucket": "activity_counts"})
    return activity_counts_


def calc_wap(df, pos=1):
    wap = (df["bid_price{}".format(pos)] * df["ask_size{}".format(pos)] + df["ask_price{}".format(pos)] * df[
        "bid_size{}".format(pos)]) / (df["bid_size{}".format(pos)] + df["ask_size{}".format(pos)])
    return wap


def calc_wap2(df, pos=1):
    wap = (df["bid_price{}".format(pos)] * df["bid_size{}".format(pos)] + df["ask_price{}".format(pos)] * df[
        "ask_size{}".format(pos)]) / (df["bid_size{}".format(pos)] + df["ask_size{}".format(pos)])
    return wap


def wp(df):
    wp_ = (df["bid_price1"] * df["bid_size1"] + df["ask_price1"] * df["ask_size1"] + df["bid_price2"] * df[
        "bid_size2"] + df["ask_price2"] * df["ask_size2"]) / (
                  df["bid_size1"] + df["ask_size1"] + df["bid_size2"] + df["ask_size2"])
    return wp_


def maximum_drawdown(series, window=600):
    # window for 10 minutes, use min_periods=1 if you want to allow the expanding window
    roll_max = series.rolling(window, min_periods=1).max()
    second_drawdown = series / roll_max - 1.0
    max_drawdown = second_drawdown.rolling(window, min_periods=1).min()

    return max_drawdown


def log_return(series):
    return np.log(series).diff().fillna(0)


def rolling_log_return(series, rolling=60):
    return np.log(series.rolling(rolling)).diff().fillna(0)


def realized_volatility(series):
    return np.sqrt(np.sum(series ** 2))


def diff(series):
    return series.diff().fillna(0)


def time_diff(series):
    return series.diff().fillna(series)


def order_flow_imbalance(df, pos=1):
    df["bid_price{}_diff".format(pos)] = df.groupby(["time_id"])["bid_price{}".format(pos)].apply(diff)
    df["bid_size{}_diff".format(pos)] = df.groupby(["time_id"])["bid_price{}".format(pos)].apply(diff)
    df["bid_order_flow{}".format(pos)] = df["bid_size{}".format(pos)].copy(deep=True)
    df["bid_order_flow{}".format(pos)].loc[df["bid_price{}_diff".format(pos)] < 0] *= -1
    df["bid_order_flow{}".format(pos)].loc[df["bid_price{}_diff".format(pos)] == 0] = \
        df["bid_size{}_diff".format(pos)].loc[df["bid_price{}_diff".format(pos)] == 0]

    df["ask_price{}_diff".format(pos)] = df.groupby(["time_id"])["ask_price{}".format(pos)].apply(diff)
    df["ask_size{}_diff".format(pos)] = df.groupby(["time_id"])["ask_price{}".format(pos)].apply(diff)
    df["ask_order_flow{}".format(pos)] = df["ask_size{}".format(pos)].copy(deep=True)
    df["ask_order_flow{}".format(pos)].loc[df["ask_price{}_diff".format(pos)] < 0] *= -1
    df["ask_order_flow{}".format(pos)].loc[df["ask_price{}_diff".format(pos)] == 0] = \
        df["ask_size{}_diff".format(pos)].loc[df["ask_price{}_diff".format(pos)] == 0]

    order_flow_imbalance_ = df["bid_order_flow{}".format(pos)] - df["ask_order_flow{}".format(pos)]

    df.drop(["bid_price{}_diff".format(pos), "bid_size{}_diff".format(pos), "bid_order_flow{}".format(pos),
             "ask_price{}_diff".format(pos), "ask_size{}_diff".format(pos), "ask_order_flow{}".format(pos)], axis=1,
            inplace=True)

    return order_flow_imbalance_ + 1e-8


def order_book_slope(df):

    df["mid_point"] = (df["bid_price1"] + df["ask_price1"]) / 2
    best_mid_point_ = df.groupby(["time_id"])["mid_point"].agg("max").reset_index()
    best_mid_point_ = best_mid_point_.rename(columns={"mid_point": "best_mid_point"})
    df = df.merge(best_mid_point_, how="left", on="time_id")

    best_mid_point = df["best_mid_point"].copy()
    df.drop(["mid_point", "best_mid_point"], axis=1, inplace=True)

    def ratio(series):
        ratio_ = series / series.shift()
        return ratio_

    bid_price1_ratio = df.groupby(["time_id"])["bid_price1"].apply(ratio)
    bid_price1_mid_point_ratio = df["bid_price1"] / best_mid_point
    bid_price1_ratio = abs(bid_price1_ratio.fillna(bid_price1_mid_point_ratio) - 1)

    bid_size1_ratio = df.groupby(["time_id"])["bid_size1"].apply(ratio) - 1
    bid_size1_ratio = bid_size1_ratio.fillna(df["bid_size1"])
    df["DE"] = (bid_size1_ratio / bid_price1_ratio).replace([np.inf, -np.inf], np.nan).fillna(0)

    ask_price1_ratio = df.groupby(["time_id"])["ask_price1"].apply(ratio)
    ask_price1_mid_point_ratio = df["ask_price1"] / best_mid_point
    ask_price1_ratio = abs(ask_price1_ratio.fillna(ask_price1_mid_point_ratio) - 1)

    ask_size1_ratio = df.groupby(["time_id"])["ask_size1"].apply(ratio) - 1
    ask_size1_ratio = ask_size1_ratio.fillna(df["ask_size1"])
    df["SE"] = (ask_size1_ratio / ask_price1_ratio).replace([np.inf, -np.inf], np.nan).fillna(0)

    df["order_book_slope"] = (df["DE"] + df["SE"]) / 2
    order_book_slope_ = df.groupby(["time_id"])["order_book_slope"].agg("mean").reset_index()
    df.drop(["order_book_slope", "DE", "SE"], axis=1, inplace=True)

    return order_book_slope_


def ldispersion(df):
    LDispersion = 1 / 2 * (
            df["bid_size1"] / (df["bid_size1"] + df["bid_size2"]) * abs(df["bid_price1"] - df["bid_price2"]) + df[
        "ask_size1"] / (df["ask_size1"] + df["ask_size2"]) * abs(df["ask_price1"] - df["ask_price2"]))
    return LDispersion


def depth_imbalance(df, pos=1):
    depth_imbalance_ = (df["bid_size{}".format(pos)] - df["ask_size{}".format(pos)]) / (
            df["bid_size{}".format(pos)] + df["ask_size{}".format(pos)])

    return depth_imbalance_


def height_imbalance(df, pos=1):
    height_imbalance_ = (df["bid_price{}".format(pos)] - df["ask_price{}".format(pos)]) / (
            df["bid_price{}".format(pos)] + df["ask_price{}".format(pos)])

    return height_imbalance_


def pressure_imbalance(df):
    mid_price = (df["bid_price1"] + df["ask_price1"]) / 2

    weight_buy = mid_price / (mid_price - df["bid_price1"]) + mid_price / (mid_price - df["bid_price2"])
    pressure_buy = df["bid_size1"] * (mid_price / (mid_price - df["bid_price1"])) / weight_buy + df["bid_size2"] * (
            mid_price / (mid_price - df["bid_price2"])) / weight_buy

    weight_sell = mid_price / (df["ask_price1"] - mid_price) + mid_price / (df["ask_price2"] - mid_price)
    pressure_sell = df["ask_size1"] * (mid_price / (df["ask_price1"] - mid_price)) / weight_sell + df["ask_size2"] * (
            mid_price / (df["ask_price2"] - mid_price)) / weight_sell

    pressure_imbalance_ = np.log(pressure_buy) - np.log(pressure_sell)

    return pressure_imbalance_


def relative_spread(df, pos=1):
    relative_spread_ = 2 * (df["ask_price{}".format(pos)] - df["bid_price{}".format(pos)]) / (
            df["ask_price{}".format(pos)] + df["bid_price{}".format(pos)])

    return relative_spread_


def count_unique(series):
    return len(np.unique(series))

# Processor

In [5]:
# Function to preprocess book data (for each stock id)
def book_preprocessor(file_path):
    df = pd.read_parquet(file_path)

    # float 64 to float 32
    float_cols = df.select_dtypes(include=[np.float64]).columns
    df[float_cols] = df[float_cols].astype(np.float32)

    # int 64 to int 32
    int_cols = df.select_dtypes(include=[np.int64]).columns
    df[int_cols] = df[int_cols].astype(np.int32)
    
    # rebase seconds_in_bucket
    df["seconds_in_bucket"] = df["seconds_in_bucket"] - df["seconds_in_bucket"].min()

    # Calculate seconds gap
    df["seconds_gap"] = df.groupby(["time_id"])["seconds_in_bucket"].apply(time_diff)

    # Calculate Wap
    df["wap1"] = calc_wap(df, pos=1)
    df["wap2"] = calc_wap(df, pos=2)

    # Calculate wap balance
    df["wap_balance"] = abs(df["wap1"] - df["wap2"])

    # Calculate log returns
    df["log_return1"] = df.groupby(["time_id"])["wap1"].apply(log_return)
    df["log_return2"] = df.groupby(["time_id"])["wap2"].apply(log_return)

    # Calculate spread
    df["bid_ask_spread1"] = df["ask_price1"] / df["bid_price1"] - 1
    df["bid_ask_spread2"] = df["ask_price2"] / df["bid_price2"] - 1

    # order flow imbalance
    df["order_flow_imbalance1"] = order_flow_imbalance(df, 1)
    df["order_flow_imbalance2"] = order_flow_imbalance(df, 2)

    # order book slope
    order_slope_ = order_book_slope(df)
    df = df.merge(order_slope_, how="left", on="time_id")

    # depth imbalance
    df["depth_imbalance1"] = depth_imbalance(df, pos=1)
    df["depth_imbalance2"] = depth_imbalance(df, pos=2)

    # height imbalance
    df["height_imbalance1"] = height_imbalance(df, pos=1)
    df["height_imbalance2"] = height_imbalance(df, pos=2)

    # pressure imbalance
    df["pressure_imbalance"] = pressure_imbalance(df)

    # total volume
    df["total_volume"] = (df["ask_size1"] + df["ask_size2"]) + (df["bid_size1"] + df["bid_size2"])

    # Dict for aggregations
    create_feature_dict = {
        "wap1": [np.sum, np.std],
        "wap2": [np.sum, np.std],
        "log_return1": [realized_volatility],
        "log_return2": [realized_volatility],
        "wap_balance": [np.sum, np.max, np.min, np.std],
        "bid_ask_spread1": [np.sum, np.max, np.min, np.std],
        "bid_ask_spread2": [np.sum, np.max, np.min, np.std],
        "order_flow_imbalance1": [np.sum, np.max, np.min, np.std],
        "order_flow_imbalance2": [np.sum, np.max, np.min, np.std],
        "order_book_slope": [np.mean, np.max],
        "depth_imbalance1": [np.sum, np.max, np.std],
        "depth_imbalance2": [np.sum, np.max, np.std],
        "height_imbalance1": [np.sum, np.max, np.std],
        "height_imbalance2": [np.sum, np.max, np.std],
        "pressure_imbalance": [np.sum, np.max, np.std],
        "total_volume": [np.sum],
        "seconds_gap": [np.mean]
    }
    create_feature_dict_time = {
        "log_return1": [realized_volatility],
        "log_return2": [realized_volatility],
        "wap_balance": [np.sum, np.max, np.min, np.std],
        "bid_ask_spread1": [np.sum, np.max, np.min, np.std],
        "bid_ask_spread2": [np.sum, np.max, np.min, np.std],
        "order_flow_imbalance1": [np.sum, np.max, np.min, np.std],
        "order_flow_imbalance2": [np.sum, np.max, np.min, np.std],
        "total_volume": [np.sum],
        "seconds_gap": [np.mean]
    }

    # Function to get group stats for different windows (seconds in bucket)
    def get_stats_window(feature_dict, seconds_in_bucket, add_suffix=False):
        # Group by the window
        df_feature_ = df[df["seconds_in_bucket"] >= seconds_in_bucket].groupby(["time_id"]).agg(
            feature_dict).reset_index()
        # Rename columns joining suffix
        df_feature_.columns = ["_".join(col) for col in df_feature_.columns]
        # Add a suffix to differentiate windows
        if add_suffix:
            df_feature_ = df_feature_.add_suffix("_" + str(seconds_in_bucket))
        return df_feature_

    # Get the stats for different windows
    windows = [0, 150, 300, 450]
    add_suffixes = [False, True, True, True]
    df_feature = None

    for window, add_suffix in zip(windows, add_suffixes):
        if df_feature is None:
            df_feature = get_stats_window(feature_dict=create_feature_dict, seconds_in_bucket=window,
                                          add_suffix=add_suffix)
        else:
            new_df_feature = get_stats_window(feature_dict=create_feature_dict_time, seconds_in_bucket=window,
                                              add_suffix=add_suffix)
            df_feature = df_feature.merge(new_df_feature, how="left", left_on="time_id_",
                                          right_on="time_id__{}".format(window))

            # Drop unnecesary time_ids
            df_feature.drop(["time_id__{}".format(window)], axis=1, inplace=True)

    # Create row_id so we can merge
    stock_id = file_path.split("=")[1]
    df_feature["row_id"] = df_feature["time_id_"].apply(lambda x: f"{stock_id}-{x}")
    df_feature.drop(["time_id_"], axis=1, inplace=True)

    return df_feature

In [6]:
# Function to preprocess trade data (for each stock id)
def trade_preprocessor(file_path):
    df = pd.read_parquet(file_path)

    # float 64 to float 32
    float_cols = df.select_dtypes(include=[np.float64]).columns
    df[float_cols] = df[float_cols].astype(np.float32)

    # int 64 to int 32
    int_cols = df.select_dtypes(include=[np.int64]).columns
    df[int_cols] = df[int_cols].astype(np.int32)
    
    # rebase seconds_in_bucket
    df["seconds_in_bucket"] = df["seconds_in_bucket"] - df["seconds_in_bucket"].min()

    # Calculate seconds gap
    df["seconds_gap"] = df.groupby(["time_id"])["seconds_in_bucket"].apply(time_diff)

    # Calculate log return
    df["price_log_return"] = df.groupby("time_id")["price"].apply(log_return)

    # Calculate volumes
    df["volumes"] = df["price"] * df["size"]

    # Dict for aggregations
    create_feature_dict = {
        "price_log_return": [realized_volatility],
        "volumes": [np.sum, np.max, np.std],
        "order_count": [np.sum],
        "seconds_gap": [np.mean]
    }
    create_feature_dict_time = {
        "price_log_return": [realized_volatility],
        "volumes": [np.sum, np.max, np.std],
        "order_count": [np.sum],
        "seconds_gap": [np.mean]
    }

    # Function to get group stats for different windows (seconds in bucket)
    def get_stats_window(feature_dict, seconds_in_bucket, add_suffix=False):
        # Group by the window
        df_feature_ = df[df["seconds_in_bucket"] >= seconds_in_bucket].groupby(["time_id"]).agg(
            feature_dict).reset_index()
        # Rename columns joining suffix
        df_feature_.columns = ["_".join(col) for col in df_feature_.columns]
        # Add a suffix to differentiate windows
        if add_suffix:
            df_feature_ = df_feature_.add_suffix("_" + str(seconds_in_bucket))
        return df_feature_

    # Get the stats for different windows
    windows = [0, 150, 300, 450]
    add_suffixes = [False, True, True, True]
    df_feature = None

    for window, add_suffix in zip(windows, add_suffixes):
        if df_feature is None:
            df_feature = get_stats_window(feature_dict=create_feature_dict, seconds_in_bucket=window,
                                          add_suffix=add_suffix)
        else:
            new_df_feature = get_stats_window(feature_dict=create_feature_dict_time, seconds_in_bucket=window,
                                              add_suffix=add_suffix)
            df_feature = df_feature.merge(new_df_feature, how="left", left_on="time_id_",
                                          right_on="time_id__{}".format(window))

            # Drop unnecesary time_ids
            df_feature.drop(["time_id__{}".format(window)], axis=1, inplace=True)

    def tendency(price, vol):
        df_diff = np.diff(price)
        val = (df_diff / price[1:]) * 100
        power = np.sum(val * vol[1:])
        return (power)

    lis = []
    for n_time_id in df["time_id"].unique():
        
        df_id = df[df["time_id"] == n_time_id]
        
        tendencyV = tendency(df_id["price"].values, df_id["size"].values)
        energy = np.mean(df_id["price"].values ** 2)

        lis.append(
            {
                "time_id": n_time_id,
                "tendency": tendencyV,
                "energy": energy,
            }
        )

    df_lr = pd.DataFrame(lis)
    df_feature = df_feature.merge(df_lr, how="left", left_on="time_id_", right_on="time_id")

    # Create row_id so we can merge
    df_feature = df_feature.add_prefix("trade_")
    stock_id = file_path.split("=")[1]
    df_feature["row_id"] = df_feature["trade_time_id_"].apply(lambda x: f"{stock_id}-{x}")
    df_feature.drop(["trade_time_id_", "trade_time_id"], axis=1, inplace=True)
    return df_feature

In [7]:
# Funtion to make preprocessing function in parallel (for each stock id)
def preprocessor(list_stock_ids, is_train = True):
    
    # Parrallel for loop
    def for_joblib(stock_id):
        # Train
        if is_train:
            file_path_book = CONFIG["root_dir"] + "book_train.parquet/stock_id=" + str(stock_id)
            file_path_trade = CONFIG["root_dir"] + "trade_train.parquet/stock_id=" + str(stock_id)
        # Test
        else:
            file_path_book = CONFIG["root_dir"] + "book_test.parquet/stock_id=" + str(stock_id)
            file_path_trade = CONFIG["root_dir"] + "trade_test.parquet/stock_id=" + str(stock_id)
    
        # Preprocess book and trade data and merge them
        df_tmp = pd.merge(book_preprocessor(file_path_book), trade_preprocessor(file_path_trade), on = "row_id", how = "left")
        
        # Return the merge dataframe
        return df_tmp
    
    # Use parallel api to call paralle for loop
    df = Parallel(n_jobs = -1, verbose = 1)(delayed(for_joblib)(stock_id) for stock_id in list_stock_ids)
    
    # Concatenate all the dataframes that return from Parallel
    df = pd.concat(df, ignore_index = True)
    
    return df

In [8]:
# Read train and test
train, test = read_train_test()

# Get unique stock ids 
train_stock_ids = train["stock_id"].unique()

# Preprocess them using Parallel and our single stock id functions
train_ = preprocessor(train_stock_ids, is_train=True)
train = train.merge(train_, on=["row_id"], how="left")

# Get unique stock ids 
test_stock_ids = test["stock_id"].unique()

# Preprocess them using Parallel and our single stock id functions
test_ = preprocessor(test_stock_ids, is_train=False)
test = test.merge(test_, on=["row_id"], how="left")

Our training set has 428932 rows


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 24 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:   52.0s
[Parallel(n_jobs=-1)]: Done 112 out of 112 | elapsed:  4.8min finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 24 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.1s finished


# Log Transformation

In [9]:
# abs log columns
abs_log_columns = [column for column in train.columns if 
                       "order_flow_imbalance" in column or 
                       "order_book_slope" in column or 
                       "depth_imbalance" in column or 
                       "pressure_imbalance" in column or
                       "total_volume" in column or
                       "seconds_gap" in column or
                       "trade_volumes" in column or
                       "trade_order_count" in column or
                       "trade_seconds_gap" in column or
                       "trade_tendency" in column
                      ]

# apply abs + log1p
train[abs_log_columns] = (train[abs_log_columns].apply(np.abs)).apply(np.log1p)
test[abs_log_columns] = (test[abs_log_columns].apply(np.abs)).apply(np.log1p)

# Fill inf with nan

In [10]:
train = train.replace([np.inf, -np.inf], np.nan)
test = test.replace([np.inf, -np.inf], np.nan)

# Agg features inside fold

In [11]:
# Process agg by kmeans
def get_kmeans_idx(n_clusters=7):
    train_p = pd.read_csv("../../input/optiver-realized-volatility-prediction/train.csv")
    train_p = train_p.pivot(index="time_id", columns="stock_id", values="target")

    corr = train_p.corr()

    ids = corr.index

    kmeans = KMeans(n_clusters=n_clusters, random_state=0).fit(corr.values)

    kmeans_clusters = []
    for n in range(n_clusters):
        kmeans_clusters.append ([(x - 1) for x in ((ids + 1)*(kmeans.labels_ == n)) if x > 0])
        
    return kmeans_clusters
    

def agg_stat_features_by_clusters(df, n_clusters=7, function=np.nanmean, post_fix="_cluster_mean"):

    kmeans_clusters = get_kmeans_idx(n_clusters=n_clusters)

    clusters = []
    agg_columns = [
        "time_id",
        "stock_id",
        "log_return1_realized_volatility",
        "log_return2_realized_volatility",
        "order_flow_imbalance1_sum",
        "order_flow_imbalance2_sum",
        "order_book_slope_mean",
        "depth_imbalance1_std",
        "depth_imbalance2_std",
        "height_imbalance1_sum",
        "height_imbalance2_sum",
        "pressure_imbalance_std",
        "total_volume_sum",
        "seconds_gap_mean",
        "trade_price_log_return_realized_volatility",
        "trade_volumes_sum",
        "trade_order_count_sum",
        "trade_seconds_gap_mean",
        "trade_tendency",
        "trade_energy"
    ]

    for cluster_idx, ind in enumerate(kmeans_clusters):
        cluster_df = df.loc[df["stock_id"].isin(ind), agg_columns].groupby(["time_id"]).agg(function)
        cluster_df.loc[:, "stock_id"] = str(cluster_idx) + post_fix
        clusters.append(cluster_df)

    clusters_df = pd.concat(clusters).reset_index()
    # multi index (column, c1)
    clusters_df = clusters_df.pivot(index="time_id", columns="stock_id")
    # ravel multi index to list of tuple [(target, c1), ...]
    clusters_df.columns = ["_".join(x) for x in clusters_df.columns.ravel()]
    clusters_df.reset_index(inplace=True)

    postfixes = [
        "0" + post_fix,
        "1" + post_fix,
        "3" + post_fix,
        "4" + post_fix,
        "6" + post_fix,
    ]
    merge_columns = []
    for column in agg_columns:
        if column == "time_id":
            merge_columns.append(column)
        elif column == "stock_id":
            continue
        else:
            for postfix in postfixes:
                merge_columns.append(column + "_" + postfix)
                
    not_exist_columns = [column for column in merge_columns if column not in clusters_df.columns]
    clusters_df[not_exist_columns] = 0
    
    df = pd.merge(df, clusters_df[merge_columns], how="left", on="time_id")

    return df


# Function to get group stats for the time_id
def agg_stat_features_by_market(df, operations=None, operations_names=None):
    def percentile(n):
        def percentile_(x):
            return np.percentile(x, n)

        percentile_.__name__ = "percentile_%s" % n
        return percentile_

    if operations is None:
        operations = [
            np.nanmean,
        ]
        operations_names = [
            "mean",
        ]

    # Get realized volatility columns
    vol_cols = [
        "log_return1_realized_volatility",
        "log_return1_realized_volatility_150",
        "log_return1_realized_volatility_300",
        "log_return1_realized_volatility_450",
    ]

    # Group by the stock id
    df_stock_id = df.groupby(["stock_id"])[vol_cols].agg(operations).reset_index()
    # Rename columns joining suffix
    df_stock_id.columns = ["_".join(col) for col in df_stock_id.columns]
    df_stock_id = df_stock_id.add_suffix("_" + "stock")

    # Group by the stock id
    df_time_id = df.groupby(["time_id"])[vol_cols].agg(operations).reset_index()
    # Rename columns joining suffix
    df_time_id.columns = ["_".join(col) for col in df_time_id.columns]
    df_time_id = df_time_id.add_suffix("_" + "time")

    # Merge with original dataframe
    df = df.merge(df_stock_id, how="left", left_on=["stock_id"], right_on=["stock_id__stock"])
    df.drop("stock_id__stock", axis=1, inplace=True)

    df = df.merge(df_time_id, how="left", left_on=["time_id"], right_on=["time_id__time"])
    df.drop("time_id__time", axis=1, inplace=True)

    return df


# Loss functions

In [12]:
# Function to calculate the root mean squared percentage error
def rmspe(y_true, y_pred):
    return np.sqrt(np.mean(np.square((y_true - y_pred) / y_true)))


class RMSPE(Metric):
    def __init__(self):
        self._name = "rmspe"
        self._maximize = False

    def __call__(self, y_true, y_score):
        
        return np.sqrt(np.mean(np.square((y_true - y_score) / y_true)))
    


def RMSPELoss(y_pred, y_true):
    return torch.sqrt(torch.mean( ((y_true - y_pred) / y_true) ** 2 )).clone()

In [13]:
def train_and_evaluate(train, test):
    
    # label encoder
    cat_columns = ["stock_id"]
    label_encoder = LabelEncoder()
    label_encoder = label_encoder.fit(train[cat_columns].values)
    train[cat_columns] = label_encoder.transform(train[cat_columns].values)
    dump(label_encoder, os.path.join(CONFIG["ckpt_path"], "label_encoder.bin"), compress=True)
    test[cat_columns] = label_encoder.transform(test[cat_columns].values)
    cat_dims = [len(label_encoder.classes_)]
    
    # scaler
    # scaler = QuantileTransformer(n_quantiles=2000, random_state=2021)
    scaler = StandardScaler()
    
    # Split features and target
    x = train.drop(["row_id", "target"], axis=1)
    y = train["target"]
    
    # x_test with train feature
    x_test = test.drop("row_id", axis=1)
    x_test = agg_stat_features_by_market(x_test)
    x_test = agg_stat_features_by_clusters(x_test, n_clusters=CONFIG["n_clusters"], function=np.nanmean, post_fix="_cluster_mean")
    x_test = agg_stat_features_by_clusters(x_test, n_clusters=CONFIG["n_clusters"], function=np.nanmax, post_fix="_cluster_max")
    x_test = agg_stat_features_by_clusters(x_test, n_clusters=CONFIG["n_clusters"], function=np.nanmin, post_fix="_cluster_min")
    x_test = agg_stat_features_by_clusters(x_test, n_clusters=CONFIG["n_clusters"], function=np.nanstd, post_fix="_cluster_std")

    # define normalize columns
    except_columns = ["stock_id", "time_id", "target", "row_id"]
    normalized_columns = [column for column in x_test.columns if column not in except_columns]
    x_test.drop("time_id", axis=1, inplace=True)
    
    # Process categorical features and get params dict
    cat_idxs = [i for i, f in enumerate(x_test.columns.tolist()) if f in cat_columns]
    
    params = dict(
        cat_idxs=cat_idxs,
        cat_dims=cat_dims,
        cat_emb_dim=1,
        n_d=16,
        n_a=16,
        n_steps=2,
        gamma=2,
        n_independent=2,
        n_shared=2,
        lambda_sparse=0,
        optimizer_fn=Adam,
        optimizer_params=dict(lr = (2e-2)),
        mask_type="entmax",
        scheduler_params=dict(T_0=200, T_mult=1, eta_min=1e-4, last_epoch=-1, verbose=False),
        scheduler_fn=CosineAnnealingWarmRestarts,
        seed=42,
        verbose=10
    )
    
    # Create out of folds array
    oof_predictions = np.zeros(x.shape[0])
    
    # Create test array to store predictions
    test_predictions = np.zeros(x_test.shape[0])
    
    # Statistics
    feature_importances = pd.DataFrame()
    feature_importances["feature"] = x_test.columns.tolist()
    stats = pd.DataFrame()
    explain_matrices = []
    masks_ =[]
    
    # Create a KFold object
    kfold = GroupKFold(n_splits=CONFIG["n_splits"])
    
    # Iterate through each fold
    for fold, (trn_ind, val_ind) in enumerate(kfold.split(x, groups=x["time_id"])):
        print(f"Training fold {fold + 1}")
        x_train = x.iloc[trn_ind]
        x_train = agg_stat_features_by_market(x_train)
        x_train = agg_stat_features_by_clusters(x_train, n_clusters=CONFIG["n_clusters"], function=np.nanmean, post_fix="_cluster_mean")
        x_train = agg_stat_features_by_clusters(x_train, n_clusters=CONFIG["n_clusters"], function=np.nanmax, post_fix="_cluster_max")
        x_train = agg_stat_features_by_clusters(x_train, n_clusters=CONFIG["n_clusters"], function=np.nanmin, post_fix="_cluster_min")
        x_train = agg_stat_features_by_clusters(x_train, n_clusters=CONFIG["n_clusters"], function=np.nanstd, post_fix="_cluster_std")
        x_train.drop("time_id", axis=1, inplace=True)
        
        scaler = scaler.fit(x_train[normalized_columns])
        dump(scaler, os.path.join(CONFIG["ckpt_path"], "tabnet_std_scaler_fold_{}.bin".format(fold + 1)), compress=True)
        x_train[normalized_columns] = scaler.transform(x_train[normalized_columns])
        x_train = x_train.fillna(0)
        
        x_val = x.iloc[val_ind]
        x_val = agg_stat_features_by_market(x_val)
        x_val = agg_stat_features_by_clusters(x_val, n_clusters=CONFIG["n_clusters"], function=np.nanmean, post_fix="_cluster_mean")
        x_val = agg_stat_features_by_clusters(x_val, n_clusters=CONFIG["n_clusters"], function=np.nanmax, post_fix="_cluster_max")
        x_val = agg_stat_features_by_clusters(x_val, n_clusters=CONFIG["n_clusters"], function=np.nanmin, post_fix="_cluster_min")
        x_val = agg_stat_features_by_clusters(x_val, n_clusters=CONFIG["n_clusters"], function=np.nanstd, post_fix="_cluster_std")
        x_val.drop("time_id", axis=1, inplace=True)
        
        x_val[normalized_columns] = scaler.transform(x_val[normalized_columns])
        x_val = x_val.fillna(0)
        
        y_train, y_val = y.iloc[trn_ind].values.reshape(-1, 1), y.iloc[val_ind].values.reshape(-1, 1)
        
        # Train
        clf =  TabNetRegressor(**params)
        
        if CONFIG["pretrained"] and os.path.exists(os.path.join(CONFIG["ckpt_path"], "tabnet_fold{}.zip".format(fold + 1))):
            clf.load_model(os.path.join(CONFIG["ckpt_path"], "tabnet_fold{}.zip".format(fold + 1)))
        else:
            clf.fit(
                  x_train.values, y_train,
                  eval_set=[(x_val.values, y_val)],
                  max_epochs=200,
                  patience=50,
                  batch_size=1024*20, 
                  virtual_batch_size=128*20,
                  num_workers=0,
                  drop_last=False,
                  eval_metric=[RMSPE],
                  loss_fn=RMSPELoss
              )

            # save model
            saved_filepath = clf.save_model(os.path.join(CONFIG["ckpt_path"], "tabnet_fold{}".format(fold + 1)))
        
        # save statistics
        explain_matrix, masks = clf.explain(x_val.values)
        explain_matrices.append(explain_matrix)
        masks_.append(masks[0])
        masks_.append(masks[1])
        
        # save oof and test predictions
        oof_predictions[val_ind] = clf.predict(x_val.values).flatten()
        x_test_ = x_test.copy()
        
        x_test_[normalized_columns] = scaler.transform(x_test_[normalized_columns])
        x_test_ = x_test_.fillna(0)
        
        test_predictions += clf.predict(x_test_.values).flatten() / CONFIG["n_splits"]
        
    rmspe_score = rmspe(y, oof_predictions)
    print("Our out of folds RMSPE is {}".format(rmspe_score))
    
    # Return test predictions
    return test_predictions, stats, feature_importances, explain_matrices, masks_

In [14]:
# Traing and evaluate
if CONFIG["shuffle"]:
    print("shuffling dataset")
    train = train.sample(frac=1, random_state=CONFIG["shuffle_seed"]).reset_index(drop=True)
test_predictions, stats, feature_importances, explain_matrices, masks_ = train_and_evaluate(train, test)

# Save test predictions
# test["target"] = test_predictions
# test[["row_id", "target"]].to_csv("submission.csv",index = False)

shuffling dataset
Training fold 1
Device used : cuda
epoch 0  | loss: 145.1339| val_0_rmspe: 45.0331 |  0:00:09s
epoch 10 | loss: 1.27266 | val_0_rmspe: 1.16858 |  0:01:33s
epoch 20 | loss: 1.10778 | val_0_rmspe: 1.75913 |  0:02:57s
epoch 30 | loss: 0.45416 | val_0_rmspe: 0.44331 |  0:04:19s
epoch 40 | loss: 0.39822 | val_0_rmspe: 0.3414  |  0:05:42s
epoch 50 | loss: 0.32769 | val_0_rmspe: 0.30663 |  0:07:04s
epoch 60 | loss: 0.26401 | val_0_rmspe: 0.26592 |  0:08:27s
epoch 70 | loss: 0.24352 | val_0_rmspe: 0.24514 |  0:09:49s
epoch 80 | loss: 0.22456 | val_0_rmspe: 0.23949 |  0:11:12s
epoch 90 | loss: 0.21725 | val_0_rmspe: 0.22892 |  0:12:34s
epoch 100| loss: 0.21938 | val_0_rmspe: 0.23077 |  0:13:57s
epoch 110| loss: 0.20914 | val_0_rmspe: 0.22704 |  0:15:20s
epoch 120| loss: 0.20693 | val_0_rmspe: 0.23098 |  0:16:42s
epoch 130| loss: 0.20607 | val_0_rmspe: 0.22915 |  0:18:05s
epoch 140| loss: 0.20276 | val_0_rmspe: 0.22801 |  0:19:27s
epoch 150| loss: 0.19982 | val_0_rmspe: 0.22939

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "C:\Users\jioni\anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 3437, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-14-2398305526da>", line 5, in <module>
    test_predictions, stats, feature_importances, explain_matrices, masks_ = train_and_evaluate(train, test)
  File "<ipython-input-13-faaa7f63fd3e>", line 107, in train_and_evaluate
    clf.fit(
  File "C:\Users\jioni\anaconda3\lib\site-packages\pytorch_tabnet\abstract_model.py", line 223, in fit
    self._train_epoch(train_dataloader)
  File "C:\Users\jioni\anaconda3\lib\site-packages\pytorch_tabnet\abstract_model.py", line 431, in _train_epoch
    for batch_idx, (X, y) in enumerate(train_loader):
  File "C:\Users\jioni\anaconda3\lib\site-packages\torch\utils\data\dataloader.py", line 521, in __next__
    data = self._next_data()
  File "C:\Users\jioni\anaconda3\lib\site-packages\torch\utils\data\dataloader.py", line 561, in _ne

TypeError: object of type 'NoneType' has no len()