In [1]:
import gc
import os
import time
import datetime
import warnings
from itertools import combinations
from warnings import simplefilter
import lightgbm as lgb
import catboost as cbt

import numpy as np
import pandas as pd
from sklearn.metrics import mean_absolute_error 
warnings.filterwarnings("ignore")
simplefilter(action="ignore", category=pd.errors.PerformanceWarning)

In [2]:
class CFG:
    """
    Configuration class for parameters and CV strategy for tuning and training
    Please use caps lock capital letters while filling in parameters
    """
    
    # Data preparation
    version_nb         = 3
    is_gpu             = True
    # device             = torch.device('cuda' if torch.cuda.is_available() and gpu_switch else 'cpu')
    state              = 42
    num_workers        = 4

    # BEFORE SUBMIT, CHECK SETTINGS
    is_test_mode       = False
    test_mode_frac     = 10
    is_offline         = True
    testing_days       = 2

    target             = 'target'    
    path               = '/kaggle/input/optiver-trading-at-the-close'
    train_path         = f'{path}/train.csv'
    test_path          = f'{path}/example_test_files/test.csv'
    model_path         = f'' if not is_offline else f'{path}/'
    
    TRAINING           = True
    INFERENCE          = True
    TUNING             = False
    
    methods            = ['LGBM',]
    # methods            = ['CBT']

    plt_path = f'fig/turning'

# Data Loading and Preprocessing 






In [3]:
# 📂 Read the dataset from a CSV file using Pandas
df = pd.read_csv(CFG.train_path)
if CFG.is_test_mode:
    df = df[df['stock_id'] < 10]

# 🧹 Remove rows with missing values in the "target" column
df = df.dropna(subset=["target"])

# 🔁 Reset the index of the DataFrame and apply the changes in place
df.reset_index(drop=True, inplace=True)

# 📏 Get the shape of the DataFrame (number of rows and columns)
df_shape = df.shape
print(df_shape)

(5237892, 17)


# Memory Optimization

In [4]:
def reduce_mem_usage(df, verbose=0):
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtype
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            
            if str(col_type)[:3] == "int":
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
               
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float32)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float32)
    if verbose:
        logger.info(f"Memory usage of dataframe is {start_mem:.2f} MB")
        end_mem = df.memory_usage().sum() / 1024**2
        logger.info(f"Memory usage after optimization is: {end_mem:.2f} MB")
        decrease = 100 * (start_mem - end_mem) / start_mem
        logger.info(f"Decreased by {decrease:.2f}%")
    return df


 # Parallel Triplet Imbalance Calculation

In [5]:
from numba import njit, prange

@njit(parallel=True)
def compute_triplet_imbalance(df_values, comb_indices):
    num_rows = df_values.shape[0]
    num_combinations = len(comb_indices)
    imbalance_features = np.empty((num_rows, num_combinations))
    for i in prange(num_combinations):
        a, b, c = comb_indices[i]
        for j in range(num_rows):
            max_val = max(df_values[j, a], df_values[j, b], df_values[j, c])
            min_val = min(df_values[j, a], df_values[j, b], df_values[j, c])
            mid_val = df_values[j, a] + df_values[j, b] + df_values[j, c] - min_val - max_val
            
            if mid_val == min_val:
                imbalance_features[j, i] = np.nan
            else:
                imbalance_features[j, i] = (max_val - mid_val) / (mid_val - min_val)

    return imbalance_features

def calculate_triplet_imbalance_numba(price, df):
    df_values = df[price].values
    comb_indices = [(price.index(a), price.index(b), price.index(c)) for a, b, c in combinations(price, 3)]
    features_array = compute_triplet_imbalance(df_values, comb_indices)
    columns = [f"{a}_{b}_{c}_imb2" for a, b, c in combinations(price, 3)]
    features = pd.DataFrame(features_array, columns=columns)
    return features


# Neighbors Classes

In [6]:
from typing import Dict, List, Optional, Tuple
from sklearn.preprocessing import minmax_scale
from sklearn.neighbors import NearestNeighbors
N_NEIGHBORS_MAX = 80 if not CFG.is_test_mode else 5

class Neighbors:
    def __init__(self, 
                 name: str, 
                 pivot: pd.DataFrame, 
                 p: float, 
                 metric: str = 'minkowski', 
                 metric_params: Optional[Dict] = None, 
                 exclude_self: bool = False):
        self.name = name
        self.exclude_self = exclude_self
        self.p = p
        self.metric = metric
        
        if metric == 'random':
            n_queries = len(pivot)
            self.neighbors = np.random.randint(n_queries, size=(n_queries, N_NEIGHBORS_MAX))
        else:
            nn = NearestNeighbors(
                n_neighbors=N_NEIGHBORS_MAX, 
                p=p, 
                metric=metric, 
                metric_params=metric_params
            )
            nn.fit(pivot)
            _, self.neighbors = nn.kneighbors(pivot, return_distance=True)

        self.columns = self.index = self.feature_values = self.feature_col = None

    def rearrange_feature_values(self, df: pd.DataFrame, feature_col: str) -> None:
        raise NotImplementedError()

    def make_nn_feature(self, n=5, agg=np.mean) -> pd.DataFrame:
        assert self.feature_values is not None, "should call rearrange_feature_values beforehand"

        start = 1 if self.exclude_self else 0

        pivot_aggs = pd.DataFrame(
            agg(self.feature_values[start:n,:,:], axis=0), 
            columns=self.columns, 
            index=self.index
        )

        dst = pivot_aggs.unstack().reset_index()
        dst.columns = ['stock_id', 'time_id', f'{self.feature_col}_nn{n}_{self.name}_{agg.__name__}']
        return dst


class TimeIdNeighbors(Neighbors):
    def rearrange_feature_values(self, df: pd.DataFrame, feature_col: str) -> None:
        feature_pivot = df.pivot('time_id', 'stock_id', feature_col)
        feature_pivot = feature_pivot.fillna(feature_pivot.mean())
        feature_pivot.head()

        feature_values = np.zeros((N_NEIGHBORS_MAX, *feature_pivot.shape))

        for i in range(N_NEIGHBORS_MAX):
            feature_values[i, :, :] += feature_pivot.values[self.neighbors[:, i], :]

        self.columns = list(feature_pivot.columns)
        self.index = list(feature_pivot.index)
        self.feature_values = feature_values
        self.feature_col = feature_col
        
    def __repr__(self) -> str:
        return f"time-id NN (name={self.name}, metric={self.metric}, p={self.p})"


class StockIdNeighbors(Neighbors):
    def rearrange_feature_values(self, df: pd.DataFrame, feature_col: str) -> None:
        """stock-id based nearest neighbor features"""
        feature_pivot = df.pivot(index='time_id', columns='stock_id', values=feature_col)
        feature_pivot = feature_pivot.fillna(feature_pivot.mean())

        feature_values = np.zeros((N_NEIGHBORS_MAX, *feature_pivot.shape))

        for i in range(N_NEIGHBORS_MAX):
            feature_values[i, :, :] += feature_pivot.values[:, self.neighbors[:, i]]

        self.columns = list(feature_pivot.columns)
        self.index = list(feature_pivot.index)
        self.feature_values = feature_values
        self.feature_col = feature_col
        
    def __repr__(self) -> str:
        return f"stock-id NN (name={self.name}, metric={self.metric}, p={self.p})"

In [7]:
# pivot = df_train.pivot(index='time_id', columns='stock_id', values='target')
# pivot = pivot.fillna(pivot.mean())
# pivot = pd.DataFrame(minmax_scale(pivot))

# stock_id_neighbors = []
# stock_id_neighbors.append(StockIdNeighbors(
#     name='stock_price_l1', 
#     pivot=minmax_scale(pivot.T), 
#     p=1, # manhattan_distance (l1)
#     exclude_self=True
# ))

In [8]:
# stock_ids = np.array(sorted(df_train['stock_id'].unique()))
# # for neighbor in stock_id_neighbors:
# print(neighbor)
# display(
#     pd.DataFrame(
#         stock_ids[stock_id_neighbors[0].neighbors[:,:5]], 
#         index=pd.Index(stock_ids, name='stock_id'), 
#         columns=[f'top_{i+1}' for i in range(5)]
#     )
# )

In [9]:
def make_nearest_neighbor_feature(df: pd.DataFrame) -> pd.DataFrame:
    df2 = df.copy()
    feature_cols_stock = {
        'imbalance_size': [np.mean, np.min, np.max, np.std],
        'reference_price': [np.mean, np.min, np.max, np.std],
        'matched_size': [np.mean],
        'far_price': [np.mean],
        'near_price': [np.mean],
        'bid_price': [np.mean],
        'ask_price': [np.mean],
        'wap': [np.mean],
    }

    pivot = df_train.pivot(index='time_id', columns='stock_id', values='target')
    pivot = pivot.fillna(pivot.mean())
    pivot = pd.DataFrame(minmax_scale(pivot))
    
    stock_id_neighbors = []
    stock_id_neighbors.append(StockIdNeighbors(
        name='stock_price_l1', 
        pivot=minmax_scale(pivot.T), 
        p=1, # manhattan_distance (l1)
        exclude_self=True
    ))

    stock_ids = np.array(sorted(df_train['stock_id'].unique()))
    # for neighbor in stock_id_neighbors:
    # print(neighbor)
    display(
        pd.DataFrame(
            stock_ids[stock_id_neighbors[0].neighbors[:,:5]], 
            index=pd.Index(stock_ids, name='stock_id'), 
            columns=[f'top_{i+1}' for i in range(5)]
        )
    )

    stock_id_neighbor_sizes = [10, 20, 40]
    
    ndf: Optional[pd.DataFrame] = None
    
    def _add_ndf(ndf: Optional[pd.DataFrame], dst: pd.DataFrame) -> pd.DataFrame:
        if ndf is None:
            return dst
        else:
            ndf[dst.columns[-1]] = dst[dst.columns[-1]].astype(np.float32)
            return ndf
    
    # neighbor stock_id
    for feature_col in feature_cols_stock.keys():
        try:
            if feature_col not in df2.columns:
                print(f"column {feature_col} is skipped")
                continue
        
            if not stock_id_neighbors:
                continue
        
            for nn in stock_id_neighbors:
                nn.rearrange_feature_values(df2, feature_col)
        
            for agg in feature_cols_stock[feature_col]:
                for n in stock_id_neighbor_sizes:
                    try:
                        for nn in stock_id_neighbors:
                            dst = nn.make_nn_feature(n, agg)
                            ndf = _add_ndf(ndf, dst)
                    except Exception as e:
                        print('stock-id nn', e)
                        pass
        except Exception as e:
            print(e)
    
    if ndf is not None:
        df2 = pd.merge(df2, ndf, on=['time_id', 'stock_id'], how='left')

    return df2

# Feature Generation Functions 

In [10]:
def imbalance_features(df):
    # Define lists of price and size-related column names
    prices = ["reference_price", "far_price", "near_price", "ask_price", "bid_price", "wap"]
    sizes = ["matched_size", "bid_size", "ask_size", "imbalance_size"]
    df["volume"] = df.eval("ask_size + bid_size")
    df["mid_price"] = df.eval("(ask_price + bid_price) / 2")
    df["liquidity_imbalance"] = df.eval("(bid_size-ask_size)/(bid_size+ask_size)")
    df["matched_imbalance"] = df.eval("(imbalance_size-matched_size)/(matched_size+imbalance_size)")
    df["size_imbalance"] = df.eval("bid_size / ask_size")
    df["size_imbalance_bid"] = df.eval("imbalance_size / bid_size")
    df["size_imbalance_ask"] = df.eval("imbalance_size / ask_size")
    df["matched_size_bid_ask"] = df.eval("matched_size / (bid_size+ask_size)")

    for c in combinations(prices, 2):
        df[f"{c[0]}_{c[1]}_imb"] = df.eval(f"({c[0]} - {c[1]})/({c[0]} + {c[1]})")

    for c in [['ask_price', 'bid_price', 'wap', 'reference_price'], sizes]:
        triplet_feature = calculate_triplet_imbalance_numba(c, df)
        df[triplet_feature.columns] = triplet_feature.values
   
    df["imbalance_momentum"] = df.groupby(['stock_id'])['imbalance_size'].diff(periods=1) / df['matched_size']
    df["price_spread"] = df["ask_price"] - df["bid_price"]
    df["spread_intensity"] = df.groupby(['stock_id'])['price_spread'].diff()
    df['price_pressure'] = df['imbalance_size'] * (df['ask_price'] - df['bid_price'])
    df['market_urgency'] = df['price_spread'] * df['liquidity_imbalance']
    df['depth_pressure'] = (df['ask_size'] - df['bid_size']) * (df['far_price'] - df['near_price'])
    
    # Calculate various statistical aggregation features
    for func in ["mean", "std", "skew", "kurt"]:
        df[f"all_prices_{func}"] = df[prices].agg(func, axis=1)
        df[f"all_sizes_{func}"] = df[sizes].agg(func, axis=1)
        

    for col in ['matched_size', 'imbalance_size', 'reference_price', 'imbalance_buy_sell_flag']:
        for window in [1, 2, 3, 10]:
            df[f"{col}_shift_{window}"] = df.groupby('stock_id')[col].shift(window)
            df[f"{col}_ret_{window}"] = df.groupby('stock_id')[col].pct_change(window)

            if window == 1:
                continue

            date_ids = df['date_id'].unique()
            agg_func = ['mean', 'sum']
            moving_li = []
            
            for date in date_ids:
                moving_li.append(
                    df[df['date_id'] == date].groupby('stock_id')[col].rolling(window).agg(agg_func).reset_index().set_index('level_1')[agg_func]
                )
            df[[f'{col}_moving_average_{window}', f'{col}_moving_sum_{window}']] = pd.concat(moving_li)[agg_func]
    
    # Calculate diff features for specific columns
    for col in ['ask_price', 'bid_price', 'ask_size', 'bid_size', 'market_urgency', 'imbalance_momentum', 'size_imbalance']:
        for window in [1, 2, 3, 10]:
            df[f"{col}_diff_{window}"] = df.groupby("stock_id")[col].diff(window)

    # --- add
    # Calculate diff prices
    for c in combinations(prices, 2):
        df[f'{c[0]}_{c[1]}_diff'] = df.eval(f'({c[0]} - {c[1]})')

    return df.replace([np.inf, -np.inf], 0)

def other_features(df):
    df["dow"] = df["date_id"] % 5  # Day of the week
    df["seconds"] = df["seconds_in_bucket"] % 60  
    df["minute"] = df["seconds_in_bucket"] // 60  
    for key, value in global_stock_id_feats.items():
        df[f"global_{key}"] = df["stock_id"].map(value.to_dict())

    for key, value in global_weight_feats.items():
        df[f"global_{key}"] = df["weight_label"].map(value.to_dict())

    return df

def generate_all_features(df, feature_name=None):
    # Select relevant columns for feature generation
    cols = [c for c in df.columns if c not in ["row_id", "time_id", "target"]]
    df = df[cols]
    
    # Generate imbalance features
    df = imbalance_features(df)
    df = other_features(df)
    df = make_nearest_neighbor_feature(df)
    gc.collect()  

    if not feature_name:
        feature_name = [i for i in df.columns if i not in ["row_id", "target", "time_id", "date_id", 'ask_price_bid_price_diff']]
    
    return df[feature_name]

In [11]:
weights = [
    0.004, 0.001, 0.002, 0.006, 0.004, 0.004, 0.002, 0.006, 0.006, 0.002, 0.002, 0.008,
    0.006, 0.002, 0.008, 0.006, 0.002, 0.006, 0.004, 0.002, 0.004, 0.001, 0.006, 0.004,
    0.002, 0.002, 0.004, 0.002, 0.004, 0.004, 0.001, 0.001, 0.002, 0.002, 0.006, 0.004,
    0.004, 0.004, 0.006, 0.002, 0.002, 0.04 , 0.002, 0.002, 0.004, 0.04 , 0.002, 0.001,
    0.006, 0.004, 0.004, 0.006, 0.001, 0.004, 0.004, 0.002, 0.006, 0.004, 0.006, 0.004,
    0.006, 0.004, 0.002, 0.001, 0.002, 0.004, 0.002, 0.008, 0.004, 0.004, 0.002, 0.004,
    0.006, 0.002, 0.004, 0.004, 0.002, 0.004, 0.004, 0.004, 0.001, 0.002, 0.002, 0.008,
    0.02 , 0.004, 0.006, 0.002, 0.02 , 0.002, 0.002, 0.006, 0.004, 0.002, 0.001, 0.02,
    0.006, 0.001, 0.002, 0.004, 0.001, 0.002, 0.006, 0.006, 0.004, 0.006, 0.001, 0.002,
    0.004, 0.006, 0.006, 0.001, 0.04 , 0.006, 0.002, 0.004, 0.002, 0.002, 0.006, 0.002,
    0.002, 0.004, 0.006, 0.006, 0.002, 0.002, 0.008, 0.006, 0.004, 0.002, 0.006, 0.002,
    0.004, 0.006, 0.002, 0.004, 0.001, 0.004, 0.002, 0.004, 0.008, 0.006, 0.008, 0.002,
    0.004, 0.002, 0.001, 0.004, 0.004, 0.004, 0.006, 0.008, 0.004, 0.001, 0.001, 0.002,
    0.006, 0.004, 0.001, 0.002, 0.006, 0.004, 0.006, 0.008, 0.002, 0.002, 0.004, 0.002,
    0.04 , 0.002, 0.002, 0.004, 0.002, 0.002, 0.006, 0.02 , 0.004, 0.002, 0.006, 0.02,
    0.001, 0.002, 0.006, 0.004, 0.006, 0.004, 0.004, 0.004, 0.004, 0.002, 0.004, 0.04,
    0.002, 0.008, 0.002, 0.004, 0.001, 0.004, 0.006, 0.004,
]
# weights = {int(k):v for k,v in enumerate(weights)}
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
weight_label = pd.Series(le.fit_transform(weights), name='weight_label')
weight_label.value_counts()

weight_label
2    60
1    60
3    40
0    20
4    10
6     5
5     5
Name: count, dtype: int64

## Data Splitting

In [12]:
df_train = df.copy()
del df

In [13]:
global_stock_id_feats = {
    "median_size": df_train.groupby("stock_id")["bid_size"].median() + df_train.groupby("stock_id")["ask_size"].median(),
    "std_size": df_train.groupby("stock_id")["bid_size"].std() + df_train.groupby("stock_id")["ask_size"].std(),
    "ptp_size": df_train.groupby("stock_id")["bid_size"].max() - df_train.groupby("stock_id")["bid_size"].min(),
    "median_price": df_train.groupby("stock_id")["bid_price"].median() + df_train.groupby("stock_id")["ask_price"].median(),
    "std_price": df_train.groupby("stock_id")["bid_price"].std() + df_train.groupby("stock_id")["ask_price"].std(),
    "ptp_price": df_train.groupby("stock_id")["bid_price"].max() - df_train.groupby("stock_id")["ask_price"].min(),
}

df_train = pd.merge(df_train, weight_label, left_on='stock_id', right_index=True)
global_weight_feats = {
    "median_size": df_train.groupby("weight_label")["bid_size"].median() + df_train.groupby("weight_label")["ask_size"].median(),
    "std_size": df_train.groupby("weight_label")["bid_size"].std() + df_train.groupby("weight_label")["ask_size"].std(),
    "ptp_size": df_train.groupby("weight_label")["bid_size"].max() - df_train.groupby("weight_label")["bid_size"].min(),
    "median_price": df_train.groupby("weight_label")["bid_price"].median() + df_train.groupby("weight_label")["ask_price"].median(),
    "std_price": df_train.groupby("weight_label")["bid_price"].std() + df_train.groupby("weight_label")["ask_price"].std(),
    "ptp_price": df_train.groupby("weight_label")["bid_price"].max() - df_train.groupby("weight_label")["ask_price"].min(),
}
if CFG.TRAINING:
    df_train_feats = generate_all_features(df_train)
    df_train_feats = reduce_mem_usage(df_train_feats)

    feature_name = list(df_train_feats.columns)
    print(f'Feature length = {len(feature_name)}')

Unnamed: 0_level_0,top_1,top_2,top_3,top_4,top_5
stock_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0,37,110,2,66
1,1,79,137,37,3
2,2,142,153,70,174
3,3,137,79,37,109
4,4,9,137,131,99
...,...,...,...,...,...
195,195,148,37,3,123
196,196,137,79,3,37
197,197,137,9,131,99
198,198,12,5,15,115


'time_id'
'time_id'
'time_id'
'time_id'
'time_id'
'time_id'
'time_id'
'time_id'
Feature length = 166


In [14]:
# with pd.option_context('display.max_columns', 200):
#     display(
#         pd.concat(
#             [df_train_feats[df_train_feats['stock_id'] == 0].head(15),
#             df_train_feats[df_train_feats['stock_id'] == 0].tail(5)]
#         )
#     )

In [15]:
# df_train_feats.columns[df_train_feats.head(1000).T.duplicated(keep=False)]

## LightGBM

In [16]:
import numpy as np
import lightgbm as lgb
from sklearn.metrics import mean_absolute_error
import gc

if CFG.TRAINING and 'LGBM' in CFG.methods:
    lgb_params = {
        "objective": "mae",
        "n_estimators": 6000 if not CFG.is_test_mode else 500,
        "num_leaves": 256,
        "subsample": 0.6,
        "colsample_bytree": 0.8,
        "learning_rate": 0.00871,
        'max_depth': 11,
        "n_jobs": 4,
        "device": 'gpu' if CFG.is_gpu else 'cpu',
        "verbosity": -1,
        "importance_type": "gain",
        'seed': CFG.state,
    }
    feature_name = list(df_train_feats.columns)
    print(f"Feature length = {len(feature_name)}")
    
    num_folds = 5
    fold_size = 480 // num_folds
    gap = 5
    
    lgb_models = {}
    scores = {}
    
    model_save_path = f'{CFG.model_path}lgb_model' 
    if not os.path.exists(model_save_path):
        os.makedirs(model_save_path)
    
    for cluster in df_train_feats['weight_label'].unique():
        print(f'Cluster {cluster}')
        cluster_train = df_train_feats[df_train_feats['weight_label'] == cluster]
        cluster_train_target = df_train['target'][df_train['weight_label'] == cluster]

        date_ids = df_train.loc[df_train['weight_label'] == cluster, 'date_id'].values
        
        for i in range(num_folds):
            start = i * fold_size
            end = start + fold_size
            if i < num_folds - 1:  # No need to purge after the last fold
                purged_start = end - 2
                purged_end = end + gap + 2
                train_indices = (date_ids >= start) & (date_ids < purged_start) | (date_ids > purged_end)
            else:
                train_indices = (date_ids >= start) & (date_ids < end)
            
            test_indices = (date_ids >= end) & (date_ids < end + fold_size)
            
            df_fold_train = cluster_train[train_indices]
            df_fold_train_target = cluster_train_target[train_indices]
            df_fold_valid = cluster_train[test_indices]
            df_fold_valid_target = cluster_train_target[test_indices]
        
            print(f"Fold {i+1} Model Training")
    
            models = []
            tmp_scores = []

            # Train a LightGBM model for the current fold
            lgb_model = lgb.LGBMRegressor(**lgb_params)
            lgb_model.fit(
                df_fold_train[feature_name],
                df_fold_train_target,
                eval_set=[(df_fold_valid[feature_name], df_fold_valid_target)],
                callbacks=[
                    lgb.callback.early_stopping(stopping_rounds=100),
                    lgb.callback.log_evaluation(period=100),
                ],
            )
        
            models.append(lgb_model)
            # Save the model to a file
            model_filename = os.path.join(model_save_path, f'lgb_cluster{cluster}_cv{i+1}.txt')
            lgb_model.booster_.save_model(model_filename, importance_type='gain')
            print(f"Model for fold {i+1} cluster {cluster} saved to {model_filename}")
        
            # Evaluate model performance on the validation set
            fold_predictions = lgb_model.predict(df_fold_valid[feature_name])
            fold_score = mean_absolute_error(fold_predictions, df_fold_valid_target)
            tmp_scores.append(fold_score)
            print(f"Fold {i+1} Cluster {cluster} MAE: {fold_score}")

        # Free up memory by deleting fold specific variables
        # del df_fold_train, df_fold_train_target, df_fold_valid, df_fold_valid_target
        # gc.collect()
    
        # Calculate the average best iteration from all regular folds
        average_best_iteration = int(np.mean([model.best_iteration_ for model in models]))
        
        # Update the lgb_params with the average best iteration
        final_model_params = lgb_params.copy()
        final_model_params['n_estimators'] = average_best_iteration
        
        print(f"Cluster{cluster} Training final model with average best iteration: {average_best_iteration}")
        
        # Train the final model on the entire dataset
        final_model = lgb.LGBMRegressor(**final_model_params)
        final_model.fit(
            cluster_train[feature_name],
            cluster_train_target,
            callbacks=[
                lgb.callback.log_evaluation(period=100),
            ],
        )
        
        # Append the final model to the list of models
        models.append(final_model)
        lgb_models[f'Cluster{i}'] = models
        scores[f'Cluster{i}'] = tmp_scores
        
        # Save the final model to a file
        final_model_filename = os.path.join(model_save_path, f'lgb_cluster{cluster}_fin.txt')
        final_model.booster_.save_model(final_model_filename, importance_type='gain')
        print(f"Final model saved to {final_model_filename}")
        
        # Now 'models' holds the trained models for each fold and 'scores' holds the validation scores
        print(f"Average MAE across all folds: {np.mean(tmp_scores)}")
    
    # os.makedirs(f'{model_save_path}/scores', exist_ok=True)
    # scores.insert(0, lgb_params)
    # scores.insert(1, len(feature_name))
    # scores.insert(2, feature_name)
    # with open(f'{model_save_path}/scores/lgbm{datetime.datetime.now()}_{len(feature_name)}.txt', 'w') as f:
    #     print(*scores, file=f, sep='\n')

Feature length = 166
Cluster 2
Fold 1 Model Training
Training until validation scores don't improve for 100 rounds


KeyboardInterrupt: 

## Catboost

In [12]:
if CFG.TRAINING and 'CBT' in CFG.methods:
    cbt_params = {
        'task_type'           : 'GPU' if CFG.is_gpu else 'CPU',
        'objective'           : 'MAE',
        'eval_metric'         : 'MAE',
        'bagging_temperature' : 0.5,
    #     'colsample_bylevel'   : 0.7,
        'iterations'          : 500 if not CFG.is_test_mode else 100,
        'early_stopping_rounds' : 50 if not CFG.is_test_mode else 10,
        'learning_rate'       : 0.065,
        'max_depth'           : 7,
        'l2_leaf_reg'         : 1.5,
        'min_data_in_leaf'    : 1000,
        'random_strength'     : 0.65, 
        'verbose'             : 0,
        'use_best_model'      : True,
        'random_seed'         : CFG.state,
    }
    feature_name = list(df_train_feats.columns)
    print(f"Feature length = {len(feature_name)}")
    
    num_folds = 5
    fold_size = 480 // num_folds
    gap = 5
    
    cbt_models = []
    scores = []
    
    model_save_path = f'{CFG.model_path}cbt_model' 
    if not os.path.exists(model_save_path):
        os.makedirs(model_save_path)
    
    date_ids = df_train['date_id'].values
    
    for i in range(num_folds):
        start = i * fold_size
        end = start + fold_size
        if i < num_folds - 1:  # No need to purge after the last fold
            purged_start = end - 2
            purged_end = end + gap + 2
            train_indices = (date_ids >= start) & (date_ids < purged_start) | (date_ids > purged_end)
        else:
            train_indices = (date_ids >= start) & (date_ids < end)
        
        test_indices = (date_ids >= end) & (date_ids < end + fold_size)
        
        df_fold_train = df_train_feats[train_indices]
        df_fold_train_target = df_train['target'][train_indices]
        df_fold_valid = df_train_feats[test_indices]
        df_fold_valid_target = df_train['target'][test_indices]
        cbt_train = cbt.Pool(df_fold_train, df_fold_train_target)
        cbt_valid = cbt.Pool(df_fold_valid, df_fold_valid_target)
    
        print(f"Fold {i+1} Model Training")
        
        # Train a LightGBM model for the current fold
        cbt_model = cbt.CatBoostRegressor(**cbt_params)
        cbt_model.fit(
            cbt_train,
            eval_set=[cbt_valid],
        )
    
        cbt_models.append(cbt_model)
        # Save the model to a file
        model_filename = os.path.join(model_save_path, f'cbt_cv{i+1}.cbm')
        cbt_model.save_model(model_filename)
        print(f"Model for fold {i+1} saved to {model_filename}")
    
        # Evaluate model performance on the validation set
        fold_predictions = cbt_model.predict(df_fold_valid[feature_name])
        fold_score = mean_absolute_error(fold_predictions, df_fold_valid_target)
        scores.append(fold_score)
        print(f"Fold {i+1} MAE: {fold_score}")
    
        # Free up memory by deleting fold specific variables
        del df_fold_train, df_fold_train_target, df_fold_valid, df_fold_valid_target
        gc.collect()
    
    # Calculate the average best iteration from all regular folds
    average_best_iteration = int(np.mean([model.get_best_iteration() for model in cbt_models]))
    
    # Update the lgb_params with the average best iteration
    final_model_params = cbt_params.copy()
    final_model_params['iterations'] = average_best_iteration
    final_model_params['use_best_model'] = False
    
    print(f"Training final model with average best iteration: {average_best_iteration}")
    
    # Train the final model on the entire dataset
    final_model = cbt.CatBoostRegressor(**final_model_params)
    final_model.fit(
        df_train_feats[feature_name],
        df_train['target'],
    )
    
    # Append the final model to the list of models
    cbt_models.append(final_model)
    
    # Save the final model to a file
    final_model_filename = os.path.join(model_save_path, 'cbt_fin.cbm')
    final_model.save_model(final_model_filename)
    print(f"Final model saved to {final_model_filename}")
    
    # Now 'models' holds the trained models for each fold and 'scores' holds the validation scores
    print(f"Average MAE across all folds: {np.mean(scores)}")

    os.makedirs(f'{model_save_path}/scores', exist_ok=True)
    scores.insert(0, cbt_params)
    scores.insert(1, len(feature_name))
    scores.insert(2, feature_name)
    np.savetxt(f'{model_save_path}/scores/cbt{datetime.datetime.now()}_{len(feature_name)}.txt', scores)

## XGBoost

In [13]:
import xgboost as xgb

if CFG.TRAINING and 'XGB' in CFG.methods:
    params = {
        'task_type'           : 'GPU' if CFG.is_gpu else 'CPU',
        'objective'           : 'MAE',
        'eval_metric'         : 'MAE',
        'bagging_temperature' : 0.5,
    #     'colsample_bylevel'   : 0.7,
        'iterations'          : 500 if not CFG.is_test_mode else 100,
        'early_stopping_rounds' : 50 if not CFG.is_test_mode else 10,
        'learning_rate'       : 0.065,
        'max_depth'           : 7,
        'l2_leaf_reg'         : 1.5,
        'min_data_in_leaf'    : 1000,
        'random_strength'     : 0.65, 
        'verbose'             : 0,
        'use_best_model'      : True,
        'random_seed'         : CFG.state,
    }
    feature_name = list(df_train_feats.columns)
    print(f"Feature length = {len(feature_name)}")
    
    num_folds = 5
    fold_size = 480 // num_folds
    gap = 5
    
    xgb_models = []
    scores = []
    
    model_save_path = f'{CFG.model_path}xgb_model' 
    if not os.path.exists(model_save_path):
        os.makedirs(model_save_path)
    
    date_ids = df_train['date_id'].values
    
    for i in range(num_folds):
        start = i * fold_size
        end = start + fold_size
        if i < num_folds - 1:  # No need to purge after the last fold
            purged_start = end - 2
            purged_end = end + gap + 2
            train_indices = (date_ids >= start) & (date_ids < purged_start) | (date_ids > purged_end)
        else:
            train_indices = (date_ids >= start) & (date_ids < end)
        
        test_indices = (date_ids >= end) & (date_ids < end + fold_size)
        
        df_fold_train = df_train_feats[train_indices]
        df_fold_train_target = df_train['target'][train_indices]
        df_fold_valid = df_train_feats[test_indices]
        df_fold_valid_target = df_train['target'][test_indices]
    
        print(f"Fold {i+1} Model Training")
        
        # Train a LightGBM model for the current fold
        model = xgb.XGBoostRegressor(**params)
        model.fit(
        )
        
        xgb_models.append(model)
        # Save the model to a file
        model_filename = os.path.join(model_save_path, f'xgb_cv{i+1}.txt')
        model.save_model(model_filename)
        print(f"Model for fold {i+1} saved to {model_filename}")
    
        # Evaluate model performance on the validation set
        fold_predictions = model.predict(df_fold_valid[feature_name])
        fold_score = mean_absolute_error(fold_predictions, df_fold_valid_target)
        scores.append(fold_score)
        print(f"Fold {i+1} MAE: {fold_score}")
    
        # Free up memory by deleting fold specific variables
        del df_fold_train, df_fold_train_target, df_fold_valid, df_fold_valid_target
        gc.collect()
    
    # Calculate the average best iteration from all regular folds
    average_best_iteration = int(np.mean([model.get_best_iteration() for model in xgb_models]))
    
    # Update the lgb_params with the average best iteration
    final_model_params = params.copy()
    final_model_params['iterations'] = average_best_iteration
    final_model_params['use_best_model'] = False
    
    print(f"Training final model with average best iteration: {average_best_iteration}")
    
    # Train the final model on the entire dataset
    final_model = xgb.XGBoostRegressor(**final_model_params)
    final_model.fit(
        df_train_feats[feature_name],
        df_train['target'],
    )
    
    # Append the final model to the list of models
    xgb_models.append(final_model)
    
    # Save the final model to a file
    final_model_filename = os.path.join(model_save_path, 'xgb_fin.cbm')
    final_model.save_model(final_model_filename)
    print(f"Final model saved to {final_model_filename}")
    
    # Now 'models' holds the trained models for each fold and 'scores' holds the validation scores
    print(f"Average MAE across all folds: {np.mean(scores)}")

    os.makedirs(f'{model_save_path}/scores', exist_ok=True)
    scores.insert(0, params)
    scores.insert(1, len(feature_name))
    scores.insert(2, feature_name)
    np.savetxt(f'{model_save_path}/scores/xgb{datetime.datetime.now()}_{len(feature_name)}.txt', scores)

In [14]:
def zero_sum(prices, volumes):
    std_error = np.sqrt(volumes)
    step = np.sum(prices) / np.sum(std_error)
    out = prices - std_error * step
    return out

if CFG.INFERENCE:
    import optiver2023
    env = optiver2023.make_env()
    iter_test = env.iter_test()
    counter = 0
    y_min, y_max = -64, 64
    qps, predictions = [], []
    cache = pd.DataFrame()
    
    lgb_models = [
#         lgb.Booster(model_file='/kaggle/input/optiver-lgbm/lgb_model/lgb_cv1.txt'),
#         lgb.Booster(model_file='/kaggle/input/optiver-lgbm/lgb_model/lgb_cv2.txt'),
        lgb.Booster(model_file='/kaggle/input/optiver-lgbm/lgb_model/lgb_cv3.txt'),
        lgb.Booster(model_file='/kaggle/input/optiver-lgbm/lgb_model/lgb_cv4.txt'),
        lgb.Booster(model_file='/kaggle/input/optiver-lgbm/lgb_model/lgb_cv5.txt'),
        lgb.Booster(model_file='/kaggle/input/optiver-lgbm/lgb_model/lgb_fin.txt'),
    ]
    cbt_models = [
#         cbt.CatBoostRegressor().load_model('/kaggle/input/optiver-catboost/cbt_model/cbt_cv1.cbm'),
#         cbt.CatBoostRegressor().load_model('/kaggle/input/optiver-catboost/cbt_model/cbt_cv2.cbm'),
#         cbt.CatBoostRegressor().load_model('/kaggle/input/optiver-catboost/cbt_model/cbt_cv3.cbm'),
#         cbt.CatBoostRegressor().load_model('/kaggle/input/optiver-catboost/cbt_model/cbt_cv4.cbm'),
        cbt.CatBoostRegressor().load_model('/kaggle/input/optiver-catboost/cbt_model/cbt_cv5.cbm'),
        cbt.CatBoostRegressor().load_model('/kaggle/input/optiver-catboost/cbt_model/cbt_fin.cbm'),
    ]
    # Weights for each fold model
    lgb_model_weights = [0.1, 0.2, 0.3, 0.4]
    cbt_model_weights = [0.4, 0.6]
#     lgb_model_weights = [1/len(lgb_models)] * len(lgb_models)
#     cbt_model_weights = [1/len(cbt_models)] * len(cbt_models)
    
    for (test, revealed_targets, sample_prediction) in iter_test:
        now_time = time.time()
        cache = pd.concat([cache, test], ignore_index=True, axis=0)
        if counter > 0:
            cache = cache.groupby(['stock_id']).tail(21).sort_values(by=['date_id', 'seconds_in_bucket', 'stock_id']).reset_index(drop=True)
        feat = generate_all_features(cache)[-len(test):]

        if test.currently_scored.iloc[0] == False:
            sample_prediction['target'] = 0
            env.predict(sample_prediction)
            counter += 1
            qps.append(time.time() - now_time)
            if counter % 10 == 0:
                print(counter, 'qps:', np.mean(qps))
            continue
                    
        feat = feat.drop(columns=['currently_scored'])

        # Generate predictions for each model and calculate the weighted average
        lgb_predictions = np.zeros(len(test))
        for model, weight in zip(lgb_models, lgb_model_weights):
            lgb_predictions += weight * model.predict(feat)

        lgb_predictions = zero_sum(lgb_predictions, test['bid_size']+test['ask_size'])
        clipped_predictions = np.clip(lgb_predictions, y_min, y_max)
        sample_prediction['target'] = clipped_predictions
        # CatBoost
        cbt_predictions = np.zeros(len(test))
        for model, weight in zip(cbt_models, cbt_model_weights):
            cbt_predictions += weight * model.predict(feat)

        cbt_predictions = zero_sum(cbt_predictions, test['bid_size']+test['ask_size'])
        clipped_predictions = np.clip(cbt_predictions, y_min, y_max)
        sample_prediction['target'] = 0.4 * clipped_predictions + 0.6 * sample_prediction['target']

        env.predict(sample_prediction)
        counter += 1
        qps.append(time.time() - now_time)
        if counter % 10 == 0:
            print(counter, 'qps:', np.mean(qps))

    time_cost = 1.146 * np.mean(qps)
    print(f"The code will take approximately {np.round(time_cost, 4)} hours to reason about")