In [1]:
import gc  
import os  
import time  
import warnings 
from itertools import combinations  
from warnings import simplefilter 
import joblib  
import playground.optivarfuncs as of
import lightgbm as lgb  
import numpy as np  
import pandas as pd  
from sklearn.metrics import mean_absolute_error 
from sklearn.model_selection import KFold, TimeSeriesSplit  
warnings.filterwarnings("ignore")
simplefilter(action="ignore", category=pd.errors.PerformanceWarning)
is_offline = False 
is_train = True  
is_infer = True 
max_lookback = np.nan 
split_day = 435  
import polars as pl
# set the max columns to none
pd.set_option('display.max_columns', None)

# Functions

## Settings and helper Functions
There are 480 dates, 5 days a week or 96 weeks

In [2]:
class CONFIG:    
    #take last 3 months worth? or roughly 12*5=60.  So we want from (480-60) to 480
    # start_date=420

    #take last 1 months worth? or roughly 4*5=20.  So we want from (480-20) to 480
    start_date=460

    #just a week for testing?
    start_date=475
    
    #take last 1 months worth? or roughly 4*5=20.  So we want from (480-20) to 480
    doTrainModel= True #if true, #need train and test sets
    runOnKaggle=False #if true, then concat all datasets before calculating features for Kaggle data

    use_subset_of_data=False

from gc import collect;
collect()
# Tracking kernel memory usage:-  
from os import path, walk, getpid;
from psutil import Process;
def GetMemUsage():
    """
    This function defines the memory usage across the kernel. 
    Source-
    https://stackoverflow.com/questions/61366458/how-to-find-memory-usage-of-kaggle-notebook
    """;
    
    pid = getpid();
    py = Process(pid);
    memory_use = py.memory_info()[0] / 2. ** 30;
    return f"RAM usage = {memory_use :.4} GB";

def cleanup(df):
    #delete df object
    try:
        del df
        df=None
    except:
        pass
    collect()
    return GetMemUsage()


#logging
import logging
# set up logging to file - see previous section for more details
logging.basicConfig(level=logging.INFO,
                    filename='logg.log',
                    filemode='w')
# define a Handler which writes INFO messages or higher to the sys.stderr
console = logging.StreamHandler()
# add the handler to the root logger
logging.getLogger().addHandler(console)
logger=logging.getLogger()

#use following to enable and disable
# logger.disabled = True

def reduce_mem_usage(df, verbose=True):
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtype
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            
            if str(col_type)[:3] == "int":
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
               
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float32)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float32)
    if verbose:
        logger.info(f"Memory usage of dataframe is {start_mem:.2f} MB")
        end_mem = df.memory_usage().sum() / 1024**2
        logger.info(f"Memory usage after optimization is: {end_mem:.2f} MB")
        decrease = 100 * (start_mem - end_mem) / start_mem
        logger.info(f"Decreased by {decrease:.2f}%")
    return df


## Parallel Triplet Imbalance Calculation function

In [3]:
from numba import njit, prange

@njit(parallel=True)
def compute_triplet_imbalance(df_values, comb_indices):
    num_rows = df_values.shape[0]
    num_combinations = len(comb_indices)
    imbalance_features = np.empty((num_rows, num_combinations))
    for i in prange(num_combinations):
        a, b, c = comb_indices[i]
        for j in range(num_rows):
            max_val = max(df_values[j, a], df_values[j, b], df_values[j, c])
            min_val = min(df_values[j, a], df_values[j, b], df_values[j, c])
            mid_val = df_values[j, a] + df_values[j, b] + df_values[j, c] - min_val - max_val
            
            if mid_val == min_val:
                imbalance_features[j, i] = np.nan
            else:
                imbalance_features[j, i] = (max_val - mid_val) / (mid_val - min_val)

    return imbalance_features

def calculate_triplet_imbalance_numba(price, df):
    df_values = df[price].values
    comb_indices = [(price.index(a), price.index(b), price.index(c)) for a, b, c in combinations(price, 3)]
    features_array = compute_triplet_imbalance(df_values, comb_indices)
    columns = [f"{a}_{b}_{c}_imb2" for a, b, c in combinations(price, 3)]
    features = pd.DataFrame(features_array, columns=columns)
    return features


## Feature Generation Functions 

In [32]:
from tqdm import tqdm
# from tqdm.auto import tqdm  # for notebooks

# Create new `pandas` methods which use `tqdm` progress
# (can use tqdm_gui, optional kwargs, etc.)
tqdm.pandas()

In [5]:
#used for backfilling
from dataclasses import dataclass

@dataclass
class LastValueAllStocks:
    numberstocks = 200
    date_id = 0 #initialise to 0
    all_near_prices = {i:0 for i in range(numberstocks)}
    all_far_prices = {i:0 for i in range(numberstocks)}

    def set_initial_prices(self,df):
        #Set the initial near_and far price for the first entry for each stock in dataframe

        #first get the very first entry for all stocks
        earliest_date=df.date_id.min()
        earliest_seconds_in_bucket=df[df.date_id==earliest_date].seconds_in_bucket.min()
        indices=df.loc[((df.date_id==earliest_date) & (df.seconds_in_bucket==earliest_seconds_in_bucket)),:].index
 
        #then loop through and set the near and far prices for each stock in dftmp
        for index in indices:
            stk=df[df.index==index].stock_id.values[0]       
            df.loc[df.index==index,['near_price']]=self.all_near_prices[stk]
            df.loc[df.index==index,['far_price']]=self.all_far_prices[stk]
        return df

    def save_last_prices(self,df):
        #save the final values for near_and far price for each stock in df

        #first get the very last entry for all stocks
        latest_date=df.date_id.max()
        latest_seconds_in_bucket=df[df.date_id==latest_date].seconds_in_bucket.max()
        indices=df.loc[((df.date_id==latest_date) & (df.seconds_in_bucket==latest_seconds_in_bucket)),:].index

        #then loop through and set the near and far prices for each stock in dftmp
        for index in indices:
            stk=df[df.index==index].stock_id.values[0] 
            self.all_near_prices[stk]= df.loc[df.index==index,['near_price']].values[0][0] 
            self.all_far_prices[stk]= df.loc[df.index==index,['far_price']].values[0][0]
        
class HandleNaNs:
    def __init__(self):
        self.last_value_all_stocks=LastValueAllStocks()

    def fill_nans(self,df):

        #sort to get stock ids all together, should be chunks of 200 rows
        df.sort_values(by=['stock_id','date_id','seconds_in_bucket'],inplace=True)

        df=self.last_value_all_stocks.set_initial_prices(df)
        
        #then do a forward interpolation
        df.far_price=df.far_price.interpolate(method='linear')
        df.near_price=df.near_price.interpolate(method='linear')

        self.last_value_all_stocks.save_last_prices(df)
        df.sort_values(by=['date_id','seconds_in_bucket','stock_id'],inplace=True)
        return df

In [6]:
def imbalance_features(df):
    # Define lists of price and size-related column names
    prices = ["reference_price", "far_price", "near_price", "ask_price", "bid_price", "wap"]
    sizes = ["matched_size", "bid_size", "ask_size", "imbalance_size"]
    
    df["volume"] = df.eval("ask_size + bid_size")
    df["mid_price"] = df.eval("(ask_price + bid_price) / 2")
    df["liquidity_imbalance"] = df.eval("(bid_size-ask_size)/(bid_size+ask_size)")
    df["matched_imbalance"] = df.eval("(imbalance_size-matched_size)/(matched_size+imbalance_size)")
    df["size_imbalance"] = df.eval("bid_size / ask_size")

    for c in combinations(prices, 2):
        df[f"{c[0]}_{c[1]}_imb"] = df.eval(f"({c[0]} - {c[1]})/({c[0]} + {c[1]})")

    for c in [['ask_price', 'bid_price', 'wap', 'reference_price'], sizes]:
        triplet_feature = calculate_triplet_imbalance_numba(c, df)
        df[triplet_feature.columns] = triplet_feature.values
    
    df["stock_weights"] = df["stock_id"].map(weights)
    df["weighted_wap"] = df["stock_weights"] * df["wap"]
    df['wap_momentum'] = df.groupby('stock_id')['weighted_wap'].pct_change(periods=6)
   
    df["imbalance_momentum"] = df.groupby(['stock_id'])['imbalance_size'].diff(periods=1) / df['matched_size']
    df["price_spread"] = df["ask_price"] - df["bid_price"]
    df["spread_intensity"] = df.groupby(['stock_id'])['price_spread'].diff()
    df['price_pressure'] = df['imbalance_size'] * (df['ask_price'] - df['bid_price'])
    df['market_urgency'] = df['price_spread'] * df['liquidity_imbalance']
    df['depth_pressure'] = (df['ask_size'] - df['bid_size']) * (df['far_price'] - df['near_price'])
    
    df['spread_depth_ratio'] = (df['ask_price'] - df['bid_price']) / (df['bid_size'] + df['ask_size'])
    df['mid_price_movement'] = df['mid_price'].diff(periods=5).apply(lambda x: 1 if x > 0 else (-1 if x < 0 else 0))
    
    df['micro_price'] = ((df['bid_price'] * df['ask_size']) + (df['ask_price'] * df['bid_size'])) / (df['bid_size'] + df['ask_size'])
    df['relative_spread'] = (df['ask_price'] - df['bid_price']) / df['wap']
    
    # Calculate various statistical aggregation features
    for func in ["mean", "std", "skew", "kurt"]:
        df[f"all_prices_{func}"] = df[prices].agg(func, axis=1)
        df[f"all_sizes_{func}"] = df[sizes].agg(func, axis=1)
        

    for col in ['matched_size', 'imbalance_size', 'reference_price', 'imbalance_buy_sell_flag']:
        for window in [1,3,5,10]:
            df[f"{col}_shift_{window}"] = df.groupby('stock_id')[col].shift(window)
            df[f"{col}_ret_{window}"] = df.groupby('stock_id')[col].pct_change(window)
    
    # Calculate diff features for specific columns
    for col in ['ask_price', 'bid_price', 'ask_size', 'bid_size', 'weighted_wap','price_spread']:
        for window in [1,3,5,10]:
            df[f"{col}_diff_{window}"] = df.groupby("stock_id")[col].diff(window)
    
    #V4 feature
    for window in [3,5,10]:
        df[f'price_change_diff_{window}'] = df[f'bid_price_diff_{window}'] - df[f'ask_price_diff_{window}']
        df[f'size_change_diff_{window}'] = df[f'bid_size_diff_{window}'] - df[f'ask_size_diff_{window}']

    #V5 - rolling diff
    # Convert from pandas to Polars
    pl_df = pl.from_pandas(df)

    #Define the windows and columns for which you want to calculate the rolling statistics
    windows = [3, 5, 10]
    columns = ['ask_price', 'bid_price', 'ask_size', 'bid_size']

    # prepare the operations for each column and window
    group = ["stock_id"]
    expressions = []

    # Loop over each window and column to create the rolling mean and std expressions
    for window in windows:
        for col in columns:
            rolling_mean_expr = (
                pl.col(f"{col}_diff_{window}")
                .rolling_mean(window)
                .over(group)
                .alias(f'rolling_diff_{col}_{window}')
            )

            rolling_std_expr = (
                pl.col(f"{col}_diff_{window}")
                .rolling_std(window)
                .over(group)
                .alias(f'rolling_std_diff_{col}_{window}')
            )

            expressions.append(rolling_mean_expr)
            expressions.append(rolling_std_expr)

    # Run the operations using Polars' lazy API
    lazy_df = pl_df.lazy().with_columns(expressions)

    # Execute the lazy expressions and overwrite the pl_df variable
    pl_df = lazy_df.collect()

    # Convert back to pandas if necessary
    df = pl_df.to_pandas()
    gc.collect()
    
    df['mid_price*volume'] = df['mid_price_movement'] * df['volume']
    df['harmonic_imbalance'] = df.eval('2 / ((1 / bid_size) + (1 / ask_size))')
    
    for col in df.columns:
        df[col] = df[col].replace([np.inf, -np.inf], 0)

    return df

def other_features(df):
    df["dow"] = df["date_id"] % 5  # Day of the week
    df["seconds"] = df["seconds_in_bucket"] % 60  
    df["minute"] = df["seconds_in_bucket"] // 60  
    df['time_to_market_close'] = 540 - df['seconds_in_bucket']
    
    for key, value in global_stock_id_feats.items():
        df[f"global_{key}"] = df["stock_id"].map(value.to_dict())

    return df

class gen_all_features():
    def __init__(self,df=None):
        #infer near and far prices
        self.hn=HandleNaNs()
        
    def generate_all_features(self,df):
        #infer near and far prices

        df=self.hn.fill_nans(df)
        
        # Select relevant columns for feature generation
        cols = [c for c in df.columns if c not in ["row_id", "time_id"]]
        df = df[cols]
        
        # Generate imbalance features
        df = imbalance_features(df)
        gc.collect() 
        df = other_features(df)
        gc.collect()  
        feature_name = [i for i in df.columns if i not in ["row_id", "time_id"]]
        
        return df[feature_name]



    

# Data Loading and Preprocessing 

In [7]:
def getdata():
    if(CONFIG.runOnKaggle==True):
        df = pd.read_csv("/kaggle/input/optiver-trading-at-the-close/train.csv")
    else:
        df = pd.read_csv("./data/train.csv")
    
    df = df.dropna(subset=["target"])  #drop all rows with NaN in target
    df.reset_index(drop=True, inplace=True)
    return df
    
df=getdata()  
df=reduce_mem_usage(df)
GetMemUsage()  

Memory usage of dataframe is 679.35 MB
Memory usage after optimization is: 304.71 MB
Decreased by 55.15%


'RAM usage = 1.222 GB'

In [8]:
if(CONFIG.use_subset_of_data):
    #just take the last 4 weeks
    df=df[df.date_id>CONFIG.start_date]
print(f'df shape={df.shape}')
GetMemUsage() 

df shape=(5237892, 17)


'RAM usage = 1.222 GB'

# Data Splitting

In [12]:
def cleanup_dataframes():
    #cleanup existing dataframes
    cleanup(df_train)
    cleanup(df_valid)
    # cleanup(df_test_feats)
    cleanup(y_train)
    cleanup(y_valid)
    # cleanup(y_test_feats)

def getDataSets(df):
    if ( CONFIG.doTrainModel == True):
        #just need a train and a valid set
        return of.get2_DatasetAndTarget(df, dep_var='target', val_size=0.05,copy=False, verbose = False) 
df_train, df_valid, y_train, y_valid = getDataSets(df)
GetMemUsage()

'RAM usage = 1.446 GB'

# Calculate features

In [13]:
weights = [
    0.004, 0.001, 0.002, 0.006, 0.004, 0.004, 0.002, 0.006, 0.006, 0.002, 0.002, 0.008,
    0.006, 0.002, 0.008, 0.006, 0.002, 0.006, 0.004, 0.002, 0.004, 0.001, 0.006, 0.004,
    0.002, 0.002, 0.004, 0.002, 0.004, 0.004, 0.001, 0.001, 0.002, 0.002, 0.006, 0.004,
    0.004, 0.004, 0.006, 0.002, 0.002, 0.04 , 0.002, 0.002, 0.004, 0.04 , 0.002, 0.001,
    0.006, 0.004, 0.004, 0.006, 0.001, 0.004, 0.004, 0.002, 0.006, 0.004, 0.006, 0.004,
    0.006, 0.004, 0.002, 0.001, 0.002, 0.004, 0.002, 0.008, 0.004, 0.004, 0.002, 0.004,
    0.006, 0.002, 0.004, 0.004, 0.002, 0.004, 0.004, 0.004, 0.001, 0.002, 0.002, 0.008,
    0.02 , 0.004, 0.006, 0.002, 0.02 , 0.002, 0.002, 0.006, 0.004, 0.002, 0.001, 0.02,
    0.006, 0.001, 0.002, 0.004, 0.001, 0.002, 0.006, 0.006, 0.004, 0.006, 0.001, 0.002,
    0.004, 0.006, 0.006, 0.001, 0.04 , 0.006, 0.002, 0.004, 0.002, 0.002, 0.006, 0.002,
    0.002, 0.004, 0.006, 0.006, 0.002, 0.002, 0.008, 0.006, 0.004, 0.002, 0.006, 0.002,
    0.004, 0.006, 0.002, 0.004, 0.001, 0.004, 0.002, 0.004, 0.008, 0.006, 0.008, 0.002,
    0.004, 0.002, 0.001, 0.004, 0.004, 0.004, 0.006, 0.008, 0.004, 0.001, 0.001, 0.002,
    0.006, 0.004, 0.001, 0.002, 0.006, 0.004, 0.006, 0.008, 0.002, 0.002, 0.004, 0.002,
    0.04 , 0.002, 0.002, 0.004, 0.002, 0.002, 0.006, 0.02 , 0.004, 0.002, 0.006, 0.02,
    0.001, 0.002, 0.006, 0.004, 0.006, 0.004, 0.004, 0.004, 0.004, 0.002, 0.004, 0.04,
    0.002, 0.008, 0.002, 0.004, 0.001, 0.004, 0.006, 0.004,
]
weights = {int(k):v for k,v in enumerate(weights)}

In [14]:
%%time
def get_global_stock_id_feats(df2):
#first get the stats based on the training features
    global_stock_id_feats = {
            "median_size": df2.groupby("stock_id")["bid_size"].median() + df2.groupby("stock_id")["ask_size"].median(),
            "std_size": df2.groupby("stock_id")["bid_size"].std() + df2.groupby("stock_id")["ask_size"].std(),
            "ptp_size": df2.groupby("stock_id")["bid_size"].max() - df2.groupby("stock_id")["bid_size"].min(),
            "median_price": df2.groupby("stock_id")["bid_price"].median() + df2.groupby("stock_id")["ask_price"].median(),
            "std_price": df2.groupby("stock_id")["bid_price"].std() + df2.groupby("stock_id")["ask_price"].std(),
            "ptp_price": df2.groupby("stock_id")["bid_price"].max() - df2.groupby("stock_id")["ask_price"].min(),
        }
    return global_stock_id_feats

global_stock_id_feats=get_global_stock_id_feats(df_train)

CPU times: user 768 ms, sys: 15.4 ms, total: 783 ms
Wall time: 782 ms


In [15]:
# cleanup_dataframes()
gaf=gen_all_features()
df = gaf.generate_all_features(df)

print("Build df Finished.")
df=reduce_mem_usage(df)
GetMemUsage()

Build df Finished.


Memory usage of dataframe is 3771.41 MB
Memory usage after optimization is: 3122.03 MB
Decreased by 17.22%


'RAM usage = 4.215 GB'

In [16]:
df_train, df_valid, y_train, y_valid = getDataSets(df)
GetMemUsage()

'RAM usage = 9.593 GB'

In [17]:
df_train.head()

Unnamed: 0,stock_id,date_id,seconds_in_bucket,imbalance_size,imbalance_buy_sell_flag,reference_price,matched_size,far_price,near_price,bid_price,bid_size,ask_price,ask_size,wap,volume,mid_price,liquidity_imbalance,matched_imbalance,size_imbalance,reference_price_far_price_imb,reference_price_near_price_imb,reference_price_ask_price_imb,reference_price_bid_price_imb,reference_price_wap_imb,far_price_near_price_imb,far_price_ask_price_imb,far_price_bid_price_imb,far_price_wap_imb,near_price_ask_price_imb,near_price_bid_price_imb,near_price_wap_imb,ask_price_bid_price_imb,ask_price_wap_imb,bid_price_wap_imb,ask_price_bid_price_wap_imb2,ask_price_bid_price_reference_price_imb2,ask_price_wap_reference_price_imb2,bid_price_wap_reference_price_imb2,matched_size_bid_size_ask_size_imb2,matched_size_bid_size_imbalance_size_imb2,matched_size_ask_size_imbalance_size_imb2,bid_size_ask_size_imbalance_size_imb2,stock_weights,weighted_wap,wap_momentum,imbalance_momentum,price_spread,spread_intensity,price_pressure,market_urgency,depth_pressure,spread_depth_ratio,mid_price_movement,micro_price,relative_spread,all_prices_mean,all_sizes_mean,all_prices_std,all_sizes_std,all_prices_skew,all_sizes_skew,all_prices_kurt,all_sizes_kurt,matched_size_shift_1,matched_size_ret_1,matched_size_shift_3,matched_size_ret_3,matched_size_shift_5,matched_size_ret_5,matched_size_shift_10,matched_size_ret_10,imbalance_size_shift_1,imbalance_size_ret_1,imbalance_size_shift_3,imbalance_size_ret_3,imbalance_size_shift_5,imbalance_size_ret_5,imbalance_size_shift_10,imbalance_size_ret_10,reference_price_shift_1,reference_price_ret_1,reference_price_shift_3,reference_price_ret_3,reference_price_shift_5,reference_price_ret_5,reference_price_shift_10,reference_price_ret_10,imbalance_buy_sell_flag_shift_1,imbalance_buy_sell_flag_ret_1,imbalance_buy_sell_flag_shift_3,imbalance_buy_sell_flag_ret_3,imbalance_buy_sell_flag_shift_5,imbalance_buy_sell_flag_ret_5,imbalance_buy_sell_flag_shift_10,imbalance_buy_sell_flag_ret_10,ask_price_diff_1,ask_price_diff_3,ask_price_diff_5,ask_price_diff_10,bid_price_diff_1,bid_price_diff_3,bid_price_diff_5,bid_price_diff_10,ask_size_diff_1,ask_size_diff_3,ask_size_diff_5,ask_size_diff_10,bid_size_diff_1,bid_size_diff_3,bid_size_diff_5,bid_size_diff_10,weighted_wap_diff_1,weighted_wap_diff_3,weighted_wap_diff_5,weighted_wap_diff_10,price_spread_diff_1,price_spread_diff_3,price_spread_diff_5,price_spread_diff_10,price_change_diff_3,size_change_diff_3,price_change_diff_5,size_change_diff_5,price_change_diff_10,size_change_diff_10,rolling_diff_ask_price_3,rolling_std_diff_ask_price_3,rolling_diff_bid_price_3,rolling_std_diff_bid_price_3,rolling_diff_ask_size_3,rolling_std_diff_ask_size_3,rolling_diff_bid_size_3,rolling_std_diff_bid_size_3,rolling_diff_ask_price_5,rolling_std_diff_ask_price_5,rolling_diff_bid_price_5,rolling_std_diff_bid_price_5,rolling_diff_ask_size_5,rolling_std_diff_ask_size_5,rolling_diff_bid_size_5,rolling_std_diff_bid_size_5,rolling_diff_ask_price_10,rolling_std_diff_ask_price_10,rolling_diff_bid_price_10,rolling_std_diff_bid_price_10,rolling_diff_ask_size_10,rolling_std_diff_ask_size_10,rolling_diff_bid_size_10,rolling_std_diff_bid_size_10,mid_price*volume,harmonic_imbalance,dow,seconds,minute,time_to_market_close,global_median_size,global_std_size,global_ptp_size,global_median_price,global_std_price,global_ptp_price
0,0,0,0,3180603.0,1,0.999812,13380277.0,0.0,0.0,0.999812,60651.5,1.000026,8493.030273,1.0,69144.53125,0.999919,0.75434,-0.61589,7.141326,1.0,1.0,-0.000107,0.0,-9.4e-05,,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.000107,1.3e-05,-9.400536e-05,0.139683,,0.139683,,255.37088,3.269177,3.215423,59.816772,0.004,0.004,,,0.000214,,680.587524,0.000161,-0.0,3.094687e-09,0,1.0,0.000214,0.666608,4157506.0,0.516353,6324881.0,-0.968246,1.695159,-1.875,2.775961,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,14899.660156,0,0,0,540,40761.296875,127343.101562,5898989.0,1.99973,0.003378,0.017414
1,1,0,0,166603.9,-1,0.999896,1642214.25,0.0,0.0,0.999896,3233.040039,1.00066,20605.089844,1.0,23838.128906,1.000278,-0.728751,-0.815787,0.156905,1.0,1.0,-0.000382,0.0,-5.2e-05,,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.000382,0.00033,-5.200776e-05,6.344985,12816.0,6.344985,1744.0,93.34568,9.032276,10.107005,8.404243,0.001,0.001,,,0.000764,,127.277512,-0.000557,0.0,3.204751e-08,0,1.0,0.000764,0.666742,458164.1,0.516456,792759.4,-0.968245,1.94999,-1.875,3.819817,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,5589.119629,0,0,0,540,25487.480469,65035.03125,511677.7,1.999878,0.005619,0.02937
2,2,0,0,302879.9,-1,0.999561,1819368.0,0.0,0.0,0.999403,37956.0,1.000298,18995.0,1.0,56951.0,0.999851,0.332935,-0.714567,1.99821,1.0,1.0,-0.000369,7.9e-05,-0.00022,,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.000448,0.000149,-0.0002985892,0.499201,4.662142,0.678887,2.776772,93.95137,5.724238,5.341909,13.972041,0.002,0.002,,,0.000895,,271.084564,0.000298,-0.0,1.571567e-08,0,1.0,0.000895,0.666544,544799.8,0.516303,859536.8,-0.968244,1.869559,-1.875,3.507308,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,25319.107422,0,0,0,540,25736.900391,72926.539062,1069838.0,2.000176,0.005415,0.051622
3,3,0,0,11917680.0,-1,1.000171,18389746.0,0.0,0.0,0.999999,2324.899902,1.000214,479032.40625,1.0,481357.3125,1.000106,-0.99034,-0.213547,0.004853,1.0,1.0,-2.2e-05,8.6e-05,8.5e-05,,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.000107,0.000107,-5.066397e-07,239.46666,0.251127,0.250871,168.705887,37.571739,0.54317,0.565807,23.995132,0.006,0.006,,,0.000215,,2562.229492,-0.000213,0.0,4.466411e-10,0,1.0,0.000215,0.666731,7697196.0,0.516447,9008441.0,-0.968246,0.424923,-1.875,-3.574718,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,4627.342285,0,0,0,540,41473.5,94332.9375,1928848.0,1.999974,0.002912,0.018551
4,4,0,0,447550.0,-1,0.999532,17860614.0,0.0,0.0,0.999394,16485.539062,1.000016,434.100006,1.0,16919.638672,0.999705,0.948687,-0.951109,37.97636,1.0,1.0,-0.000242,6.9e-05,-0.000234,,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.000311,8e-06,-0.0003030921,0.02636,3.507559,0.034131,3.391793,1111.6521,40.395496,38.945301,26.855202,0.004,0.004,,,0.000622,,278.364655,0.00059,-0.0,3.67605e-08,0,1.0,0.000622,0.66649,4581271.0,0.516261,8855317.0,-0.968245,1.996737,-1.875,3.988887,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,845.924988,0,0,0,540,33656.980469,80714.015625,1604066.0,1.99983,0.003781,0.017379


# Reduce memory

In [18]:
df_train = reduce_mem_usage(df_train)
df_valid = reduce_mem_usage(df_valid)
GetMemUsage()

Memory usage of dataframe is 2977.05 MB
Memory usage after optimization is: 2977.05 MB
Decreased by 0.00%
Memory usage of dataframe is 164.96 MB
Memory usage after optimization is: 164.96 MB
Decreased by 0.00%


'RAM usage = 9.593 GB'

# rolling?

In [18]:
# df_train.head()

# df_train.reset_index(drop=True, inplace=True)

# df_train.loc[df_train.stock_id==0,:].loc[:,['matched_size','matched_size_shift_1','matched_size_shift_3','matched_size_shift_5','matched_size_shift_10']].head(20)
# test.loc[:,'bid_size']

# Save and Reload df

In [19]:
df_train.to_parquet("./tof_train.parquet")
df_valid.to_parquet("./tof_valid.parquet")

In [20]:
df_train=pd.read_parquet("./tof_train.parquet")
df_valid=pd.read_parquet("./tof_valid.parquet")
GetMemUsage()

'RAM usage = 17.39 GB'

# Split for the following clusters

In [21]:
#these come from months that were split into 4 clusters
# c0="13  14  17  23  45  54  68  72  75  84  90  106  128  133  138  140  157  158  167  175  186"
# c0=[int(a) for a in c0.split('  ')]
# c1="6  24  26  53  69  111  114  115  118  119  124  156  159  161  166  188  191  196  199"
# c1=[int(a) for a in c1.split('  ')]

# #verify no intersection
# print(f'intersection of c0 and c1={[a for a in c0 if a in c1]}')

# #find all other stocks
# allothers=[a for a in range(200) if a not in c0 and a not in c1]
# clusters=[c0,c1,allothers]
#run 13
# clusters=[[0, 1, 28, 35, 50, 120, 121, 138, 153, 155, 167, 171, 179, 181, 25, 41, 47, 80, 84, 85, 113, 117, 133, 151, 175, 178, 186, 7, 37, 43, 51, 60, 139, 148, 165, 189, 23, 195, 44, 86, 180, 68, 81, 90, 36, 191, 131],
# [2, 8, 40, 53, 55, 74, 77, 100, 101, 102, 114, 130, 150, 166, 177, 9, 18, 20, 29, 33, 52, 56, 59, 62, 99, 107, 111, 125, 149, 152, 196, 16, 39, 46, 135, 168, 170, 15, 63, 67, 79, 104, 145, 164, 173, 71, 93, 98, 119, 19, 136, 123, 161, 115, 82],
# [3, 14, 32, 45, 48, 54, 58, 70, 78, 89, 94, 110, 141, 157, 176, 184, 13, 105, 160, 83, 198],
# [4, 12, 22, 24, 27, 30, 31, 65, 96, 103, 158, 199, 5, 61, 64, 73, 75, 106, 112, 129, 154, 163, 169, 116, 192, 21, 128, 132, 134],
# [6, 17, 42, 57, 88, 109, 118, 122, 126, 147, 159, 182, 188, 193, 10, 34, 49, 66, 69, 91, 97, 137, 142, 185, 194, 26, 92, 146, 174, 108, 197, 127, 76, 124, 72, 140, 87, 183, 162],
# [11, 38, 95, 144, 172, 187, 190, 143, 156]]

# allothers=[]
# flattenclusters=[c for cluster in clusters for c in cluster]
# allothers=[a for a in range(200) if a not in flattenclusters]
# if allothers:
#     clusters.append(allothers)

## Last optuna run for lightgbm gave these params

lgb_params={'num_leaves': 244,
 'learning_rate': 0.01,
 'max_depth': 11,
 'reg_alpha': 0.23746308577367767,
 'reg_lambda': 2.702844990315888}

# Train Model

In [19]:
def get_mae(model, X_tst,y_tst):
    y_pred = model.predict(X_tst)
    return mean_absolute_error(y_pred, y_tst)

def evaluate_simple(model, X_train, X_val,X_tst, y_train, y_val,y_tst):   
    model.fit(X_train, y_train, eval_set=[(X_val, y_val)], callbacks=[lgb.early_stopping(stopping_rounds=100, verbose=True),lgb.callback.log_evaluation(period=100)])
    return get_mae(model, X_tst,y_tst)
    
def average_target(av_target_train, X_train, X_val, X_tst, y_train, y_val, y_tst):   
    # model.fit(X_train, y_train, eval_set=[(X_val, y_val)], callbacks=[lgb.early_stopping(50, verbose=False)])
    return len(y_tst), mean_absolute_error([av_target_train]*len(y_tst), y_tst)

In [20]:
# lgb_params={'num_leaves': 244,
#  'learning_rate': 0.01,
#  'max_depth': 11,
#  'reg_alpha': 0.23746308577367767,
#  'reg_lambda': 2.702844990315888}

lgb_params = {
        "objective": "mae",
        "n_estimators": 6000,
        "num_leaves": 256,
        "subsample": 0.6,
        "colsample_bytree": 0.8,
#         "learning_rate": 0.00871,
        "learning_rate": 0.01,
        'max_depth': 11,
        "n_jobs": 4,
        "verbosity": -1,
        "importance_type": "gain",
#         "reg_alpha": 0.1,
        "reg_alpha": 0.2,
        "reg_lambda": 3.25
    }

In [21]:
%%time
import os
model_save_path = 'models' 
if not os.path.exists(model_save_path):
    os.makedirs(model_save_path)

models=[]
# train 1 model on all the data, it will be the first model
model = lgb.LGBMRegressor(**lgb_params)
model.fit(df_train, y_train, eval_set=[(df_valid, y_valid)], callbacks=[lgb.early_stopping(stopping_rounds=100, verbose=True),lgb.callback.log_evaluation(period=100)])
models.append(model)

# Save the model to a file
model_filename = os.path.join(model_save_path, f'm_0.txt')
model.booster_.save_model(model_filename)
print(f"Model for all data saved to {model_filename}")


# for i,cluster in enumerate(clusters):
#     #split dataframe
#     t=df_train.loc[df_train.stock_id.isin(cluster),:]
#     v=df_valid.loc[df_valid.stock_id.isin(cluster),:]
  
#     # pull from y_train using t index
#     y_t = y_train.loc[t.index]
#     y_v = y_valid.loc[v.index]
    
#     # continue with the rest of your code
#     model = lgb.LGBMRegressor(**lgb_params)
#     model.fit(t, y_t, eval_set=[(v, y_v)], callbacks=[lgb.early_stopping(stopping_rounds=100, verbose=True),lgb.callback.log_evaluation(period=100)])
    
#     model_filename = os.path.join(model_save_path, f'm_{i+1}.txt')
#     model.booster_.save_model(model_filename)
#     print(f"Model for fold {i+1} saved to {model_filename}")
    
#     models.append(model)

Training until validation scores don't improve for 100 rounds
[100]	valid_0's l1: 5.73563
[200]	valid_0's l1: 5.6754
[300]	valid_0's l1: 5.65089
[400]	valid_0's l1: 5.63677
[500]	valid_0's l1: 5.62798
[600]	valid_0's l1: 5.62175
[700]	valid_0's l1: 5.61768
[800]	valid_0's l1: 5.61449
[900]	valid_0's l1: 5.61217
[1000]	valid_0's l1: 5.61139
[1100]	valid_0's l1: 5.61095
[1200]	valid_0's l1: 5.61044
[1300]	valid_0's l1: 5.60989
[1400]	valid_0's l1: 5.60939
[1500]	valid_0's l1: 5.60933
[1600]	valid_0's l1: 5.60875
[1700]	valid_0's l1: 5.60896
Early stopping, best iteration is:
[1617]	valid_0's l1: 5.60872
Model for all data saved to models/m_0.txt
CPU times: user 1h 59min 22s, sys: 3.7 s, total: 1h 59min 25s
Wall time: 30min 28s


In [26]:
# %%time
# import os
# model_save_path = 'models' 
# if not os.path.exists(model_save_path):
#     os.makedirs(model_save_path)

# models=[]
# # train 1 model on all the data, it will be the first model
# model = lgb.LGBMRegressor(**lgb_params)
# model.fit(df_train, y_train, eval_set=[(df_valid, y_valid)], callbacks=[lgb.early_stopping(stopping_rounds=100, verbose=True),lgb.callback.log_evaluation(period=100)])
# models.append(model)

# # Save the model to a file
# model_filename = os.path.join(model_save_path, f'm_0.txt')
# model.booster_.save_model(model_filename)
# print(f"Model for all data saved to {model_filename}")


# for i,cluster in enumerate(clusters):
#     #split dataframe
#     t=df_train.loc[df_train.stock_id.isin(cluster),:]
#     v=df_valid.loc[df_valid.stock_id.isin(cluster),:]
  
#     # pull from y_train using t index
#     y_t = y_train.loc[t.index]
#     y_v = y_valid.loc[v.index]
    
#     # continue with the rest of your code
#     model = lgb.LGBMRegressor(**lgb_params)
#     model.fit(t, y_t, eval_set=[(v, y_v)], callbacks=[lgb.early_stopping(stopping_rounds=100, verbose=True),lgb.callback.log_evaluation(period=100)])
    
#     model_filename = os.path.join(model_save_path, f'm_{i+1}.txt')
#     model.booster_.save_model(model_filename)
#     print(f"Model for fold {i+1} saved to {model_filename}")
    
#     models.append(model)


Training until validation scores don't improve for 100 rounds
[100]	valid_0's l1: 5.89449
[200]	valid_0's l1: 5.87543
[300]	valid_0's l1: 5.86448
[400]	valid_0's l1: 5.85611
[500]	valid_0's l1: 5.85211
[600]	valid_0's l1: 5.84895
[700]	valid_0's l1: 5.84654
[800]	valid_0's l1: 5.84487
[900]	valid_0's l1: 5.84396
[1000]	valid_0's l1: 5.84329
[1100]	valid_0's l1: 5.84288
[1200]	valid_0's l1: 5.84232
[1300]	valid_0's l1: 5.84201
[1400]	valid_0's l1: 5.84224
Early stopping, best iteration is:
[1314]	valid_0's l1: 5.84193
Model for all data saved to models/m_0.txt
Training until validation scores don't improve for 100 rounds
[100]	valid_0's l1: 5.56599
[200]	valid_0's l1: 5.55264
[300]	valid_0's l1: 5.54495
[400]	valid_0's l1: 5.54208
[500]	valid_0's l1: 5.54106
[600]	valid_0's l1: 5.53978
[700]	valid_0's l1: 5.53952
Early stopping, best iteration is:
[685]	valid_0's l1: 5.53907
Model for fold 1 saved to models/m_1.txt
Training until validation scores don't improve for 100 rounds
[100]	vali

In [None]:
df.target

0         -3.029704
1         -5.519986
2         -8.389950
3         -4.010201
4         -7.349849
             ...   
5237887    2.310276
5237888   -8.220077
5237889    1.169443
5237890   -1.540184
5237891   -6.530285
Name: target, Length: 5237892, dtype: float32

# train all models again on full dataset

In [None]:
# # del y_train
# # del 
# #load the models

# #get number of models
# num_models=len(clusters)+1

# models=[]
# for i in range(num_models):
#     model_filename = os.path.join(model_save_path, f'm_{i}.txt')
#     model = lgb.Booster(model_file=model_filename)
#     models.append(model)

# # %%time
# # # Train a LightGBM model for the current fold
# # model = lgb.LGBMRegressor(**lgb_params)
# # model.fit(df_train, y_train, eval_set=[(df_valid, y_valid)], callbacks=[lgb.early_stopping(stopping_rounds=100, verbose=True),lgb.callback.log_evaluation(period=100)])

# # GetMemUsage()

# Lets see how they do

In [24]:
preds=mod.predict(df_valid)
mean_absolute_error(preds, y_valid)


5.60872247403068

In [23]:
res=df_valid['stock_id'].copy().to_frame()
#create a place for the results to go
res['final_res']=np.NaN

    
#do predictions
for i,mod in enumerate(models):
    res[f'res_{i}']=mod.predict(df_valid)

# chooses output from the model trained 
# on the cluster that stock_id is in
def weight_func(x):
    for i,cluster in enumerate(clusters):
        if x.stock_id in cluster:
            #in this case take the average of the first model and the one that was trained
            #for the cluster this stock_id is in
            return ((x['res_0']))
        
res1 = res.apply(weight_func,axis=1)a

NameError: name 'clusters' is not defined

In [34]:
#using all the models
mean_absolute_error(res1, y_valid)

5.850818017031716

In [36]:
#using just the first trained on all the data
mean_absolute_error(res.res_0, y_valid)

5.8419250185785625

In [None]:
#so the individual models idea is bad


#  Run with Kaggle

In [29]:
# cleanup_dataframes() 
global_stock_id_feats = get_global_stock_id_feats(df)
# cleanup(df)
# GetMemUsage()

# Mock API

In [36]:
from data.public_timeseries_testing_util import MockApi
def make_env():
    return MockApi()


# Some visualizations

In [None]:
# import seaborn as sns
# #average kurtosis
# df.columns
# # for func in ["mean", "std", "skew", "kurt"]:
# #         df[f"all_prices_{func}"] = df[prices].agg(func, axis=1)
# #         df[f"all_sizes_{func}"] = df[sizes].agg(func, axis=1)
# plt.figure(figsize = (20, 10), dpi = 300)

# sns.lineplot(data = train, x = 'date_id', y = 'target', hue = 'imbalance_buy_sell_flag', errorbar = None, palette = 'viridis')

# plt.title('Average Target Over Days', weight = 'bold', fontsize = 30)
# plt.show()

# Real Kaggle



In [37]:
# %%time
mock_api=True
if mock_api:
    env = make_env()
    iter_test = env.iter_test()
else:
    import optiver2023
    env = optiver2023.make_env()
    iter_test = env.iter_test()

cache_size=55000 #5 days worth of data
cache=None #used for Kaggle to calculate rolling features on 200 stocks

def getcache(test):
    #get all dates in orig dataframe
    dates=df.date_id.unique()
    
    #get tests current date
    date=test.iloc[-1].date_id
    print(f'creating cache, test starts at date={date}')

    if (date in dates):
        i=np.where(dates == date)[0]
        prevdate=i-1
        cache=df.loc[df['date_id']==dates[prevdate[0]],:][-cache_size:]
    else:
        cache=df[-cache_size:]
        
    #get rid of extra columns in cache
    dropcols=[c for c in cache.columns if c not in test.columns]
    cache.drop(columns=dropcols, inplace=True)

    return cache


def zero_sum(prices, volumes):
    
#    I got this idea from https://github.com/gotoConversion/goto_conversion/
    
    std_error = np.sqrt(volumes)
    step = np.sum(prices)/np.sum(std_error)
    out = prices-std_error*step
    
    return out

for (test, revealed_targets, sample_prediction) in iter_test:
    # test.drop(columns=['currently_scored'],inplace=True)
    
    #add to the cache
    if cache is None:
        cache=getcache(test)
                
    cache=pd.concat([cache,test])
    
    feat = gaf.generate_all_features(cache)
    # print(f'feat.near_price.isnull().sum()={feat.near_price.isnull().sum()}')
    # x=model.predict(feat[-len(test):])
    # print(type(x))
    # print(x.shape)

    #create a place for the results to go
    # res=test.stock_id.copy().to_frame();
    # res['final_res']=np.NaN
    
    # #do predictions
    # for i,mod in enumerate(models):
    #     res[f'res_{i}']=mod.predict(feat[-len(test):])
    
    # # chooses output from the model trained 
    # # on the cluster that stock_id is in
    # def weight_func(x):
    #     for i,cluster in enumerate(clusters):
    #         if x.stock_id in cluster:
    #             #in this case take the average of the first model and the one that was trained
    #             #for the cluster this stock_id is in
    #             return ((x['res_0']+x[f'res_{i}'])/2)

    # sample_prediction['target'] = res.apply(weight_func,axis=1)
    # sample_prediction['target'] = zero_sum(sample_prediction['target'], test.loc[:,'bid_size'] + test.loc[:,'ask_size'])
    # env.predict(sample_prediction)

    sample_prediction['target'] = model.predict(feat[-len(test):])
    sample_prediction['target'] = zero_sum(sample_prediction['target'], test.loc[:,'bid_size'] + test.loc[:,'ask_size'])
    env.predict(sample_prediction)

    #just save the last part of the cache
    cache=cache[-cache_size:]
    print('done 1')

sample_prediction['target']

creating cache, test starts at date=478
done 1
done 1
done 1
done 1
done 1
done 1
done 1
done 1
done 1
done 1
done 1
done 1
done 1
done 1
done 1
done 1
done 1
done 1
done 1
done 1
done 1
done 1
done 1
done 1
done 1
done 1
done 1
done 1
done 1
done 1
done 1
done 1
done 1
done 1
done 1
done 1
done 1
done 1
done 1
done 1
done 1
done 1
done 1
done 1
done 1
done 1
done 1
done 1
done 1
done 1
done 1
done 1
done 1
done 1
done 1
done 1
done 1
done 1
done 1
done 1
done 1
done 1
done 1
done 1
done 1
done 1
done 1
done 1
done 1
done 1
done 1
done 1
done 1
done 1
done 1
done 1
done 1
done 1
done 1
done 1
done 1
done 1
done 1
done 1
done 1
done 1
done 1
done 1
done 1
done 1
done 1
done 1
done 1
done 1
done 1
done 1
done 1
done 1
done 1
done 1
done 1
done 1
done 1
done 1
done 1
done 1
done 1
done 1
done 1
done 1
done 1
done 1
done 1
done 1
done 1
done 1
done 1
done 1
done 1
done 1
done 1
done 1
done 1
done 1
done 1
done 1
done 1
done 1
done 1
done 1
done 1
done 1
done 1
done 1
done 1
done 1
done 1
d

0     -1.395904
1      0.544403
2      0.785570
3     -1.152986
4     -1.342191
         ...   
195   -2.244198
196   -2.237867
197   -0.210165
198    1.737057
199   -3.381692
Name: target, Length: 200, dtype: float64

In [None]:
%debug


No traceback has been produced, nothing to debug.


In [None]:
# def zero_sum(prices, volumes):
#     std_error = np.sqrt(volumes)
#     step = np.sum(prices) / np.sum(std_error)
#     out = prices - std_error * step
#     return out

# if is_infer:
#     import optiver2023
#     env = optiver2023.make_env()
#     iter_test = env.iter_test()
#     counter = 0
#     y_min, y_max = -64, 64
#     qps, predictions = [], []
#     cache = pd.DataFrame()

#     # Weights for each fold model
#     model_weights = [1/len(models)] * len(models) 
    
#     for (test, revealed_targets, sample_prediction) in iter_test:
#         now_time = time.time()
#         cache = pd.concat([cache, test], ignore_index=True, axis=0)
#         if counter > 0:
#             cache = cache.groupby(['stock_id']).tail(21).sort_values(by=['date_id', 'seconds_in_bucket', 'stock_id']).reset_index(drop=True)
#         feat = generate_all_features(cache)[-len(test):]

#         # added after new API, reference: https://www.kaggle.com/competitions/optiver-trading-at-the-close/discussion/455690#2526672
#         if test.currently_scored.iloc[0]== False:
#             sample_prediction['target'] = 0
#             env.predict(sample_prediction)
#             counter += 1
#             qps.append(time.time() - now_time)
#             if counter % 10 == 0:
#                 print(counter, 'qps:', np.mean(qps))
#             continue

#         feat = feat.drop(columns = ["currently_scored"])    
#         # end of new codes for new API
        
#         # Generate predictions for each model and calculate the weighted average
#         lgb_predictions = np.zeros(len(test))
#         for model, weight in zip(models, model_weights):
#             lgb_predictions += weight * model.predict(feat)

#         lgb_predictions = zero_sum(lgb_predictions, test['bid_size'] + test['ask_size'])
#         clipped_predictions = np.clip(lgb_predictions, y_min, y_max)
#         sample_prediction['target'] = clipped_predictions
#         env.predict(sample_prediction)
#         counter += 1
#         qps.append(time.time() - now_time)
#         if counter % 10 == 0:
#             print(counter, 'qps:', np.mean(qps))

#     time_cost = 1.146 * np.mean(qps)
#     print(f"The code will take approximately {np.round(time_cost, 4)} hours to reason about")