In [1]:
import gc  
import os  
import time  
import warnings 
from itertools import combinations  
from warnings import simplefilter 
import joblib  
import playground.optivarfuncs as of
import lightgbm as lgb  
import numpy as np  
import pandas as pd  
from sklearn.metrics import mean_absolute_error 
from sklearn.model_selection import KFold, TimeSeriesSplit  
warnings.filterwarnings("ignore")
simplefilter(action="ignore", category=pd.errors.PerformanceWarning)
is_offline = False 
is_train = True  
is_infer = True 
max_lookback = np.nan 
split_day = 435  
import polars as pl
# set the max columns to none
pd.set_option('display.max_columns', None)

# Functions

## Settings and helper Functions
There are 480 dates, 5 days a week or 96 weeks

In [2]:
class CONFIG:    
    #take last 3 months worth? or roughly 12*5=60.  So we want from (480-60) to 480
    # start_date=420

    #take last 1 months worth? or roughly 4*5=20.  So we want from (480-20) to 480
    start_date=460

    #just a week for testing?
    start_date=475
    
    #take last 1 months worth? or roughly 4*5=20.  So we want from (480-20) to 480
    doTrainModel= True #if true, #need train and test sets
    runOnKaggle=False #if true, then concat all datasets before calculating features for Kaggle data

    use_subset_of_data=True

In [3]:
from gc import collect;
collect()

0

In [4]:
# Tracking kernel memory usage:-  
from os import path, walk, getpid;
from psutil import Process;
def GetMemUsage():
    """
    This function defines the memory usage across the kernel. 
    Source-
    https://stackoverflow.com/questions/61366458/how-to-find-memory-usage-of-kaggle-notebook
    """;
    
    pid = getpid();
    py = Process(pid);
    memory_use = py.memory_info()[0] / 2. ** 30;
    return f"RAM usage = {memory_use :.4} GB";

def cleanup(df):
    try:
        del df
        df=None
    except:
        pass
    collect()
    return GetMemUsage()

GetMemUsage()

'RAM usage = 0.1951 GB'

In [5]:
#logging
import logging
# set up logging to file - see previous section for more details
logging.basicConfig(level=logging.INFO,
                    filename='logg.log',
                    filemode='w')
# define a Handler which writes INFO messages or higher to the sys.stderr
console = logging.StreamHandler()
# add the handler to the root logger
logging.getLogger().addHandler(console)
logger=logging.getLogger()

#use following to enable and disable
# logger.disabled = True

def reduce_mem_usage(df, verbose=True):
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtype
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            
            if str(col_type)[:3] == "int":
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
               
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float32)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float32)
    if verbose:
        logger.info(f"Memory usage of dataframe is {start_mem:.2f} MB")
        end_mem = df.memory_usage().sum() / 1024**2
        logger.info(f"Memory usage after optimization is: {end_mem:.2f} MB")
        decrease = 100 * (start_mem - end_mem) / start_mem
        logger.info(f"Decreased by {decrease:.2f}%")
    return df


## Parallel Triplet Imbalance Calculation function

In [6]:
from numba import njit, prange

@njit(parallel=True)
def compute_triplet_imbalance(df_values, comb_indices):
    num_rows = df_values.shape[0]
    num_combinations = len(comb_indices)
    imbalance_features = np.empty((num_rows, num_combinations))
    for i in prange(num_combinations):
        a, b, c = comb_indices[i]
        for j in range(num_rows):
            max_val = max(df_values[j, a], df_values[j, b], df_values[j, c])
            min_val = min(df_values[j, a], df_values[j, b], df_values[j, c])
            mid_val = df_values[j, a] + df_values[j, b] + df_values[j, c] - min_val - max_val
            
            if mid_val == min_val:
                imbalance_features[j, i] = np.nan
            else:
                imbalance_features[j, i] = (max_val - mid_val) / (mid_val - min_val)

    return imbalance_features

def calculate_triplet_imbalance_numba(price, df):
    df_values = df[price].values
    comb_indices = [(price.index(a), price.index(b), price.index(c)) for a, b, c in combinations(price, 3)]
    features_array = compute_triplet_imbalance(df_values, comb_indices)
    columns = [f"{a}_{b}_{c}_imb2" for a, b, c in combinations(price, 3)]
    features = pd.DataFrame(features_array, columns=columns)
    return features


## Feature Generation Functions 

In [7]:
from tqdm import tqdm
# from tqdm.auto import tqdm  # for notebooks

# Create new `pandas` methods which use `tqdm` progress
# (can use tqdm_gui, optional kwargs, etc.)
tqdm.pandas()

In [8]:
def imbalance_features(df):
    # Define lists of price and size-related column names
    prices = ["reference_price", "far_price", "near_price", "ask_price", "bid_price", "wap"]
    sizes = ["matched_size", "bid_size", "ask_size", "imbalance_size"]
    
    df["volume"] = df.eval("ask_size + bid_size")
    df["mid_price"] = df.eval("(ask_price + bid_price) / 2")
    df["liquidity_imbalance"] = df.eval("(bid_size-ask_size)/(bid_size+ask_size)")
    df["matched_imbalance"] = df.eval("(imbalance_size-matched_size)/(matched_size+imbalance_size)")
    df["size_imbalance"] = df.eval("bid_size / ask_size")

    for c in combinations(prices, 2):
        df[f"{c[0]}_{c[1]}_imb"] = df.eval(f"({c[0]} - {c[1]})/({c[0]} + {c[1]})")

    for c in [['ask_price', 'bid_price', 'wap', 'reference_price'], sizes]:
        triplet_feature = calculate_triplet_imbalance_numba(c, df)
        df[triplet_feature.columns] = triplet_feature.values
    
    df["stock_weights"] = df["stock_id"].map(weights)
    df["weighted_wap"] = df["stock_weights"] * df["wap"]
    df['wap_momentum'] = df.groupby('stock_id')['weighted_wap'].pct_change(periods=6)
   
    df["imbalance_momentum"] = df.groupby(['stock_id'])['imbalance_size'].diff(periods=1) / df['matched_size']
    df["price_spread"] = df["ask_price"] - df["bid_price"]
    df["spread_intensity"] = df.groupby(['stock_id'])['price_spread'].diff()
    df['price_pressure'] = df['imbalance_size'] * (df['ask_price'] - df['bid_price'])
    df['market_urgency'] = df['price_spread'] * df['liquidity_imbalance']
    df['depth_pressure'] = (df['ask_size'] - df['bid_size']) * (df['far_price'] - df['near_price'])
    
    df['spread_depth_ratio'] = (df['ask_price'] - df['bid_price']) / (df['bid_size'] + df['ask_size'])
    df['mid_price_movement'] = df['mid_price'].diff(periods=5).apply(lambda x: 1 if x > 0 else (-1 if x < 0 else 0))
    
    df['micro_price'] = ((df['bid_price'] * df['ask_size']) + (df['ask_price'] * df['bid_size'])) / (df['bid_size'] + df['ask_size'])
    df['relative_spread'] = (df['ask_price'] - df['bid_price']) / df['wap']
    
    # Calculate various statistical aggregation features
    for func in ["mean", "std", "skew", "kurt"]:
        df[f"all_prices_{func}"] = df[prices].agg(func, axis=1)
        df[f"all_sizes_{func}"] = df[sizes].agg(func, axis=1)
        

    for col in ['matched_size', 'imbalance_size', 'reference_price', 'imbalance_buy_sell_flag']:
        for window in [1,3,5,10]:
            df[f"{col}_shift_{window}"] = df.groupby('stock_id')[col].shift(window)
            df[f"{col}_ret_{window}"] = df.groupby('stock_id')[col].pct_change(window)
    
    # Calculate diff features for specific columns
    for col in ['ask_price', 'bid_price', 'ask_size', 'bid_size', 'weighted_wap','price_spread']:
        for window in [1,3,5,10]:
            df[f"{col}_diff_{window}"] = df.groupby("stock_id")[col].diff(window)
    
    #V4 feature
    for window in [3,5,10]:
        df[f'price_change_diff_{window}'] = df[f'bid_price_diff_{window}'] - df[f'ask_price_diff_{window}']
        df[f'size_change_diff_{window}'] = df[f'bid_size_diff_{window}'] - df[f'ask_size_diff_{window}']

    #V5 - rolling diff
    # Convert from pandas to Polars
    pl_df = pl.from_pandas(df)

    #Define the windows and columns for which you want to calculate the rolling statistics
    windows = [3, 5, 10]
    columns = ['ask_price', 'bid_price', 'ask_size', 'bid_size']

    # prepare the operations for each column and window
    group = ["stock_id"]
    expressions = []

    # Loop over each window and column to create the rolling mean and std expressions
    for window in windows:
        for col in columns:
            rolling_mean_expr = (
                pl.col(f"{col}_diff_{window}")
                .rolling_mean(window)
                .over(group)
                .alias(f'rolling_diff_{col}_{window}')
            )

            rolling_std_expr = (
                pl.col(f"{col}_diff_{window}")
                .rolling_std(window)
                .over(group)
                .alias(f'rolling_std_diff_{col}_{window}')
            )

            expressions.append(rolling_mean_expr)
            expressions.append(rolling_std_expr)

    # Run the operations using Polars' lazy API
    lazy_df = pl_df.lazy().with_columns(expressions)

    # Execute the lazy expressions and overwrite the pl_df variable
    pl_df = lazy_df.collect()

    # Convert back to pandas if necessary
    df = pl_df.to_pandas()
    gc.collect()
    
    df['mid_price*volume'] = df['mid_price_movement'] * df['volume']
    df['harmonic_imbalance'] = df.eval('2 / ((1 / bid_size) + (1 / ask_size))')
    
    for col in df.columns:
        df[col] = df[col].replace([np.inf, -np.inf], 0)

    return df

def other_features(df):
    df["dow"] = df["date_id"] % 5  # Day of the week
    df["seconds"] = df["seconds_in_bucket"] % 60  
    df["minute"] = df["seconds_in_bucket"] // 60  
    df['time_to_market_close'] = 540 - df['seconds_in_bucket']
    
    for key, value in global_stock_id_feats.items():
        df[f"global_{key}"] = df["stock_id"].map(value.to_dict())

    return df

class gen_all_features():
    def __init__(self,df=None):
        #infer near and far prices
        self.bs=of.bfs(['near_price','far_price'],df,None)
        
    def generate_all_features(self,df):
        #infer near and far prices
        self.bs.doautocol(df)
        df = df.progress_apply(self.bs.backfill, axis=1)
        
        # Select relevant columns for feature generation
        cols = [c for c in df.columns if c not in ["row_id", "time_id"]]
        df = df[cols]
        
        # Generate imbalance features
        df = imbalance_features(df)
        gc.collect() 
        df = other_features(df)
        gc.collect()  
        feature_name = [i for i in df.columns if i not in ["row_id", "time_id"]]
        
        return df[feature_name]



    

# Data Loading and Preprocessing 

In [9]:
def getdata():
    if(CONFIG.runOnKaggle==True):
        df = pd.read_csv("/kaggle/input/optiver-trading-at-the-close/train.csv")
    else:
        df = pd.read_csv("./data/train.csv")
    
    df = df.dropna(subset=["target"])  #drop all rows with NaN in target
    df.reset_index(drop=True, inplace=True)
    return df
    
df=getdata()  
df=reduce_mem_usage(df)
GetMemUsage()  

Memory usage of dataframe is 679.35 MB
Memory usage after optimization is: 304.71 MB
Decreased by 55.15%


'RAM usage = 1.224 GB'

In [10]:
if(CONFIG.use_subset_of_data):
    #just take the last 4 weeks
    df=df[df.date_id>CONFIG.start_date]
print(f'df shape={df.shape}')
GetMemUsage() 

df shape=(55000, 17)


'RAM usage = 0.9381 GB'

# Data Splitting

In [11]:
def cleanup_dataframes():
    #cleanup existing dataframes
    cleanup(df_train)
    cleanup(df_valid)
    # cleanup(df_test_feats)
    cleanup(y_train)
    cleanup(y_valid)
    # cleanup(y_test_feats)

def getDataSets(df):
    if ( CONFIG.doTrainModel == True):
        #just need a train and a valid set
        return of.get2_DatasetAndTarget(df, dep_var='target', val_size=0.05,copy=False, verbose = False) 
df_train, df_valid, y_train, y_valid = getDataSets(df)
GetMemUsage()

'RAM usage = 0.9369 GB'

# Calculate features

In [12]:
weights = [
    0.004, 0.001, 0.002, 0.006, 0.004, 0.004, 0.002, 0.006, 0.006, 0.002, 0.002, 0.008,
    0.006, 0.002, 0.008, 0.006, 0.002, 0.006, 0.004, 0.002, 0.004, 0.001, 0.006, 0.004,
    0.002, 0.002, 0.004, 0.002, 0.004, 0.004, 0.001, 0.001, 0.002, 0.002, 0.006, 0.004,
    0.004, 0.004, 0.006, 0.002, 0.002, 0.04 , 0.002, 0.002, 0.004, 0.04 , 0.002, 0.001,
    0.006, 0.004, 0.004, 0.006, 0.001, 0.004, 0.004, 0.002, 0.006, 0.004, 0.006, 0.004,
    0.006, 0.004, 0.002, 0.001, 0.002, 0.004, 0.002, 0.008, 0.004, 0.004, 0.002, 0.004,
    0.006, 0.002, 0.004, 0.004, 0.002, 0.004, 0.004, 0.004, 0.001, 0.002, 0.002, 0.008,
    0.02 , 0.004, 0.006, 0.002, 0.02 , 0.002, 0.002, 0.006, 0.004, 0.002, 0.001, 0.02,
    0.006, 0.001, 0.002, 0.004, 0.001, 0.002, 0.006, 0.006, 0.004, 0.006, 0.001, 0.002,
    0.004, 0.006, 0.006, 0.001, 0.04 , 0.006, 0.002, 0.004, 0.002, 0.002, 0.006, 0.002,
    0.002, 0.004, 0.006, 0.006, 0.002, 0.002, 0.008, 0.006, 0.004, 0.002, 0.006, 0.002,
    0.004, 0.006, 0.002, 0.004, 0.001, 0.004, 0.002, 0.004, 0.008, 0.006, 0.008, 0.002,
    0.004, 0.002, 0.001, 0.004, 0.004, 0.004, 0.006, 0.008, 0.004, 0.001, 0.001, 0.002,
    0.006, 0.004, 0.001, 0.002, 0.006, 0.004, 0.006, 0.008, 0.002, 0.002, 0.004, 0.002,
    0.04 , 0.002, 0.002, 0.004, 0.002, 0.002, 0.006, 0.02 , 0.004, 0.002, 0.006, 0.02,
    0.001, 0.002, 0.006, 0.004, 0.006, 0.004, 0.004, 0.004, 0.004, 0.002, 0.004, 0.04,
    0.002, 0.008, 0.002, 0.004, 0.001, 0.004, 0.006, 0.004,
]
weights = {int(k):v for k,v in enumerate(weights)}

In [13]:
%%time
def get_global_stock_id_feats(df2):
#first get the stats based on the training features
    global_stock_id_feats = {
            "median_size": df2.groupby("stock_id")["bid_size"].median() + df2.groupby("stock_id")["ask_size"].median(),
            "std_size": df2.groupby("stock_id")["bid_size"].std() + df2.groupby("stock_id")["ask_size"].std(),
            "ptp_size": df2.groupby("stock_id")["bid_size"].max() - df2.groupby("stock_id")["bid_size"].min(),
            "median_price": df2.groupby("stock_id")["bid_price"].median() + df2.groupby("stock_id")["ask_price"].median(),
            "std_price": df2.groupby("stock_id")["bid_price"].std() + df2.groupby("stock_id")["ask_price"].std(),
            "ptp_price": df2.groupby("stock_id")["bid_price"].max() - df2.groupby("stock_id")["ask_price"].min(),
        }
    return global_stock_id_feats

global_stock_id_feats=get_global_stock_id_feats(df_train)

CPU times: user 10.4 ms, sys: 332 µs, total: 10.7 ms
Wall time: 10.4 ms


In [14]:
# cleanup_dataframes()
gaf=gen_all_features()
df1 = gaf.generate_all_features(df)

print("Build df Finished.")
df1=reduce_mem_usage(df1)
GetMemUsage()

100%|██████████| 55000/55000 [00:04<00:00, 12523.77it/s]
Memory usage of dataframe is 68.82 MB
Memory usage after optimization is: 32.89 MB
Decreased by 52.21%


Build df Finished.


'RAM usage = 1.054 GB'

In [15]:
df_train, df_valid, y_train, y_valid = getDataSets(df1)
GetMemUsage()

'RAM usage = 1.054 GB'

In [16]:
df_train.head()

Unnamed: 0,stock_id,date_id,seconds_in_bucket,imbalance_size,imbalance_buy_sell_flag,reference_price,matched_size,far_price,near_price,bid_price,bid_size,ask_price,ask_size,wap,syn_near_price,syn_far_price,volume,mid_price,liquidity_imbalance,matched_imbalance,size_imbalance,reference_price_far_price_imb,reference_price_near_price_imb,reference_price_ask_price_imb,reference_price_bid_price_imb,reference_price_wap_imb,far_price_near_price_imb,far_price_ask_price_imb,far_price_bid_price_imb,far_price_wap_imb,near_price_ask_price_imb,near_price_bid_price_imb,near_price_wap_imb,ask_price_bid_price_imb,ask_price_wap_imb,bid_price_wap_imb,ask_price_bid_price_wap_imb2,ask_price_bid_price_reference_price_imb2,ask_price_wap_reference_price_imb2,bid_price_wap_reference_price_imb2,matched_size_bid_size_ask_size_imb2,matched_size_bid_size_imbalance_size_imb2,matched_size_ask_size_imbalance_size_imb2,bid_size_ask_size_imbalance_size_imb2,stock_weights,weighted_wap,wap_momentum,imbalance_momentum,price_spread,spread_intensity,price_pressure,market_urgency,depth_pressure,spread_depth_ratio,mid_price_movement,micro_price,relative_spread,all_prices_mean,all_sizes_mean,all_prices_std,all_sizes_std,all_prices_skew,all_sizes_skew,all_prices_kurt,all_sizes_kurt,matched_size_shift_1,matched_size_ret_1,matched_size_shift_3,matched_size_ret_3,matched_size_shift_5,matched_size_ret_5,matched_size_shift_10,matched_size_ret_10,imbalance_size_shift_1,imbalance_size_ret_1,imbalance_size_shift_3,imbalance_size_ret_3,imbalance_size_shift_5,imbalance_size_ret_5,imbalance_size_shift_10,imbalance_size_ret_10,reference_price_shift_1,reference_price_ret_1,reference_price_shift_3,reference_price_ret_3,reference_price_shift_5,reference_price_ret_5,reference_price_shift_10,reference_price_ret_10,imbalance_buy_sell_flag_shift_1,imbalance_buy_sell_flag_ret_1,imbalance_buy_sell_flag_shift_3,imbalance_buy_sell_flag_ret_3,imbalance_buy_sell_flag_shift_5,imbalance_buy_sell_flag_ret_5,imbalance_buy_sell_flag_shift_10,imbalance_buy_sell_flag_ret_10,ask_price_diff_1,ask_price_diff_3,ask_price_diff_5,ask_price_diff_10,bid_price_diff_1,bid_price_diff_3,bid_price_diff_5,bid_price_diff_10,ask_size_diff_1,ask_size_diff_3,ask_size_diff_5,ask_size_diff_10,bid_size_diff_1,bid_size_diff_3,bid_size_diff_5,bid_size_diff_10,weighted_wap_diff_1,weighted_wap_diff_3,weighted_wap_diff_5,weighted_wap_diff_10,price_spread_diff_1,price_spread_diff_3,price_spread_diff_5,price_spread_diff_10,price_change_diff_3,size_change_diff_3,price_change_diff_5,size_change_diff_5,price_change_diff_10,size_change_diff_10,rolling_diff_ask_price_3,rolling_std_diff_ask_price_3,rolling_diff_bid_price_3,rolling_std_diff_bid_price_3,rolling_diff_ask_size_3,rolling_std_diff_ask_size_3,rolling_diff_bid_size_3,rolling_std_diff_bid_size_3,rolling_diff_ask_price_5,rolling_std_diff_ask_price_5,rolling_diff_bid_price_5,rolling_std_diff_bid_price_5,rolling_diff_ask_size_5,rolling_std_diff_ask_size_5,rolling_diff_bid_size_5,rolling_std_diff_bid_size_5,rolling_diff_ask_price_10,rolling_std_diff_ask_price_10,rolling_diff_bid_price_10,rolling_std_diff_bid_price_10,rolling_diff_ask_size_10,rolling_std_diff_ask_size_10,rolling_diff_bid_size_10,rolling_std_diff_bid_size_10,mid_price*volume,harmonic_imbalance,dow,seconds,minute,time_to_market_close,global_median_size,global_std_size,global_ptp_size,global_median_price,global_std_price,global_ptp_price
0,0,476,0,5936142.0,-1,1.000162,11735802.0,0.0,0.0,0.99999,1339.75,1.000333,45159.25,1.0,1,1,46499.0,1.000161,-0.942375,-0.328185,0.029667,1.0,1.0,-8.5e-05,8.6e-05,8.1e-05,,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.000171,0.000166,-5e-06,33.25,0.993763,1.055188,16.178572,266.7909,0.977229,0.984498,134.4375,0.004,0.004,,,0.000343,,2035.889526,-0.0003232018,0.0,7.375753e-09,0,1.0,0.000343,0.666748,4429610.5,0.51646,5611983.0,-0.968246,0.838516,-1.875,-1.37271,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,2602.297119,1,0,0,540,77741.273438,113330.742188,497793.34375,1.999808,0.00188,0.003515
1,1,476,0,340144.8,-1,0.998827,1538534.5,0.0,0.0,0.998827,41139.558594,1.000045,1576.640015,1.0,1,1,42716.199219,0.999436,0.926181,-0.637889,26.093184,1.0,1.0,-0.000609,0.0,-0.000587,,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.000609,2.2e-05,-0.000587,0.038313,,0.038313,,37.84845,4.007922,3.539582,7.557715,0.001,0.001,,,0.001218,,414.283234,0.001128052,-0.0,2.851287e-08,0,1.0,0.001218,0.666283,480348.875,0.516101,721466.5,-0.968241,1.753388,-1.875,3.035313,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,3036.893555,1,0,0,540,34196.175781,65239.136719,243251.40625,1.998478,0.004576,0.009339
2,2,476,0,960061.1,-1,1.000144,4989971.5,0.0,0.0,0.999991,586.590027,1.000656,41871.238281,1.0,1,1,42457.828125,1.000324,-0.972368,-0.677292,0.014009,1.0,1.0,-0.000256,7.6e-05,7.2e-05,,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.000332,0.000328,-5e-06,72.887421,3.346319,3.555464,16.0,119.8533,4.200122,4.388973,22.24047,0.002,0.002,,,0.000665,,638.44928,-0.0006466337,0.0,1.566281e-08,0,1.0,0.000665,0.666798,1498122.625,0.5165,2369655.5,-0.968245,1.802349,-1.875,3.232602,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,1156.971558,1,0,0,540,29787.416016,50589.726562,91794.109375,2.002432,0.002628,0.004537
3,3,476,0,6379094.0,-1,1.000394,31743734.0,0.0,0.0,0.999754,10154.0,1.000246,10159.0,1.0,1,1,20313.0,1.0,-0.000246,-0.66534,0.999508,1.0,1.0,7.4e-05,0.00032,0.000197,,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.000246,0.000123,-0.000123,1.000242,0.300666,0.60126,1.601648,6346715.0,3.982553,3.982556,1273787.0,0.006,0.006,,,0.000492,,3138.746338,-1.211137e-07,0.0,2.422273e-08,0,1.0,0.000492,0.666732,9535785.0,0.516449,15106653.0,-0.968245,1.77765,-1.875,3.128522,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,10156.499023,1,0,0,540,51616.085938,103787.125,236193.328125,2.000993,0.001404,0.002409
4,4,476,0,3713347.0,-1,1.000209,14506131.0,0.0,0.0,0.999758,17740.0,1.000097,7098.399902,1.0,1,1,24838.400391,0.999928,0.428433,-0.592376,2.499155,1.0,1.0,5.6e-05,0.000225,0.000104,,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.00017,4.9e-05,-0.000121,0.400985,0.330169,1.153563,0.863547,1361.486,2.920436,2.912051,347.2792,0.004,0.004,,,0.000339,,1258.94043,0.0001452523,-0.0,1.364948e-08,0,1.0,0.000339,0.666677,4561079.0,0.516406,6855736.5,-0.968245,1.644325,-1.875,2.55282,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,10139.591797,1,0,0,540,45138.894531,65744.742188,269987.75,2.000669,0.001705,0.003182


# Reduce memory

In [17]:
df_train = reduce_mem_usage(df_train)
df_valid = reduce_mem_usage(df_valid)
GetMemUsage()

Memory usage of dataframe is 26.48 MB
Memory usage after optimization is: 26.48 MB
Decreased by 0.00%
Memory usage of dataframe is 6.62 MB
Memory usage after optimization is: 6.62 MB
Decreased by 0.00%


'RAM usage = 1.054 GB'

# rolling?

In [18]:
# df_train.head()

# df_train.reset_index(drop=True, inplace=True)

# df_train.loc[df_train.stock_id==0,:].loc[:,['matched_size','matched_size_shift_1','matched_size_shift_3','matched_size_shift_5','matched_size_shift_10']].head(20)
# test.loc[:,'bid_size']

# Save and Reload df

In [19]:
df_train.to_parquet("./tof_train.parquet")
df_valid.to_parquet("./tof_valid.parquet")

In [20]:
df_train=pd.read_parquet("./tof_train.parquet")
df_valid=pd.read_parquet("./tof_valid.parquet")
GetMemUsage()

'RAM usage = 1.146 GB'

## Last optuna run for lightgbm gave these params

lgb_params={'num_leaves': 244,
 'learning_rate': 0.01,
 'max_depth': 11,
 'reg_alpha': 0.23746308577367767,
 'reg_lambda': 2.702844990315888}

# Train Model

In [21]:
def get_mae(model, X_tst,y_tst):
    y_pred = model.predict(X_tst)
    return mean_absolute_error(y_pred, y_tst)

def evaluate_simple(model, X_train, X_val,X_tst, y_train, y_val,y_tst):   
    model.fit(X_train, y_train, eval_set=[(X_val, y_val)], callbacks=[lgb.early_stopping(stopping_rounds=100, verbose=True),lgb.callback.log_evaluation(period=100)])
    return get_mae(model, X_tst,y_tst)
    
def average_target(av_target_train, X_train, X_val, X_tst, y_train, y_val, y_tst):   
    # model.fit(X_train, y_train, eval_set=[(X_val, y_val)], callbacks=[lgb.early_stopping(50, verbose=False)])
    return len(y_tst), mean_absolute_error([av_target_train]*len(y_tst), y_tst)

In [22]:
lgb_params={'num_leaves': 244,
 'learning_rate': 0.01,
 'max_depth': 11,
 'reg_alpha': 0.23746308577367767,
 'reg_lambda': 2.702844990315888}

In [23]:
%%time
# Train a LightGBM model for the current fold
model = lgb.LGBMRegressor(**lgb_params)
model.fit(df_train, y_train, eval_set=[(df_valid, y_valid)], callbacks=[lgb.early_stopping(stopping_rounds=100, verbose=True),lgb.callback.log_evaluation(period=100)])

GetMemUsage()

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.019335 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 36525
[LightGBM] [Info] Number of data points in the train set: 44000, number of used features: 163
[LightGBM] [Info] Start training from score -0.198113
Training until validation scores don't improve for 100 rounds
[100]	valid_0's l2: 50.5477
Did not meet early stopping. Best iteration is:
[35]	valid_0's l2: 49.8724
CPU times: user 17.7 s, sys: 168 ms, total: 17.9 s
Wall time: 4.6 s


'RAM usage = 1.158 GB'

#  Run with Kaggle

In [24]:
# cleanup_dataframes() 
global_stock_id_feats = get_global_stock_id_feats(df)
# cleanup(df)
# GetMemUsage()

In [25]:

GetMemUsage()

'RAM usage = 1.158 GB'

# Figure out kurtosis

In [26]:
# def calculate_Kaggle_features_for_test(df,test):
#     # Define lists of price and size-related column names
#     prices = ["reference_price", "far_price", "near_price", "ask_price", "bid_price", "wap"]
#     sizes = ["matched_size", "bid_size", "ask_size", "imbalance_size"]

#      # Calculate various statistical aggregation features
#     for func in ["mean", "std", "skew", "kurt"]:
#     df[f"all_prices_{func}"] = df[prices].agg(func, axis=1)
#     df[f"all_sizes_{func}"] = df[sizes].agg(func, axis=1)

IndentationError: expected an indented block (3421134065.py, line 8)

# Mock API

In [27]:
from data.public_timeseries_testing_util import MockApi
def make_env():
    return MockApi()


# Real Kaggle



In [31]:
mock_api=True
if mock_api:
    env = make_env()
    iter_test = env.iter_test()
else:
    import optiver2023
    env = optiver2023.make_env()
    iter_test = env.iter_test()

cache=None #used for Kaggle to calculate rolling features on 200 stocks

def getcache(test):
    #get all dates in orig dataframe
    dates=df.date_id.unique()
    
    #get tests current date
    date=test.iloc[-1].date_id
    print(f'creating cache, test starts at date={date}')

    if (date in dates):
        i=np.where(dates == date)[0]
        prevdate=i-1
        cache=df.loc[df['date_id']==prevdate[0],:][-4400:]
    else:
        cache=df[-4400:]
        
    #get rid of extra columns in cache
    dropcols=[c for c in cache.columns if c not in test.columns]
    cache.drop(columns=dropcols, inplace=True)

    return cache


def zero_sum(prices, volumes):
    
#    I got this idea from https://github.com/gotoConversion/goto_conversion/
    
    std_error = np.sqrt(volumes)
    step = np.sum(prices)/np.sum(std_error)
    out = prices-std_error*step
    
    return out

for (test, revealed_targets, sample_prediction) in iter_test:
    # test.drop(columns=['currently_scored'],inplace=True)
    
    #add to the cache
    if cache is None:
        cache=getcache(test)
        
    cache=pd.concat([cache,test])
    
    feat = gaf.generate_all_features(cache)
    sample_prediction['target'] = model.predict(feat[-len(test):])
    sample_prediction['target'] = zero_sum(sample_prediction['target'], test.loc[:,'bid_size'] + test.loc[:,'ask_size'])
    env.predict(sample_prediction)

    #just save the last part of the cache
    cache=cache[-4400:]

sample_prediction['target']

creating cache, test starts at date=478


100%|██████████| 200/200 [00:00<00:00, 11659.59it/s]
100%|██████████| 400/400 [00:00<00:00, 11467.21it/s]
100%|██████████| 600/600 [00:00<00:00, 11799.21it/s]
100%|██████████| 800/800 [00:00<00:00, 11604.67it/s]
100%|██████████| 1000/1000 [00:00<00:00, 12481.30it/s]
100%|██████████| 1200/1200 [00:00<00:00, 12645.44it/s]
100%|██████████| 1400/1400 [00:00<00:00, 12368.36it/s]
100%|██████████| 1600/1600 [00:00<00:00, 12265.73it/s]
100%|██████████| 1800/1800 [00:00<00:00, 12220.81it/s]
100%|██████████| 2000/2000 [00:00<00:00, 12647.92it/s]
100%|██████████| 2200/2200 [00:00<00:00, 12543.54it/s]
100%|██████████| 2400/2400 [00:00<00:00, 12531.35it/s]
100%|██████████| 2600/2600 [00:00<00:00, 12563.80it/s]
100%|██████████| 2800/2800 [00:00<00:00, 12516.23it/s]
100%|██████████| 3000/3000 [00:00<00:00, 12476.52it/s]
100%|██████████| 3200/3200 [00:00<00:00, 12610.04it/s]
100%|██████████| 3400/3400 [00:00<00:00, 12537.72it/s]
100%|██████████| 3600/3600 [00:00<00:00, 12604.22it/s]
100%|██████████| 3

0     -0.019533
1     -0.128127
2      0.392179
3      0.086051
4      0.145063
         ...   
195    0.052843
196   -0.115056
197   -0.197594
198    0.356272
199    0.094903
Name: target, Length: 200, dtype: float64

In [None]:
# def zero_sum(prices, volumes):
#     std_error = np.sqrt(volumes)
#     step = np.sum(prices) / np.sum(std_error)
#     out = prices - std_error * step
#     return out

# if is_infer:
#     import optiver2023
#     env = optiver2023.make_env()
#     iter_test = env.iter_test()
#     counter = 0
#     y_min, y_max = -64, 64
#     qps, predictions = [], []
#     cache = pd.DataFrame()

#     # Weights for each fold model
#     model_weights = [1/len(models)] * len(models) 
    
#     for (test, revealed_targets, sample_prediction) in iter_test:
#         now_time = time.time()
#         cache = pd.concat([cache, test], ignore_index=True, axis=0)
#         if counter > 0:
#             cache = cache.groupby(['stock_id']).tail(21).sort_values(by=['date_id', 'seconds_in_bucket', 'stock_id']).reset_index(drop=True)
#         feat = generate_all_features(cache)[-len(test):]

#         # added after new API, reference: https://www.kaggle.com/competitions/optiver-trading-at-the-close/discussion/455690#2526672
#         if test.currently_scored.iloc[0]== False:
#             sample_prediction['target'] = 0
#             env.predict(sample_prediction)
#             counter += 1
#             qps.append(time.time() - now_time)
#             if counter % 10 == 0:
#                 print(counter, 'qps:', np.mean(qps))
#             continue

#         feat = feat.drop(columns = ["currently_scored"])    
#         # end of new codes for new API
        
#         # Generate predictions for each model and calculate the weighted average
#         lgb_predictions = np.zeros(len(test))
#         for model, weight in zip(models, model_weights):
#             lgb_predictions += weight * model.predict(feat)

#         lgb_predictions = zero_sum(lgb_predictions, test['bid_size'] + test['ask_size'])
#         clipped_predictions = np.clip(lgb_predictions, y_min, y_max)
#         sample_prediction['target'] = clipped_predictions
#         env.predict(sample_prediction)
#         counter += 1
#         qps.append(time.time() - now_time)
#         if counter % 10 == 0:
#             print(counter, 'qps:', np.mean(qps))

#     time_cost = 1.146 * np.mean(qps)
#     print(f"The code will take approximately {np.round(time_cost, 4)} hours to reason about")