In [71]:
import gc  
import os  
import time  
import warnings 
from itertools import combinations  
from warnings import simplefilter 
import joblib  
# import playground.optivarfuncs as of
import lightgbm as lgb  
import numpy as np  
import pandas as pd  
from sklearn.metrics import mean_absolute_error 
from sklearn.model_selection import KFold, TimeSeriesSplit  
warnings.filterwarnings("ignore")
simplefilter(action="ignore", category=pd.errors.PerformanceWarning)
is_offline = False 
is_train = True  
is_infer = True 
max_lookback = np.nan 
split_day = 435  
import polars as pl
from gc import collect;

In [72]:
import pandas as pd
import numpy as np
import itertools

#--------------------------------------get splits (no unit tests)
import numpy as np

def getsplitday( X, val_size, verbose = True):
    '''
    find split that seperates by day
    ex 
    tstdf=of.getdf(numb_days= 10, drop_days= [2,3])
    getsplitday( df, val_sze=0.1)
    9
    '''
    days=X.date_id.unique()
    index=int(np.floor(len(days)*(1-val_size)))
    if (verbose):
        print(f"For days{days} and val_size={val_size}, split day={days[index]}")
    return days[index]

def splitDataset(X, val_size, copy=False,verbose=True):
    '''
    ensures datasets do not split accross a day
    returns train and val
    ex 
    tstdf=of.getdf(numb_days= 10, drop_days= [2,3])
    X_train, X_val = splitDataset(tstdf, 0.2, verbose=False):
    '''
    #get the day to split on
    day=getsplitday( X,val_size, verbose)
    # print(f'df shape={X.shape}, min_date={X.date_id.min()},max_date={X.date_id.max()}, val_start={val_start}, tst_start={tst_start}')

    #get train, val
    if(copy==True):
        X_train=X[X.eval(f"(date_id<{day})")].copy()
        X_val=X.loc[(X.date_id>=day)].copy()
    else:
        X_train=X[X.eval(f"(date_id<{day})")]
        X_val=X.loc[(X.date_id>=day)]
    return  X_train, X_val

def splitTarget(X, dep_var='target',verbose=True):
    '''
    split target from X
    ex
    X_train, y_train = splitTarget(X_train, dep_var)
    '''
    y = X[dep_var]
    X.drop(columns=[dep_var],inplace=True)
    return X,y

def get2_DatasetAndTarget(X, dep_var='target', val_size=0.2,copy=False, verbose=True):
    '''
    split into 2 datasets
    ex 
    tstdf=of.getdf(numb_days= 10, drop_days= [2,3])
    X_train, X_val, y_train, y_val = get2_DatasetAndTarget(tstdf,dep_var='target', val_size, verbose=False):
    '''
    X_train, X_val = splitDataset(X, val_size, copy,verbose)
    X_train, y_train = splitTarget(X_train, dep_var)
    X_val, y_val = splitTarget(X_val, dep_var)
    if (verbose):
        print(f"len(X_train)={len(X_train)} and len(X_val)={len(X_val)}")
        tots=len(X_val)+len(X_train)
        print(f"train size={len(X_train)/tots}, val size={len(X_val)/tots}")

    return X_train, X_val, y_train, y_val

def get3_DatasetAndTarget(X, dep_var='target', val_size=0.1, test_size=0.2,copy=False,verbose = True):
    '''
    split into 3 datasets
    ex 
    tstdf=of.getdf(numb_days= 10, drop_days= [2,3])
    X_train, X_val, X_tst, y_train, y_val, y_tst = get3_DatasetAndTarget(tstdf, dep_var='target', .1,.2 verbose=False):
    '''
 
    X_train, X_val= splitDataset(X, val_size+test_size,copy, verbose)
    X_val, X_tst = splitDataset(X_val, test_size/(val_size+test_size),copy, verbose)

    X_train, y_train = splitTarget(X_train, dep_var)
    X_val, y_val = splitTarget(X_val, dep_var)
    X_tst, y_tst = splitTarget(X_tst, dep_var)

    if (verbose):
        print(f"len(X_train)={len(X_train)} and len(X_val)={len(X_val)}, len(X_tst)={len(X_tst)}")
        tots=len(X_val)+len(X_train)+len(X_tst)
        print(f"train size={len(X_train)/tots}, val size={len(X_val)/tots}, tst size={len(X_tst)/tots}")

    return X_train, X_val, X_tst, y_train, y_val, y_tst

# Functions

## Settings and helper Functions
There are 480 dates, 5 days a week or 96 weeks

In [73]:
class CONFIG:    
    runOnKaggle=False #if true, then concat all datasets before calculating features for Kaggle data
    #just a week for testing?
    start_date=475
    
    #take last 1 months worth? or roughly 4*5=20.  So we want from (480-20) to 480
    doTrainModel= True #if true, #need train and test sets
    runOnKaggle=False #if true, then concat all datasets before calculating features for Kaggle data

    use_subset_of_data=True

In [74]:
# Tracking kernel memory usage:-  
from os import path, walk, getpid;
from psutil import Process;
def GetMemUsage():
    """
    This function defines the memory usage across the kernel. 
    Source-
    https://stackoverflow.com/questions/61366458/how-to-find-memory-usage-of-kaggle-notebook
    """;
    
    pid = getpid();
    py = Process(pid);
    memory_use = py.memory_info()[0] / 2. ** 30;
    return f"RAM usage = {memory_use :.4} GB";

def cleanup(df):
    try:
        del df
        df=None
    except:
        pass
    collect()
    return GetMemUsage()

GetMemUsage()

'RAM usage = 3.642 GB'

In [75]:
#logging
import logging
# set up logging to file - see previous section for more details
logging.basicConfig(level=logging.INFO,
                    filename='logg.log',
                    filemode='w')
# define a Handler which writes INFO messages or higher to the sys.stderr
console = logging.StreamHandler()
# add the handler to the root logger
logging.getLogger().addHandler(console)
logger=logging.getLogger()

#use following to enable and disable
# logger.disabled = True

def reduce_mem_usage(df, verbose=True):
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtype
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            
            if str(col_type)[:3] == "int":
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
               
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float32)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float32)
    if verbose:
        logger.info(f"Memory usage of dataframe is {start_mem:.2f} MB")
        end_mem = df.memory_usage().sum() / 1024**2
        logger.info(f"Memory usage after optimization is: {end_mem:.2f} MB")
        decrease = 100 * (start_mem - end_mem) / start_mem
        logger.info(f"Decreased by {decrease:.2f}%")
    return df


## Parallel Triplet Imbalance Calculation function

In [76]:
from numba import njit, prange

@njit(parallel=True)
def compute_triplet_imbalance(df_values, comb_indices):
    num_rows = df_values.shape[0]
    num_combinations = len(comb_indices)
    imbalance_features = np.empty((num_rows, num_combinations))
    for i in prange(num_combinations):
        a, b, c = comb_indices[i]
        for j in range(num_rows):
            max_val = max(df_values[j, a], df_values[j, b], df_values[j, c])
            min_val = min(df_values[j, a], df_values[j, b], df_values[j, c])
            mid_val = df_values[j, a] + df_values[j, b] + df_values[j, c] - min_val - max_val
            
            if mid_val == min_val:
                imbalance_features[j, i] = np.nan
            else:
                imbalance_features[j, i] = (max_val - mid_val) / (mid_val - min_val)

    return imbalance_features

def calculate_triplet_imbalance_numba(price, df):
    df_values = df[price].values
    comb_indices = [(price.index(a), price.index(b), price.index(c)) for a, b, c in combinations(price, 3)]
    features_array = compute_triplet_imbalance(df_values, comb_indices)
    columns = [f"{a}_{b}_{c}_imb2" for a, b, c in combinations(price, 3)]
    features = pd.DataFrame(features_array, columns=columns)
    return features


## Feature Generation Functions 

In [77]:
from tqdm import tqdm
# from tqdm.auto import tqdm  # for notebooks

# Create new `pandas` methods which use `tqdm` progress
# (can use tqdm_gui, optional kwargs, etc.)
tqdm.pandas()

In [78]:
#used for backfilling
from dataclasses import dataclass

@dataclass
class LastValueAllStocks:
    numberstocks = 200
    date_id = 0 #initialise to 0
    all_near_prices = {i:0 for i in range(numberstocks)}
    all_far_prices = {i:0 for i in range(numberstocks)}

    def set_initial_prices(self,df):
        #Set the initial near_and far price for the first entry for each stock in dataframe

        #first get the very first entry for all stocks
        earliest_date=df.date_id.min()
        earliest_seconds_in_bucket=df[df.date_id==earliest_date].seconds_in_bucket.min()
        indices=df.loc[((df.date_id==earliest_date) & (df.seconds_in_bucket==earliest_seconds_in_bucket)),:].index
 
        #then loop through and set the near and far prices for each stock in dftmp
        for index in indices:
            stk=df[df.index==index].stock_id.values[0]       
            df.loc[df.index==index,['near_price']]=self.all_near_prices[stk]
            df.loc[df.index==index,['far_price']]=self.all_far_prices[stk]
        return df

    def save_last_prices(self,df):
        #save the final values for near_and far price for each stock in df

        #first get the very last entry for all stocks
        latest_date=df.date_id.max()
        latest_seconds_in_bucket=df[df.date_id==latest_date].seconds_in_bucket.max()
        indices=df.loc[((df.date_id==latest_date) & (df.seconds_in_bucket==latest_seconds_in_bucket)),:].index

        #then loop through and set the near and far prices for each stock in dftmp
        for index in indices:
            stk=df[df.index==index].stock_id.values[0] 
            self.all_near_prices[stk]= df.loc[df.index==index,['near_price']].values[0][0] 
            self.all_far_prices[stk]= df.loc[df.index==index,['far_price']].values[0][0]
        
class HandleNaNs:
    def __init__(self):
        self.last_value_all_stocks=LastValueAllStocks()
        
    def fill_nans(self,df):

            #sort to get stock ids all together, should be chunks of 200 rows
            df.sort_values(by=['stock_id','date_id','seconds_in_bucket'],inplace=True)

            df=self.last_value_all_stocks.set_initial_prices(df)
            
            #then do a forward interpolation
            df.far_price=df.far_price.interpolate(method='linear')
            df.near_price=df.near_price.interpolate(method='linear')

            self.last_value_all_stocks.save_last_prices(df)
            df.sort_values(by=['date_id','seconds_in_bucket','stock_id'],inplace=True)
            return df

In [79]:
def imbalance_features(df):
    # Define lists of price and size-related column names
    prices = ["reference_price", "far_price", "near_price", "ask_price", "bid_price", "wap"]
    sizes = ["matched_size", "bid_size", "ask_size", "imbalance_size"]
    
    df["volume"] = df.eval("ask_size + bid_size")
    df["mid_price"] = df.eval("(ask_price + bid_price) / 2")
    df["liquidity_imbalance"] = df.eval("(bid_size-ask_size)/(bid_size+ask_size)")
    df["matched_imbalance"] = df.eval("(imbalance_size-matched_size)/(matched_size+imbalance_size)")
    df["size_imbalance"] = df.eval("bid_size / ask_size")

    for c in combinations(prices, 2):
        df[f"{c[0]}_{c[1]}_imb"] = df.eval(f"({c[0]} - {c[1]})/({c[0]} + {c[1]})")

    for c in [['ask_price', 'bid_price', 'wap', 'reference_price'], sizes]:
        triplet_feature = calculate_triplet_imbalance_numba(c, df)
        df[triplet_feature.columns] = triplet_feature.values
    
    df["stock_weights"] = df["stock_id"].map(weights)
    df["weighted_wap"] = df["stock_weights"] * df["wap"]
    df['wap_momentum'] = df.groupby('stock_id')['weighted_wap'].pct_change(periods=6)
   
    df["imbalance_momentum"] = df.groupby(['stock_id'])['imbalance_size'].diff(periods=1) / df['matched_size']
    df["price_spread"] = df["ask_price"] - df["bid_price"]
    df["spread_intensity"] = df.groupby(['stock_id'])['price_spread'].diff()
    df['price_pressure'] = df['imbalance_size'] * (df['ask_price'] - df['bid_price'])
    df['market_urgency'] = df['price_spread'] * df['liquidity_imbalance']
    df['depth_pressure'] = (df['ask_size'] - df['bid_size']) * (df['far_price'] - df['near_price'])
    
    df['spread_depth_ratio'] = (df['ask_price'] - df['bid_price']) / (df['bid_size'] + df['ask_size'])
    df['mid_price_movement'] = df['mid_price'].diff(periods=5).apply(lambda x: 1 if x > 0 else (-1 if x < 0 else 0))
    
    df['micro_price'] = ((df['bid_price'] * df['ask_size']) + (df['ask_price'] * df['bid_size'])) / (df['bid_size'] + df['ask_size'])
    df['relative_spread'] = (df['ask_price'] - df['bid_price']) / df['wap']
    
    #TODO anyway to save the whole?
    # Calculate various statistical aggregation features
    for func in ["mean", "std", "skew", "kurt"]:
        df[f"all_prices_{func}"] = df[prices].agg(func, axis=1)
        df[f"all_sizes_{func}"] = df[sizes].agg(func, axis=1)
        

    for col in ['matched_size', 'imbalance_size', 'reference_price', 'imbalance_buy_sell_flag']:
        for window in [1,3,5,10]:
            df[f"{col}_shift_{window}"] = df.groupby('stock_id')[col].shift(window)
            df[f"{col}_ret_{window}"] = df.groupby('stock_id')[col].pct_change(window)
    
    # Calculate diff features for specific columns
    for col in ['ask_price', 'bid_price', 'ask_size', 'bid_size', 'weighted_wap','price_spread']:
        for window in [1,3,5,10]:
            df[f"{col}_diff_{window}"] = df.groupby("stock_id")[col].diff(window)
    
    #V4 feature
    for window in [3,5,10]:
        df[f'price_change_diff_{window}'] = df[f'bid_price_diff_{window}'] - df[f'ask_price_diff_{window}']
        df[f'size_change_diff_{window}'] = df[f'bid_size_diff_{window}'] - df[f'ask_size_diff_{window}']

    #V5 - rolling diff
    # Convert from pandas to Polars
    pl_df = pl.from_pandas(df)

    #Define the windows and columns for which you want to calculate the rolling statistics
    windows = [3, 5, 10]
    columns = ['ask_price', 'bid_price', 'ask_size', 'bid_size']

    # prepare the operations for each column and window
    group = ["stock_id"]
    expressions = []

    # Loop over each window and column to create the rolling mean and std expressions
    for window in windows:
        for col in columns:
            rolling_mean_expr = (
                pl.col(f"{col}_diff_{window}")
                .rolling_mean(window)
                .over(group)
                .alias(f'rolling_diff_{col}_{window}')
            )

            rolling_std_expr = (
                pl.col(f"{col}_diff_{window}")
                .rolling_std(window)
                .over(group)
                .alias(f'rolling_std_diff_{col}_{window}')
            )

            expressions.append(rolling_mean_expr)
            expressions.append(rolling_std_expr)

    # Run the operations using Polars' lazy API
    lazy_df = pl_df.lazy().with_columns(expressions)

    # Execute the lazy expressions and overwrite the pl_df variable
    pl_df = lazy_df.collect()

    # Convert back to pandas if necessary
    df = pl_df.to_pandas()
    gc.collect()
    
    df['mid_price*volume'] = df['mid_price_movement'] * df['volume']
    df['harmonic_imbalance'] = df.eval('2 / ((1 / bid_size) + (1 / ask_size))')
    
    for col in df.columns:
        df[col] = df[col].replace([np.inf, -np.inf], 0)

    return df

def other_features(df):
    df["dow"] = df["date_id"] % 5  # Day of the week
    df["seconds"] = df["seconds_in_bucket"] % 60  
    df["minute"] = df["seconds_in_bucket"] // 60  
    df['time_to_market_close'] = 540 - df['seconds_in_bucket']
    
    for key, value in global_stock_id_feats.items():
        df[f"global_{key}"] = df["stock_id"].map(value.to_dict())

    return df

class gen_all_features():
    def __init__(self,df=None):
        #infer near and far prices
        self.hn=HandleNaNs()
        
    def generate_all_features(self,df):
        #infer near and far prices

        df=self.hn.fill_nans(df)
        
        # Select relevant columns for feature generation
        cols = [c for c in df.columns if c not in ["row_id", "time_id"]]
        df = df[cols]
        
        # Generate imbalance features
        df = imbalance_features(df)
        gc.collect() 
        df = other_features(df)
        gc.collect()  
        feature_name = [i for i in df.columns if i not in ["row_id", "time_id"]]
        
        return df[feature_name]


# Data Loading and Preprocessing 

In [80]:
def getdata():
    if(CONFIG.runOnKaggle==True):
        df = pd.read_csv("/kaggle/input/optiver-trading-at-the-close/train.csv")
    else:
        df = pd.read_csv("./data/train.csv")
    
    df = df.dropna(subset=["target"])  #drop all rows with NaN in target
    df.reset_index(drop=True, inplace=True)
    return df
    
df=getdata()  
df=reduce_mem_usage(df)
GetMemUsage()  

Memory usage of dataframe is 679.35 MB
Memory usage of dataframe is 679.35 MB
Memory usage of dataframe is 679.35 MB
Memory usage of dataframe is 679.35 MB
Memory usage after optimization is: 304.71 MB
Memory usage after optimization is: 304.71 MB
Memory usage after optimization is: 304.71 MB
Memory usage after optimization is: 304.71 MB
Decreased by 55.15%
Decreased by 55.15%
Decreased by 55.15%
Decreased by 55.15%


'RAM usage = 4.505 GB'

# Calculate features

In [81]:
weights = [
    0.004, 0.001, 0.002, 0.006, 0.004, 0.004, 0.002, 0.006, 0.006, 0.002, 0.002, 0.008,
    0.006, 0.002, 0.008, 0.006, 0.002, 0.006, 0.004, 0.002, 0.004, 0.001, 0.006, 0.004,
    0.002, 0.002, 0.004, 0.002, 0.004, 0.004, 0.001, 0.001, 0.002, 0.002, 0.006, 0.004,
    0.004, 0.004, 0.006, 0.002, 0.002, 0.04 , 0.002, 0.002, 0.004, 0.04 , 0.002, 0.001,
    0.006, 0.004, 0.004, 0.006, 0.001, 0.004, 0.004, 0.002, 0.006, 0.004, 0.006, 0.004,
    0.006, 0.004, 0.002, 0.001, 0.002, 0.004, 0.002, 0.008, 0.004, 0.004, 0.002, 0.004,
    0.006, 0.002, 0.004, 0.004, 0.002, 0.004, 0.004, 0.004, 0.001, 0.002, 0.002, 0.008,
    0.02 , 0.004, 0.006, 0.002, 0.02 , 0.002, 0.002, 0.006, 0.004, 0.002, 0.001, 0.02,
    0.006, 0.001, 0.002, 0.004, 0.001, 0.002, 0.006, 0.006, 0.004, 0.006, 0.001, 0.002,
    0.004, 0.006, 0.006, 0.001, 0.04 , 0.006, 0.002, 0.004, 0.002, 0.002, 0.006, 0.002,
    0.002, 0.004, 0.006, 0.006, 0.002, 0.002, 0.008, 0.006, 0.004, 0.002, 0.006, 0.002,
    0.004, 0.006, 0.002, 0.004, 0.001, 0.004, 0.002, 0.004, 0.008, 0.006, 0.008, 0.002,
    0.004, 0.002, 0.001, 0.004, 0.004, 0.004, 0.006, 0.008, 0.004, 0.001, 0.001, 0.002,
    0.006, 0.004, 0.001, 0.002, 0.006, 0.004, 0.006, 0.008, 0.002, 0.002, 0.004, 0.002,
    0.04 , 0.002, 0.002, 0.004, 0.002, 0.002, 0.006, 0.02 , 0.004, 0.002, 0.006, 0.02,
    0.001, 0.002, 0.006, 0.004, 0.006, 0.004, 0.004, 0.004, 0.004, 0.002, 0.004, 0.04,
    0.002, 0.008, 0.002, 0.004, 0.001, 0.004, 0.006, 0.004,
]
weights = {int(k):v for k,v in enumerate(weights)}

In [82]:
%%time
def get_global_stock_id_feats(df2):
#first get the stats based on the training features
    global_stock_id_feats = {
            "median_size": df2.groupby("stock_id")["bid_size"].median() + df2.groupby("stock_id")["ask_size"].median(),
            "std_size": df2.groupby("stock_id")["bid_size"].std() + df2.groupby("stock_id")["ask_size"].std(),
            "ptp_size": df2.groupby("stock_id")["bid_size"].max() - df2.groupby("stock_id")["bid_size"].min(),
            "median_price": df2.groupby("stock_id")["bid_price"].median() + df2.groupby("stock_id")["ask_price"].median(),
            "std_price": df2.groupby("stock_id")["bid_price"].std() + df2.groupby("stock_id")["ask_price"].std(),
            "ptp_price": df2.groupby("stock_id")["bid_price"].max() - df2.groupby("stock_id")["ask_price"].min(),
        }
    return global_stock_id_feats



CPU times: user 81 µs, sys: 3 µs, total: 84 µs
Wall time: 169 µs


In [83]:
%%time
# cleanup_dataframes() 

#get these global feats on entire dataframe
global_stock_id_feats = get_global_stock_id_feats(df)
GetMemUsage()

CPU times: user 813 ms, sys: 24.1 ms, total: 837 ms
Wall time: 842 ms


'RAM usage = 4.505 GB'

In [84]:
%%time
gaf=gen_all_features()
df = gaf.generate_all_features(df)
print("Build df1 Finished.")

df=reduce_mem_usage(df)

GetMemUsage()

Build df1 Finished.


Memory usage of dataframe is 3771.41 MB
Memory usage of dataframe is 3771.41 MB
Memory usage of dataframe is 3771.41 MB
Memory usage of dataframe is 3771.41 MB
Memory usage after optimization is: 3122.03 MB
Memory usage after optimization is: 3122.03 MB
Memory usage after optimization is: 3122.03 MB
Memory usage after optimization is: 3122.03 MB
Decreased by 17.22%
Decreased by 17.22%
Decreased by 17.22%
Decreased by 17.22%


CPU times: user 56.2 s, sys: 4.07 s, total: 1min
Wall time: 50.4 s


'RAM usage = 3.883 GB'

## Clusters

In [85]:
clusters=[]
#run 12
#these come from months that were split into 4 clusters
# c0="13  14  17  23  45  54  68  72  75  84  90  106  128  133  138  140  157  158  167  175  186"
# c0=[int(a) for a in c0.split('  ')]
# c1="6  24  26  53  69  111  114  115  118  119  124  156  159  161  166  188  191  196  199"
# c1=[int(a) for a in c1.split('  ')]

# #verify no intersection
# print(f'intersection of c0 and c1={[a for a in c0 if a in c1]}')

# #find all other stocks
# allothers=[a for a in range(200) if a not in c0 and a not in c1]
# clusters=[c0,c1,allothers]

# #run 13
# clusters=[[0, 1, 28, 35, 50, 120, 121, 138, 153, 155, 167, 171, 179, 181, 25, 41, 47, 80, 84, 85, 113, 117, 133, 151, 175, 178, 186, 7, 37, 43, 51, 60, 139, 148, 165, 189, 23, 195, 44, 86, 180, 68, 81, 90, 36, 191, 131],
# [2, 8, 40, 53, 55, 74, 77, 100, 101, 102, 114, 130, 150, 166, 177, 9, 18, 20, 29, 33, 52, 56, 59, 62, 99, 107, 111, 125, 149, 152, 196, 16, 39, 46, 135, 168, 170, 15, 63, 67, 79, 104, 145, 164, 173, 71, 93, 98, 119, 19, 136, 123, 161, 115, 82],
# [3, 14, 32, 45, 48, 54, 58, 70, 78, 89, 94, 110, 141, 157, 176, 184, 13, 105, 160, 83, 198],
# [4, 12, 22, 24, 27, 30, 31, 65, 96, 103, 158, 199, 5, 61, 64, 73, 75, 106, 112, 129, 154, 163, 169, 116, 192, 21, 128, 132, 134],
# [6, 17, 42, 57, 88, 109, 118, 122, 126, 147, 159, 182, 188, 193, 10, 34, 49, 66, 69, 91, 97, 137, 142, 185, 194, 26, 92, 146, 174, 108, 197, 127, 76, 124, 72, 140, 87, 183, 162],
# [11, 38, 95, 144, 172, 187, 190, 143, 156]]

# allothers=[]
# flattenclusters=[c for cluster in clusters for c in cluster]
# allothers=[a for a in range(200) if a not in flattenclusters]
# if allothers:
#     clusters.append(allothers)

#get number of models
num_models=len(clusters)+1


# Load Models

In [86]:
#load the models
if(CONFIG.runOnKaggle==True):
    model_save_path = '../input/optivar-r13/' 
else:
    model_save_path = './models/' 
    if not os.path.exists(model_save_path):
        os.makedirs(model_save_path)
            
models=[]
for i in range(num_models):
    model_filename = os.path.join(model_save_path, f'm_{i}.txt')
    model = lgb.Booster(model_file=model_filename)
    models.append(model)

In [87]:
mock_api=True
if mock_api:
    from data.public_timeseries_testing_util import MockApi
    def make_env():
        return MockApi()
    env = make_env()
    iter_test = env.iter_test()
else:
    import optiver2023
    env = optiver2023.make_env()
    iter_test = env.iter_test()

cache_size=55000 #5 days worth of data
iter_test = env.iter_test()

cache=None #used for Kaggle to calculate rolling features on 200 stocks

def getcache(test):
    #get all dates in orig dataframe
    dates=df.date_id.unique()
    
    #get tests current date
    date=test.iloc[-1].date_id
    print(f'creating cache, test starts at date={date}')

    if (date in dates):
        i=np.where(dates == date)[0]
        prevdate=i-1
        cache=df.loc[df['date_id']==dates[prevdate[0]],:][-cache_size:]
    else:
        cache=df[-cache_size:]
        
    #get rid of extra columns in cache
    dropcols=[c for c in cache.columns if c not in test.columns]
    cache.drop(columns=dropcols, inplace=True)

    return cache


def zero_sum(prices, volumes):
    
#    I got this idea from https://github.com/gotoConversion/goto_conversion/
    
    std_error = np.sqrt(volumes)
    step = np.sum(prices)/np.sum(std_error)
    out = prices-std_error*step
    
    return out

for (test, revealed_targets, sample_prediction) in iter_test:
    print(f'shape test={test.shape}, columns={test.columns}')
    cols=test.columns
    if 'currently_scored' in cols:
        test.drop(columns=['currently_scored'],inplace=True)
    
    #add to the cache
    if cache is None:
        cache=getcache(test)
        
    cache=pd.concat([cache,test])
    
    feat = gaf.generate_all_features(cache)
    
    feat[-len(test):]=feat[-len(test):].fillna(feat.mean())

    print(f'feat shape={feat[-len(test):].shape},unique(seconds_in_bucket)={feat[-len(test):].seconds_in_bucket.nunique()}, unique near_price={feat[-len(test):].near_price.nunique()}, unique far_price={feat[-len(test):].far_price.nunique()}')  
    # print(f'seconds_in_bucket)={feat[-len(test):].seconds_in_bucket}, unique near_price={feat[-len(test):].near_price.nunique()}, unique far_price={feat[-len(test):].far_price.nunique()}')  
    print(f'for stock {feat[-1:].stock_id.values[0]}, seconds_in_bucket={feat[-1:].seconds_in_bucket.values[0]}, near_price={feat[-1:].near_price.values[0]}, far_price={feat[-1:].far_price.values[0]}')  
   
    # #create a place for the results to go
    # res=test.stock_id.copy().to_frame();
    # res['final_res']=np.NaN
    
    # #do predictions
    # for i,mod in enumerate(models):
    #     res[f'res_{i}']=mod.predict(feat[-len(test):])
    
    # # chooses output from the model trained 
    # # on the cluster that stock_id is in
    # def weight_func(x):
    #     for i,cluster in enumerate(clusters):
    #         if x.stock_id in cluster:
    #             #in this case take the average of the first model and the one that was trained
    #             #for the cluster this stock_id is in
    #             return ((x['res_0']+x[f'res_{i}'])/2)
            
    sample_prediction['target'] = model.predict(feat[-len(test):])
    sample_prediction['target'] = zero_sum(sample_prediction['target'], feat.loc[-len(test):,'bid_size']+feat.loc[-len(test):,'ask_size'])
    # sample_prediction['target'] = zero_sum(sample_prediction['target'], test.loc[:,'bid_size'] + test.loc[:,'ask_size'])

    print(f'sample_prediction shape={sample_prediction.shape}, columns={sample_prediction.columns}')
    env.predict(sample_prediction)
       
    #just save the last part of the cache
    cache=cache[-cache_size:]

sample_prediction['target']

shape test=(200, 15), columns=Index(['stock_id', 'date_id', 'seconds_in_bucket', 'imbalance_size',
       'imbalance_buy_sell_flag', 'reference_price', 'matched_size',
       'far_price', 'near_price', 'bid_price', 'bid_size', 'ask_price',
       'ask_size', 'wap', 'row_id'],
      dtype='object')
creating cache, test starts at date=478
feat shape=(200, 161),unique(seconds_in_bucket)=1, unique near_price=198, unique far_price=198
for stock 199, seconds_in_bucket=0, near_price=0.9960629940032959, far_price=0.9960629940032959
sample_prediction shape=(200, 2), columns=Index(['row_id', 'target'], dtype='object')
shape test=(200, 15), columns=Index(['stock_id', 'date_id', 'seconds_in_bucket', 'imbalance_size',
       'imbalance_buy_sell_flag', 'reference_price', 'matched_size',
       'far_price', 'near_price', 'bid_price', 'bid_size', 'ask_price',
       'ask_size', 'wap', 'row_id'],
      dtype='object')
feat shape=(200, 161),unique(seconds_in_bucket)=1, unique near_price=199, unique far_

KeyboardInterrupt: 