In [1]:
import gc  
import os  
import time  
import warnings 
from itertools import combinations  
from warnings import simplefilter 
import joblib  
import playground.optivarfuncs as of
import lightgbm as lgb  
import numpy as np  
import pandas as pd  
from sklearn.metrics import mean_absolute_error 
from sklearn.model_selection import KFold, TimeSeriesSplit  
warnings.filterwarnings("ignore")
simplefilter(action="ignore", category=pd.errors.PerformanceWarning)
is_offline = False 
is_train = True  
is_infer = True 
max_lookback = np.nan 
split_day = 435  
import polars as pl
# set the max columns to none
pd.set_option('display.max_columns', None)

# Functions

## Settings and helper Functions
There are 480 dates, 5 days a week or 96 weeks

In [2]:
class CONFIG:    
    #take last 3 months worth? or roughly 12*5=60.  So we want from (480-60) to 480
    # start_date=420

    #take last 1 months worth? or roughly 4*5=20.  So we want from (480-20) to 480
    start_date=460

    #just a week for testing?
    start_date=470
    
    #take last 1 months worth? or roughly 4*5=20.  So we want from (480-20) to 480
    doTrainModel= True #if true, #need train and test sets
    runOnKaggle=False #if true, then concat all datasets before calculating features for Kaggle data

    use_subset_of_data=True

In [3]:
from gc import collect;
collect()

0

In [4]:
# Tracking kernel memory usage:-  
from os import path, walk, getpid;
from psutil import Process;
def GetMemUsage():
    """
    This function defines the memory usage across the kernel. 
    Source-
    https://stackoverflow.com/questions/61366458/how-to-find-memory-usage-of-kaggle-notebook
    """;
    
    pid = getpid();
    py = Process(pid);
    memory_use = py.memory_info()[0] / 2. ** 30;
    return f"RAM usage = {memory_use :.4} GB";

def cleanup(df):
    try:
        del df
        df=None
    except:
        pass
    collect()
    return GetMemUsage()

GetMemUsage()

'RAM usage = 0.1951 GB'

In [5]:
#logging
import logging
# set up logging to file - see previous section for more details
logging.basicConfig(level=logging.INFO,
                    filename='logg.log',
                    filemode='w')
# define a Handler which writes INFO messages or higher to the sys.stderr
console = logging.StreamHandler()
# add the handler to the root logger
logging.getLogger().addHandler(console)
logger=logging.getLogger()

#use following to enable and disable
# logger.disabled = True

def reduce_mem_usage(df, verbose=True):
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtype
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            
            if str(col_type)[:3] == "int":
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
               
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float32)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float32)
    if verbose:
        logger.info(f"Memory usage of dataframe is {start_mem:.2f} MB")
        end_mem = df.memory_usage().sum() / 1024**2
        logger.info(f"Memory usage after optimization is: {end_mem:.2f} MB")
        decrease = 100 * (start_mem - end_mem) / start_mem
        logger.info(f"Decreased by {decrease:.2f}%")
    return df


## Parallel Triplet Imbalance Calculation function

In [6]:
from numba import njit, prange

@njit(parallel=True)
def compute_triplet_imbalance(df_values, comb_indices):
    num_rows = df_values.shape[0]
    num_combinations = len(comb_indices)
    imbalance_features = np.empty((num_rows, num_combinations))
    for i in prange(num_combinations):
        a, b, c = comb_indices[i]
        for j in range(num_rows):
            max_val = max(df_values[j, a], df_values[j, b], df_values[j, c])
            min_val = min(df_values[j, a], df_values[j, b], df_values[j, c])
            mid_val = df_values[j, a] + df_values[j, b] + df_values[j, c] - min_val - max_val
            
            if mid_val == min_val:
                imbalance_features[j, i] = np.nan
            else:
                imbalance_features[j, i] = (max_val - mid_val) / (mid_val - min_val)

    return imbalance_features

def calculate_triplet_imbalance_numba(price, df):
    df_values = df[price].values
    comb_indices = [(price.index(a), price.index(b), price.index(c)) for a, b, c in combinations(price, 3)]
    features_array = compute_triplet_imbalance(df_values, comb_indices)
    columns = [f"{a}_{b}_{c}_imb2" for a, b, c in combinations(price, 3)]
    features = pd.DataFrame(features_array, columns=columns)
    return features


## Feature Generation Functions 

In [7]:
from tqdm import tqdm
# from tqdm.auto import tqdm  # for notebooks

# Create new `pandas` methods which use `tqdm` progress
# (can use tqdm_gui, optional kwargs, etc.)
tqdm.pandas()

In [8]:
def imbalance_features(df):
    # Define lists of price and size-related column names
    prices = ["reference_price", "far_price", "near_price", "ask_price", "bid_price", "wap"]
    sizes = ["matched_size", "bid_size", "ask_size", "imbalance_size"]
    
    df["volume"] = df.eval("ask_size + bid_size")
    df["mid_price"] = df.eval("(ask_price + bid_price) / 2")
    df["liquidity_imbalance"] = df.eval("(bid_size-ask_size)/(bid_size+ask_size)")
    df["matched_imbalance"] = df.eval("(imbalance_size-matched_size)/(matched_size+imbalance_size)")
    df["size_imbalance"] = df.eval("bid_size / ask_size")

    for c in combinations(prices, 2):
        df[f"{c[0]}_{c[1]}_imb"] = df.eval(f"({c[0]} - {c[1]})/({c[0]} + {c[1]})")

    for c in [['ask_price', 'bid_price', 'wap', 'reference_price'], sizes]:
        triplet_feature = calculate_triplet_imbalance_numba(c, df)
        df[triplet_feature.columns] = triplet_feature.values
    
    df["stock_weights"] = df["stock_id"].map(weights)
    df["weighted_wap"] = df["stock_weights"] * df["wap"]
    df['wap_momentum'] = df.groupby('stock_id')['weighted_wap'].pct_change(periods=6)
   
    df["imbalance_momentum"] = df.groupby(['stock_id'])['imbalance_size'].diff(periods=1) / df['matched_size']
    df["price_spread"] = df["ask_price"] - df["bid_price"]
    df["spread_intensity"] = df.groupby(['stock_id'])['price_spread'].diff()
    df['price_pressure'] = df['imbalance_size'] * (df['ask_price'] - df['bid_price'])
    df['market_urgency'] = df['price_spread'] * df['liquidity_imbalance']
    df['depth_pressure'] = (df['ask_size'] - df['bid_size']) * (df['far_price'] - df['near_price'])
    
    df['spread_depth_ratio'] = (df['ask_price'] - df['bid_price']) / (df['bid_size'] + df['ask_size'])
    df['mid_price_movement'] = df['mid_price'].diff(periods=5).apply(lambda x: 1 if x > 0 else (-1 if x < 0 else 0))
    
    df['micro_price'] = ((df['bid_price'] * df['ask_size']) + (df['ask_price'] * df['bid_size'])) / (df['bid_size'] + df['ask_size'])
    df['relative_spread'] = (df['ask_price'] - df['bid_price']) / df['wap']
    
    # Calculate various statistical aggregation features
    for func in ["mean", "std", "skew", "kurt"]:
        df[f"all_prices_{func}"] = df[prices].agg(func, axis=1)
        df[f"all_sizes_{func}"] = df[sizes].agg(func, axis=1)
        

    for col in ['matched_size', 'imbalance_size', 'reference_price', 'imbalance_buy_sell_flag']:
        for window in [1,3,5,10]:
            df[f"{col}_shift_{window}"] = df.groupby('stock_id')[col].shift(window)
            df[f"{col}_ret_{window}"] = df.groupby('stock_id')[col].pct_change(window)
    
    # Calculate diff features for specific columns
    for col in ['ask_price', 'bid_price', 'ask_size', 'bid_size', 'weighted_wap','price_spread']:
        for window in [1,3,5,10]:
            df[f"{col}_diff_{window}"] = df.groupby("stock_id")[col].diff(window)
    
    #V4 feature
    for window in [3,5,10]:
        df[f'price_change_diff_{window}'] = df[f'bid_price_diff_{window}'] - df[f'ask_price_diff_{window}']
        df[f'size_change_diff_{window}'] = df[f'bid_size_diff_{window}'] - df[f'ask_size_diff_{window}']

    #V5 - rolling diff
    # Convert from pandas to Polars
    pl_df = pl.from_pandas(df)

    #Define the windows and columns for which you want to calculate the rolling statistics
    windows = [3, 5, 10]
    columns = ['ask_price', 'bid_price', 'ask_size', 'bid_size']

    # prepare the operations for each column and window
    group = ["stock_id"]
    expressions = []

    # Loop over each window and column to create the rolling mean and std expressions
    for window in windows:
        for col in columns:
            rolling_mean_expr = (
                pl.col(f"{col}_diff_{window}")
                .rolling_mean(window)
                .over(group)
                .alias(f'rolling_diff_{col}_{window}')
            )

            rolling_std_expr = (
                pl.col(f"{col}_diff_{window}")
                .rolling_std(window)
                .over(group)
                .alias(f'rolling_std_diff_{col}_{window}')
            )

            expressions.append(rolling_mean_expr)
            expressions.append(rolling_std_expr)

    # Run the operations using Polars' lazy API
    lazy_df = pl_df.lazy().with_columns(expressions)

    # Execute the lazy expressions and overwrite the pl_df variable
    pl_df = lazy_df.collect()

    # Convert back to pandas if necessary
    df = pl_df.to_pandas()
    gc.collect()
    
    df['mid_price*volume'] = df['mid_price_movement'] * df['volume']
    df['harmonic_imbalance'] = df.eval('2 / ((1 / bid_size) + (1 / ask_size))')
    
    for col in df.columns:
        df[col] = df[col].replace([np.inf, -np.inf], 0)

    return df

def other_features(df):
    df["dow"] = df["date_id"] % 5  # Day of the week
    df["seconds"] = df["seconds_in_bucket"] % 60  
    df["minute"] = df["seconds_in_bucket"] // 60  
    df['time_to_market_close'] = 540 - df['seconds_in_bucket']
    
    for key, value in global_stock_id_feats.items():
        df[f"global_{key}"] = df["stock_id"].map(value.to_dict())

    return df

class gen_all_features():
    def __init__(self,df=None):
        #infer near and far prices
        self.bs=of.bfs(['near_price','far_price'],df,None)
        
    def generate_all_features(self,df):
        #infer near and far prices
        self.bs.doautocol(df)
        df = df.progress_apply(self.bs.backfill, axis=1)
        
        # Select relevant columns for feature generation
        cols = [c for c in df.columns if c not in ["row_id", "time_id"]]
        df = df[cols]
        
        # Generate imbalance features
        df = imbalance_features(df)
        gc.collect() 
        df = other_features(df)
        gc.collect()  
        feature_name = [i for i in df.columns if i not in ["row_id", "time_id"]]
        
        return df[feature_name]



    

# Data Loading and Preprocessing 

In [9]:
def getdata():
    if(CONFIG.runOnKaggle==True):
        df = pd.read_csv("/kaggle/input/optiver-trading-at-the-close/train.csv")
    else:
        df = pd.read_csv("./data/train.csv")
    
    df = df.dropna(subset=["target"])  #drop all rows with NaN in target
    df.reset_index(drop=True, inplace=True)
    return df
    
df=getdata()  
df=reduce_mem_usage(df)
GetMemUsage()  

Memory usage of dataframe is 679.35 MB
Memory usage after optimization is: 304.71 MB
Decreased by 55.15%


'RAM usage = 0.8931 GB'

In [10]:
if(CONFIG.use_subset_of_data):
    #just take the last 4 weeks
    df=df[df.date_id>CONFIG.start_date]
print(f'df shape={df.shape}')
GetMemUsage() 

df shape=(110000, 17)


'RAM usage = 0.3404 GB'

# Data Splitting

In [11]:
def cleanup_dataframes():
    #cleanup existing dataframes
    cleanup(df_train)
    cleanup(df_valid)
    # cleanup(df_test_feats)
    cleanup(y_train)
    cleanup(y_valid)
    # cleanup(y_test_feats)

def getDataSets(df):
    if ( CONFIG.doTrainModel == True):
        #just need a train and a valid set
        return of.get2_DatasetAndTarget(df, dep_var='target', val_size=0.05,copy=False, verbose = False) 
df_train, df_valid, y_train, y_valid = getDataSets(df)
GetMemUsage()

'RAM usage = 0.3387 GB'

# Calculate features

In [12]:
weights = [
    0.004, 0.001, 0.002, 0.006, 0.004, 0.004, 0.002, 0.006, 0.006, 0.002, 0.002, 0.008,
    0.006, 0.002, 0.008, 0.006, 0.002, 0.006, 0.004, 0.002, 0.004, 0.001, 0.006, 0.004,
    0.002, 0.002, 0.004, 0.002, 0.004, 0.004, 0.001, 0.001, 0.002, 0.002, 0.006, 0.004,
    0.004, 0.004, 0.006, 0.002, 0.002, 0.04 , 0.002, 0.002, 0.004, 0.04 , 0.002, 0.001,
    0.006, 0.004, 0.004, 0.006, 0.001, 0.004, 0.004, 0.002, 0.006, 0.004, 0.006, 0.004,
    0.006, 0.004, 0.002, 0.001, 0.002, 0.004, 0.002, 0.008, 0.004, 0.004, 0.002, 0.004,
    0.006, 0.002, 0.004, 0.004, 0.002, 0.004, 0.004, 0.004, 0.001, 0.002, 0.002, 0.008,
    0.02 , 0.004, 0.006, 0.002, 0.02 , 0.002, 0.002, 0.006, 0.004, 0.002, 0.001, 0.02,
    0.006, 0.001, 0.002, 0.004, 0.001, 0.002, 0.006, 0.006, 0.004, 0.006, 0.001, 0.002,
    0.004, 0.006, 0.006, 0.001, 0.04 , 0.006, 0.002, 0.004, 0.002, 0.002, 0.006, 0.002,
    0.002, 0.004, 0.006, 0.006, 0.002, 0.002, 0.008, 0.006, 0.004, 0.002, 0.006, 0.002,
    0.004, 0.006, 0.002, 0.004, 0.001, 0.004, 0.002, 0.004, 0.008, 0.006, 0.008, 0.002,
    0.004, 0.002, 0.001, 0.004, 0.004, 0.004, 0.006, 0.008, 0.004, 0.001, 0.001, 0.002,
    0.006, 0.004, 0.001, 0.002, 0.006, 0.004, 0.006, 0.008, 0.002, 0.002, 0.004, 0.002,
    0.04 , 0.002, 0.002, 0.004, 0.002, 0.002, 0.006, 0.02 , 0.004, 0.002, 0.006, 0.02,
    0.001, 0.002, 0.006, 0.004, 0.006, 0.004, 0.004, 0.004, 0.004, 0.002, 0.004, 0.04,
    0.002, 0.008, 0.002, 0.004, 0.001, 0.004, 0.006, 0.004,
]
weights = {int(k):v for k,v in enumerate(weights)}

In [13]:
%%time
def get_global_stock_id_feats(df2):
#first get the stats based on the training features
    global_stock_id_feats = {
            "median_size": df2.groupby("stock_id")["bid_size"].median() + df2.groupby("stock_id")["ask_size"].median(),
            "std_size": df2.groupby("stock_id")["bid_size"].std() + df2.groupby("stock_id")["ask_size"].std(),
            "ptp_size": df2.groupby("stock_id")["bid_size"].max() - df2.groupby("stock_id")["bid_size"].min(),
            "median_price": df2.groupby("stock_id")["bid_price"].median() + df2.groupby("stock_id")["ask_price"].median(),
            "std_price": df2.groupby("stock_id")["bid_price"].std() + df2.groupby("stock_id")["ask_price"].std(),
            "ptp_price": df2.groupby("stock_id")["bid_price"].max() - df2.groupby("stock_id")["ask_price"].min(),
        }
    return global_stock_id_feats

global_stock_id_feats=get_global_stock_id_feats(df_train)

CPU times: user 18 ms, sys: 0 ns, total: 18 ms
Wall time: 17.7 ms


In [14]:
# cleanup_dataframes()
gaf=gen_all_features()
df = gaf.generate_all_features(df)

print("Build df Finished.")
df=reduce_mem_usage(df)
GetMemUsage()

100%|██████████| 110000/110000 [00:08<00:00, 12610.85it/s]
Memory usage of dataframe is 137.63 MB
Memory usage after optimization is: 65.78 MB
Decreased by 52.21%


Build df Finished.


'RAM usage = 0.6396 GB'

In [15]:
df_train, df_valid, y_train, y_valid = getDataSets(df)
GetMemUsage()

'RAM usage = 0.7041 GB'

In [16]:
df_train.head()

Unnamed: 0,stock_id,date_id,seconds_in_bucket,imbalance_size,imbalance_buy_sell_flag,reference_price,matched_size,far_price,near_price,bid_price,bid_size,ask_price,ask_size,wap,syn_near_price,syn_far_price,volume,mid_price,liquidity_imbalance,matched_imbalance,size_imbalance,reference_price_far_price_imb,reference_price_near_price_imb,reference_price_ask_price_imb,reference_price_bid_price_imb,reference_price_wap_imb,far_price_near_price_imb,far_price_ask_price_imb,far_price_bid_price_imb,far_price_wap_imb,near_price_ask_price_imb,near_price_bid_price_imb,near_price_wap_imb,ask_price_bid_price_imb,ask_price_wap_imb,bid_price_wap_imb,ask_price_bid_price_wap_imb2,ask_price_bid_price_reference_price_imb2,ask_price_wap_reference_price_imb2,bid_price_wap_reference_price_imb2,matched_size_bid_size_ask_size_imb2,matched_size_bid_size_imbalance_size_imb2,matched_size_ask_size_imbalance_size_imb2,bid_size_ask_size_imbalance_size_imb2,stock_weights,weighted_wap,wap_momentum,imbalance_momentum,price_spread,spread_intensity,price_pressure,market_urgency,depth_pressure,spread_depth_ratio,mid_price_movement,micro_price,relative_spread,all_prices_mean,all_sizes_mean,all_prices_std,all_sizes_std,all_prices_skew,all_sizes_skew,all_prices_kurt,all_sizes_kurt,matched_size_shift_1,matched_size_ret_1,matched_size_shift_3,matched_size_ret_3,matched_size_shift_5,matched_size_ret_5,matched_size_shift_10,matched_size_ret_10,imbalance_size_shift_1,imbalance_size_ret_1,imbalance_size_shift_3,imbalance_size_ret_3,imbalance_size_shift_5,imbalance_size_ret_5,imbalance_size_shift_10,imbalance_size_ret_10,reference_price_shift_1,reference_price_ret_1,reference_price_shift_3,reference_price_ret_3,reference_price_shift_5,reference_price_ret_5,reference_price_shift_10,reference_price_ret_10,imbalance_buy_sell_flag_shift_1,imbalance_buy_sell_flag_ret_1,imbalance_buy_sell_flag_shift_3,imbalance_buy_sell_flag_ret_3,imbalance_buy_sell_flag_shift_5,imbalance_buy_sell_flag_ret_5,imbalance_buy_sell_flag_shift_10,imbalance_buy_sell_flag_ret_10,ask_price_diff_1,ask_price_diff_3,ask_price_diff_5,ask_price_diff_10,bid_price_diff_1,bid_price_diff_3,bid_price_diff_5,bid_price_diff_10,ask_size_diff_1,ask_size_diff_3,ask_size_diff_5,ask_size_diff_10,bid_size_diff_1,bid_size_diff_3,bid_size_diff_5,bid_size_diff_10,weighted_wap_diff_1,weighted_wap_diff_3,weighted_wap_diff_5,weighted_wap_diff_10,price_spread_diff_1,price_spread_diff_3,price_spread_diff_5,price_spread_diff_10,price_change_diff_3,size_change_diff_3,price_change_diff_5,size_change_diff_5,price_change_diff_10,size_change_diff_10,rolling_diff_ask_price_3,rolling_std_diff_ask_price_3,rolling_diff_bid_price_3,rolling_std_diff_bid_price_3,rolling_diff_ask_size_3,rolling_std_diff_ask_size_3,rolling_diff_bid_size_3,rolling_std_diff_bid_size_3,rolling_diff_ask_price_5,rolling_std_diff_ask_price_5,rolling_diff_bid_price_5,rolling_std_diff_bid_price_5,rolling_diff_ask_size_5,rolling_std_diff_ask_size_5,rolling_diff_bid_size_5,rolling_std_diff_bid_size_5,rolling_diff_ask_price_10,rolling_std_diff_ask_price_10,rolling_diff_bid_price_10,rolling_std_diff_bid_price_10,rolling_diff_ask_size_10,rolling_std_diff_ask_size_10,rolling_diff_bid_size_10,rolling_std_diff_bid_size_10,mid_price*volume,harmonic_imbalance,dow,seconds,minute,time_to_market_close,global_median_size,global_std_size,global_ptp_size,global_median_price,global_std_price,global_ptp_price
0,0,471,0,1876093.0,1,0.999907,14207162.0,0.0,0.0,0.999907,17550.0,1.000078,14861.540039,1.0,1,1,32411.539062,0.999992,0.082948,-0.766702,1.1809,1.0,1.0,-8.5e-05,0.0,-4.6e-05,,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,8.5e-05,3.9e-05,-4.6e-05,0.838462,,0.838462,,5277.970215,6.634804,6.62522,691.304016,0.004,0.004,,,0.000171,,320.710846,1.4e-05,-0.0,5.274236e-09,0,1.0,0.000171,0.666649,4028916.75,0.516384,6841906.0,-0.968246,1.905057,-1.875,3.646779,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,16094.269531,1,0,0,540,87288.53125,103943.195312,498374.125,1.998584,0.002744,0.006289
1,1,471,0,8743771.0,1,0.998976,2955807.25,0.0,0.0,0.998976,20936.300781,1.001862,38176.0,1.0,1,1,59112.300781,1.000419,-0.291643,0.494716,0.548415,1.0,1.0,-0.001442,0.0,-0.000512,,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.001442,0.00093,-0.000512,1.818393,,1.818393,,169.239105,1.972136,1.983789,504.973724,0.001,0.001,,,0.002886,,25235.021484,-0.000842,0.0,4.882329e-08,0,0.999998,0.002886,0.666636,2939672.75,0.516375,4107940.5,-0.968228,1.406325,-1.875,1.462868,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,27042.230469,1,0,0,540,32263.470703,65763.054688,243262.375,1.998152,0.005741,0.01433
2,2,471,0,374828.0,-1,0.999851,22619190.0,0.0,0.0,0.999851,6750.810059,1.000487,22107.599609,1.0,1,1,28858.410156,1.000169,-0.532143,-0.967398,0.305362,1.0,1.0,-0.000318,0.0,-7.5e-05,,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.000318,0.000243,-7.5e-05,3.268,,3.268,,1471.471802,60.433964,63.065147,22.968367,0.002,0.002,,,0.000636,,238.383682,-0.000338,0.0,2.2038e-08,0,1.0,0.000636,0.666698,5755719.0,0.516422,11243599.0,-0.968245,1.998633,-1.875,3.99538,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,10343.204102,1,0,0,540,34268.851562,93512.054688,686719.875,2.001764,0.003074,0.006286
3,3,471,0,1677462.0,1,1.000122,44561212.0,0.0,0.0,0.999823,12048.599609,1.000171,11651.040039,1.0,1,1,23699.640625,0.999997,0.016775,-0.927443,1.034122,1.0,1.0,-2.4e-05,0.000149,6.1e-05,,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.000174,8.5e-05,-8.9e-05,0.965657,0.163876,0.40176,0.688889,112056.570312,25.749607,25.743462,4189.092285,0.006,0.006,,,0.000348,,583.709778,6e-06,-0.0,1.468258e-08,0,1.0,0.000348,0.666686,11565594.0,0.516413,22011088.0,-0.968246,1.992433,-1.875,3.973937,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,11846.485352,1,0,0,540,41624.5625,87388.382812,236199.390625,2.000378,0.002114,0.004815
4,4,471,0,331555.6,-1,0.999705,44681628.0,0.0,0.0,0.99982,17374.0,1.00005,4865.839844,1.0,1,1,22239.839844,0.999935,0.562421,-0.985269,3.570607,1.0,1.0,-0.000172,-5.8e-05,-0.000148,,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.000115,2.5e-05,-9e-05,0.277483,2.0,0.169327,1.565578,3570.809326,141.160645,135.755951,25.118128,0.004,0.004,,,0.00023,,76.242767,0.000129,-0.0,1.033977e-08,0,1.0,0.00023,0.666596,11258856.0,0.516343,22282360.0,-0.968246,1.999724,-1.875,3.999075,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,7602.491699,1,0,0,540,43320.699219,68085.5,292166.9375,2.000204,0.001925,0.004519


# Reduce memory

In [17]:
df_train = reduce_mem_usage(df_train)
df_valid = reduce_mem_usage(df_valid)
GetMemUsage()

Memory usage of dataframe is 59.58 MB
Memory usage after optimization is: 59.58 MB
Decreased by 0.00%
Memory usage of dataframe is 6.62 MB
Memory usage after optimization is: 6.62 MB
Decreased by 0.00%


'RAM usage = 0.7041 GB'

# rolling?

In [18]:
# df_train.head()

# df_train.reset_index(drop=True, inplace=True)

# df_train.loc[df_train.stock_id==0,:].loc[:,['matched_size','matched_size_shift_1','matched_size_shift_3','matched_size_shift_5','matched_size_shift_10']].head(20)
# test.loc[:,'bid_size']

# Save and Reload df

In [19]:
df_train.to_parquet("./tof_train.parquet")
df_valid.to_parquet("./tof_valid.parquet")

In [20]:
df_train=pd.read_parquet("./tof_train.parquet")
df_valid=pd.read_parquet("./tof_valid.parquet")
GetMemUsage()

'RAM usage = 0.8604 GB'

# Split for the following clusters

In [21]:
#these come from months that were split into 4 clusters
c0="13  14  17  23  45  54  68  72  75  84  90  106  128  133  138  140  157  158  167  175  186"
c0=[int(a) for a in c0.split('  ')]
c1="6  24  26  53  69  111  114  115  118  119  124  156  159  161  166  188  191  196  199"
c1=[int(a) for a in c1.split('  ')]

#verify no intersection
print(f'intersection of c0 and c1={[a for a in c0 if a in c1]}')

#find all other stocks
allothers=[a for a in range(200) if a not in c0 and a not in c1]
clusters=[c0,c1,allothers]


intersection of c0 and c1=[]


## Last optuna run for lightgbm gave these params

lgb_params={'num_leaves': 244,
 'learning_rate': 0.01,
 'max_depth': 11,
 'reg_alpha': 0.23746308577367767,
 'reg_lambda': 2.702844990315888}

# Train Model

In [22]:
def get_mae(model, X_tst,y_tst):
    y_pred = model.predict(X_tst)
    return mean_absolute_error(y_pred, y_tst)

def evaluate_simple(model, X_train, X_val,X_tst, y_train, y_val,y_tst):   
    model.fit(X_train, y_train, eval_set=[(X_val, y_val)], callbacks=[lgb.early_stopping(stopping_rounds=100, verbose=True),lgb.callback.log_evaluation(period=100)])
    return get_mae(model, X_tst,y_tst)
    
def average_target(av_target_train, X_train, X_val, X_tst, y_train, y_val, y_tst):   
    # model.fit(X_train, y_train, eval_set=[(X_val, y_val)], callbacks=[lgb.early_stopping(50, verbose=False)])
    return len(y_tst), mean_absolute_error([av_target_train]*len(y_tst), y_tst)

In [23]:
# lgb_params={'num_leaves': 244,
#  'learning_rate': 0.01,
#  'max_depth': 11,
#  'reg_alpha': 0.23746308577367767,
#  'reg_lambda': 2.702844990315888}

lgb_params = {
        "objective": "mae",
        "n_estimators": 6000,
        "num_leaves": 256,
        "subsample": 0.6,
        "colsample_bytree": 0.8,
#         "learning_rate": 0.00871,
        "learning_rate": 0.01,
        'max_depth': 11,
        "n_jobs": 4,
        "verbosity": -1,
        "importance_type": "gain",
#         "reg_alpha": 0.1,
        "reg_alpha": 0.2,
        "reg_lambda": 3.25
    }

In [24]:
%%time
models=[]
for cluster in clusters:
    #split dataframe
    t=df_train.loc[df_train.stock_id.isin(cluster),:]
    v=df_valid.loc[df_valid.stock_id.isin(cluster),:]
  
    # pull from y_train using t index
    y_t = y_train.loc[t.index]
    y_v = y_valid.loc[v.index]
    
    # continue with the rest of your code
    model = lgb.LGBMRegressor(**lgb_params)
    model.fit(t, y_t, eval_set=[(v, y_v)], callbacks=[lgb.early_stopping(stopping_rounds=100, verbose=True),lgb.callback.log_evaluation(period=100)])
    models.append(model)


Training until validation scores don't improve for 100 rounds
[100]	valid_0's l1: 4.12995
Early stopping, best iteration is:
[35]	valid_0's l1: 4.11391
Training until validation scores don't improve for 100 rounds
[100]	valid_0's l1: 5.47664
Early stopping, best iteration is:
[13]	valid_0's l1: 5.41516
Training until validation scores don't improve for 100 rounds
[100]	valid_0's l1: 4.90687
[200]	valid_0's l1: 4.90712
Early stopping, best iteration is:
[128]	valid_0's l1: 4.90378
CPU times: user 1min 15s, sys: 363 ms, total: 1min 15s
Wall time: 19.1 s


In [25]:
# %%time
# # Train a LightGBM model for the current fold
# model = lgb.LGBMRegressor(**lgb_params)
# model.fit(df_train, y_train, eval_set=[(df_valid, y_valid)], callbacks=[lgb.early_stopping(stopping_rounds=100, verbose=True),lgb.callback.log_evaluation(period=100)])

# GetMemUsage()

#  Run with Kaggle

In [26]:
# cleanup_dataframes() 
global_stock_id_feats = get_global_stock_id_feats(df)
# cleanup(df)
# GetMemUsage()

# Mock API

In [27]:
from data.public_timeseries_testing_util import MockApi
def make_env():
    return MockApi()


# Some visualizations

In [28]:
df_train.date_id.unique()

array([471, 472, 473, 474, 475, 476, 477, 478, 479], dtype=int16)

In [29]:
# import seaborn as sns
# #average kurtosis
# df.columns
# # for func in ["mean", "std", "skew", "kurt"]:
# #         df[f"all_prices_{func}"] = df[prices].agg(func, axis=1)
# #         df[f"all_sizes_{func}"] = df[sizes].agg(func, axis=1)
# plt.figure(figsize = (20, 10), dpi = 300)

# sns.lineplot(data = train, x = 'date_id', y = 'target', hue = 'imbalance_buy_sell_flag', errorbar = None, palette = 'viridis')

# plt.title('Average Target Over Days', weight = 'bold', fontsize = 30)
# plt.show()

NameError: name 'plt' is not defined

# Real Kaggle



In [37]:
# %%time
mock_api=True
if mock_api:
    env = make_env()
    iter_test = env.iter_test()
else:
    import optiver2023
    env = optiver2023.make_env()
    iter_test = env.iter_test()

cache_size=55000 #5 days worth of data
cache=None #used for Kaggle to calculate rolling features on 200 stocks

def getcache(test):
    #get all dates in orig dataframe
    dates=df.date_id.unique()
    
    #get tests current date
    date=test.iloc[-1].date_id
    print(f'creating cache, test starts at date={date}')

    if (date in dates):
        i=np.where(dates == date)[0]
        prevdate=i-1
        cache=df.loc[df['date_id']==dates[prevdate[0]],:][-cache_size:]
    else:
        cache=df[-cache_size:]
        
    #get rid of extra columns in cache
    dropcols=[c for c in cache.columns if c not in test.columns]
    cache.drop(columns=dropcols, inplace=True)

    return cache


def zero_sum(prices, volumes):
    
#    I got this idea from https://github.com/gotoConversion/goto_conversion/
    
    std_error = np.sqrt(volumes)
    step = np.sum(prices)/np.sum(std_error)
    out = prices-std_error*step
    
    return out

for (test, revealed_targets, sample_prediction) in iter_test:
    # test.drop(columns=['currently_scored'],inplace=True)
    
    #add to the cache
    if cache is None:
        cache=getcache(test)
                
    cache=pd.concat([cache,test])
    
    feat = gaf.generate_all_features(cache)
    # print(f'feat.near_price.isnull().sum()={feat.near_price.isnull().sum()}')
    # x=model.predict(feat[-len(test):])
    # print(type(x))
    # print(x.shape)

    #create a place for the results to go
    res=test.stock_id.copy().to_frame();
    res['final_res']=np.NaN
    
    #do predictions
    for i,mod in enumerate(models):
        res[f'res_{i}']=mod.predict(feat[-len(test):])
    
    # chooses output from the model trained 
    # on the cluster that stock_id is in
    def weight_func(x):
        for i,cluster in enumerate(clusters):
            if x.stock_id in cluster:
                return x[f'res_{i}']

    sample_prediction['target'] = res.apply(weight_func,axis=1)
    sample_prediction['target'] = zero_sum(sample_prediction['target'], test.loc[:,'bid_size'] + test.loc[:,'ask_size'])
    env.predict(sample_prediction)

    #just save the last part of the cache
    cache=cache[-cache_size:]

sample_prediction['target']

creating cache, test starts at date=478


100%|██████████| 11200/11200 [00:02<00:00, 5577.24it/s]
100%|██████████| 11400/11400 [00:02<00:00, 5438.55it/s]
100%|██████████| 11600/11600 [00:02<00:00, 5434.70it/s]
100%|██████████| 11800/11800 [00:02<00:00, 5281.19it/s]
 83%|████████▎ | 9970/12000 [00:01<00:00, 6668.52it/s]


KeyboardInterrupt: 

In [69]:
%debug


> [0;32m/home/keith/anaconda3/envs/p39/lib/python3.9/site-packages/lightgbm/sklearn.py[0m(894)[0;36mpredict[0;34m()[0m
[0;32m    892 [0;31m        [0mn_features[0m [0;34m=[0m [0mX[0m[0;34m.[0m[0mshape[0m[0;34m[[0m[0;36m1[0m[0;34m][0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    893 [0;31m        [0;32mif[0m [0mself[0m[0;34m.[0m[0m_n_features[0m [0;34m!=[0m [0mn_features[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m--> 894 [0;31m            raise ValueError("Number of features of the model must "
[0m[0;32m    895 [0;31m                             [0;34mf"match the input. Model n_features_ is {self._n_features} and "[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    896 [0;31m                             f"input n_features is {n_features}")
[0m


In [29]:
# def zero_sum(prices, volumes):
#     std_error = np.sqrt(volumes)
#     step = np.sum(prices) / np.sum(std_error)
#     out = prices - std_error * step
#     return out

# if is_infer:
#     import optiver2023
#     env = optiver2023.make_env()
#     iter_test = env.iter_test()
#     counter = 0
#     y_min, y_max = -64, 64
#     qps, predictions = [], []
#     cache = pd.DataFrame()

#     # Weights for each fold model
#     model_weights = [1/len(models)] * len(models) 
    
#     for (test, revealed_targets, sample_prediction) in iter_test:
#         now_time = time.time()
#         cache = pd.concat([cache, test], ignore_index=True, axis=0)
#         if counter > 0:
#             cache = cache.groupby(['stock_id']).tail(21).sort_values(by=['date_id', 'seconds_in_bucket', 'stock_id']).reset_index(drop=True)
#         feat = generate_all_features(cache)[-len(test):]

#         # added after new API, reference: https://www.kaggle.com/competitions/optiver-trading-at-the-close/discussion/455690#2526672
#         if test.currently_scored.iloc[0]== False:
#             sample_prediction['target'] = 0
#             env.predict(sample_prediction)
#             counter += 1
#             qps.append(time.time() - now_time)
#             if counter % 10 == 0:
#                 print(counter, 'qps:', np.mean(qps))
#             continue

#         feat = feat.drop(columns = ["currently_scored"])    
#         # end of new codes for new API
        
#         # Generate predictions for each model and calculate the weighted average
#         lgb_predictions = np.zeros(len(test))
#         for model, weight in zip(models, model_weights):
#             lgb_predictions += weight * model.predict(feat)

#         lgb_predictions = zero_sum(lgb_predictions, test['bid_size'] + test['ask_size'])
#         clipped_predictions = np.clip(lgb_predictions, y_min, y_max)
#         sample_prediction['target'] = clipped_predictions
#         env.predict(sample_prediction)
#         counter += 1
#         qps.append(time.time() - now_time)
#         if counter % 10 == 0:
#             print(counter, 'qps:', np.mean(qps))

#     time_cost = 1.146 * np.mean(qps)
#     print(f"The code will take approximately {np.round(time_cost, 4)} hours to reason about")