In [1]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/optiver-trading-at-the-close/public_timeseries_testing_util.py
/kaggle/input/optiver-trading-at-the-close/train.csv
/kaggle/input/optiver-trading-at-the-close/example_test_files/sample_submission.csv
/kaggle/input/optiver-trading-at-the-close/example_test_files/revealed_targets.csv
/kaggle/input/optiver-trading-at-the-close/example_test_files/test.csv
/kaggle/input/optiver-trading-at-the-close/optiver2023/competition.cpython-310-x86_64-linux-gnu.so
/kaggle/input/optiver-trading-at-the-close/optiver2023/__init__.py


In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
import pandas as pd
import numpy as np
train_df = pd.read_csv('/kaggle/input/optiver-trading-at-the-close/train.csv')
train_df.head()
#train_df=train_org.sample(frac=0.01, random_state=42)  # 1% sample


Unnamed: 0,stock_id,date_id,seconds_in_bucket,imbalance_size,imbalance_buy_sell_flag,reference_price,matched_size,far_price,near_price,bid_price,bid_size,ask_price,ask_size,wap,target,time_id,row_id
0,0,0,0,3180602.69,1,0.999812,13380276.64,,,0.999812,60651.5,1.000026,8493.03,1.0,-3.029704,0,0_0_0
1,1,0,0,166603.91,-1,0.999896,1642214.25,,,0.999896,3233.04,1.00066,20605.09,1.0,-5.519986,0,0_0_1
2,2,0,0,302879.87,-1,0.999561,1819368.03,,,0.999403,37956.0,1.000298,18995.0,1.0,-8.38995,0,0_0_2
3,3,0,0,11917682.27,-1,1.000171,18389745.62,,,0.999999,2324.9,1.000214,479032.4,1.0,-4.0102,0,0_0_3
4,4,0,0,447549.96,-1,0.999532,17860614.95,,,0.999394,16485.54,1.000016,434.1,1.0,-7.349849,0,0_0_4


In [4]:
def reduce_mem_usage(df, verbose=1):
    start_mem = df.memory_usage(deep=True).sum() / 1024**2
    print(f"\nInitial memory usage: {start_mem:.2f} MB")

    for col in df.columns:
        col_type = df[col].dtype
        if col_type != object and not pd.api.types.is_categorical_dtype(col_type):
            c_min = df[col].min()
            c_max = df[col].max()
            old_dtype = df[col].dtype

            if pd.api.types.is_integer_dtype(col_type):
                if c_min >= np.iinfo(np.int8).min and c_max <= np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min >= np.iinfo(np.int16).min and c_max <= np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min >= np.iinfo(np.int32).min and c_max <= np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                else:
                    df[col] = df[col].astype(np.int64)

            else:  # floats
                if c_min >= np.finfo(np.float16).min and c_max <= np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min >= np.finfo(np.float32).min and c_max <= np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

            if verbose and old_dtype != df[col].dtype:
                print(f"Column '{col}': {old_dtype} → {df[col].dtype}")

        elif col_type == object:
            num_unique = df[col].nunique()
            num_total = len(df[col])
            if num_unique / num_total < 0.5:
                df[col] = df[col].astype('category')
                if verbose:
                    print(f"Column '{col}': object → category")

    end_mem = df.memory_usage(deep=True).sum() / 1024**2
    print(f"\nFinal memory usage: {end_mem:.2f} MB")
    print(f"Reduced by {(100 * (start_mem - end_mem) / start_mem):.2f}%")

    return df

In [5]:
def build_metrics(train_df):
    # Microstructure features from order book
    train_df['spread'] = train_df['ask_price'] - train_df['bid_price']
    
    train_df['mid_price'] = (train_df['ask_price'] + train_df['bid_price']) / 2
    
    train_df['wap_diff'] = train_df['wap'] - train_df['mid_price']
    
    # Avoid division by zero in imbalance calculation
    train_df['order_flow_imbalance'] = (
        (train_df['bid_size'] - train_df['ask_size']) /
        (train_df['bid_size'] + train_df['ask_size']).replace(0, 1e-6)
    )
    # Relative imbalance (magnitude of excess demand/supply relative to matched volume)
    train_df['relative_imbalance'] = train_df['imbalance_size'] / (train_df['matched_size'] + 1e-6)

    # Binary indicator for buy-side pressure in the auction
    train_df['is_buy_pressure'] = (train_df['imbalance_buy_sell_flag'] == 1).astype(int)

    # Auction signal strength (how far auction price is from current WAP)
    train_df['auction_signal_strength'] = train_df['reference_price'] - train_df['wap']
    # 1-minute = 6 rows (10s intervals), use min_periods=1 to preserve early rows
    train_df['rolling_avg_imbalance'] = (
        train_df
        .groupby(['stock_id', 'date_id'])['imbalance_size']
        .transform(lambda x: x.rolling(window=6, min_periods=1).mean())
    )
    train_df['seconds_to_close'] = 540 - train_df['seconds_in_bucket']
    train_df['spread_change'] = train_df.groupby(['stock_id', 'date_id'])['spread'].diff()
    # 10-second WAP velocity (standard row-to-row diff)
    train_df['wap_velocity'] = (
        train_df
        .groupby(['stock_id', 'date_id'])['wap']
        .diff()
    )

    # 60-second WAP velocity (lag of 6 rows = 60 seconds in 10s interval data)
    train_df['wap_velocity_60s'] = (
        train_df
        .groupby(['stock_id', 'date_id'])['wap']
        .diff(periods=6)
    )
    # WAP 60 seconds earlier
    train_df['wap_lag_60s'] = (
        train_df
        .groupby(['stock_id', 'date_id'])['wap']
        .shift(6)
    )
    
    # Spread 60 seconds earlier
    train_df['spread_lag_60s'] = (
        train_df
        .groupby(['stock_id', 'date_id'])['spread']
        .shift(6)
    )

    # Order imbalance 60 seconds earlier
    train_df['imbalance_lag_60s'] = (
    train_df
    .groupby(['stock_id', 'date_id'])['imbalance_size']
    .shift(6)
    )
    # Ensure synthetic_index_wap is already computed
    train_df['synthetic_index_wap'] = (
        train_df.groupby(['date_id', 'seconds_in_bucket'])['wap'].transform('mean')
    )
    
    #
    # You can repeat this pattern for any feature you've already created:
    train_df['wap_velocity_lag_60s'] = train_df.groupby(['stock_id', 'date_id'])['wap_velocity'].shift(6)
    train_df['spread_change_lag_60s'] = train_df.groupby(['stock_id', 'date_id'])['spread_change'].shift(6)
    # Now compute the ratio
    train_df['stock_vs_index_wap_ratio'] = train_df['wap'] / (train_df['synthetic_index_wap'] + 1e-6)
    bins = [0, 300, 480, 600]
    labels = ['0_300', '300_480', '480_600']    
    train_df['window_label'] = pd.cut(
        train_df['seconds_in_bucket'],
        bins=bins,
        labels=labels,
        right=False  # means 0 <= x < 300, 300 <= x < 400, etc.
    )

    train_df = reduce_mem_usage(train_df)
    return train_df

In [6]:
def basic_agg_metrics(group_level,df,common_list,metric_list):
    column_list=[]
    for key, value in group_level.items():
        for metric_name in metric_list:
            for agg_func in common_list:
                new_col = metric_name+'_'+key+'_'+agg_func
                df[new_col] = (
                    df.groupby(value)[metric_name]
                      .expanding()
                     .agg(agg_func)
                      .reset_index(level=value, drop=True)
                )
    df = reduce_mem_usage(df)
    return df
    
def delta_beg_metrics(df,group_level,metric_list):
    for key, value in group_level.items():
        print(key)
        df = df.sort_values(['stock_id', 'date_id','window_label','seconds_in_bucket']).copy()
        for metric_name in metric_list:
            new_col = metric_name+'_'+key
            if key=='sw':
                # Window beg and initial
                df['window_end_'+new_col]=(df.groupby(value, observed=True)[metric_name].transform('last'))
                df['window_beg_'+new_col]=(df.groupby(value, observed=True)[metric_name].transform('first'))
                # delta
                df['delta_within_window'+new_col]=df[metric_name]-df['window_beg_'+new_col]
            else:
                df['initial_0s_'+metric_name]=(df.groupby(value, observed=True)[metric_name].transform('first'))
                df['lag_'+new_col] = (df.groupby(value, observed=True)[metric_name].shift(1))
                df['delta_'+new_col]=df[metric_name]-df['initial_0s_'+metric_name] 
        df = reduce_mem_usage(df)
        df = df.copy()
    return df        
    
def rolling_mean(df, window_size, metric_list):
    df = df.sort_values(['stock_id', 'seconds_in_bucket', 'date_id'])

    for feat in metric_list:
        rolcol=str(feat)+'_rollmean_'+str(window_size)
        df[rolcol] = (
            df.groupby(['stock_id', 'seconds_in_bucket'])[feat]
              .transform(lambda x: x.shift(1).rolling(window=window_size, min_periods=1).mean())
        )
        df[rolcol] = df[rolcol].fillna(df[feat])
        df = reduce_mem_usage(df)
        df = df.copy()
    return df

def window_agg(df,metric_list):
    # Aggregations to apply
    agg_funcs = ['min', 'max', 'mean', 'std'] 
    # Apply groupby with aggregation
    df_window_agg = df.groupby(['stock_id','date_id','window_label'],observed=True)[metric_list].agg(agg_funcs)
    df_window_agg.columns = ['_'.join(col) for col in df_window_agg.columns]
    df_window_agg = df_window_agg.reset_index()
    agg_cols=[]
    for i in metric_list:
        for j in agg_funcs:
            agg_cols.append(i+'_'+j)
    pivot_data = df_window_agg.pivot(index=['stock_id','date_id'], columns='window_label', values=agg_cols)
    pivot_data.columns = ['_'.join(col) for col in pivot_data.columns]
    pivot_data = pivot_data.reset_index()
    return pivot_data
    

In [7]:
def na_imputation(df):
    # Step 1: Sort so that forward fill respects time order
    df = df.sort_values(['stock_id', 'date_id', 'seconds_in_bucket'])
    cols_with_na = df.columns[df.isna().any()].tolist()
    print(cols_with_na)
    # Step 2: Forward fill missing prices per stock and date
    for col in cols_with_na:
        #df['missing'+str(col)] = df[col].isna().astype(int)  # optional missing flag
        df[col] = df.groupby(['stock_id', 'date_id'])[col].ffill()
    for col in df.columns:
        if 'std' in col:
            #df[f'{col}_was_missing'] = df[col].isna().astype(int)  # optional flag
            df[col] = df[col].fillna(0)  # fill std NaNs with 0  
    df['lag_near_price_sd'] = df['lag_near_price_sd'].fillna(df['near_price'])
    df = reduce_mem_usage(df)
    df = df.copy()
    
    return df

def window_0_300_data(df):
    df_1= df[df['window_label']=='0_300'] 
    cols_df1=['stock_vs_index_wap_ratio','near_price','spread','relative_imbalance','far_price']
    cols_to_drop = [col for col in df_1.columns if any(substr in col for substr in cols_df1)]+['time_id','row_id','window_label']
    cols_df2 = ['window_end']
        # Drop columns that contain substrings in cols_df1 but NOT '0_300'
    cols_to_drop2 = [col for col in df_1.columns 
                        if any(substr in col for substr in cols_df2) ]
    cols_to_drop_fin=cols_to_drop+cols_to_drop2
    df_1.drop(columns=cols_to_drop_fin, inplace=True)
    df_1 = df_1.copy()
    df_1 = reduce_mem_usage(df_1)
    return df_1
    
def window_300_480(df,pivot_data,cols_to_merge_1,merge_keys):
    if '300_480' in df['window_label'].values:
        df_2= df[df['window_label']=='300_480'].copy()
        df_2 = reduce_mem_usage(df_2)
        df_2 = df_2.merge(pivot_data[merge_keys + cols_to_merge_1], on=merge_keys, how='left')
        #Dropping cols which aren't neccessary
        cols_df2=['bid_size','order_flow_imbalance','matched_size','relative_imbalance']
        cols_to_drop = [col for col in df_2.columns if any(substr in col for substr in cols_df2)]
        cols_df2 = ['window_end']
        # Drop columns that contain substrings in cols_df1 but NOT '0_300'
        cols_to_drop2 = [col for col in df_2.columns 
                        if any(substr in col for substr in cols_df2) and '0_300' not in col]
        cols_to_drop1 = [col for col in df_2.columns 
                        if any(substr in col for substr in ['near_price']) and '0_300' in col]
        cols_to_drop_fin=cols_to_drop1+cols_to_drop2+cols_to_drop+['time_id','row_id','window_label']
        df_2 = df_2.drop(columns=cols_to_drop_fin)
        df_2 = reduce_mem_usage(df_2) 
        df_2 =df_2.copy()
    else:
        df_2 = pd.DataFrame(columns=df.columns.union(cols_to_merge_1))  # Or skip entirely

    return df_2




def window_480_600(df,pivot_data,cols_to_merge_1,cols_to_merge_2,merge_keys):
    if '480_600' in df['window_label'].values:
        df_3= df[df['window_label']=='480_600'].copy()
        df_3 = reduce_mem_usage(df_3)  
        df_3 = df_3.merge(pivot_data[merge_keys + cols_to_merge_1], on=merge_keys, how='left')
        df_3 = df_3.merge(pivot_data[merge_keys + cols_to_merge_2], on=merge_keys, how='left')
        #Dropping cols which aren't neccessary
        cols_df3=['order_flow_imbalance','bid_size','matched_size','relative_imbalance']
        cols_to_drop = [col for col in df_3.columns if any(substr in col for substr in cols_df3)]
        cols_df2 = ['window_end']
        cols_to_drop2 = [col for col in df_3.columns 
                        if any(substr in col for substr in cols_df2) and '0_300' not in col]
        cols_to_drop3 = [col for col in df_3.columns 
                        if any(substr in col for substr in cols_df2) and '300_480' not in col]
        
        cols_to_drop1 = [col for col in df_3.columns 
                        if any(substr in col for substr in ['near_price']) and '0_300' in col]
        cols_to_drop_fin=cols_to_drop1+cols_to_drop2+cols_to_drop+['time_id','row_id','window_label']
        df_3 = df_3.drop(columns=cols_to_drop_fin)
        df_3 = reduce_mem_usage(df_3) 
        df_3 =df_3.copy()
    else:
        df_3 = pd.DataFrame(columns=df.columns.union(cols_to_merge_2))  # Or skip entirely
    return df_3

In [8]:
metric_list=['wap_diff','order_flow_imbalance','auction_signal_strength','bid_size','matched_size','stock_vs_index_wap_ratio','near_price','spread','relative_imbalance']

def full_preprocessing_pipeline(data,metric_list, is_test=True):
    
    # 1. Add domain-specific features
    df = build_metrics(data) 
    df=df.sort_values(['stock_id', 'date_id', 'seconds_in_bucket']).copy()
    if not is_test:
        df=df[~(df['stock_id'].astype(int).isin([19,101,158,131]))].copy()
        
    # 2. Building basic metrics like min max avg and index metric to showcase variation across all stocks
    group_level_basic={'sd':['stock_id', 'date_id'],'sw':['stock_id', 'date_id','window_label'],'all':['date_id']}
    common_list=['max','mean','min','std']
    df=basic_agg_metrics(group_level_basic,df,common_list,metric_list)

    # 3. Building basic metrics like delta and window beg window end etc..
    group_level={'sd':['stock_id', 'date_id'],'sw':['stock_id', 'date_id','window_label']}
    df=delta_beg_metrics(df,group_level,metric_list)

    # 4. Rolling Mean across X days 
    df=rolling_mean(df, window_size=10, metric_list=metric_list)
    
    # 5. Window level agg data
    pivot_df=window_agg(df,metric_list)

    # 6. Creating df for 0-300 window 
    df_1=window_0_300_data(df) 

    # 7. Null Imputation 
    df=na_imputation(df)

    #Historical window metrics
    merge_keys = ['stock_id', 'date_id']
    cols_to_merge_1 = [col for col in pivot_df.columns if '0_300' in col]
    cols_to_merge_2 = [col for col in pivot_df.columns if '300_480' in col]
    
    # 8. Creating df for 300-480 window 
    df_2=window_300_480(df,pivot_df,cols_to_merge_1,merge_keys)

    # 9. Creating df for 480-600 window 
    df_3=window_480_600(df,pivot_df,cols_to_merge_1,cols_to_merge_2,merge_keys)
    return df_1,df_2,df_3


In [9]:
metric_list=['wap_diff','order_flow_imbalance','auction_signal_strength','bid_size','matched_size','stock_vs_index_wap_ratio','near_price','spread','relative_imbalance']
train_0_300, train_300_480, train_480_600 = full_preprocessing_pipeline(train_df,metric_list,is_test=False)



Initial memory usage: 1738.49 MB
Column 'stock_id': int64 → int16
Column 'date_id': int64 → int16
Column 'seconds_in_bucket': int64 → int16
Column 'imbalance_size': float64 → float32
Column 'imbalance_buy_sell_flag': int64 → int8
Column 'reference_price': float64 → float16
Column 'matched_size': float64 → float32
Column 'far_price': float64 → float16
Column 'near_price': float64 → float16
Column 'bid_price': float64 → float16
Column 'bid_size': float64 → float32
Column 'ask_price': float64 → float16
Column 'ask_size': float64 → float32
Column 'wap': float64 → float16
Column 'target': float64 → float16
Column 'time_id': int64 → int16
Column 'spread': float64 → float16
Column 'mid_price': float64 → float16
Column 'wap_diff': float64 → float16
Column 'order_flow_imbalance': float64 → float16
Column 'relative_imbalance': float64 → float16
Column 'is_buy_pressure': int64 → int8
Column 'auction_signal_strength': float64 → float16
Column 'rolling_avg_imbalance': float64 → float32
Column 'sec

# Model Training

In [10]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score
from catboost import CatBoostRegressor, Pool
import numpy as np

In [11]:
if not train_0_300.empty:
    cat_features = ['stock_id']
    # Create Y_1 as the target column
    Y_1 = train_0_300['target']
    # Create X_1 by dropping only the target column
    X_1 = train_0_300.drop(columns=['target'])
    final_model_df_1 = CatBoostRegressor(
        iterations=1000,
        learning_rate=0.05,
        depth=8,
        l2_leaf_reg=3,
        cat_features=cat_features,
        task_type='GPU',
        devices='0',
        loss_function='MAE',
        early_stopping_rounds=50,
        verbose=50
    )
    final_model_df_1.fit(X_1, Y_1, cat_features=cat_features)

Default metric period is 5 because MAE is/are not implemented for GPU


0:	learn: 6.8797582	total: 452ms	remaining: 7m 31s
50:	learn: 6.8578939	total: 11.4s	remaining: 3m 32s
100:	learn: 6.8401456	total: 22.2s	remaining: 3m 17s
150:	learn: 6.8256737	total: 33.1s	remaining: 3m 5s
200:	learn: 6.8138446	total: 43.8s	remaining: 2m 54s
250:	learn: 6.8041331	total: 54.6s	remaining: 2m 42s
300:	learn: 6.7960712	total: 1m 5s	remaining: 2m 31s
350:	learn: 6.7893318	total: 1m 15s	remaining: 2m 20s
400:	learn: 6.7837056	total: 1m 26s	remaining: 2m 9s
450:	learn: 6.7789630	total: 1m 37s	remaining: 1m 58s
500:	learn: 6.7749057	total: 1m 47s	remaining: 1m 47s
550:	learn: 6.7714313	total: 1m 58s	remaining: 1m 36s
600:	learn: 6.7683699	total: 2m 8s	remaining: 1m 25s
650:	learn: 6.7657207	total: 2m 19s	remaining: 1m 14s
700:	learn: 6.7633280	total: 2m 30s	remaining: 1m 4s
750:	learn: 6.7612190	total: 2m 40s	remaining: 53.3s
800:	learn: 6.7593021	total: 2m 51s	remaining: 42.5s
850:	learn: 6.7575660	total: 3m 1s	remaining: 31.8s
900:	learn: 6.7559499	total: 3m 11s	remaining:

## Model Training 300-470 seconds

In [12]:
if not train_300_480.empty:
    # Create Y_2 as the target column
    Y_2 = train_300_480['target']
    
    # Create X_2 by dropping only the target column
    X_2 = train_300_480.drop(columns=['target'])
    cat_features = ['stock_id']
    
    # Train the final model using best hyperparameters
    final_model_df_2 = CatBoostRegressor(
        depth=8,
        learning_rate=0.01,
        l2_leaf_reg=1,
        iterations=500,
        task_type='GPU',
        devices='0',
        verbose=100,
        random_state=42
    )
    
    final_model_df_2.fit(X_2, Y_2, cat_features=cat_features)

0:	learn: 8.3827854	total: 193ms	remaining: 1m 36s
100:	learn: 8.2551006	total: 12.5s	remaining: 49.3s
200:	learn: 8.2015454	total: 24.5s	remaining: 36.5s
300:	learn: 8.1631689	total: 36.5s	remaining: 24.1s
400:	learn: 8.1309278	total: 48.5s	remaining: 12s
499:	learn: 8.1038490	total: 1m	remaining: 0us


## Model Training 480-540 seconds

In [13]:
if not train_480_600.empty:
    # Step 1: Split into X_3 and y_3
    X_3 = train_480_600.drop(columns=['target'])
    y_3 = train_480_600['target']
    cat_cols = ['stock_id']
    
    # Step 3: Train final model on all data with best parameters
    final_model_df_3 = CatBoostRegressor(
        depth=9,
        iterations=200,
        l2_leaf_reg=46.41588833612777,
        learning_rate=0.07,
        verbose=0,
        random_state=42
    )       
    final_model_df_3.fit(X_3, y_3, cat_features=cat_cols)

## Test Functions

In [14]:
def get_last_n_days_per_stock(df, n=10):
    """
    Return the last n days of data per stock_id, preserving sort by time.
    """
    return (
        df.sort_values(['stock_id', 'date_id', 'seconds_in_bucket'])
          .groupby('stock_id', group_keys=False)
          .apply(lambda x: x[x['date_id'] >= x['date_id'].max() - (n - 1)])
          .reset_index(drop=True)
    )
def test_preprocess(cache,test,metric_list,key_cols):

    # Merge test + cache and drop duplicates (keep test row)
    merged = pd.concat([cache, test], ignore_index=True)
    merged.drop_duplicates(subset=key_cols, keep='last', inplace=True)
    cache = merged.copy()

    # Run preprocessing (returns 3 windows)
    df_0_300, df_300_480, df_480_600 = full_preprocessing_pipeline(cache,metric_list,is_test=True)

    # Keep only current test batch rows
    test_keys = test[key_cols].drop_duplicates()
    df_0_300 = df_0_300.merge(test_keys, on=key_cols, how='inner')
    df_300_480 = df_300_480.merge(test_keys, on=key_cols, how='inner')
    df_480_600 = df_480_600.merge(test_keys, on=key_cols, how='inner')
    return cache,df_0_300,df_300_480,df_480_600
    

# Submission

In [15]:
# Initial cache: last N days per stock from train
import optiver2023
env = optiver2023.make_env()
iter_test = env.iter_test()


In [16]:
metric_list=['wap_diff','order_flow_imbalance','auction_signal_strength','bid_size','matched_size','stock_vs_index_wap_ratio','near_price','spread','relative_imbalance']
cache = get_last_n_days_per_stock(train_df, n=10)
key_cols = ['stock_id', 'date_id', 'seconds_in_bucket']
# Start test loop
for test, revealed_targets, sample_prediction in iter_test:
    #Initialize final predictions using row_id
    preds_df1 = test[['row_id']].copy()
    preds_df1['target'] = 0.0
    cache,df_0_300,df_300_480,df_480_600=test_preprocess(cache,test,metric_list,key_cols)
    #Predict separately for each window if not empty
    if not df_0_300.empty:
        #df_0_300['stock_id'] = df_0_300['stock_id'].astype(str)  # Same as training
        col_0_300=train_0_300.columns
        preds_0 = final_model_df_1.predict(df_0_300[col_0_300])
        ids_0 = test.merge(df_0_300, on=key_cols, how='inner')['row_id']
        preds_df1.loc[preds_df1['row_id'].isin(ids_0), 'target'] = preds_0

    if not df_300_480.empty:
        col_300_480=train_300_480.columns
       # df_300_480['stock_id'] = df_300_480['stock_id'].astype(str)  # Same as training
        preds_1 = final_model_df_2.predict(df_300_480[col_300_480])
        ids_1 = test.merge(df_300_480, on=key_cols, how='inner')['row_id']
        preds_df1.loc[preds_df1['row_id'].isin(ids_1), 'target'] = preds_1

    if not df_480_600.empty:
       # df_480_600['stock_id'] = df_480_600['stock_id'].astype(str)  # Same as training
        col_480_600=train_480_600.columns
        preds_2 = final_model_df_3.predict(df_480_600[col_480_600])
        ids_2 = test.merge(df_480_600, on=key_cols, how='inner')['row_id']
        preds_df1.loc[preds_df1['row_id'].isin(ids_2), 'target'] = preds_2
    # Submit predictions
    sample_prediction['target'] = preds_df1['target'].values
    env.predict(sample_prediction)

This version of the API is not optimized and should not be used to estimate the runtime of your code on the hidden test set.

Initial memory usage: 32.13 MB
Column 'stock_id': int64 → int16
Column 'date_id': int64 → int16
Column 'seconds_in_bucket': int64 → int16
Column 'imbalance_size': float64 → float32
Column 'imbalance_buy_sell_flag': int64 → int8
Column 'reference_price': float32 → float16
Column 'matched_size': float64 → float32
Column 'far_price': float32 → float16
Column 'near_price': float32 → float16
Column 'bid_price': float32 → float16
Column 'bid_size': float64 → float32
Column 'ask_price': float32 → float16
Column 'ask_size': float64 → float32
Column 'wap': float32 → float16
Column 'time_id': float64 → float16
Column 'spread': float32 → float16
Column 'mid_price': float32 → float16
Column 'wap_diff': float32 → float16
Column 'order_flow_imbalance': float64 → float16
Column 'relative_imbalance': float64 → float16
Column 'is_buy_pressure': int64 → int8
Column 'auction_signa