In [33]:
import os
import pandas as pd
import matplotlib.pyplot as plt
from pprint import pprint
import numpy as np

from numpy.random import default_rng
RANDOM_STATE = 2 # random state for default_rng
rng = default_rng(RANDOM_STATE)


import random
#import altair as alt
from tqdm import tqdm
import datetime

# CHECKLIST for Kaggle variant
# USE_ALL_STOCK_IDS False to check then True
# USE_TEST_LOCAL_6_ITEMS must be False else we override the local test data
# TEST_SIZE must be 0 to get all items
# Check on Kaggle that "internet" is disabled
# First run with "USE_ALL_STOCK_IDS=False", flip to True, Save Version, it'll take 30 mins to run

# CHECKLIST for home variant
# USE_ALL_STOCK_IDS False for fast dev, True for proper testing
# USE_TEST_LOCAL_6_ITEMS False for fast dev, True for proper testing
# NBR_FOR_SUBSET_OF_STOCK_IDS 4 for quick testing

t1_notebook_start = datetime.datetime.utcnow()

if os.environ.get('USER') == 'ian':
    ENV_HOME = True
    import ipython_memory_usage
    %ipython_memory_usage_start
    USE_ALL_STOCK_IDS = False
    NBR_FOR_SUBSET_OF_STOCK_IDS = 4
    #TEST_SIZE = 0.25 # for single train/test split
    TEST_SIZE = 0
    #USE_TEST_LOCAL_6_ITEMS = True # robust local testing at home
    USE_TEST_LOCAL_6_ITEMS = False # robust local testing at home TEMPORARY WHILST DEBUGGING
    
    from joblib import Memory
    memory = Memory(location='joblib_cache', verbose=0)

else:
    ENV_HOME = False
    USE_ALL_STOCK_IDS = False # for KAGGLE on first-upload for a quick test
    TEST_SIZE = 0
    USE_TEST_LOCAL_6_ITEMS = False
    NBR_FOR_SUBSET_OF_STOCK_IDS = 4
    # kaggle notes:
    # You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
    # You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session


if USE_ALL_STOCK_IDS:
    NBR_FOR_SUBSET_OF_STOCK_IDS=None
print(f'ENV_HOME: {ENV_HOME}, TEST_SIZE {TEST_SIZE}, USE_ALL_STOCK_IDS {USE_ALL_STOCK_IDS}, USE_TEST_LOCAL_6_ITEMS {USE_TEST_LOCAL_6_ITEMS}')
print(f'NBR_FOR_SUBSET_OF_STOCK_IDS: {NBR_FOR_SUBSET_OF_STOCK_IDS}')

ENV_HOME: True, TEST_SIZE 0, USE_ALL_STOCK_IDS False, USE_TEST_LOCAL_6_ITEMS False
NBR_FOR_SUBSET_OF_STOCK_IDS: 4
In [33] used 0.1797 MiB RAM in 0.11s, peaked 0.00 MiB above current, total RAM usage 395.67 MiB


In [34]:
# OR PASTE IN UTILITY CODE HERE FOR KAGGLE
from utility import make_unique_time_ids, get_training_stock_ids, rmspe_score
from utility import ROOT, TEST_CSV, TRAIN_CSV

In [34] used 0.0000 MiB RAM in 0.10s, peaked 0.00 MiB above current, total RAM usage 395.67 MiB


## Load train set

In [35]:
stock_ids = get_training_stock_ids('book_train.parquet') # all stocks by default

if not USE_ALL_STOCK_IDS:
    # choose a random subset
    print(f"Using a subset of {NBR_FOR_SUBSET_OF_STOCK_IDS}")
    rng.shuffle(stock_ids)
    #random.shuffle(stock_ids)
    stock_ids = stock_ids[:NBR_FOR_SUBSET_OF_STOCK_IDS]
else:
    print("Using all")
stock_ids[:3] # expect 59, 58, 23 if we're using all or 76, 73, 0 on the RANDOM_STATE of 1 if we don't use all stock ids

Using a subset of 4


[119, 2, 16]

In [35] used 0.0000 MiB RAM in 0.11s, peaked 0.00 MiB above current, total RAM usage 395.67 MiB


In [36]:
df_train_all = pd.read_csv(TRAIN_CSV)
df_train_all = df_train_all.set_index(['stock_id', 'time_id'])
print(df_train_all.shape)
#rows_for_stock_id_0 = df_train_all.query('stock_id == 0').shape[0]
#rows_for_stock_id_0

(428932, 1)
In [36] used 0.0273 MiB RAM in 0.24s, peaked 0.00 MiB above current, total RAM usage 395.70 MiB


In [37]:
def show_details(df):
    try:
        nbr_index_levels = len(df.index.levels)
    except AttributeError:
        nbr_index_levels = 1
    nbr_nulls = df.isnull().sum().sum()
    #nulls_msg = "Has no nulls"
    #if nbr_nulls==0:
    nulls_msg = f"{nbr_nulls} nulls"
    is_view_msg = f'is_view {df_train_all._data.is_view}'
    is_single_block_msg = f'is_single_block {df_train_all._data.is_single_block}'
    is_consolidated_msg = f'is_consolidated {df_train_all._data.is_consolidated()}'    
    print(f'[{nbr_index_levels}c] {df.shape[0]:,}x{df.shape[1]:,}, {nulls_msg}, {is_view_msg}, {is_single_block_msg}, {is_consolidated_msg}')

show_details(df_train_all)

[2c] 428,932x1, 0 nulls, is_view True, is_single_block True, is_consolidated True
In [37] used 0.0000 MiB RAM in 0.11s, peaked 0.00 MiB above current, total RAM usage 395.70 MiB


In [38]:
all_time_ids = df_train_all.reset_index().time_id.unique()
rng.shuffle(all_time_ids)
print(f"We have {len(all_time_ids):,} time ids")
time_ids_train, time_ids_test = make_unique_time_ids(all_time_ids, test_size=TEST_SIZE)
assert len(time_ids_train) + len(time_ids_test) == len(all_time_ids)
assert len(time_ids_train.intersection(time_ids_test)) == 0, "Expecting no overlap between train and test time ids"
print(f"Example time ids for training, min first: {sorted(list(time_ids_train))[:5]}")

We have 3,830 time ids
Taking 3,830 for train and 0 for test
Example time ids for training, min first: [5, 11, 16, 31, 62]
In [38] used 0.0664 MiB RAM in 0.12s, peaked 0.00 MiB above current, total RAM usage 395.77 MiB


In [39]:
# make feature columns
def make_features_stats(df_book, agg_type, cols):
    features_var1 = df_book.groupby(['stock_id', 'time_id'])[cols].agg(agg_type)
    #print(type(features_var1))
    if isinstance(features_var1, pd.Series):
        # .size yields a series not a df
        #features_var1.name = str(agg_type)
        features_var1 = pd.DataFrame(features_var1, columns=[agg_type])
        #pass
    else:
        features_var1_col_names = [f"{col}_{agg_type}" for col in cols]
        features_var1.columns = features_var1_col_names
    return features_var1

if True: # lightweight tests
    df_book_train_stock_XX = pd.read_parquet(os.path.join(ROOT, f"book_train.parquet/stock_id=0"))
    df_book_train_stock_XX["stock_id"] = 0
    df_book_train_stock_XX = df_book_train_stock_XX.set_index(['stock_id', 'time_id'])
    display(make_features_stats(df_book_train_stock_XX, 'nunique', ['ask_size1']).head())

Unnamed: 0_level_0,Unnamed: 1_level_0,ask_size1_nunique
stock_id,time_id,Unnamed: 2_level_1
0,5,67
0,11,26
0,16,22
0,31,30
0,62,54


In [39] used 106.7109 MiB RAM in 0.44s, peaked 0.00 MiB above current, total RAM usage 502.48 MiB


In [40]:
def log_return(list_stock_prices):
    return np.log(list_stock_prices).diff()

def realized_volatility(series_log_return):
    return np.sqrt(np.sum(series_log_return**2))

In [40] used 0.0000 MiB RAM in 0.11s, peaked 0.00 MiB above current, total RAM usage 502.48 MiB


In [41]:
def _realized_volatility_weighted_sub(ser, weights):
    ser_weighted = ser * weights
    return np.sqrt(np.sum(ser_weighted**2))

def realized_volatility_weighted(ser, weights_type):
    """Weighted volatility"""
    # as a numpy array
    # we drop from 12us to 3us by adding @njit to the _sub function
    # we can't make _sub a closure, it loses all compilation benefits
    # and we can't add njit(cache=True) in Jupyter as it can't
    # find a cache location    
    # as a Series we have 5us and 15us w/wo @njit respectively
    if isinstance(ser, pd.Series):
        ser = ser.to_numpy()
    nbr_items = ser.shape[0]
    if weights_type == 'uniform':
        weights = np.ones(nbr_items)
    elif weights_type == 'linear':
        weights = np.linspace(0.1, 1, nbr_items) # linear increasing weight
    elif weights_type == 'half0half1':
        half_way = int(ser.shape[0] / 2)
        weights = np.concatenate((np.zeros(half_way), np.ones(ser.shape[0] - half_way))) # 0s then 1s weight
    elif weights_type == 'geom':
        weights = np.geomspace(0.01, 1, nbr_items) # geometric increase
    #assert isinstance(weights_type, str) == False, f"Must not be a string like '{weights}' at this point"
    return _realized_volatility_weighted_sub(ser, weights)

if True:
    series_log_return = pd.Series(np.linspace(0, 10, 6))
    print(realized_volatility_weighted(series_log_return, weights_type="uniform"))

    #%timeit realized_volatility_weighted(series_log_return, weights_type="uniform")

14.832396974191326
In [41] used 0.0195 MiB RAM in 0.10s, peaked 0.00 MiB above current, total RAM usage 502.50 MiB


In [42]:
def make_wap(df_book_data, num=1, wap_colname="wap"):
    """Modifies df_book_data"""
    assert num==1 or num==2
    wap_numerator = (df_book_data[f'bid_price{num}'] * df_book_data[f'ask_size{num}'] +
                                     df_book_data[f'ask_price{num}'] * df_book_data[f'bid_size{num}'])
    wap_denominator = df_book_data[f'bid_size{num}'] + df_book_data[f'ask_size{num}']
    df_book_data[wap_colname] = wap_numerator / wap_denominator

@memory.cache
def make_realized_volatility(df_book_data, log_return_name='log_return', wap_colname='wap', weights=None):
    """Consume wap column"""
    df_book_data[log_return_name] = df_book_data.groupby(['stock_id', 'time_id'])[wap_colname].apply(log_return)
    df_book_data = df_book_data[~df_book_data[log_return_name].isnull()]
    df_realized_vol_per_stock =  pd.DataFrame(df_book_data.groupby(['stock_id', 'time_id'])[log_return_name].agg(realized_volatility_weighted, weights))
    return df_realized_vol_per_stock

In [42] used 0.0000 MiB RAM in 0.10s, peaked 0.00 MiB above current, total RAM usage 502.50 MiB


In [43]:
if True: # lightweight tests
    df_book_train_stock_XX = pd.read_parquet(os.path.join(ROOT, f"book_train.parquet/stock_id=0"))
    df_book_train_stock_XX["stock_id"] = 0
    df_book_train_stock_XX = df_book_train_stock_XX.set_index(['stock_id', 'time_id'])
    make_wap(df_book_train_stock_XX, 2) # adds 'wap' column
    #df_realized_vol_per_stockXX = make_realized_volatility(df_book_train_stock_XX, log_return_name="log_return2", weights='linear')
    #display(df_realized_vol_per_stockXX)

In [43] used 25.4062 MiB RAM in 0.29s, peaked 10.00 MiB above current, total RAM usage 527.90 MiB


In [44]:
@memory.cache
def load_data_build_features(stock_id, ROOT, filename, cols, df_target):
    # filename e.g. book_train.parquet
    assert isinstance(stock_id, int)
    df_book_train_stock_X = pd.read_parquet(
        os.path.join(ROOT, f"{filename}/stock_id={stock_id}")
    )
    df_book_train_stock_X["stock_id"] = stock_id
    df_book_train_stock_X = df_book_train_stock_X.set_index(['stock_id', 'time_id'])
    #assert df_book_train_stock_X.shape[0] > rows_for_stock_id_0, (df_book_train_stock_X.shape[0], rows_for_stock_id_0)
    
    #df_book_train_stock_X_gt500 = df_book_train_stock_X.query("seconds_in_bucket>500").copy()
    #df_realized_vol_per_stock_short500 = add_wap_make_realized_volatility(df_book_train_stock_X_gt500, log_return_name='log_return_gt500sec')
    #df_book_train_stock_X_gt300 = df_book_train_stock_X.query("seconds_in_bucket>300").copy()
    #df_realized_vol_per_stock_short300 = add_wap_make_realized_volatility(df_book_train_stock_X_gt300, log_return_name='log_return_gt300sec')
    make_wap(df_book_train_stock_X, 2, "wap2") 
    df_realized_vol_per_stock_wap2_uniform = make_realized_volatility(df_book_train_stock_X, log_return_name="log_return2_uniform", wap_colname="wap2", weights='uniform')    
    df_realized_vol_per_stock_wap2_linear = make_realized_volatility(df_book_train_stock_X, log_return_name="log_return2_linear", wap_colname="wap2", weights='linear')
    df_realized_vol_per_stock_wap2_half0half1 = make_realized_volatility(df_book_train_stock_X, log_return_name="log_return2_half0half1", wap_colname="wap2", weights='half0half1')
    make_wap(df_book_train_stock_X, 1, "wap") # adds 'wap' column
    df_realized_vol_per_stock_wap1_uniform = make_realized_volatility(df_book_train_stock_X, log_return_name="log_return1_uniform", weights='uniform')
    df_realized_vol_per_stock_wap1_linear = make_realized_volatility(df_book_train_stock_X, log_return_name="log_return1_linear", weights='linear')
    df_realized_vol_per_stock_wap1_half0half1 = make_realized_volatility(df_book_train_stock_X, log_return_name="log_return1_half0half1", weights='half0half1')
    
    features_var1 = make_features_stats(df_book_train_stock_X, 'var', cols)
    features_mean1 = make_features_stats(df_book_train_stock_X, 'mean', cols)
    features_size1 = make_features_stats(df_book_train_stock_X, 'size', cols)
    features_min1 = make_features_stats(df_book_train_stock_X, 'min', cols)
    features_max1 = make_features_stats(df_book_train_stock_X, 'max', cols)
    features_nunique1 = make_features_stats(df_book_train_stock_X, 'nunique', cols)
    
    df_train_stock_X = df_target.query('stock_id == @stock_id')
    to_merge = [df_train_stock_X, 
                features_var1, features_mean1, features_size1, 
                features_min1, features_max1, features_nunique1,
                df_realized_vol_per_stock_wap1_uniform,
                df_realized_vol_per_stock_wap2_uniform,
                df_realized_vol_per_stock_wap1_linear,
                df_realized_vol_per_stock_wap2_linear,
                df_realized_vol_per_stock_wap1_half0half1,
                df_realized_vol_per_stock_wap2_half0half1]
    row_lengths = [df.shape[0] for df in to_merge]
    assert len(set(row_lengths)) == 1, row_lengths # should all be same length
    train_merged = pd.concat(to_merge, axis=1)
    
    if 'target' in train_merged.columns:
        features = train_merged.drop(columns='target').columns
        #print(features)
        assert len(set(features)) == len(features), f"Feature duplication! {len(set(features))} vs {len(features)}"

    return train_merged

#if 'memory' in dir():
#    # only setup local cache if we're running locally in development
#    load_data_build_features = memory.cache(load_data_build_features)
    
cols = ['bid_price1', 'ask_price1', 'bid_price2', 'ask_price2',] 
cols += ['bid_size1', 'ask_size1', 'bid_size2', 'ask_size2']

if True:    
    # test...
    train_mergedXX = load_data_build_features(0, ROOT, 'book_train.parquet', cols, df_train_all)
    display(train_mergedXX)

Unnamed: 0_level_0,Unnamed: 1_level_0,target,bid_price1_var,ask_price1_var,bid_price2_var,ask_price2_var,bid_size1_var,ask_size1_var,bid_size2_var,ask_size2_var,bid_price1_mean,...,bid_size1_nunique,ask_size1_nunique,bid_size2_nunique,ask_size2_nunique,log_return1_uniform,log_return2_uniform,log_return1_linear,log_return2_linear,log_return1_half0half1,log_return2_half0half1
stock_id,time_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
0,5,0.004136,3.557702e-07,3.609267e-07,3.278967e-07,3.649655e-07,6592.215309,4338.669743,7348.756507,5184.163572,1.003314,...,44,67,32,46,0.004499,0.006999,0.002517,0.004500,0.003051,0.004940
0,11,0.001445,8.048454e-08,4.782809e-08,5.987733e-08,4.715270e-08,15492.325402,9691.320578,8465.001985,7875.531633,1.000011,...,58,26,26,44,0.001204,0.002476,0.000904,0.001749,0.000976,0.001981
0,16,0.002168,5.057495e-07,6.204843e-07,5.307066e-07,6.152997e-07,4978.115912,9259.747269,5674.860251,4586.227415,0.999204,...,43,22,43,21,0.002369,0.004801,0.001504,0.003284,0.001898,0.003759
0,31,0.002195,4.746649e-07,3.223060e-07,5.349445e-07,3.222207e-07,8093.880602,10996.043697,5778.103922,6483.167437,0.998445,...,24,30,24,34,0.002574,0.003637,0.001665,0.002465,0.001840,0.002868
0,62,0.001747,4.740689e-08,3.653909e-08,5.247379e-08,3.838027e-08,9499.414513,9603.210909,7059.243117,4527.422208,0.999407,...,23,54,26,43,0.001894,0.003257,0.001402,0.001806,0.001519,0.002106
0,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,32751,0.002611,3.989057e-07,5.059281e-07,3.846314e-07,4.818288e-07,9490.991605,10394.996610,4828.041246,9333.874374,0.997639,...,54,63,31,57,0.002579,0.003821,0.001525,0.002343,0.001617,0.002506
0,32753,0.001190,1.837252e-07,2.433933e-07,2.033434e-07,2.618266e-07,28270.493701,20407.077646,15863.063249,10575.195643,1.000141,...,36,59,39,40,0.002206,0.002847,0.001252,0.001905,0.001487,0.002265
0,32758,0.004264,5.668076e-07,4.536424e-07,6.043470e-07,4.448702e-07,6587.428917,7851.263511,2219.748436,5428.886648,0.999334,...,37,42,35,20,0.002913,0.003266,0.001542,0.002118,0.001600,0.002473
0,32763,0.004352,1.058614e-07,1.051107e-07,1.070506e-07,1.431541e-07,7244.247983,6607.922889,8062.183603,5552.617083,1.002087,...,39,42,27,41,0.003046,0.005105,0.001840,0.003160,0.002004,0.003813


In [44] used -89.9727 MiB RAM in 15.20s, peaked 180.06 MiB above current, total RAM usage 437.93 MiB


In [45]:
from joblib import Parallel, delayed
print(f'Iterating over {len(stock_ids)} stocks:')

all_train_merged = Parallel(n_jobs=-1, verbose=10)(delayed(load_data_build_features)(stock_id, ROOT, 'book_train.parquet', cols, df_train_all) for stock_id in stock_ids)

Iterating over 4 stocks:


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:   17.3s
[Parallel(n_jobs=-1)]: Done   2 out of   4 | elapsed:   21.4s remaining:   21.4s
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:   24.5s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:   24.5s finished


In [45] used 5.5000 MiB RAM in 24.71s, peaked 0.00 MiB above current, total RAM usage 443.43 MiB


In [46]:
# join all the partial results back together
train_merged = pd.concat(all_train_merged)
show_details(train_merged)

[2c] 15,320x48, 0 nulls, is_view True, is_single_block True, is_consolidated True
In [46] used 0.0039 MiB RAM in 0.12s, peaked 0.00 MiB above current, total RAM usage 443.43 MiB


In [47]:
train_merged.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,target,bid_price1_var,ask_price1_var,bid_price2_var,ask_price2_var,bid_size1_var,ask_size1_var,bid_size2_var,ask_size2_var,bid_price1_mean,...,bid_size1_nunique,ask_size1_nunique,bid_size2_nunique,ask_size2_nunique,log_return1_uniform,log_return2_uniform,log_return1_linear,log_return2_linear,log_return1_half0half1,log_return2_half0half1
stock_id,time_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
119,5,0.002571,2.088262e-07,2.104228e-07,2.088271e-07,2.104224e-07,69683.204623,34785.435798,82425.274566,39846.596395,0.999944,...,116,105,174,140,0.002432,0.003155,0.001474,0.001838,0.001576,0.002043
119,11,0.000839,9.473078e-08,9.466105e-08,9.471785e-08,9.466751e-08,60275.111882,71715.790671,81942.876757,91815.565758,1.001272,...,103,127,47,47,0.00098,0.001166,0.000572,0.000768,0.000643,0.00082
119,16,0.002569,2.464497e-06,2.452061e-06,2.464483e-06,2.452053e-06,64523.049254,96091.792742,104998.758126,96312.95174,1.000416,...,90,147,63,69,0.002676,0.003292,0.001743,0.002101,0.002044,0.002428
119,31,0.002115,1.956015e-06,1.950483e-06,1.956018e-06,1.950486e-06,37212.281811,51410.300536,30782.874772,52436.658755,1.001276,...,99,127,43,48,0.001806,0.002271,0.001044,0.001325,0.001187,0.001588
119,62,0.001549,1.487308e-07,1.499866e-07,1.487252e-07,1.499833e-07,125439.070563,103642.298047,52465.13679,995839.920949,1.000219,...,82,126,54,58,0.001225,0.001742,0.000726,0.001041,0.000861,0.001223


In [47] used 0.0000 MiB RAM in 0.15s, peaked 0.00 MiB above current, total RAM usage 443.43 MiB


In [48]:
features = train_merged.drop(columns='target').columns
print(features)
assert len(set(features)) == len(features), f"{len(set(features))} vs {len(features)} features, we should not have any duplicates"

Index(['bid_price1_var', 'ask_price1_var', 'bid_price2_var', 'ask_price2_var',
       'bid_size1_var', 'ask_size1_var', 'bid_size2_var', 'ask_size2_var',
       'bid_price1_mean', 'ask_price1_mean', 'bid_price2_mean',
       'ask_price2_mean', 'bid_size1_mean', 'ask_size1_mean', 'bid_size2_mean',
       'ask_size2_mean', 'size', 'bid_price1_min', 'ask_price1_min',
       'bid_price2_min', 'ask_price2_min', 'bid_size1_min', 'ask_size1_min',
       'bid_size2_min', 'ask_size2_min', 'bid_price1_max', 'ask_price1_max',
       'bid_price2_max', 'ask_price2_max', 'bid_size1_max', 'ask_size1_max',
       'bid_size2_max', 'ask_size2_max', 'bid_price1_nunique',
       'ask_price1_nunique', 'bid_price2_nunique', 'ask_price2_nunique',
       'bid_size1_nunique', 'ask_size1_nunique', 'bid_size2_nunique',
       'ask_size2_nunique', 'log_return1_uniform', 'log_return2_uniform',
       'log_return1_linear', 'log_return2_linear', 'log_return1_half0half1',
       'log_return2_half0half1'],
      dtype

# Features

In [49]:
if TEST_SIZE == 0:
    # probably we're building on Kaggle
    # we need all data for train, there is no test set
    feature_cols = list(features) + ['stock_id']
    df_train_merged = train_merged.reset_index()[feature_cols+['time_id', 'target']]
    X_train = df_train_merged.drop(columns=['target', 'time_id'])
    y_train = df_train_merged['target']
    print(X_train.shape, y_train.shape)
    #X_test.shape, y_train.shape, y_test.shape

(15320, 48) (15320,)
In [49] used 0.0000 MiB RAM in 0.11s, peaked 0.00 MiB above current, total RAM usage 443.43 MiB


In [50]:
if False:
    def train_test_split(df, target_col, time_ids_train, time_ids_test):
        X_train = df.query('time_id in @time_ids_train').drop(columns=[target_col, 'time_id'])
        X_test = df.query('time_id in @time_ids_test').drop(columns=[target_col, 'time_id'])
        y_train = df.query('time_id in @time_ids_train')[target_col]
        y_test = df.query('time_id in @time_ids_test')[target_col]
        return X_train, X_test, y_train, y_test

    feature_cols = list(features) + ['stock_id']
    X_train, X_test, y_train, y_test = train_test_split(train_merged.reset_index()[feature_cols+['time_id', 'target']], 'target', time_ids_train, time_ids_test)
    X_train.shape, X_test.shape, y_train.shape, y_test.shape

In [50] used 0.0000 MiB RAM in 0.10s, peaked 0.00 MiB above current, total RAM usage 443.43 MiB


In [51]:
X_train.head(3)

Unnamed: 0,bid_price1_var,ask_price1_var,bid_price2_var,ask_price2_var,bid_size1_var,ask_size1_var,bid_size2_var,ask_size2_var,bid_price1_mean,ask_price1_mean,...,ask_size1_nunique,bid_size2_nunique,ask_size2_nunique,log_return1_uniform,log_return2_uniform,log_return1_linear,log_return2_linear,log_return1_half0half1,log_return2_half0half1,stock_id
0,2.088262e-07,2.104228e-07,2.088271e-07,2.104224e-07,69683.204623,34785.435798,82425.274566,39846.596395,0.999944,1.000119,...,105,174,140,0.002432,0.003155,0.001474,0.001838,0.001576,0.002043,119
1,9.473078e-08,9.466105e-08,9.471785e-08,9.466751e-08,60275.111882,71715.790671,81942.876757,91815.565758,1.001272,1.001433,...,127,47,47,0.00098,0.001166,0.000572,0.000768,0.000643,0.00082,119
2,2.464497e-06,2.452061e-06,2.464483e-06,2.452053e-06,64523.049254,96091.792742,104998.758126,96312.95174,1.000416,1.000635,...,147,63,69,0.002676,0.003292,0.001743,0.002101,0.002044,0.002428,119


In [51] used 0.0000 MiB RAM in 0.15s, peaked 0.00 MiB above current, total RAM usage 443.43 MiB


In [52]:
#X_train.shape, X_test.shape, y_train.shape, y_test.shape

In [52] used -0.0273 MiB RAM in 0.10s, peaked 0.03 MiB above current, total RAM usage 443.41 MiB


# ML on a train/test split

In [53]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
import xgboost as xgb
from lightgbm import LGBMRegressor
from sklearn.experimental import enable_hist_gradient_boosting 
from sklearn.ensemble import HistGradientBoostingRegressor

In [53] used 0.0000 MiB RAM in 0.10s, peaked 0.00 MiB above current, total RAM usage 443.41 MiB


In [54]:
#est = LinearRegression()
#est = RandomForestRegressor(n_estimators=10, n_jobs=-1, random_state=RANDOM_STATE) # default n_estimators==100
#est = RandomForestRegressor(n_estimators=100, n_jobs=-1, random_state=RANDOM_STATE) # default n_estimators==100
#est = GradientBoostingRegressor(random_state=RANDOM_STATE)
#est = HistGradientBoostingRegressor(random_state=RANDOM_STATE)

# https://xgboost.readthedocs.io/en/latest/python/python_api.html
#tree_method='exact' default
#est = xgb.XGBRegressor(objective ='reg:linear', colsample_bytree = 0.3, learning_rate = 0.1, max_depth = 5, alpha = 10, n_estimators = 10)
est = xgb.XGBRegressor(tree_method='hist', )

#est = LGBMRegressor()

if TEST_SIZE == 0:
    print('Fitting estimator on all the data')
    est.fit(X_train, y_train)

Fitting estimator on all the data
In [54] used 19.8203 MiB RAM in 0.70s, peaked 0.00 MiB above current, total RAM usage 463.23 MiB


In [55]:
from sklearn.metrics import r2_score

if False:

    print(f"USE_ALL_STOCK_IDS: {USE_ALL_STOCK_IDS}")

    print(f"{df_train_all.reset_index().stock_id.unique().shape[0]} unique stock ids, test set is {TEST_SIZE*100:0.1f}%")
    print(f"Features:", feature_cols)
    print(est)
    if X_test.shape[0] > 0:
        y_pred = est.predict(X_test)
        score = r2_score(y_test, y_pred)
        rmspe = rmspe_score(y_test, y_pred)
        print(f"rmspe score {rmspe:0.3f}, r^2 score {score:0.3f} on {y_pred.shape[0]:,} predictions")
    else:
        print('No testing rows in X_test')

In [55] used 0.0000 MiB RAM in 0.11s, peaked 0.00 MiB above current, total RAM usage 463.23 MiB


In [56]:
%%time

print(f"USE_ALL_STOCK_IDS: {USE_ALL_STOCK_IDS}")

print(f"{df_train_all.reset_index().stock_id.unique().shape[0]} unique stock ids")
      #, test set is {TEST_SIZE*100:0.1f}%")
print(f"Features:", feature_cols)
print(est)

scores = []
if TEST_SIZE > 0:
    # https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GroupKFold.html
    # note the splits appear to be deterministic, possibly on discovery order
    from sklearn.model_selection import GroupKFold
    train_merged_no_idx = train_merged.reset_index()
    groups = train_merged_no_idx['time_id']
    group_kfold = GroupKFold(n_splits=3)
    X_all = train_merged_no_idx[feature_cols]
    y_all = train_merged_no_idx['target']
    print(group_kfold.get_n_splits(X_all, y_all, groups))
    for train_index, test_index in group_kfold.split(X_all, y_all, groups):
        print("TRAIN:", train_index, "TEST:", test_index)
        X_train, X_test = X_all.loc[train_index], X_all.loc[test_index]
        y_train, y_test = y_all.loc[train_index], y_all.loc[test_index]
        est.fit(X_train, y_train)
        y_pred = est.predict(X_test)
        score = r2_score(y_test, y_pred)
        rmspe = rmspe_score(y_test, y_pred)
        print(f"rmspe score {rmspe:0.3f}, r^2 score {score:0.3f} on {y_pred.shape[0]:,} predictions")
        scores.append({'r2': score, 'rmspe': rmspe})

USE_ALL_STOCK_IDS: False
112 unique stock ids
Features: ['bid_price1_var', 'ask_price1_var', 'bid_price2_var', 'ask_price2_var', 'bid_size1_var', 'ask_size1_var', 'bid_size2_var', 'ask_size2_var', 'bid_price1_mean', 'ask_price1_mean', 'bid_price2_mean', 'ask_price2_mean', 'bid_size1_mean', 'ask_size1_mean', 'bid_size2_mean', 'ask_size2_mean', 'size', 'bid_price1_min', 'ask_price1_min', 'bid_price2_min', 'ask_price2_min', 'bid_size1_min', 'ask_size1_min', 'bid_size2_min', 'ask_size2_min', 'bid_price1_max', 'ask_price1_max', 'bid_price2_max', 'ask_price2_max', 'bid_size1_max', 'ask_size1_max', 'bid_size2_max', 'ask_size2_max', 'bid_price1_nunique', 'ask_price1_nunique', 'bid_price2_nunique', 'ask_price2_nunique', 'bid_size1_nunique', 'ask_size1_nunique', 'bid_size2_nunique', 'ask_size2_nunique', 'log_return1_uniform', 'log_return2_uniform', 'log_return1_linear', 'log_return2_linear', 'log_return1_half0half1', 'log_return2_half0half1', 'stock_id']
XGBRegressor(base_score=0.5, booster='gbt

In [57]:
if len(scores) > 0:
    # only show results if we've used cross validation
    df_scores = pd.DataFrame(scores).T
    folds = df_scores.columns.values
    df_scores['std'] = df_scores[folds].std(axis=1)
    df_scores['mean'] = df_scores[folds].mean(axis=1)
    df_scores

In [57] used 0.0000 MiB RAM in 0.10s, peaked 0.00 MiB above current, total RAM usage 463.23 MiB


In [58]:
#if X_test.shape[0] > 0:
if TEST_SIZE > 0:
    df_preds = pd.DataFrame({'y_test': y_test, 'y_pred': y_pred})
    df_preds['abs_diff'] = (df_preds['y_test'] - df_preds['y_pred']).abs()
    display(df_preds.sort_values('abs_diff', ascending=False))

In [58] used 0.0000 MiB RAM in 0.10s, peaked 0.00 MiB above current, total RAM usage 463.23 MiB


In [59]:
#item_to_debug = 32451
#train_merged.reset_index().loc[item_to_debug][['stock_id', 'time_id', 'target']]

In [59] used 0.0000 MiB RAM in 0.11s, peaked 0.00 MiB above current, total RAM usage 463.23 MiB


In [60]:
try:
    #if X_test.shape[0] > 0:
    if TEST_SIZE > 0:
        from yellowbrick.regressor import PredictionError
        visualizer = PredictionError(est)
        visualizer.fit(X_train, y_train)  # Fit the training data to the visualizer
        visualizer.score(X_test, y_test)  # Evaluate the model on the test data
        ax_subplot = visualizer.show()        
except ModuleNotFoundError:
    print('no yellowbrick')

In [60] used 0.0000 MiB RAM in 0.10s, peaked 0.00 MiB above current, total RAM usage 463.23 MiB


In [61]:
if ENV_HOME:
    import eli5
    display(eli5.show_weights(est, feature_names=feature_cols, top=30))

Weight,Feature
0.3526,log_return1_linear
0.1535,log_return1_uniform
0.0770,log_return2_half0half1
0.0604,log_return2_linear
0.0478,log_return1_half0half1
0.0272,ask_price2_var
0.0185,ask_size2_min
0.0146,log_return2_uniform
0.0117,stock_id
0.0096,bid_size1_var


In [61] used -2.3516 MiB RAM in 0.23s, peaked 2.37 MiB above current, total RAM usage 460.88 MiB


In [62]:
if 'feature_importances_' in dir(est):
    feature_col = 'feature_importances_'
elif 'coef_' in dir(est):
    feature_col = 'coef_'
df_features = pd.DataFrame(zip(getattr(est, feature_col), feature_cols), columns=['importance', 'feature']).set_index('importance')
df_features.sort_index(ascending=False)

Unnamed: 0_level_0,feature
importance,Unnamed: 1_level_1
0.352618,log_return1_linear
0.153496,log_return1_uniform
0.07697,log_return2_half0half1
0.060365,log_return2_linear
0.047845,log_return1_half0half1
0.027248,ask_price2_var
0.018537,ask_size2_min
0.01457,log_return2_uniform
0.011677,stock_id
0.009556,bid_size1_var


In [62] used 0.0000 MiB RAM in 0.16s, peaked 0.00 MiB above current, total RAM usage 460.88 MiB


# Make predictions

In [63]:
len(stock_ids) # expecting 112

4

In [63] used 0.0000 MiB RAM in 0.11s, peaked 0.00 MiB above current, total RAM usage 460.88 MiB


In [64]:
if USE_TEST_LOCAL_6_ITEMS: # True if debugging
    # book train as a substitute
    df_test_all = pd.read_csv(os.path.join(ROOT, 'test_local.csv'))
    df_test_all = df_test_all.rename(columns={'target': 'train_target'})
    TEST_FOLDER = 'book_test_local.parquet'
    assert ENV_HOME == True
else:
    df_test_all = pd.read_csv(TEST_CSV)
    if df_test_all.shape[0] == 3: # kaggle test data
        df_test_all = df_test_all[:1] # cut out 2 rows so predictions work    
    TEST_FOLDER = 'book_test.parquet'
print(ROOT, TEST_FOLDER)
df_test_all = df_test_all.set_index(['stock_id', 'time_id'])

show_details(df_test_all)

/home/ian/data/kaggle/optiver_volatility/ book_test.parquet
[2c] 1x1, 0 nulls, is_view True, is_single_block True, is_consolidated True
In [64] used 0.0000 MiB RAM in 0.11s, peaked 0.00 MiB above current, total RAM usage 460.88 MiB


In [65]:
test_set_predictions = []
stock_ids_test = get_training_stock_ids(TEST_FOLDER) # all stocks by default

df_test_predictions = pd.DataFrame() # prediction set to build up
for stock_id in tqdm(stock_ids_test):
    df_test_all_X = df_test_all.query('stock_id==@stock_id').copy()
    test_merged = load_data_build_features(stock_id, ROOT, TEST_FOLDER, cols, df_test_all)
    test_set_predictions_X = est.predict(test_merged.reset_index()[list(features) + ['stock_id']])
    df_test_all_X['target'] = test_set_predictions_X
    df_test_predictions = pd.concat((df_test_predictions, df_test_all_X))
    
assert df_test_all.shape[0] == df_test_predictions.shape[0], "Expecting all rows to be predicted"

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  4.59it/s]

In [65] used 0.8008 MiB RAM in 0.33s, peaked 0.00 MiB above current, total RAM usage 461.68 MiB





In [66]:
print(f"Writing {df_test_predictions.shape[0]} rows to submission.csv on {datetime.datetime.utcnow()}")
df_test_predictions.reset_index()[['row_id', 'target']].to_csv('submission.csv', index=False)
show_details(df_test_predictions)
print(f'Notebook took {datetime.datetime.utcnow()-t1_notebook_start} to run')

Writing 1 rows to submission.csv on 2021-08-25 16:57:05.997390
[2c] 1x2, 0 nulls, is_view True, is_single_block True, is_consolidated True
Notebook took 0:00:45.387779 to run
In [66] used 0.0000 MiB RAM in 0.11s, peaked 0.00 MiB above current, total RAM usage 461.68 MiB


In [67]:
if not ENV_HOME:
    assert USE_ALL_STOCK_IDS, "If we're on Kaggle but not using all stock_ids, we're not ready to submit, so fail here to remind me to change USSE_ALL_STOCK_IDS!"

In [67] used 0.0000 MiB RAM in 0.11s, peaked 0.00 MiB above current, total RAM usage 461.68 MiB
