In [1]:
%load_ext autoreload
%autoreload 2

In [3]:
import os
import sys

(parent_folder_path, current_dir) = os.path.split(os.path.abspath(''))
sys.path.append(parent_folder_path)

from pathlib import Path
from typing import Optional
import numpy as np
import pandas as pd
from tqdm import tqdm
from glob import glob
from decimal import Decimal

from equities.data_processing import lobster_preproc
from equities.data_processing import itch_preproc
from equities.data_processing import lobster_encoding
from equities.data_processing import itch_encoding

In [4]:
lobster_load_path = parent_folder_path + '/dataset/raw/LOBSTER/'
itch_load_path = parent_folder_path + '/dataset/raw/ITCH/'

# locate LOBSTER data
lobster_message_files = sorted(glob(lobster_load_path + '*message*.csv'))
lobster_book_files = sorted(glob(lobster_load_path + '*orderbook*.csv'))

print('found', len(lobster_message_files), 'LOBSTER message files')
print('found', len(lobster_book_files), 'LOBSTER book files')
print()

# locate ITCH data
itch_message_files = sorted(glob(itch_load_path + '*message*.csv'))
itch_book_files = sorted(glob(itch_load_path + '*book*.csv'))

print('found', len(itch_message_files), 'ITCH message files')
print('found', len(itch_book_files), 'ITCH book files')

found 1 LOBSTER message files
found 1 LOBSTER book files

found 1 ITCH message files
found 1 ITCH book files


In [5]:
# load LOBSTER data
for m_f, b_f in tqdm(zip(lobster_message_files, lobster_book_files)):
    print(m_f)
    print(b_f)

    lobster_messages = lobster_preproc.load_message_df(m_f)

    lobster_book = pd.read_csv(
        b_f,
        index_col=False,
        header=None
    )
    assert len(lobster_messages) == len(lobster_book)

    print("LOBSTER messages shape:", lobster_messages.shape)
    print("LOBSTER book shape:", lobster_book.shape)

# load ITCH data
for m_f, b_f in tqdm(zip(itch_message_files, itch_book_files)):
    print(m_f)
    print(b_f)

    itch_messages = itch_preproc.load_message_df(m_f)

    itch_book = pd.read_csv(
        b_f,
        # index_col=False,
        # header=None
    )
    assert len(itch_messages) == len(itch_book)

    print("ITCH messages shape:", itch_messages.shape)
    print("ITCH book shape:", itch_book.shape)

0it [00:00, ?it/s]

/home/aaron/Documents/Github/MarketSimT/dataset/raw/LOBSTER/AAPL_2012-06-21_34200000_57600000_message_10.csv
/home/aaron/Documents/Github/MarketSimT/dataset/raw/LOBSTER/AAPL_2012-06-21_34200000_57600000_orderbook_10.csv


1it [00:00,  1.52it/s]


LOBSTER messages shape: (400391, 6)
LOBSTER book shape: (400391, 40)


0it [00:00, ?it/s]

/home/aaron/Documents/Github/MarketSimT/dataset/raw/ITCH/03272019.NASDAQ_ITCH50_AAPL_message.csv
/home/aaron/Documents/Github/MarketSimT/dataset/raw/ITCH/03272019.NASDAQ_ITCH50_AAPL_book_20.csv


1it [00:04,  4.35s/it]

ITCH messages shape: (2010136, 12)
ITCH book shape: (2010136, 81)





### Message Processing

In [6]:
lobster_messages
# lobster_book

# itch_messages
# itch_book

Unnamed: 0,time,event_type,order_id,size,price,direction
0,34200.004241176,1,16113575,18,5853300,1
1,34200.00426064,1,16113584,18,5853200,1
2,34200.004447484,1,16113594,18,5853100,1
3,34200.025551909,1,16120456,18,5859100,-1
4,34200.025579546,1,16120480,18,5859200,-1
...,...,...,...,...,...,...
400386,57599.444019561,1,287150868,48,5776100,-1
400387,57599.444794893,1,287150931,100,5777200,-1
400388,57599.625827171,3,286898608,100,5774900,1
400389,57599.913117637,4,287150868,48,5776100,-1


In [7]:
print("Columns:", list(lobster_messages.columns))
print("Sample:", lobster_messages.values[1])
print()

lobster_tok = lobster_encoding.Message_Tokenizer()

print('<< pre processing LOBSTER dataset >>')
lobster_m_ = lobster_tok.preproc(lobster_messages, lobster_book)

print("Shape:", lobster_m_.shape)
print("Columns:", ['order_id', 'event_type', 'direction', 'price_abs', 'price', 'size',
               'delta_t_s', 'delta_t_ns', 'time_s', 'time_ns',
               # ref fields
               'price', 'size', 'time_s', 'time_ns'])
print("Sample:", lobster_m_[0])
lobster_m_

Columns: ['time', 'event_type', 'order_id', 'size', 'price', 'direction']
Sample: [Decimal('34200.00426064') 1 16113584 18 5853200 1]

<< pre processing LOBSTER dataset >>
truncating 0.0000% of prices > 99900
truncating 0.0000% of prices < -99900
Shape: (389058, 14)
Columns: ['order_id', 'event_type', 'direction', 'price_abs', 'price', 'size', 'delta_t_s', 'delta_t_ns', 'time_s', 'time_ns', 'price', 'size', 'time_s', 'time_ns']
Sample: [16113584        1        1  5853200      -31       18        0    19464
    34200  4260640    -9999    -9999    -9999    -9999]


array([[ 16113584,         1,         1, ...,     -9999,     -9999,
            -9999],
       [ 16113594,         1,         1, ...,     -9999,     -9999,
            -9999],
       [ 16120456,         1,         0, ...,     -9999,     -9999,
            -9999],
       ...,
       [287150931,         1,         0, ...,     -9999,     -9999,
            -9999],
       [286898608,         3,         1, ...,       100,     57594,
        339740972],
       [287150868,         4,         0, ...,        48,     57599,
        444019561]])

In [None]:
# # filter out only allowed event types ...
# allowed_event_types=[1,2,3,4]
# lobster_messages = lobster_messages.loc[lobster_messages.event_type.isin(allowed_event_types)].copy()
# # ... and corresponding book changes
# lobster_book = lobster_book.loc[lobster_messages.index]

# # TIME
# # DELTA_T: time since previous order --> 4 tokens of length 3
# lobster_messages.insert(
#     loc=1,
#     column='delta_t_ns',
#     value=lobster_messages['time'].diff().fillna(0)
# )
# lobster_messages.insert(
#     loc=1,
#     column='delta_t_s',
#     value=lobster_messages.delta_t_ns.astype(int)
# )
# lobster_messages.delta_t_ns = ((lobster_messages.delta_t_ns % 1) * 1000000000).astype(int)

# lobster_messages.insert(0, 'time_s', lobster_messages.time.astype(int))
# lobster_messages.rename(columns={'time': 'time_ns'}, inplace=True)
# lobster_messages.time_ns = ((lobster_messages.time_ns % 1) * 1000000000).astype(int)

# # SIZE
# lobster_messages.loc[lobster_messages['size'] > 9999, 'size'] = 9999
# lobster_messages['size'] = lobster_messages['size'].astype(int)

# # PRICE
# lobster_messages['price_abs'] = lobster_messages.price  # keep absolute price for later (simulator)
# # mid-price reference, rounded down to nearest tick_size
# tick_size = 100
# p_ref = ((lobster_book.iloc[:, 0] + lobster_book.iloc[:, 2]) / 2).shift()#.round(-2).astype(int).shift()
# p_ref = (p_ref // tick_size) * tick_size
# # --> 1999 price levels // ...00 since tick size is 100
# lobster_messages.price = lobster_tok._preproc_prices(lobster_messages.price, p_ref, p_lower_trunc=-99900, p_upper_trunc=99900)
# lobster_messages = lobster_messages.iloc[1:]
# lobster_messages.price = lobster_messages.price.astype(int)

# # DIRECTION
# lobster_messages.direction = ((lobster_messages.direction + 1) / 2).astype(int)

# # change column order
# lobster_messages = lobster_messages[['order_id', 'event_type', 'direction', 'price_abs', 'price', 'size',
#         'delta_t_s', 'delta_t_ns', 'time_s', 'time_ns']]

# lobster_messages

In [None]:
# # add original message as feature
# # for all referential order types (2, 3, 4)
# modif_types={2,3,4}
# modif_fields=['price', 'size', 'time_s', 'time_ns']
# nan_val=-9999

# m_changes = pd.merge(
#     lobster_messages.loc[lobster_messages.event_type.isin(modif_types)].reset_index(),
#     lobster_messages.loc[lobster_messages.event_type == 1, ['order_id'] + modif_fields],
#     how='left', on='order_id', suffixes=['', '_ref']).set_index('index')

# # m_changes

# # add new empty columns for referenced order
# modif_cols = [field + '_ref' for field in modif_fields]
# lobster_messages[modif_cols] = nan_val

# # replace order changes by original order and additional new fields
# lobster_messages.loc[m_changes.index] = m_changes
# lobster_messages[modif_cols] = lobster_messages[modif_cols].fillna(nan_val).astype(int)

# lobster_messages

In [8]:
itch_messages

Unnamed: 0,time,type,id,side,size,price,cancSize,execSize,oldId,oldSize,oldPrice,mpid
0,14400006432545,A,13301,1,18.0,207.85,,,,,,
1,14400008777412,A,15969,0,100.0,129.33,,,,,,
2,14400016498868,A,20677,0,1.0,114.94,,,,,,
3,14400017857990,A,22061,0,1.0,98.39,,,,,,
4,14403597489791,A,98453,1,300.0,192.70,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
2010131,72000073334596,D,336886329,0,0.0,186.45,100.0,,,,,
2010132,72000073447563,D,342882157,0,0.0,188.00,1.0,,,,,
2010133,72000073477970,D,335312241,0,0.0,175.00,50.0,,,,,
2010134,72000074023439,D,343725045,1,0.0,195.00,2600.0,,,,,


In [9]:
# remove mpid field from ITCH data
itch_messages = itch_messages.drop(columns=['mpid'])

# remove pre-market and after-market hours from ITCH data
remove_premarket = True # False
remove_aftermarket = True

if remove_premarket:
    itch_messages = itch_messages[itch_messages['time'] >= 34200000000000]
if remove_aftermarket:
    itch_messages = itch_messages[itch_messages['time'] <= 57600000000000]

# itch_messages = itch_messages[itch_messages['time'] >= 34200000000000]
# itch_messages = itch_messages[itch_messages['time'] <= 57600000000000]

# format time for pre-processing
itch_messages['time'] = itch_messages['time'].astype('string')
itch_messages['time'] = itch_messages['time'].apply(lambda x: '.'.join((x[0:5], x[5:])))
itch_messages['time'] = itch_messages['time'].apply(lambda x: Decimal(x))

# convert price to pennies from dollars
itch_messages['price'] = (itch_messages['price'] * 100).astype('int')
itch_messages['oldPrice'] = (itch_messages['oldPrice'] * 100) # make int after dealing with NaNs

itch_messages

Unnamed: 0,time,type,id,side,size,price,cancSize,execSize,oldId,oldSize,oldPrice
12347,34200.010884166,A,9449261,1,100.0,18885,,,,,
12348,34200.010956675,A,9449417,0,100.0,15855,,,,,
12349,34200.011021382,A,9449521,1,100.0,21906,,,,,
12350,34200.020725916,A,9460301,1,100.0,19083,,,,,
12351,34200.020728094,A,9460309,0,100.0,18666,,,,,
...,...,...,...,...,...,...,...,...,...,...,...
1973112,57599.979846894,R,352941341,1,1400.0,18991,,,352808321.0,1400.0,18992.0
1973113,57599.990456021,A,352942509,0,10.0,18847,,,,,
1973114,57599.998220184,A,352943165,0,100.0,18847,,,,,
1973115,57599.998239790,D,352930741,1,0.0,18850,100.0,,,,


In [10]:
print("Columns:", list(itch_messages.columns))
print("Sample:", itch_messages.values[1])
print()

itch_tok = itch_encoding.Message_Tokenizer()

print('<< pre processing ITCH dataset >>')
itch_m_ = itch_tok.preproc(itch_messages, itch_book)

print("Shape:", itch_m_.shape)
print("Columns:", ['id', 'type', 'side', 'price_abs', 'price', 'size',
               'delta_t_s', 'delta_t_ns', 'time_s', 'time_ns',
               # ref fields
               'cancSize', 'execSize', 'oldId', 'oldSize', 'oldPrice',
               'time_s_ref', 'time_ns_ref'])
print("Sample:", itch_m_[0])
itch_m_

Columns: ['time', 'type', 'id', 'side', 'size', 'price', 'cancSize', 'execSize', 'oldId', 'oldSize', 'oldPrice']
Sample: [Decimal('34200.010956675') 'A' 9449417 0 100.0 15855 nan nan nan nan nan]

<< pre processing ITCH dataset >>
truncating 1.1448% of prices > 999
truncating 0.8483% of prices < -999
Shape: (1953936, 17)
Columns: ['id', 'type', 'side', 'price_abs', 'price', 'size', 'delta_t_s', 'delta_t_ns', 'time_s', 'time_ns', 'cancSize', 'execSize', 'oldId', 'oldSize', 'oldPrice', 'time_s_ref', 'time_ns_ref']
Sample: [9449417 'A' 0 15855 -999 100 0 72509 34200 10956675 -9999 -9999 -9999
 -9999 -9999 -9999 -9999]


array([[9449417, 'A', 0, ..., -9999, -9999, -9999],
       [9449521, 'A', 1, ..., -9999, -9999, -9999],
       [9460301, 'A', 1, ..., -9999, -9999, -9999],
       ...,
       [352943165, 'A', 0, ..., -9999, -9999, -9999],
       [352930741, 'D', 1, ..., -9999, 57599, 940363730],
       [352930753, 'D', 1, ..., -9999, 57599, 940388253]], dtype=object)

In [None]:
# itch_tok = itch_encoding.Message_Tokenizer()

# # filter out only allowed event types ...
# allowed_event_types=['A','E','C','D','R']
# itch_messages = itch_messages.loc[itch_messages.type.isin(allowed_event_types)].copy()
# # ... and corresponding book changes
# itch_book = itch_book.loc[itch_messages.index]

# # TIME
# # DELTA_T: time since previous order --> 4 tokens of length 3
# itch_messages.insert(
#     loc=1,
#     column='delta_t_ns',
#     value=itch_messages['time'].diff().fillna(0)
# )
# itch_messages.insert(
#     loc=1,
#     column='delta_t_s',
#     value=itch_messages.delta_t_ns.astype(int)
# )
# itch_messages.delta_t_ns = ((itch_messages.delta_t_ns % 1) * 1000000000).astype(int)

# itch_messages.insert(0, 'time_s', itch_messages.time.astype(int))
# itch_messages.rename(columns={'time': 'time_ns'}, inplace=True)
# itch_messages.time_ns = ((itch_messages.time_ns % 1) * 1000000000).astype(int)

# # SIZE
# itch_messages.loc[itch_messages['size'] > 9999, 'size'] = 9999
# itch_messages.loc[itch_messages['oldSize'] > 9999, 'oldSize'] = 9999
# itch_messages['size'] = itch_messages['size'].astype(int)

# # PRICE
# itch_messages['price_abs'] = itch_messages.price  # keep absolute price for later (simulator)
# # mid-price reference, rounded down to nearest tick_size
# tick_size = 1
# p_ref = (((itch_book.iloc[:, 1] * 100) + (itch_book.iloc[:, 3] * 100)) / 2).shift()#.round(-2).astype(int).shift()
# p_ref = (p_ref // tick_size) * tick_size
# # --> 1999 price levels // ...00 since tick size is 100
# itch_messages.price = itch_tok._preproc_prices(itch_messages.price, p_ref, p_lower_trunc=-999, p_upper_trunc=999)
# itch_messages = itch_messages.iloc[1:]
# itch_messages.price = itch_messages.price.astype(int)

# # # DIRECTION
# # itch_messages.direction = ((itch_messages.direction + 1) / 2).astype(int)

# # change column order
# # m = m[['order_id', 'event_type', 'direction', 'price_abs', 'price', 'size',
# #        'delta_t_s', 'delta_t_ns', 'time_s', 'time_ns']]
# itch_messages = itch_messages[['id', 'type', 'side', 'price_abs', 'price', 'size',
#         'delta_t_s', 'delta_t_ns', 'time_s', 'time_ns',
#         'cancSize', 'execSize', 'oldId', 'oldSize', 'oldPrice']]

# itch_messages

In [None]:
# # add time elements of original message as feature and process NaNs
# # for all referential order types ('E','C','D','R')
# modif_types={'E','C','D'}
# modif_types_special={'R'}
# modif_fields=['time_s', 'time_ns']
# ref_cols = ['cancSize', 'execSize', 'oldId', 'oldSize', 'oldPrice']
# nan_val=-9999

# # make df that converts 'R' values to 'A' values
# itch_r_messages = itch_messages.copy()
# itch_r_messages['type'] = itch_r_messages['type'].replace('R', 'A')

# m_changes = pd.merge(
#     itch_messages.loc[itch_messages.type.isin(modif_types)].reset_index(),
#     itch_r_messages.loc[itch_r_messages.type == 'A', ['id'] + modif_fields],
#     how='left', on='id', suffixes=['', '_ref']).set_index('index')

# m_changes

# # find modif_fields of A events that match with R event oldId
# m_changes_special = pd.merge(
#     itch_messages.loc[itch_messages.type.isin(modif_types_special)].reset_index(),
#     (itch_r_messages.loc[itch_r_messages.type == 'A', ['id'] + modif_fields]).rename(columns={'id': 'oldId'}),
#     how='left', on='oldId', suffixes=['', '_ref']).set_index('index')

# m_changes_special

# # add new empty columns for referenced order
# modif_cols = [field + '_ref' for field in modif_fields]
# itch_messages[modif_cols] = nan_val

# # replace order changes by original order and additional new fields
# itch_messages.loc[m_changes.index] = m_changes
# itch_messages.loc[m_changes_special.index] = m_changes_special
# itch_messages[modif_cols] = itch_messages[modif_cols].fillna(nan_val).astype(int)

# # process other ref fields
# itch_messages[ref_cols] = itch_messages[ref_cols].fillna(nan_val).astype(int)

# itch_messages

### Book Processing

In [11]:
# load ITCH data
for m_f, b_f in tqdm(zip(itch_message_files, itch_book_files)):
    itch_messages = itch_preproc.load_message_df(m_f)

    itch_book = pd.read_csv(
        b_f,
        # index_col=False,
        # header=None
    )
    assert len(itch_messages) == len(itch_book)

    print("ITCH messages shape:", itch_messages.shape)
    print("ITCH book shape:", itch_book.shape)

# remove pre-market and after-market hours from ITCH data
if remove_premarket:
    itch_messages = itch_messages[itch_messages['time'] >= 34200000000000]
if remove_aftermarket:
    itch_messages = itch_messages[itch_messages['time'] <= 57600000000000]

# remove disallowed order types
allowed_events=['A','E','C','D','R']
itch_messages = itch_messages.loc[itch_messages.type.isin(allowed_events)]
# make sure book is same length as messages
itch_book = itch_book.loc[itch_messages.index]

filter_above_lvl = 20
itch_messages, itch_book = itch_preproc.filter_by_lvl(itch_messages, itch_book, filter_above_lvl)

# remove time field from ITCH book data
itch_book = itch_book.drop(columns=['time'])

assert len(itch_messages) == len(itch_book)
itch_book

1it [00:03,  3.97s/it]

ITCH messages shape: (2010136, 12)
ITCH book shape: (2010136, 81)





Unnamed: 0,1_bid_price,1_bid_vol,1_ask_price,1_ask_vol,2_bid_price,2_bid_vol,2_ask_price,2_ask_vol,3_bid_price,3_bid_vol,...,18_ask_price,18_ask_vol,19_bid_price,19_bid_vol,19_ask_price,19_ask_vol,20_bid_price,20_bid_vol,20_ask_price,20_ask_vol
12347,188.74,100.0,188.85,117.0,188.58,2.0,188.88,50.0,188.49,31.0,...,189.65,305.0,187.60,100.0,189.66,100.0,187.57,50.0,189.68,100.0
12366,188.58,2.0,188.85,117.0,188.49,31.0,188.88,50.0,188.37,100.0,...,189.65,305.0,187.57,50.0,189.66,100.0,187.53,500.0,189.68,100.0
12368,188.58,2.0,188.82,100.0,188.49,31.0,188.85,117.0,188.37,100.0,...,189.60,100.0,187.57,50.0,189.65,305.0,187.53,500.0,189.66,100.0
12369,188.58,2.0,188.75,100.0,188.49,31.0,188.82,100.0,188.37,100.0,...,189.58,900.0,187.57,50.0,189.60,100.0,187.53,500.0,189.65,305.0
12376,188.58,2.0,188.75,100.0,188.49,31.0,188.82,100.0,188.37,100.0,...,189.58,900.0,187.57,50.0,189.60,100.0,187.53,500.0,189.65,305.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1973110,188.47,8320.0,188.50,200.0,188.46,300.0,188.51,400.0,188.45,2200.0,...,188.67,4600.0,188.29,2661.0,188.68,5850.0,188.28,400.0,188.69,156.0
1973113,188.47,8330.0,188.50,200.0,188.46,300.0,188.51,400.0,188.45,2200.0,...,188.67,4600.0,188.29,2661.0,188.68,5850.0,188.28,400.0,188.69,156.0
1973114,188.47,8430.0,188.50,200.0,188.46,300.0,188.51,400.0,188.45,2200.0,...,188.67,4600.0,188.29,2661.0,188.68,5850.0,188.28,400.0,188.69,156.0
1973115,188.47,8430.0,188.50,100.0,188.46,300.0,188.51,400.0,188.45,2200.0,...,188.67,4600.0,188.29,2661.0,188.68,5850.0,188.28,400.0,188.69,156.0


In [12]:
# itch_messages
lobster_book

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,30,31,32,33,34,35,36,37,38,39
0,5859400,200,5853300,18,5859800,200,5853000,150,5861000,200,...,5845300,300,5876500,1160,5843800,200,5879000,500,5842700,300
1,5859400,200,5853300,18,5859800,200,5853200,18,5861000,200,...,5846500,300,5876500,1160,5845300,300,5879000,500,5843800,200
2,5859400,200,5853300,18,5859800,200,5853200,18,5861000,200,...,5849300,300,5876500,1160,5846500,300,5879000,500,5845300,300
3,5859100,18,5853300,18,5859400,200,5853200,18,5859800,200,...,5849300,300,5873900,100,5846500,300,5876500,1160,5845300,300
4,5859100,18,5853300,18,5859200,18,5853200,18,5859400,200,...,5849300,300,5871000,10,5846500,300,5873900,100,5845300,300
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
400386,5776100,48,5775400,410,5776700,300,5775300,1400,5776800,200,...,5774600,100,5777600,100,5774500,200,5777700,100,5774300,100
400387,5776100,48,5775400,410,5776700,300,5775300,1400,5776800,200,...,5774600,100,5777500,600,5774500,200,5777600,100,5774300,100
400388,5776100,48,5775400,410,5776700,300,5775300,1400,5776800,200,...,5774600,100,5777500,600,5774500,200,5777600,100,5774300,100
400389,5776700,300,5775400,410,5776800,200,5775300,1400,5776900,160,...,5774600,100,5777600,100,5774500,200,5777800,200,5774300,100


In [None]:
# # process book
# price_levels = 500 # n_price_series

# # mid-price rounded to nearest tick (100)
# p_ref = ((lobster_book.iloc[:, 0] + lobster_book.iloc[:, 2]) / 2).round(-2).astype(int)
# b_indices = lobster_book.iloc[:, ::2].sub(p_ref, axis=0).div(100).astype(int) # div 100 bc tick
# b_indices = b_indices + price_levels // 2 # make fit between span of 0 to price_levels
# b_indices.columns = list(range(b_indices.shape[1])) # reset col indices
# vol_book = lobster_book.iloc[:, 1::2].copy()
# # convert sell volumes (ask side) to negative
# vol_book.iloc[:, ::2] = vol_book.iloc[:, ::2].mul(-1)
# vol_book.columns = list(range(vol_book.shape[1])) # reset col indices

# # convert to book representation with volume at each price level relative to reference price (mid)
# # whilst preserving empty levels to maintain sparse representation of book
# # i.e. at each time we have a fixed width snapshot around the mid price
# # therefore movement of the mid price needs to be a separate feature (e.g. relative to previous price)

# mybook = np.zeros((len(lobster_book), price_levels), dtype=np.int32)

# a = b_indices.values
# for i in range(a.shape[0]):
#     for j in range(a.shape[1]):
#         price = a[i, j]
#         # remove prices outside of price_levels range
#         if price >= 0 and price < price_levels:
#             mybook[i, price] = vol_book.values[i, j]

# # prepend column with best bid changes (in ticks)
# mid_diff = p_ref.div(100).diff().fillna(0).astype(int).values
# mybook = np.concatenate([mid_diff[:, None], mybook], axis=1)

# mybook

In [13]:
itch_book

Unnamed: 0,1_bid_price,1_bid_vol,1_ask_price,1_ask_vol,2_bid_price,2_bid_vol,2_ask_price,2_ask_vol,3_bid_price,3_bid_vol,...,18_ask_price,18_ask_vol,19_bid_price,19_bid_vol,19_ask_price,19_ask_vol,20_bid_price,20_bid_vol,20_ask_price,20_ask_vol
12347,188.74,100.0,188.85,117.0,188.58,2.0,188.88,50.0,188.49,31.0,...,189.65,305.0,187.60,100.0,189.66,100.0,187.57,50.0,189.68,100.0
12366,188.58,2.0,188.85,117.0,188.49,31.0,188.88,50.0,188.37,100.0,...,189.65,305.0,187.57,50.0,189.66,100.0,187.53,500.0,189.68,100.0
12368,188.58,2.0,188.82,100.0,188.49,31.0,188.85,117.0,188.37,100.0,...,189.60,100.0,187.57,50.0,189.65,305.0,187.53,500.0,189.66,100.0
12369,188.58,2.0,188.75,100.0,188.49,31.0,188.82,100.0,188.37,100.0,...,189.58,900.0,187.57,50.0,189.60,100.0,187.53,500.0,189.65,305.0
12376,188.58,2.0,188.75,100.0,188.49,31.0,188.82,100.0,188.37,100.0,...,189.58,900.0,187.57,50.0,189.60,100.0,187.53,500.0,189.65,305.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1973110,188.47,8320.0,188.50,200.0,188.46,300.0,188.51,400.0,188.45,2200.0,...,188.67,4600.0,188.29,2661.0,188.68,5850.0,188.28,400.0,188.69,156.0
1973113,188.47,8330.0,188.50,200.0,188.46,300.0,188.51,400.0,188.45,2200.0,...,188.67,4600.0,188.29,2661.0,188.68,5850.0,188.28,400.0,188.69,156.0
1973114,188.47,8430.0,188.50,200.0,188.46,300.0,188.51,400.0,188.45,2200.0,...,188.67,4600.0,188.29,2661.0,188.68,5850.0,188.28,400.0,188.69,156.0
1973115,188.47,8430.0,188.50,100.0,188.46,300.0,188.51,400.0,188.45,2200.0,...,188.67,4600.0,188.29,2661.0,188.68,5850.0,188.28,400.0,188.69,156.0


In [None]:
# # process book
# price_levels = 500 # n_price_series

# # mid-price rounded to nearest tick
# p_ref = ((itch_book.iloc[:, 0] + itch_book.iloc[:, 2]) / 2).mul(100).round().astype(int)
# b_indices = itch_book.iloc[:, ::2].mul(100).sub(p_ref, axis=0).astype(int)
# b_indices = b_indices + price_levels // 2 # make tick differences fit between span of 0 to price_levels
# b_indices.columns = list(range(b_indices.shape[1])) # reset col indices
# vol_book = itch_book.iloc[:, 1::2].copy().astype(int)
# # convert sell volumes (ask side) to negative
# vol_book.iloc[:, 1::2] = vol_book.iloc[:, 1::2].mul(-1)
# vol_book.columns = list(range(vol_book.shape[1])) # reset col indices

# # convert to book representation with volume at each price level relative to reference price (mid)
# # whilst preserving empty levels to maintain sparse representation of book
# # i.e. at each time we have a fixed width snapshot around the mid price
# # therefore movement of the mid price needs to be a separate feature (e.g. relative to previous price)

# mybook = np.zeros((len(itch_book), price_levels), dtype=np.int32)

# a = b_indices.values
# for i in range(a.shape[0]):
#     for j in range(a.shape[1]):
#         price = a[i, j]
#         # remove prices outside of price_levels range
#         if price >= 0 and price < price_levels:
#             mybook[i, price] = vol_book.values[i, j]

# # prepend column with best bid changes (in ticks)
# mid_diff = p_ref.diff().fillna(0).astype(int).values
# mybook = np.concatenate([mid_diff[:, None], mybook], axis=1)

# mybook

In [14]:
mybook = itch_preproc.process_book(itch_book, price_levels=500)
mybook

array([[ 0,  0,  0, ...,  0,  0,  0],
       [-8,  0,  0, ...,  0,  0,  0],
       [-2,  0,  0, ...,  0,  0,  0],
       ...,
       [ 0,  0,  0, ...,  0,  0,  0],
       [ 0,  0,  0, ...,  0,  0,  0],
       [ 1,  0,  0, ...,  0,  0,  0]])