In [1]:
%load_ext autoreload
%autoreload 2

In [9]:
import os
import sys

(parent_folder_path, current_dir) = os.path.split(os.path.abspath(''))
sys.path.append(parent_folder_path)

from pathlib import Path
from typing import Optional
import numpy as np
import pandas as pd
from tqdm import tqdm
from glob import glob
from decimal import Decimal

from equities.data_processing import lobster_preproc
from equities.data_processing import itch_preproc
from equities.data_processing import lobster_encoding

In [4]:
lobster_load_path = parent_folder_path + '/dataset/raw/LOBSTER/'
itch_load_path = parent_folder_path + '/dataset/raw/ITCH/'

# locate LOBSTER data
lobster_message_files = sorted(glob(lobster_load_path + '*message*.csv'))
lobster_book_files = sorted(glob(lobster_load_path + '*orderbook*.csv'))

print('found', len(lobster_message_files), 'LOBSTER message files')
print('found', len(lobster_book_files), 'LOBSTER book files')
print()

# locate ITCH data
itch_message_files = sorted(glob(itch_load_path + '*message*.csv'))
itch_book_files = sorted(glob(itch_load_path + '*book*.csv'))

print('found', len(itch_message_files), 'ITCH message files')
print('found', len(itch_book_files), 'ITCH book files')

found 1 LOBSTER message files
found 1 LOBSTER book files

found 1 ITCH message files
found 1 ITCH book files


In [56]:
# load LOBSTER data
for m_f, b_f in tqdm(zip(lobster_message_files, lobster_book_files)):
    print(m_f)

    lobster_messages = lobster_preproc.load_message_df(m_f)

    lobster_book = pd.read_csv(
        b_f,
        index_col=False,
        header=None
    )
    assert len(lobster_messages) == len(lobster_book)

    print("LOBSTER messages shape:", lobster_messages.shape)
    print("LOBSTER book shape:", lobster_book.shape)

# load ITCH data
for m_f, b_f in tqdm(zip(itch_message_files, itch_book_files)):
    print(m_f)

    itch_messages = itch_preproc.load_message_df(m_f)

    itch_book = pd.read_csv(
        b_f,
        # index_col=False,
        # header=None
    )
    assert len(itch_messages) == len(itch_book)

    print("ITCH messages shape:", itch_messages.shape)
    print("ITCH book shape:", itch_book.shape)

0it [00:00, ?it/s]

/home/aaron/Documents/Github/MarketSimT/dataset/raw/LOBSTER/AAPL_2012-06-21_34200000_57600000_message_10.csv


1it [00:00,  1.72it/s]


LOBSTER messages shape: (400391, 6)
LOBSTER book shape: (400391, 40)


0it [00:00, ?it/s]

/home/aaron/Documents/Github/MarketSimT/dataset/raw/ITCH/03272019.NASDAQ_ITCH50_AAPL_message.csv


1it [00:04,  4.51s/it]

ITCH messages shape: (2010136, 12)
ITCH book shape: (2010136, 81)





In [114]:
lobster_messages
# lobster_book

# itch_messages
# itch_book

Unnamed: 0,time,event_type,order_id,size,price,direction
0,34200.004241176,1,16113575,18,5853300,1
1,34200.00426064,1,16113584,18,5853200,1
2,34200.004447484,1,16113594,18,5853100,1
3,34200.025551909,1,16120456,18,5859100,-1
4,34200.025579546,1,16120480,18,5859200,-1
...,...,...,...,...,...,...
400386,57599.444019561,1,287150868,48,5776100,-1
400387,57599.444794893,1,287150931,100,5777200,-1
400388,57599.625827171,3,286898608,100,5774900,1
400389,57599.913117637,4,287150868,48,5776100,-1


In [30]:
print("Columns:", list(lobster_messages.columns))
print("Sample:", lobster_messages.values[1])
print()

lobster_tok = lobster_encoding.Message_Tokenizer()

print('<< pre processing LOBSTER dataset >>')
lobster_m_ = lobster_tok.preproc(lobster_messages, lobster_book)

print("Shape:", lobster_m_.shape)
print("Columns:", ['order_id', 'event_type', 'direction', 'price_abs', 'price', 'size',
               'delta_t_s', 'delta_t_ns', 'time_s', 'time_ns',
               # ref fields
               'price', 'size', 'time_s', 'time_ns'])
print("Sample:", lobster_m_[0])
lobster_m_

Columns: ['time', 'event_type', 'order_id', 'size', 'price', 'direction']
Sample: [Decimal('34200.00426064') 1 16113584 18 5853200 1]

<< pre processing LOBSTER dataset >>
truncating 0.0000% of prices > 99900
truncating 0.0000% of prices < -99900
Shape: (389058, 14)
Columns: ['order_id', 'event_type', 'direction', 'price_abs', 'price', 'size', 'delta_t_s', 'delta_t_ns', 'time_s', 'time_ns', 'price', 'size', 'time_s', 'time_ns']
Sample: [16113584        1        1  5853200      -31       18        0    19464
    34200  4260640    -9999    -9999    -9999    -9999]


array([[ 16113584,         1,         1, ...,     -9999,     -9999,
            -9999],
       [ 16113594,         1,         1, ...,     -9999,     -9999,
            -9999],
       [ 16120456,         1,         0, ...,     -9999,     -9999,
            -9999],
       ...,
       [287150931,         1,         0, ...,     -9999,     -9999,
            -9999],
       [286898608,         3,         1, ...,       100,     57594,
        339740972],
       [287150868,         4,         0, ...,        48,     57599,
        444019561]])

In [57]:
itch_messages

Unnamed: 0,time,type,id,side,size,price,cancSize,execSize,oldId,oldSize,oldPrice,mpid
0,14400006432545,A,13301,1,18.0,207.85,,,,,,
1,14400008777412,A,15969,0,100.0,129.33,,,,,,
2,14400016498868,A,20677,0,1.0,114.94,,,,,,
3,14400017857990,A,22061,0,1.0,98.39,,,,,,
4,14403597489791,A,98453,1,300.0,192.70,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
2010131,72000073334596,D,336886329,0,0.0,186.45,100.0,,,,,
2010132,72000073447563,D,342882157,0,0.0,188.00,1.0,,,,,
2010133,72000073477970,D,335312241,0,0.0,175.00,50.0,,,,,
2010134,72000074023439,D,343725045,1,0.0,195.00,2600.0,,,,,


In [58]:
# remove mpid field from ITCH data
itch_messages = itch_messages.drop(columns=['mpid'])

# remove pre-market and after-market hours from ITCH data
itch_messages = itch_messages[itch_messages['time'] >= 34200000000000]
itch_messages = itch_messages[itch_messages['time'] <= 57600000000000]

itch_messages

Unnamed: 0,time,type,id,side,size,price,cancSize,execSize,oldId,oldSize,oldPrice
12347,34200010884166,A,9449261,1,100.0,188.85,,,,,
12348,34200010956675,A,9449417,0,100.0,158.55,,,,,
12349,34200011021382,A,9449521,1,100.0,219.06,,,,,
12350,34200020725916,A,9460301,1,100.0,190.83,,,,,
12351,34200020728094,A,9460309,0,100.0,186.66,,,,,
...,...,...,...,...,...,...,...,...,...,...,...
1973112,57599979846894,R,352941341,1,1400.0,189.91,,,352808321.0,1400.0,189.92
1973113,57599990456021,A,352942509,0,10.0,188.47,,,,,
1973114,57599998220184,A,352943165,0,100.0,188.47,,,,,
1973115,57599998239790,D,352930741,1,0.0,188.50,100.0,,,,


In [59]:
# format time for pre-processing
itch_messages['time'] = itch_messages['time'].astype('string')
itch_messages['time'] = itch_messages['time'].apply(lambda x: '.'.join((x[0:5], x[5:])))
itch_messages['time'] = itch_messages['time'].apply(lambda x: Decimal(x))

itch_messages['time']

12347      34200.010884166
12348      34200.010956675
12349      34200.011021382
12350      34200.020725916
12351      34200.020728094
                ...       
1973112    57599.979846894
1973113    57599.990456021
1973114    57599.998220184
1973115    57599.998239790
1973116    57599.998277970
Name: time, Length: 1960770, dtype: object

In [85]:
# convert price to pennies from dollars
itch_messages['price'] = (itch_messages['price'] * 100).astype('int')

itch_messages['price']

12347      18885
12348      15855
12349      21906
12350      19083
12351      18666
           ...  
1973112    18991
1973113    18847
1973114    18847
1973115    18850
1973116    18850
Name: price, Length: 1960770, dtype: int64

In [69]:
print(itch_messages.values[1])
itch_book

[Decimal('34200.010956675') 'A' 9449417 0 100.0 158.55 nan nan nan nan nan]


Unnamed: 0,time,1_bid_price,1_bid_vol,1_ask_price,1_ask_vol,2_bid_price,2_bid_vol,2_ask_price,2_ask_vol,3_bid_price,...,18_ask_price,18_ask_vol,19_bid_price,19_bid_vol,19_ask_price,19_ask_vol,20_bid_price,20_bid_vol,20_ask_price,20_ask_vol
0,14400006432545,,,207.85,18.0,,,,,,...,,,,,,,,,,
1,14400008777412,129.33,100.0,207.85,18.0,,,,,,...,,,,,,,,,,
2,14400016498868,129.33,100.0,207.85,18.0,114.94,1.0,,,,...,,,,,,,,,,
3,14400017857990,129.33,100.0,207.85,18.0,114.94,1.0,,,98.39,...,,,,,,,,,,
4,14403597489791,129.33,100.0,192.70,300.0,114.94,1.0,207.85,18.0,98.39,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2010131,72000073334596,188.00,1.0,188.99,1000.0,175.00,50.0,195.00,2600.0,,...,,,,,,,,,,
2010132,72000073447563,175.00,50.0,188.99,1000.0,,,195.00,2600.0,,...,,,,,,,,,,
2010133,72000073477970,,,188.99,1000.0,,,195.00,2600.0,,...,,,,,,,,,,
2010134,72000074023439,,,188.99,1000.0,,,,,,...,,,,,,,,,,


In [109]:
tick_size = 1
p_ref = (((itch_book.iloc[:, 1] * 100) + (itch_book.iloc[:, 3] * 100)) / 2).shift()
p_ref = (p_ref // tick_size) * tick_size

p_ref

0              NaN
1              NaN
2          16859.0
3          16859.0
4          16859.0
            ...   
2010131    18849.0
2010132    18849.0
2010133    18199.0
2010134        NaN
2010135        NaN
Length: 2010136, dtype: float64

In [113]:
itch_messages.dtypes

time         object
type         object
id            int64
side          int64
size        float64
price         int64
cancSize    float64
execSize    float64
oldId       float64
oldSize     float64
oldPrice    float64
dtype: object