In [1]:
import gzip
import shutil
from struct import unpack
from collections import namedtuple, Counter, defaultdict
from pathlib import Path
from urllib.request import urlretrieve
from urllib.parse import urljoin
from datetime import timedelta
from time import time

import pandas as pd

import matplotlib.pyplot as plt
from matplotlib.ticker import FuncFormatter
import seaborn as sns

In [2]:
def format_time(time):
  """ Returns a time with format HH:MM:SS from numeric time value, in seconds """
  m, s = divmod(time, 60)
  h, m = divmod(m, 60)
  return f"{h:0>2.0f}:{m:0>2.0f}:{s:0>5.2f}"

# ITCH

ITCH, shorthand for TotalView-ITCH, is Nasdaq's direct data-feed reporting protocol. It allows subscribers to track individual orders from placement to execution or cancellation. With proper parsing, it allows reconstruction of the order book. Being able to read ITCH data is invaluable for algorithmic trading.

So that's what we are going to work on today.

Check the TotalView-ITCH documentation here:
https://www.nasdaqtrader.com/content/technicalsupport/specifications/dataproducts/NQTVITCHSpecification.pdf

#### Set Data Paths

In [3]:
data_path = Path('data')
itch_store = str(data_path / 'itch.h5')
order_book_store = data_path / 'order_book.h5'

In [4]:
FTP_URL = 'ftp://emi.nasdaq.com/ITCH/'
SOURCE_FILE = '10302019.NASDAQ_ITCH50.gz'

#### Download and Unzip

In [6]:
def download(url):
  """ Download and unzip ITCH data, if available """

  # create directory if we need to
  if not data_path.exists():
    data_path.mkdir()
    print('Directory Created')
  else:
    print('Directory Already Exists')

  filename = data_path / url.split('/')[-1]

  # Download file if need be
  if not filename.exists():
    print('Downloading file...', url)
    urlretrieve(url, filename)
  else:
    print('File already downloaded')

  unzipped = data_path / (filename.stem + '.bin')

  # unzip feel if need be
  if not unzipped.exists():
    print('Unzipping to', unzipped)
    with gzip.open(str(filename), 'rb') as f_in:
      with open(unzipped, 'wb') as f_out:
        shutil.copyfileobj(f_in, f_out)
  else:
    print('File already unzipped')
  
  return unzipped

In [7]:
file_name = download(urljoin(FTP_URL, SOURCE_FILE))
date = file_name.name.split('.')[0]

Directory Created
Downloading file... ftp://emi.nasdaq.com/ITCH/10302019.NASDAQ_ITCH50.gz
Unzipping to data/10302019.NASDAQ_ITCH50.bin


## Processing the file

The ITCH data is in a binary file, but thankfully Python has the `struct` module to help us parse the information. 

I highly recommend taking a look at the module documentation here: 
https://docs.python.org/3/library/struct.html

### Creating Formatting Dictionaries

We will use a few dictionaries to "translate" the ITCH data into something we can more easily read and understand.

In [8]:
event_codes = {'O':'Start of Messages',
               'S':'Start of System Hours',
               'Q':'Start of Market Hours',
               'M':'End of Market Hours',
               'E':'End of System Hours',
               'C':'End of Messages'}

In [9]:
variable_encoding = {'primary_market_maker': {'Y':1, 'N':0},
                     'printable': {'Y':1, 'N':0},
                     'buy_sell_indicator': {'B':1, 'S':-1},
                     'cross_type': {'O':0, 'C':1, 'H':2},
                     'imbalance_direction': {'B':0, 'S':1, 'N':0, 'O':-1}}

In [10]:
formatting_dict = {
    ('integer', 2):'H',
    ('integer', 4):'I',
    ('integer', 6):'6s',
    ('integer', 8):'Q',

    ('alpha', 1):'s',
    ('alpha', 2):'2s',
    ('alpha', 4):'4s',
    ('alpha', 8):'8s',

    ('price_4', 4):'I',
    ('price_8', 8):'Q'
}

### Message Types

There are many different message types in ITCH, and the protocol includes an encoding system for the different types. It is a pretty long list...but thankfully, Stefan Jansen or one of his collaborators has taken on the tedious task of putting the codes into a .xlsx file for us. 

Get the file here:
https://github.com/stefan-jansen/machine-learning-for-trading/blob/master/02_market_and_fundamental_data/01_NASDAQ_TotalView-ITCH_Order_Book/message_types.xlsx



In [17]:
message_types = pd.read_excel('message_types.xlsx', sheet_name='messages')
message_types = message_types.sort_values('id').drop('id', axis=1)
message_types.head()

Unnamed: 0,Name,Offset,Length,Value,Notes
0,Message Type,0,1,S,System Event Message
1,Stock Locate,1,2,Integer,Always 0
2,Tracking Number,3,2,Integer,Nasdaq internal tracking number
3,Timestamp,5,6,Integer,Nanoseconds since midnight
4,Event Code,11,1,Alpha,See System Event Codes below


#### Clean the message_types Data

In [30]:
def clean_message_types(data):
  """ Cleans up the message_types DataFrame, making it easier to work with in Python """
  data.columns = [c.lower().strip() for c in data.columns]
  data.value = data.value.str.strip()
  data.name = (data.name.str.strip().str.lower().str.replace(' ', '_').str.replace('-', '_').str.replace('/', '_'))
  data.notes = data.notes.str.strip()
  data['message_type'] = data.loc[data.name == 'message_type', 'value']
  
  return data

In [32]:
message_types = clean_message_types(message_types)

In [35]:
message_types.head()

Unnamed: 0,name,offset,length,value,notes,message_type
0,message_type,0,1,S,System Event Message,S
1,stock_locate,1,2,Integer,Always 0,
2,tracking_number,3,2,Integer,Nasdaq internal tracking number,
3,timestamp,5,6,Integer,Nanoseconds since midnight,
4,event_code,11,1,Alpha,See System Event Codes below,


We are also going to extract the message_types so we can easily replace them with message_labels to make it more readable.

In [40]:
message_labels = message_types.loc[:, ['message_type', 'notes']].dropna()
message_labels.columns = ['message_type', 'name']
message_labels.name = message_labels.name.str.lower().str.replace('message', '').str.replace('.', '').str.strip().str.replace(' ', '_')
message_labels.head()

Unnamed: 0,message_type,name
0,S,system_event
5,R,stock_directory
23,H,stock_trading_action
31,Y,reg_sho_short_sale_price_test_restricted_indic...
37,L,market_participant_position
