# itch_trade_identification_pandas_numpy_exp

#### Juan Camilo Henao Londono - 07.03.2019
#### AG Guhr - Universitaet Duisburg-Essen

In this implementation I mixed pandas and numpy. I loaded and filtered the data using pandas because is fast and easy to divide the data in what I needed. Then I convert each column of the pandas data in a numpy array. With numpy array the searching of the order values number is fast and the `for` loop run fast in comparison with the implementation using pandas only.

This implementation will be used to test the accuracy of the model used by S. Wang.

In [1]:
import csv
import gzip
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
import datetime

%matplotlib inline

In [2]:
tickers = ['AAPL', 'AAPL', 'GS', 'GS', 'XOM', 'XOM']
year = '2008'
months = ['01', '06', '10', '12', '02', '08']
days = ['07', '02', '07', '10', '11', '04']

ticker = 'AAPL'
year = '2008'
month = '01'
day = '07'

In [3]:
# Load full data using cols with values time, order, type, shares and price
data = pd.read_csv(gzip.open('../ITCH_{1}/{1}{2}{3}_{0}.csv.gz'.format(ticker, year, month, day), 'rt'),
                   usecols=(0,2,3,4,5), 
                   dtype={'Time': 'uint32', 'Order': 'uint64', 'T': str,
                          'Shares': 'uint16', 'Price': 'float64'})
                   
data['Price'] = data['Price'] / 10000

In [4]:
market_time = (data['Time'] / 3600 / 1000 >= 9.666666) & \
                        (data['Time'] / 3600 / 1000 < 15.833333)
    
data_market_time = data[market_time]

identified_limit_orders = len(data_market_time[data_market_time['T'] == 'B']) \
                    + len(data_market_time[data_market_time['T'] == 'S'])
identified_trades = len(data_market_time[data_market_time['T'] == 'E']) \
                    + len(data_market_time[data_market_time['T'] == 'F'])

print('The stock {} on {}.{}.{} has {} limit orders'.format(ticker, year, month, day, identified_limit_orders))
print('The stock {} on {}.{}.{} has {} trades'.format(ticker, year, month, day, identified_trades))

The stock AAPL on 2008.01.07 has 745020 limit orders
The stock AAPL on 2008.01.07 has 120287 trades


In [5]:
# Select only trade orders
trade_pos = np.array(data['T'] == 'E') + np.array(data['T'] == 'F')
trade_data = data[trade_pos]
# Converting the data in numpy arrays
trade_data_time = trade_data['Time'].values
trade_data_order = trade_data['Order'].values
trade_data_types = 3 * np.array(trade_data['T'] == 'E') + 4 * np.array(trade_data['T'] == 'F')
trade_data_volume = trade_data['Shares'].values

#trade_data
trade_data[trade_data['Order'] == 41579]

Unnamed: 0,Time,Order,T,Shares,Price
101,25762517,41579,E,25,0.0
213717,35819184,41579,F,0,0.0


In [6]:
# Select only limit orders
limit_pos = np.array(data['T'] == 'B') + np.array(data['T'] == 'S')
limit_data = data[limit_pos]
# Reduce the values to only the ones that have the same order number
# as trade orders
limit_data = limit_data[limit_data.Order.isin(trade_data['Order'])]
# Converting the data in numpy arrays
limit_data_order = limit_data['Order'].values
limit_data_types = 1 * np.array(limit_data['T'] == 'S') - 1 * np.array(limit_data['T'] == 'B')
limit_data_volume = limit_data['Shares'].values
limit_data_price = limit_data['Price'].values

limit_data[limit_data['Order'] == 41579]

Unnamed: 0,Time,Order,T,Shares,Price
93,25741564,41579,S,50,182.88


In [7]:
# Arrays to store the info of the identified trades
length_trades = len(trade_data)
trade_times = 1 * trade_data_time
trade_signs = np.zeros(length_trades)
trade_volumes = np.zeros(length_trades, dtype='uint16')
trade_price = np.zeros(length_trades)

In [8]:
time_01 = datetime.datetime.now()

for t_idx in range(len(trade_data)):
    # limit orders that have the same order as the trade order   
    l_idx = np.where(limit_data_order == trade_data_order[t_idx])[0][0]
        
    # Save values that are independent of the type

    # Price of the trade (Limit data)
    trade_price[t_idx] = limit_data_price[l_idx] 

    # Trade sign identification

    trade = limit_data_types[l_idx]

    if (trade == 1):
        trade_signs[t_idx] = 1.
    else:
        trade_signs[t_idx] = -1.

    # The volume depends on the trade type. If it is 4 the
    # value is taken from the limit data and the order number
    # is deleted from the data. If it is 3 the
    # value is taken from the trade data and then the
    # value of the volume in the limit data must be 
    # reduced with the value of the trade data

    volume_type = trade_data_types[t_idx]

    if (volume_type == 4):

        trade_volumes[t_idx] = limit_data_volume[l_idx]
        limit_data_order[l_idx] = 0

    else:

        trade_volumes[t_idx] = trade_data_volume[t_idx]
        diff_volumes = limit_data_volume[l_idx] - trade_data_volume[t_idx]
        assert diff_volumes > 0

        limit_data_volume[l_idx] = diff_volumes

assert not sum(trade_signs == 0)

time_02 = datetime.datetime.now()
print(time_02 - time_01)

0:00:12.903066


In [9]:
market_time = (trade_times / 3600 / 1000 >= 9.666666) & \
                    (trade_times / 3600 / 1000 < 15.833333)
    
trade_times_market = trade_times[market_time] 
trade_signs_market = trade_signs[market_time]
trade_volumes_market = trade_volumes[market_time]
trade_price_market = trade_price[market_time]

In [10]:
print('Identified trades =', len(trade_times_market))

Identified trades = 120287
