# TAQ_wang_comparison

#### Juan Camilo Henao Londono - 02.05.2019
#### AG Guhr - Universitaet Duisburg-Essen

In [1]:
import gzip
import numpy as np
import pandas as pd
import pickle

## Trade sign comparison full data

Check if the original data is the same

In [2]:
year = '2008'
month = '01'
day = '02'

time_t, ask_t = pickle.load(open(
        '../TAQ_2008/TAQ_py/TAQ_AAPL_trades_{}{}{}.pickle'
        .format(year, month, day)
        , 'rb'))

assert not np.sum(ask_t == 0)

In [3]:
# Trades identified using equation (1)
identified_trades = np.zeros(len(time_t), dtype=int)
identified_trades[-1] = 1

In [4]:
# Implementation of equation (1). Sign of the price change between
# consecutive trades

for t_idx, t_val in enumerate(time_t):

    diff = ask_t[t_idx] - ask_t[t_idx - 1]

    if (diff):

        identified_trades[t_idx] = np.sign(diff)

    else:

        identified_trades[t_idx] = identified_trades[t_idx - 1]

In [5]:
# All the identified trades must be different to zero
assert not np.sum(identified_trades == 0)

In [6]:
d_trade_sign_full = {'Date': '{}-{}-{}'.format(year, month, day), 'Time': time_t,
                     'Price': ask_t, 'Trade': identified_trades}
trade_sign_juan_full = pd.DataFrame(data=d_trade_sign_full)

In [7]:
trade_sign_wang_full = pd.read_csv('AAPLprices_full_time.txt', sep='   ', header=None, engine='python')
trade_sign_wang_full.columns = ['Date', 'Time', 'Price', 'Trade']

In [8]:
time_set = set(trade_sign_juan_full['Time'])
set_comp = np.ones(len(time_set))
sec_sim = np.zeros(len(time_set))


for idx, time in enumerate(time_set):
    condition_juan = trade_sign_juan_full['Time'] == time
    condition_wang = trade_sign_wang_full['Time'] == time
    set_comp[idx] = len(set(trade_sign_juan_full['Price'][condition_juan]) - set(trade_sign_wang_full['Price'][condition_wang]))
    sec_sim[idx] = len(trade_sign_juan_full['Price'][condition_juan]) == len(trade_sign_wang_full['Price'][condition_wang])
    
print('The number of diferences of the set values of both data is {}'.format(np.sum(set_comp)))
print('Similarities of the number of values in each second is {}%'.format(np.sum(sec_sim) / len(time_set) * 100))

The number of diferences of the set values of both data is 0.0
Similarities of the number of values in each second is 100.0%


In [9]:
trade_sign_juan_full[trade_sign_juan_full['Time'] == 34801]

Unnamed: 0,Date,Time,Price,Trade
0,2008-01-02,34801,1990100,1
1,2008-01-02,34801,1990000,-1
2,2008-01-02,34801,1989900,-1
3,2008-01-02,34801,1989900,-1
4,2008-01-02,34801,1990100,1
5,2008-01-02,34801,1990000,-1


In [10]:
trade_sign_wang_full[trade_sign_wang_full['Time'] == 34801]

Unnamed: 0,Date,Time,Price,Trade
0,2008-01-02,34801,1990100,1
1,2008-01-02,34801,1990000,-1
2,2008-01-02,34801,1989900,-1
3,2008-01-02,34801,1989900,-1
4,2008-01-02,34801,1990100,1
5,2008-01-02,34801,1990000,-1


In [11]:
assert len(trade_sign_juan_full) == len(trade_sign_wang_full)

In [12]:
# Comparison

trade_date_comp_full = np.sum(trade_sign_wang_full['Date'] == trade_sign_juan_full['Date']) / len(trade_sign_wang_full['Date'])
trade_time_comp_full = np.sum(trade_sign_wang_full['Time'] == trade_sign_juan_full['Time']) / len(trade_sign_wang_full['Time'])
trade_price_comp_full = np.sum(trade_sign_wang_full['Price'] == trade_sign_juan_full['Price']) / len(trade_sign_wang_full['Price'])
trade_trade_comp_full = np.sum(trade_sign_wang_full['Trade'] == trade_sign_juan_full['Trade']) / len(trade_sign_wang_full['Trade'])

In [13]:
print('The similarity of the full date is {:.2f}%'.format(trade_date_comp_full * 100))
print('The similarity of the full time is {:.2f}%'.format(trade_time_comp_full * 100))
print('The similarity of the full prices is {:.2f}%'.format(trade_price_comp_full * 100))
print('The similarity of the full trades is {:.2f}%'.format(trade_trade_comp_full * 100))

The similarity of the full date is 100.00%
The similarity of the full time is 100.00%
The similarity of the full prices is 100.00%
The similarity of the full trades is 100.00%


## Trade sign comparison only one data per second

Compare the trade sign for the first day of the TAQ data.

In [14]:
full_time = np.array(range(34801, 57001))
trade_signs = 0. * full_time
price_signs = 0. * full_time

# Implementation of equation (2). Trade sign in each second
for t_idx, t_val in enumerate(full_time):

    condition = (time_t >= t_val) \
                * (time_t  < t_val + 1)
    # Experimental
    trades_same_t_exp = identified_trades[condition]
    sign_exp = int(np.sign(np.sum(trades_same_t_exp)))
    trade_signs[t_idx] = sign_exp
    try:
        price_signs[t_idx] = ask_t[condition][-1]
    except IndexError:
        full_time[t_idx] = 0

In [15]:
d_trade_sign_perse = {'Time': full_time, 'Price': price_signs, 'Trade': trade_signs}
trade_sign_juan_perse = pd.DataFrame(data=d_trade_sign_perse)

In [16]:
trade_sign_wang_perse = pd.read_csv('AAPLtrade_signs.txt', sep='   ', usecols=(1,2,3), header=None, engine='python')
trade_sign_wang_perse.columns = ['Time', 'Price', 'Trade']

In [17]:
# Comparison

trade_time_comp_perse = np.sum(trade_sign_wang_perse['Time'] == trade_sign_juan_perse['Time']) / len(trade_sign_wang_perse['Time'])
trade_price_comp_perse = np.sum(trade_sign_wang_perse['Price'] == trade_sign_juan_perse['Price']) / len(trade_sign_wang_perse['Price'])
trade_trade_comp_perse = np.sum(trade_sign_wang_perse['Trade'] == trade_sign_juan_perse['Trade']) / len(trade_sign_wang_perse['Trade'])

In [18]:
print('The similarity of the time per second is {:.2f}%'.format(trade_time_comp_perse * 100))
print('The similarity of the price per second is {:.2f}%'.format(trade_price_comp_perse * 100))
print('The similarity of the trades per second is {:.2f}%'.format(trade_trade_comp_perse * 100))

The similarity of the time per second is 100.00%
The similarity of the price per second is 100.00%
The similarity of the trades per second is 100.00%
