# taq_wang_results_comparison

#### Juan Camilo Henao Londono - 02.05.2019
#### AG Guhr - Universitaet Duisburg-Essen

To test where were the differences between my results and S. Wang results, I divided the
code to compare by steps every intermediate result and see the accuracy.

In [1]:
import gzip
import numpy as np
import pandas as pd
import pickle
import sys

import taq_data_tools_comp
import taq_data_analysis_comp

__tau__ = 1000

## Trade sign comparison

Compare the S. Wang trade sign data with my values

In [2]:
def trade_sign_comparison(ticker, date):
    """
    Compare the results of the trade signs from the TAQ data obtained by me
    and by S. Wang. The trade signs are calculated using the equation (1), (2)
    of https://arxiv.org/pdf/1603.01580.pdf.
    As the trades signs are not directly given by the TAQ data, they must be
    infered by the trades prices.
        :param ticker: string of the abbreviation of the stock to be analized
         (i.e. 'AAPL')
        :param date: string of the date to be analized (i.e '2016-07-07')
    """''
    
    date_sep = date.split('-')

    year = date_sep[0]
    month = date_sep[1]
    day = date_sep[2]
    
    # Load data
    # All transactions data
    time_t, ask_t, identified_trades = taq_data_analysis_comp \
                                       .taq_trade_signs_all_transactions_data(ticker, year, month, day)
    
    condition = (time_t >= 34801) * (time_t <= 57000)
    
    d_trade_sign_all = {'Date': '{}-{}-{}'.format(year, month, day), 'Time': time_t[condition],
                         'Price': ask_t[condition], 'Trade': identified_trades[condition]}
    trade_sign_juan_all = pd.DataFrame(data=d_trade_sign_all)
    
    trade_sign_wang_all = pd.read_csv(''.join(('../../../taq_data/article_reproduction_data_2008/wang/'
                                               + '{}trade_signs_transactions.txt').split()).format(ticker),
                                      sep='\s+', header=None, engine='python')
    trade_sign_wang_all.columns = ['Date', 'Time', 'Price', 'Trade']
    
    assert len(trade_sign_juan_all) == len(trade_sign_wang_all)
    
    # Full time data
    (full_time, price_signs,
     trade_signs) = taq_data_analysis_comp.taq_trade_signs_full_time_data(ticker, date)
    
    d_trade_sign_perse = {'Time': full_time, 'Price': price_signs, 'Trade': trade_signs}
    trade_sign_juan_perse = pd.DataFrame(data=d_trade_sign_perse)
    
    trade_sign_wang_perse = pd.read_csv(''.join(('../../../taq_data/article_reproduction_data_2008/wang/'
                                               + '{}trade_signs_seconds.txt').split()).format(ticker),
                                        sep='\s+', usecols=(1,2,3), header=None, engine='python')
    trade_sign_wang_perse.columns = ['Time', 'Price', 'Trade']
    
    # Comparison

    trade_date_comp_all = np.sum(trade_sign_wang_all['Date'] == trade_sign_juan_all['Date']) / len(trade_sign_wang_all['Date'])
    trade_time_comp_all = np.sum(trade_sign_wang_all['Time'] == trade_sign_juan_all['Time']) / len(trade_sign_wang_all['Time'])
    trade_price_comp_all = np.sum(trade_sign_wang_all['Price'] == trade_sign_juan_all['Price']) / len(trade_sign_wang_all['Price'])
    trade_trade_comp_all = np.sum(trade_sign_wang_all['Trade'] == trade_sign_juan_all['Trade']) / len(trade_sign_wang_all['Trade'])
    print()
    print('Trade signs results - ' + ticker)
    print()
    print('Comparison of all the transactions')
    print('The similarity of all the transaction dates is {:.2f}%'.format(trade_date_comp_all * 100))
    print('The similarity of all the transaction times is {:.2f}%'.format(trade_time_comp_all * 100))
    print('The similarity of all the transaction prices is {:.2f}%'.format(trade_price_comp_all * 100))
    print('The similarity of all the transaction trades signs is {:.2f}%'.format(trade_trade_comp_all * 100))
    print()
    
    # Comparison

    trade_time_comp_perse = np.sum(trade_sign_wang_perse['Time'] == trade_sign_juan_perse['Time']) / len(trade_sign_wang_perse['Time'])
    trade_price_comp_perse = np.sum(trade_sign_wang_perse['Price'] == trade_sign_juan_perse['Price']) / len(trade_sign_wang_perse['Price'])
    trade_trade_comp_perse = np.sum(trade_sign_wang_perse['Trade'] == trade_sign_juan_perse['Trade']) / len(trade_sign_wang_perse['Trade'])
    
    print('Comparison of the full time values')
    print('The similarity of the time per second is {:.2f}%'.format(trade_time_comp_perse * 100))
    print('The similarity of the price per second is {:.2f}%'.format(trade_price_comp_perse * 100))
    print('The similarity of the trades per second is {:.2f}%'.format(trade_trade_comp_perse * 100))
    print()
    
    return None

In [3]:
tickers = ['AAPL', 'MSFT']
year = '2008'
month = '01'
day = '02'
date = year + '-' + month + '-' + day

for ticker in tickers:
    trade_sign_comparison(ticker, date)

TAQ data
taq_trade_signs_all_transactions_data
Processing data for the stock AAPL the 2008.01.02
TAQ data
taq_trade_signs_full_time_data
Processing data for the stock AAPL the 2008.01.02
TAQ data
taq_trade_signs_all_transactions_data
Processing data for the stock AAPL the 2008.01.02

Trade signs results - AAPL

Comparison of all the transactions
The similarity of all the transaction dates is 100.00%
The similarity of all the transaction times is 100.00%
The similarity of all the transaction prices is 100.00%
The similarity of all the transaction trades signs is 100.00%

Comparison of the full time values
The similarity of the time per second is 100.00%
The similarity of the price per second is 100.00%
The similarity of the trades per second is 100.00%

TAQ data
taq_trade_signs_all_transactions_data
Processing data for the stock MSFT the 2008.01.02
TAQ data
taq_trade_signs_full_time_data
Processing data for the stock MSFT the 2008.01.02
TAQ data
taq_trade_signs_all_transactions_data
Pro

## Midpoint price comparison

Compare the S. Wang midpoint price data with my values

In [4]:
def midpoint_comparison(ticker, date):
    """
    Compare the results of the trade signs from the TAQ data obtained by me
    and by S. Wang. The trade signs are calculated using the equation (1), (2)
    of https://arxiv.org/pdf/1603.01580.pdf.
    As the trades signs are not directly given by the TAQ data, they must be
    infered by the trades prices.
        :param ticker: string of the abbreviation of the stock to be analized
         (i.e. 'AAPL')
        :param date: string of the date to be analized (i.e '2016-07-07')
    """''
    
    date_sep = date.split('-')

    year = date_sep[0]
    month = date_sep[1]
    day = date_sep[2]
    
    # Load data
    # All transactions data
    time_q, bid_q, ask_q, midpoint, spread = taq_data_analysis_comp \
                                             .taq_midpoint_all_transactions_data(ticker, year, month, day)

    condition = (time_q >= 34800) * (time_q <= 56999)

    d_midpoint_all = {'Date': '{}-{}-{}'.format(year, month, day), 'Time': time_q[condition],
                      'Bid': bid_q[condition], 'Ask': ask_q[condition], 'Midpoint': midpoint[condition],
                      'Spread': spread[condition]}
    midpoint_juan_all = pd.DataFrame(data=d_midpoint_all)

    midpoint_wang_all = pd.read_csv(''.join(('../../../taq_data/article_reproduction_data_2008/wang/'
                                               + '{}midpoint_transactions.txt').split()).format(ticker),
                                      sep='\s+', header=None, engine='python')
    midpoint_wang_all.columns = ['Date', 'Time', 'Bid', 'Ask', 'Midpoint', 'Spread']

    assert len(midpoint_juan_all) == len(midpoint_wang_all)

    # Full time data
    midpoint_juan = taq_data_analysis_comp.taq_midpoint_full_time_data(ticker, date)
    d_midpoint_perse = {'Midpoint': midpoint_juan}
    midpoint_juan_perse = pd.DataFrame(data=d_midpoint_perse)

    midpoint_wang_perse = pd.read_csv(''.join(('../../../taq_data/article_reproduction_data_2008/wang/'
                                               + '{}midpoint_seconds.txt').split()).format(ticker),
                                      sep='\s+', usecols=[2], header=None, engine='python')
    midpoint_wang_perse.columns = ['Midpoint']
    
    assert len(midpoint_juan_perse) == len(midpoint_wang_perse)

    # Comparison transactions

    midpoint_date_comp_all = np.sum(midpoint_wang_all['Date'] == midpoint_juan_all['Date']) / len(midpoint_wang_all['Date'])
    midpoint_time_comp_all = np.sum(midpoint_wang_all['Time'] == midpoint_juan_all['Time']) / len(midpoint_wang_all['Time'])
    midpoint_bid_comp_all = np.sum(midpoint_wang_all['Bid'] == midpoint_juan_all['Bid']) / len(midpoint_wang_all['Bid'])
    midpoint_ask_comp_all = np.sum(midpoint_wang_all['Ask'] == midpoint_juan_all['Ask']) / len(midpoint_wang_all['Ask'])
    midpoint_midpoint_comp_all = np.sum(midpoint_wang_all['Midpoint'] == midpoint_juan_all['Midpoint']) / len(midpoint_wang_all['Midpoint'])
    midpoint_spread_comp_all = np.sum(midpoint_wang_all['Spread'] == midpoint_juan_all['Spread']) / len(midpoint_wang_all['Spread'])

    print()
    print('Midpoint results - ' + ticker)
    print()
    print('Comparison of all the transactions')
    print('The similarity of all the transaction dates is {:.2f}%'.format(midpoint_date_comp_all * 100))
    print('The similarity of all the transaction times is {:.2f}%'.format(midpoint_time_comp_all * 100))
    print('The similarity of all the transaction bids is {:.2f}%'.format(midpoint_bid_comp_all * 100))
    print('The similarity of all the transaction asks is {:.2f}%'.format(midpoint_ask_comp_all * 100))
    print('The similarity of all the transaction midpoints is {:.2f}%'.format(midpoint_midpoint_comp_all * 100))
    print('The similarity of all the transaction spreads is {:.2f}%'.format(midpoint_spread_comp_all * 100))
    print()

    # Comparison full time
    midpoint_midpoint_comp_perse = np.sum(midpoint_wang_perse['Midpoint'] == midpoint_juan_perse['Midpoint']) / len(midpoint_wang_perse['Midpoint'])

    print('Comparison of the full time values')
    print('The similarity of midpoints is {:.2f}%'.format(midpoint_midpoint_comp_perse * 100))
    print()

In [5]:
tickers = ['AAPL', 'MSFT']
year = '2008'
month = '01'
day = '02'
date = year + '-' + month + '-' + day

for ticker in tickers:
    midpoint_comparison(ticker, date)

TAQ data
taq_midpoint_all_transactions_data
Processing data for the stock AAPL the 2008.01.02
TAQ data
taq_midpoint_full_time_data
Processing data for the stock AAPL the 2008.01.02
TAQ data
taq_midpoint_all_transactions_data
Processing data for the stock AAPL the 2008.01.02


Midpoint results - AAPL

Comparison of all the transactions
The similarity of all the transaction dates is 100.00%
The similarity of all the transaction times is 100.00%
The similarity of all the transaction bids is 100.00%
The similarity of all the transaction asks is 100.00%
The similarity of all the transaction midpoints is 100.00%
The similarity of all the transaction spreads is 100.00%

Comparison of the full time values
The similarity of midpoints is 100.00%

TAQ data
taq_midpoint_all_transactions_data
Processing data for the stock MSFT the 2008.01.02
TAQ data
taq_midpoint_full_time_data
Processing data for the stock MSFT the 2008.01.02
TAQ data
taq_midpoint_all_transactions_data
Processing data for the stoc

## Self response comparison

Compare the S. Wang self responses data with my values

In [6]:
def self_response_avg_number(ticker, dates):
    
    self = np.zeros(__tau__)
    num_s = []
    
    for date in dates:
                
        try:
            
            data, avg_num = taq_data_analysis_comp.taq_self_response_data(ticker, date)

            self += data

            num_s.append(avg_num)
            
        except TypeError:
            pass

    num_s = np.asarray(num_s)
    num_s_t = np.sum(num_s, axis=0)
    
    return self / num_s_t, num_s_t

In [7]:
def self_response_comparison(ticker, year):
    
    dates = taq_data_tools_comp.taq_bussiness_days(year)
    self, num = self_response_avg_number(ticker, dates)
    
    d_self = {'Self': self, 'Avg': num}
    self_juan = pd.DataFrame(data=d_self)

    self_wang = pd.read_csv(''.join(('../../../taq_data/article_reproduction_data_2008/wang/'
                            + '{0}_{0}_{1}_RDC_L=1000.txt').split())
                            .format(ticker, year),
                            sep='\s+', header=None, engine='python', usecols=[1,3])
    self_wang.columns = ['Self', 'Avg']
    self_wang = self_wang[1:].copy()
    self_wang = self_wang.reset_index(drop=True)

    assert len(self_juan) == len(self_wang)
    
    # Comparison self-response
    self_wang_arr = np.array([float('{:.5f}'.format(i)) for i in self_wang['Self']])
    self_juan_arr = np.array([float('{:.5f}'.format(i)) for i in self_juan['Self']])
        
    self_comp = np.sum(self_juan_arr == self_wang_arr) / len(self_wang_arr)
    print('Self-response results - ' + ticker)
    print()
    print('Comparison of all self-responses')
    print('The similarity of all the self-responses is {:.2f}%'.format(self_comp * 100))
    
    return self_juan, self_wang

In [8]:
tickers = ['AAPL', 'MSFT']
year = '2008'

for ticker in tickers:
    
    self_response_comparison(ticker, year)

Self-response results - AAPL

Comparison of all self-responses
The similarity of all the self-responses is 99.70%
Self-response results - MSFT

Comparison of all self-responses
The similarity of all the self-responses is 99.30%


## Cross response comparison

Compare the S. Wang cross responses data with my values

In [9]:
def cross_response_avg_number(ticker_i, ticker_j, dates):
    
    cross = np.zeros(__tau__)
    num_c = []
    
    for date in dates:
                
        try:
            
            data, avg_num = taq_data_analysis_comp.taq_cross_response_data(ticker_i, ticker_j, date)

            cross += data

            num_c.append(avg_num)
            
        except TypeError:
            pass

    num_c = np.asarray(num_c)
    num_c_t = np.sum(num_c, axis=0)
    
    return cross / num_c_t, num_c_t

In [10]:
def cross_response_comparison(ticker_i, ticker_j, year):

    dates = taq_data_tools_comp.taq_bussiness_days(year)
    cross, num = cross_response_avg_number(ticker_i, ticker_j, dates)

    d_cross = {'Cross': cross, 'Avg': num}
    cross_juan = pd.DataFrame(data=d_cross)

    cross_wang = pd.read_csv(''.join(('../../../taq_data/article_reproduction_data_2008/wang/'
                            + '{0}_{1}_{2}_RDC_L=1000.txt').split())
                            .format(ticker_i, ticker_j, year),
                            sep='\s+', header=None, engine='python', usecols=[1,3])
    cross_wang.columns = ['Cross', 'Avg']
    cross_wang = cross_wang[1:].copy()
    cross_wang = cross_wang.reset_index(drop=True)

    assert len(cross_juan) == len(cross_wang)

    # Comparison self-response
    cross_wang_arr = np.array([float('{:.5f}'.format(i)) for i in cross_wang['Cross']])
    cross_juan_arr = np.array([float('{:.5f}'.format(i)) for i in cross_juan['Cross']])

    cross_comp = np.sum(cross_juan_arr == cross_wang_arr) / len(cross_wang_arr)
    print('Cross-response results - {0}-{1}'.format(ticker_i, ticker_j))
    print()
    print('Comparison of all cross-responses')
    print('The similarity of all the cross-responses is {:.2f}%'.format(cross_comp * 100))

    return cross_juan, cross_wang

In [11]:
ticker_i = 'AAPL'
ticker_j = 'MSFT'
year = '2008'

_, _ = cross_response_comparison(ticker_i, ticker_j, year)

Cross-response results - AAPL-MSFT

Comparison of all cross-responses
The similarity of all the cross-responses is 98.70%
