# Example

In [4]:
import numpy as np
import os
import pandas as pd

__tau__ = 1000

In [26]:
def taq_data_extract(ticker, type, year):
    """Extracts the data for every day in a year.

    Extracts the trades and quotes (TAQ) data for a day from a CSV file with
    the information of a whole year. The time range for each day is from 9:30
    to 16:00, that means, the open market time.

    :param ticker: string of the abbreviation of the stock to be analized
     (i.e. 'AAPL').
    :param type: string with the type of the data to be extracted
     (i.e. 'trades' or 'quotes').
    :param year: string of the year to be analyzed (i.e. '2016').
    :return: None -- The function extracts the data and does not return a
     value.
    """

    try:

        df = pd.DataFrame()
        chunksize = 10 ** 7

        date_list = taq_data_tools_article_reproduction \
            .taq_bussiness_days(year)

        # Load data
        csv_file = f'{ticker}_{year}_NASDAQ_{type}.csv'

        df_type = {'quotes': {
                        'Date': 'str',
                        'Time': 'int',
                        'Bid': 'int',
                        'Ask': 'int',
                        'Vol_Bid': 'int',
                        'Vol_Ask': 'int',
                        'Mode': 'int',
                        'Cond': 'str',
                    },
                   'trades': {
                        'Date': 'str',
                        'Time': 'int',
                        'Ask': 'int',
                        'Vol_Ask': 'int',
                        'Mode': 'int',
                        'Corr': 'int',
                        'Cond': 'str',
                    }}

        col_names = {'quotes': ['Date', 'Time', 'Bid', 'Ask', 'Vol_Bid',
                                'Vol_Ask', 'Mode', 'Cond'],
                     'trades': ['Date', 'Time', 'Ask', 'Vol_Ask', 'Mode',
                                'Corr', 'Cond']}

        # Save data
        if (not os.path.isdir(f'hdf5_dayly_data_{year}/')):

            try:
                os.mkdir(f'hdf5_dayly_data_{year}/')
                print('Folder to save data created')

            except FileExistsError:
                print('Folder exists. The folder was not created')

        for chunk in pd.read_csv(csv_file, chunksize=chunksize, sep='\s+',
                                 names=col_names[type], dtype=df_type[type],
                                 na_filter=False, low_memory=False):

            chunk['Date'] = pd.to_datetime(chunk['Date'], format='%Y-%m-%d')
            chunk.set_index('Date', inplace=True)
            if (type == 'quotes'):
                chunk.drop(['Mode', 'Cond'], axis=1, inplace=True)
            else:
                chunk.drop(['Mode', 'Corr', 'Cond'], axis=1, inplace=True)

            for date in date_list:
                day = chunk.index.isin([date])
                df = chunk.loc[day & (chunk['Time'] >= 34200)
                               & (chunk['Time'] < 57600)]

                if not df.empty:
                    df.to_hdf(f''.join(('hdf5_dayly_data_'
                              + f'{year}/taq_{ticker}_{type}_{date}.h5')
                              .split()), key=type,
                              format='table', append=True)

        print('Data Saved')
        print()

        return None

    except AssertionError:
        print('No data')
        print()
        return None

In [20]:
def taq_midpoint_event_data(ticker, date):
    """Computes the midpoint price of every event.

    Using the dayly TAQ data computes the midpoint price of every event in a
    day.
    For further calculations, the function returns the values for the time
    range from 9h40 to 15h50.

    :param ticker: string of the abbreviation of the stock to be analized
     (i.e. 'AAPL').
    :param year: string of the year to be analized (i.e '2008').
    :param month: string of the month to be analized (i.e '07').
    :param day: string of the day to be analized (i.e '07').
    :return: tuple -- The function returns a tuple with numpy arrays.
    """

    date_sep = date.split('-')
    year = date_sep[0]
    month = date_sep[1]
    day = date_sep[2]

    # Load data
    # TAQ data gives directly the quotes data in every second that there is
    # a change in the quotes
    data_quotes_event = pd.read_hdf(
        f'hdf5_dayly_data_{year}/taq_{ticker}_quotes_{date}.h5')

    # Some files are corrupted, so there are some zero values that
    # does not have sense
    data_quotes_event = data_quotes_event[data_quotes_event['Ask'] != 0]

    data_quotes_event['Midpoint'] = (data_quotes_event['Bid']
                                     + data_quotes_event['Ask']) / 2
    data_quotes_event['Spread'] = data_quotes_event['Ask'] \
        - data_quotes_event['Bid']

    return data_quotes_event

In [6]:
def taq_midpoint_time_data(ticker, date):
    """Computes the midpoint price of every second.

    Using the taq_midpoint_event_data function computes the midpoint price of
    every second. To fill the time spaces when nothing happens I replicate the
    last value calculated until a change in the price happens.

    :param ticker: string of the abbreviation of the stock to be analized
     (i.e. 'AAPL').
    :param date: string with the date of the data to be extracted
     (i.e. '2008-01-02').
    :return: numpy array.
    """

    date_sep = date.split('-')

    year = date_sep[0]
    month = date_sep[1]
    day = date_sep[2]

    function_name = taq_midpoint_time_data.__name__

    try:
        # Calculate the values of the midpoint price for all the events
        data_quotes_event = taq_midpoint_event_data(ticker, date)

        # 34800 s = 9h40 - 57000 s = 15h50
        # Reproducing S. Wang values. In her results the time interval for the
        # midpoint is [34800, 56999]
        full_time = np.array(range(34800, 57000))

        # As there can be several values for the same second, we use the
        # last value of each second in the full time array as it behaves
        # quiet equal as the original input
        set_data_time = np.array(list(set(data_quotes_event['Time'])))
        list_data_time = [0] * len(full_time)

        for t_idx, t_val in enumerate(full_time):
            if (np.sum(t_val == set_data_time)):

                condition = data_quotes_event['Time'] == t_val
                data_dict = {'Time': data_quotes_event[condition].ix[-1]['Time'],
                             'Midpoint': data_quotes_event[condition].ix[-1]['Midpoint']}

                list_data_time[t_idx] = data_dict

            else:

                data_dict = {'Time': list_data_time[t_idx - 1]['Time'],
                             'Midpoint': list_data_time[t_idx - 1]['Midpoint']}

                list_data_time[t_idx] = data_dict

        data_quotes_time = pd.DataFrame(list_data_time, columns=['Time', 'Midpoint'])

        # The lengths of the time and the dataframe have to be the same
        assert len(full_time) == len(data_quotes_time['Time'])

        data_quotes_time['Time'] = full_time

        # Saving data

        if (not os.path.isdir(f'{function_name}/')):

            try:
                os.mkdir(f'{function_name}/')
                print('Folder to save data created')

            except FileExistsError:
                print('Folder exists. The folder was not created')

        data_quotes_time.astype(str).to_hdf(''.join((f'{function_name}/'
                                + f'{function_name}_quotes_{year}{month}{day}'
                                + f'_{ticker}.h5').split()),
                                key='data_quotes_time', mode='w', format='table')

        print('Data saved')
        print()

        return data_quotes_time

    except FileNotFoundError as e:
        print('No data')
        print(e)
        print()
        return None

In [7]:
def taq_trade_signs_event_data(ticker, year, month, day):
    """Computes the trade signs of every event.

    Using the dayly TAQ data computes the trade signs of every event in a day.
    The trade signs are computed using the equation (1) of the
    `paper <https://arxiv.org/pdf/1603.01580.pdf>`_.
    As the trades signs are not directly given by the TAQ data, they must be
    infered by the trades prices.
    For further calculations, the function returns the values for the time
    range from 9h40 to 15h50.

    :param ticker: string of the abbreviation of the stock to be analized
        (i.e. 'AAPL').
    :param year: string of the year to be analized (i.e '2016').
    :param month: string of the month to be analized (i.e '07').
    :param day: string of the day to be analized (i.e '07').
    :return: tuple -- The function returns a tuple with numpy arrays.
    """

    function_name = taq_trade_signs_event_data.__name__
    taq_data_tools_article_reproduction \
        .taq_function_header_print_data(function_name, ticker, ticker, year,
                                        month, day)

    # Load data
    time_t, ask_t, _ = pickle.load(open(
        '../../taq_data/pickle_dayly_data_{1}/TAQ_{0}_trades_{1}{2}{3}.pickle'
        .format(ticker, year, month, day), 'rb'))

    # All the trades must have a price different to zero
    assert not np.sum(ask_t == 0)

    # Trades identified using equation (1)
    identified_trades = np.zeros(len(time_t))
    identified_trades[-1] = 1

    # Implementation of equation (1). Sign of the price change between
    # consecutive trades

    for t_idx in range(len(time_t)):

        diff = ask_t[t_idx] - ask_t[t_idx - 1]

        if (diff):
            identified_trades[t_idx] = np.sign(diff)

        else:
            identified_trades[t_idx] = identified_trades[t_idx - 1]

    # All the identified trades must be different to zero
    assert not np.sum(identified_trades == 0)

    return (time_t, ask_t, identified_trades)

In [8]:
def taq_trade_signs_time_data(ticker, date):
    """Computes the trade signs of every second.

    Using the taq_trade_signs_event_data function computes the trade signs of
    every second.
    The trade signs are computed using the equation (2) of the
    `paper <https://arxiv.org/pdf/1603.01580.pdf>`_.
    As the trades signs are not directly given by the TAQ data, they must be
    infered by the trades prices.
    For further calculations, the function returns the values for the time
    range from 9h40 to 15h50.
    To fill the time spaces when nothing happens I added zeros indicating that
    there were neither a buy nor a sell.

    :param ticker: string of the abbreviation of the stock to be analized
     (i.e. 'AAPL').
    :param date: string with the date of the data to be extracted
     (i.e. '2008-01-02').
    :return: tuple -- The function returns a tuple with numpy arrays.
    """

    date_sep = date.split('-')

    year = date_sep[0]
    month = date_sep[1]
    day = date_sep[2]

    function_name = taq_trade_signs_time_data.__name__
    taq_data_tools_article_reproduction \
        .taq_function_header_print_data(function_name, ticker, ticker, year,
                                        month, day)

    try:
        # Calculate the values of the trade signs for all the events
        (time_t, ask_t,
         identified_trades) = taq_trade_signs_event_data(ticker, year, month,
                                                         day)

        # Reproducing S. Wang values. In her results the time interval for the
        # trade signs is [34801, 57000]
        full_time = np.array(range(34801, 57001))

        trade_signs = 0. * full_time
        price_signs = 0. * full_time

        # Implementation of equation (2). Trade sign in each second
        for t_idx, t_val in enumerate(full_time):

            condition = (time_t >= t_val) * (time_t < t_val + 1)
            # Empirical
            trades_same_t_exp = identified_trades[condition]
            sign_exp = int(np.sign(np.sum(trades_same_t_exp)))
            trade_signs[t_idx] = sign_exp
            try:
                price_signs[t_idx] = ask_t[condition][-1]
            except IndexError as e:
                full_time[t_idx] = 0

        # Saving data
        taq_data_tools_article_reproduction \
            .taq_save_data(function_name,
                           (full_time, price_signs, trade_signs),
                           ticker, ticker, year, month, day)

        return (full_time, price_signs, trade_signs)

    except FileNotFoundError as e:
        print('No data')
        print(e)
        print()
        return None

In [9]:
def taq_self_response_day_data(ticker, date):
    """Computes the self response of a day.

    Using the midpoint price and trade signs of a ticker computes the self-
    response during different time lags (:math:`\tau`) for a day.

    :param ticker: string of the abbreviation of the stock to be analized
     (i.e. 'AAPL').
    :param date: string with the date of the data to be extracted
     (i.e. '2008-01-02').
    :return: tuple -- The function returns a tuple with numpy arrays.
    """

    date_sep = date.split('-')

    year = date_sep[0]
    month = date_sep[1]
    day = date_sep[2]

    function_name = taq_self_response_day_data.__name__
    taq_data_tools_article_reproduction \
        .taq_function_header_print_data(function_name, ticker, ticker, year,
                                        month, day)

    try:
        # Load data
        midpoint = pickle.load(open(''.join((
                '../../taq_data/article_reproduction_data_{1}/taq_midpoint'
                + '_time_data/taq_midpoint_time_data_midpoint_{1}'
                + '{2}{3}_{0}.pickle').split())
                .format(ticker, year, month, day), 'rb'))
        _, _, trade_sign = pickle.load(open("".join((
                '../../taq_data/article_reproduction_data_{1}/taq_trade_signs'
                + '_time_data/taq_trade_signs_time_data_{1}{2}{3}_'
                + '{0}.pickle').split())
                .format(ticker, year, month, day), 'rb'))

        assert len(midpoint) == len(trade_sign)

        # Array of the average of each tau. 10^3 s used by Wang
        self_response_tau = np.zeros(__tau__)
        num = np.zeros(__tau__)

        # Calculating the midpoint price return and the self response function

        # Depending on the tau value
        for tau_idx in range(__tau__):

            trade_sign_tau = trade_sign[:-tau_idx - 1]
            trade_sign_no_0_len = len(trade_sign_tau[trade_sign_tau != 0])
            num[tau_idx] = trade_sign_no_0_len
            # Obtain the midpoint price return. Displace the numerator tau
            # values to the right and compute the return

            # midpoint price returns

            log_return_sec = (midpoint[tau_idx + 1:]
                              - midpoint[:-tau_idx - 1]) \
                / midpoint[:-tau_idx - 1]

            # Obtain the self response value
            if (trade_sign_no_0_len != 0):
                product = log_return_sec * trade_sign_tau
                self_response_tau[tau_idx] = np.sum(product)

        return (self_response_tau, num)

    except FileNotFoundError as e:
        print('No data')
        print(e)
        print()
        return None

In [10]:
def taq_self_response_year_data(ticker, year):
    """Computes the self response of a year.

    Using the taq_self_response_day_data function computes the self-response
    function for a year.

    :param ticker: string of the abbreviation of stock to be analized
     (i.e. 'AAPL').
    :param year: string of the year to be analized (i.e '2016').
    :return: tuple -- The function returns a tuple with numpy arrays.
    """

    function_name = taq_self_response_year_data.__name__
    taq_data_tools_article_reproduction \
        .taq_function_header_print_data(function_name, ticker, ticker, year,
                                        '', '')

    dates = taq_data_tools_article_reproduction.taq_bussiness_days(year)

    self_ = np.zeros(__tau__)
    num_s = []

    for date in dates:

        try:
            data, avg_num = taq_self_response_day_data(ticker, date)
            self_ += data
            num_s.append(avg_num)

        except TypeError:
            pass

    num_s = np.asarray(num_s)
    num_s_t = np.sum(num_s, axis=0)

    # Saving data
    taq_data_tools_article_reproduction \
        .taq_save_data(function_name, self_ / num_s_t, ticker, ticker, year,
                       '', '')

    return (self_ / num_s_t, num_s_t)

In [18]:
ticker = 'AAPL'
year = '2008'
date = '2008-01-02'

In [25]:
%%timeit
mid_data = taq_midpoint_event_data(ticker, date)

36.6 ms ± 301 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
