# taq_data_extract

#### Juan Camilo Henao Londono - 26.03.2019
#### AG Guhr - Universitaet Duisburg-Essen

In [1]:
# Modules

import numpy as np
import os
import pandas as pd

import pickle

import taq_data_tools

__tau__ = 1000

In [2]:
def taq_data_extract(ticker, year, month, day):
    """
    Extract the trades and quotes (TAQ) data for a day, from a CSV file with
    the full information of a year. The time range for each day is from 9:30
    to 16:00 (including both).
        :param ticker: string of the abbreviation of the stock to be analized
                       (i.e. 'AAPL')
        :param year: string of the year to be analized (i.e '2008')
        :param month: string of the month to be analized (i.e '07')
        :param day: string of the day to be analized (i.e '07')
    """

    function_name = taq_data_extract.__name__
    taq_data_tools.taq_function_header_print_data(function_name, ticker,
                                                  ticker, year, month, day)

    # Load data
    # Date of the day to be saved
    date = '{}-{}-{}'.format(year, month, day)
    quotes_filename = ''.join(('../../taq_data/csv_year_data_{1}/{0}_{1}'
                               + '_NASDAQ_quotes.csv').split()) \
                      .format(ticker, year)
    trades_filename = ''.join(('../../taq_data/csv_year_data_{1}/{0}_{1}'
                               + '_NASDAQ_trades.csv').split()) \
                      .format(ticker, year)
    quotes_day_list = []
    trades_day_list = []

    # Read line per line
    with open(quotes_filename) as f_quotes:
        for idx, line in enumerate(f_quotes):
            list_line = line.split()
            if (list_line[0] == date
                    and list_line[1] >= '34200'
                    and list_line[1] <= '57600'):
                quotes_day_list.append(list_line[:6])

    assert len(quotes_day_list) != 0

    with open(trades_filename) as f_trades:
        for idx, line in enumerate(f_trades):
            list_line = line.split()
            if (list_line[0] == date
                    and list_line[1] >= '34200'
                    and list_line[1] <= '57600'):
                trades_day_list.append(list_line[:4])

    assert len(trades_day_list) != 0

    # Pandas dataframes with the filtered data
    quotes_df = pd.DataFrame(quotes_day_list,
                             columns=['Date', 'Time', 'Bid', 'Ask',
                                      'Vol_Bid', 'Vol_Ask'])
    trades_df = pd.DataFrame(trades_day_list,
                             columns=['Date', 'Time', 'Ask', 'Vol_Ask'])

    # Dataframes to arrays
    time_q = np.array(quotes_df['Time']).astype(int)
    bid_q = np.array(quotes_df['Bid']).astype(int)
    ask_q = np.array(quotes_df['Ask']).astype(int)
    vol_bid_q = np.array(quotes_df['Vol_Bid']).astype(int)
    vol_ask_q = np.array(quotes_df['Vol_Ask']).astype(int)

    time_t = np.array(trades_df['Time']).astype(int)
    ask_t = np.array(trades_df['Ask']).astype(int)
    vol_ask_t = np.array(trades_df['Vol_Ask']).astype(int)

    # Save data
    if (not os.path.isdir('../../taq_data/pickle_dayly_data_{}/'.format(year))):

        try:

            os.mkdir('../../taq_data/pickle_dayly_data_{}/'.format(year))
            print('Folder to save data created')

        except FileExistsError:

            print('Folder exists. The folder was not created')

    pickle.dump((time_q, bid_q, ask_q, vol_bid_q, vol_ask_q),
                open(''.join(('../../taq_data/pickle_dayly_data_2008/TAQ_{0}'
                     + '_quotes_{1}{2}{3}.pickle').split())
                     .format(ticker, year, month, day), 'wb'))

    pickle.dump((time_t, ask_t, vol_ask_t),
                open(''.join(('../../taq_data/pickle_dayly_data_2008/TAQ_{0}'
                     + '_trades_{1}{2}{3}.pickle').split())
                     .format(ticker, year, month, day), 'wb'))

    print('Data Saved')
    print()

    return (time_q, bid_q, ask_q, vol_bid_q, vol_ask_q,
            time_t, ask_t, vol_ask_t)

In [3]:
# Test

tickers = ['AAPL', 'MSFT']
year = '2008'
month = '03'
days = ['10', '11', '12', '13', '14']

for ticker in tickers:
    for day in days:
        taq_data_extract(ticker, year, month, day)

TAQ data
taq_data_extract
Processing data for the stock AAPL the 2008.03.10
Folder exists. The folder was not created
Data Saved

TAQ data
taq_data_extract
Processing data for the stock AAPL the 2008.03.11
Folder exists. The folder was not created
Data Saved

TAQ data
taq_data_extract
Processing data for the stock AAPL the 2008.03.12
Folder exists. The folder was not created
Data Saved

TAQ data
taq_data_extract
Processing data for the stock AAPL the 2008.03.13
Folder exists. The folder was not created
Data Saved

TAQ data
taq_data_extract
Processing data for the stock AAPL the 2008.03.14
Folder exists. The folder was not created
Data Saved

TAQ data
taq_data_extract
Processing data for the stock MSFT the 2008.03.10
Folder exists. The folder was not created
Data Saved

TAQ data
taq_data_extract
Processing data for the stock MSFT the 2008.03.11
Folder exists. The folder was not created
Data Saved

TAQ data
taq_data_extract
Processing data for the stock MSFT the 2008.03.12
Folder exists.