# taq_data_load

In [1]:
import numpy as np
import pandas as pd
import dask.dataframe as dd
from dask.multiprocessing import get
import pickle
import matplotlib.pyplot as plt
import swifter
%matplotlib inline

In [2]:
def taq_data_extract(ticker, year, month):

    print('Obtaining data from ticker {}'.format(ticker))
    data_quotes = dd.read_csv('../TAQ_{1}/Data/{0}_{1}_NASDAQ_quotes.csv'
                              .format(ticker, year),
                              usecols=range(4),
                              sep=' ',
                              names=['Date', 'Time', 'Bid', 'Ask'],
                              parse_dates=['Date']).set_index('Date')
    data_quotes = data_quotes['{}-{}'.format(year, month)]
    data_quotes = data_quotes.compute()

    data_trades = dd.read_csv('../TAQ_{1}/Data/{0}_{1}_NASDAQ_trades.csv'
                              .format(ticker, year),
                              usecols=range(3),
                              sep=' ',
                              names=['Date', 'Time', 'Ask'],
                              parse_dates=['Date']).set_index('Date')
    data_trades = data_trades['{}-{}'.format(year, month)]
    data_trades = data_trades.compute()
    
    return (data_quotes, data_trades)

In [3]:
def get_sec(time_str):
    h, m, s = time_str.split(':')
    return int(h) * 3600 + int(m) * 60 + int(s)
print(get_sec('09:40:00'))
print(get_sec('15:50:00'))

34800
57000


In [4]:
# The data on the week of the 2008.04.07-2008.04.11 will be saved in Pickle files to test S. Wang paper.

In [5]:
def data_to_array(quotes, trades, year, month, days):
    
    print('Processing data')
    print()

    for d in days:

        data_q = quotes.loc[year + '-' + month + '-' + d].copy()
        data_t = trades.loc[year + '-' + month + '-' + d].copy()

        data_q.loc[:, 'Time'] = data_q['Time'].apply(get_sec)
        data_t.loc[:, 'Time'] = data_t['Time'].apply(get_sec)

        data_q = data_q.loc[(data_q['Time'] >= 34800) & (data_q['Time'] < 57000)]
        data_t = data_t.loc[(data_t['Time'] >= 34800) & (data_t['Time'] < 57000)]

        print('Saving data ' + year + '-' + month + '-' + d)

        print('Quotes')
        time_q = np.array(data_q['Time'])
        bid_q = np.array(data_q['Bid'])
        ask_q = np.array(data_q['Ask'])
        
        print('Time, bid and ask')
        pickle.dump((time_q, bid_q, ask_q),
                    open('../TAQ_{1}/TAQ_py/TAQ_{0}_quotes_{1}{2}{3}.pickle'
                         .format(ticker, year, month, d), 'wb'))
        print('Trades')
        time_t = np.array(data_t['Time'])
        ask_t = np.array(data_t['Ask'])
        time_t, ask_t = zip(*sorted(zip(time_t, ask_t)))
        time_t = np.asarray(time_t)
        ask_t = np.asarray(ask_t)
        
        print('Time and ask')
        pickle.dump((time_t, ask_t),
                    open('../TAQ_{1}/TAQ_py/TAQ_{0}_trades_{1}{2}{3}.pickle'
                         .format(ticker, year, month, d), 'wb'))
        
        print()
        
    return None

In [6]:
tickers = ['AAPL', 'MSFT']
year = '2008'
month = '03'
days = ['10', '11', '12', '13', '14']
for ticker in tickers:
    data_quotes, data_trades = taq_data_extract(ticker, year, month)
    data_to_array(data_quotes, data_trades, year, month, days)

Processing data

Saving data 2008-03-10
Quotes
Time, bid and ask
Trades
Time and ask

Saving data 2008-03-11
Quotes
Time, bid and ask
Trades
Time and ask

Saving data 2008-03-12
Quotes
Time, bid and ask
Trades
Time and ask

Saving data 2008-03-13
Quotes
Time, bid and ask
Trades
Time and ask

Saving data 2008-03-14
Quotes
Time, bid and ask
Trades
Time and ask

Processing data

Saving data 2008-03-10
Quotes
Time, bid and ask
Trades
Time and ask

Saving data 2008-03-11
Quotes
Time, bid and ask
Trades
Time and ask

Saving data 2008-03-12
Quotes
Time, bid and ask
Trades
Time and ask

Saving data 2008-03-13
Quotes
Time, bid and ask
Trades
Time and ask

Saving data 2008-03-14
Quotes
Time, bid and ask
Trades
Time and ask

