# TAQ data extract

In [None]:
# Modules

import numpy as np
import os
import pandas as pd

In [None]:
def taq_data_extract(ticker, type, year):
    """Extracts the data for every day in a year.

    Extracts the trades and quotes (TAQ) data for a day from a CSV file with
    the information of a whole year. The time range for each day is from 9:30
    to 16:00, that means, the open market time.

    :param ticker: string of the abbreviation of the stock to be analized
     (i.e. 'AAPL').
    :param type: string with the type of the data to be extracted
     (i.e. 'trades' or 'quotes').
    :param year: string of the year to be analyzed (i.e. '2016').
    :return: None -- The function extracts the data and does not return a
     value.
    """

    try:

        df = pd.DataFrame()
        chunksize = 10 ** 7

        date_list = taq_data_tools_article_reproduction \
            .taq_bussiness_days(year)

        # Load data
        csv_file = f'data/{ticker}_{year}_NASDAQ_{type}.csv'

        df_type = {'quotes': {
                        'Date': 'str',
                        'Time': 'int',
                        'Bid': 'int',
                        'Ask': 'int',
                        'Vol_Bid': 'int',
                        'Vol_Ask': 'int',
                        'Mode': 'int',
                        'Cond': 'str',
                    },
                   'trades': {
                        'Date': 'str',
                        'Time': 'int',
                        'Ask': 'int',
                        'Vol_Ask': 'int',
                        'Mode': 'int',
                        'Corr': 'int',
                        'Cond': 'str',
                    }}

        col_names = {'quotes': ['Date', 'Time', 'Bid', 'Ask', 'Vol_Bid',
                                'Vol_Ask', 'Mode', 'Cond'],
                     'trades': ['Date', 'Time', 'Ask', 'Vol_Ask', 'Mode',
                                'Corr', 'Cond']}

        # Save data
        if (not os.path.isdir(f'hdf5_dayly_data_{year}/')):

            try:
                os.mkdir(f'hdf5_dayly_data_{year}/')
                print('Folder to save data created')

            except FileExistsError:
                print('Folder exists. The folder was not created')

        for chunk in pd.read_csv(csv_file, chunksize=chunksize, sep='\s+',
                                 names=col_names[type], dtype=df_type[type],
                                 na_filter=False, low_memory=False):

            chunk['Date'] = pd.to_datetime(chunk['Date'], format='%Y-%m-%d')
            chunk.set_index('Date', inplace=True)
            if (type == 'quotes'):
                chunk.drop(['Mode', 'Cond'], axis=1, inplace=True)
            else:
                chunk.drop(['Mode', 'Corr', 'Cond'], axis=1, inplace=True)

            for date in date_list:
                day = chunk.index.isin([date])
                df = chunk.loc[day & (chunk['Time'] >= 34200)
                               & (chunk['Time'] < 57600)]

                if not df.empty:
                    df.to_hdf(f''.join(('hdf5_dayly_data_'
                              + f'{year}/taq_{ticker}_{type}_{date}.h5')
                              .split()), key=type,
                              format='table', append=True)

        print('Data Saved')
        print()

        return None

    except AssertionError:
        print('No data')
        print()
        return None

In [None]:
ticker_i = 'AAPL'
ticker_j = 'MSFT'
type_q = 'quotes'
type_t = 'trades'
year = '2008'

In [None]:
%%timeit
taq_data_extract(ticker_i, type_q, year)

In [None]:
%%timeit
taq_data_extract(ticker_i, type_t, year)

In [None]:
%%timeit
taq_data_extract(ticker_j, type_q, year)

In [None]:
%%timeit
taq_data_extract(ticker_j, type_t, year)