# taq_data_extract

#### Juan Camilo Henao Londono - 26.03.2019
#### AG Guhr - Universitaet Duisburg-Essen

Time test between my load method and Droopy's method.

In [1]:
# Modules
import pandas as pd
import numpy as np

In [34]:
def taq_data_extract_juan(ticker, year, month, day):
    """
    Extract the trades and quotes (TAQ) data for a day, from a CSV file with
    the full information of a year. The time range for each day is from 9:40
    to 15:50 (including both).
        :param ticker: string of the abbreviation of the stock to be analized
                       (i.e. 'AAPL')
        :param year: string of the year to be analized (i.e '2008')
        :param month: string of the month to be analized (i.e '07')
        :param day: string of the day to be analized (i.e '07')
    """

    # Load data
    date = '{}-{}-{}'.format(year, month, day)
    quotes_filename = '../TAQ_{1}/Data/{0}_{1}_NASDAQ.quotes' \
                      .format(ticker, year)
    trades_filename = '../TAQ_{1}/Data/{0}_{1}_NASDAQ.trades' \
                      .format(ticker, year)
    quotes_day_list = []
    trades_day_list = []

    with open(quotes_filename, 'rb') as f_quotes:
        for idx, line in enumerate(f_quotes):
            print(chardet.detect(line))
            list_line = line.split()
            if (list_line[0] == date
                    and list_line[1] >= '34800'
                    and list_line[1] <= '57000'):
                quotes_day_list.append(list_line[:4])

    assert len(quotes_day_list) != 0

    with open(trades_filename, 'rb') as f_trades:
        for idx, line in enumerate(f_trades):
            list_line = line.split()
            if (list_line[0] == date
                    and list_line[1] >= '34800'
                    and list_line[1] <= '57000'):
                trades_day_list.append(list_line[:3])

    assert len(trades_day_list) != 0

    quotes_df = pd.DataFrame(quotes_day_list,
                             columns=['Date', 'Time', 'Bid', 'Ask'])
    trades_df = pd.DataFrame(trades_day_list,
                             columns=['Date', 'Time', 'Ask'])

    return (quotes_df, trades_df)

In [35]:
def taq_data_extract_droopy(ticker, year, month, day, chunksize):
    """
    Extract the trades and quotes (TAQ) data for a day, from a CSV file with
    the full information of a year. The time range for each day is from 9:40
    to 15:50 (including both).
        :param ticker: string of the abbreviation of the stock to be analized
                       (i.e. 'AAPL')
        :param year: string of the year to be analized (i.e '2008')
        :param month: string of the month to be analized (i.e '07')
        :param day: string of the day to be analized (i.e '07')
    """

    # Load data
    date = '{}-{}-{}'.format(year, month, day)
    quotes_filename = '../TAQ_{1}/Data/{0}_{1}_NASDAQ.quotes' \
                      .format(ticker, year)
    trades_filename = '../TAQ_{1}/Data/{0}_{1}_NASDAQ.trades' \
                      .format(ticker, year)
    
    quotes_df = pd.DataFrame()
    trades_df = pd.DataFrame()
    
    for chunk in pd.read_csv(quotes_filename, chunksize=chunksize, sep="\s+",
                             names = ['Date', 'Time', 'Bid', 'Ask'], usecols=range(4)
                        ):
    
        copy = chunk.loc[(chunk['Date'] == date) & (chunk['Time'] >= 34800)
                         & (chunk['Time'] <= 57000)]
        quotes_df = quotes_df.append(copy)
        
    for chunk in pd.read_csv(trades_filename, chunksize=chunksize, sep="\s+",
                             names = ['Date', 'Time', 'Ask'], usecols=range(3)
                        ):
    
        copy = chunk.loc[(chunk['Date'] == date) & (chunk['Time'] >= 34800)
                         & (chunk['Time'] <= 57000)]
        trades_df = trades_df.append(copy)

    return (quotes_df, trades_df)

In [36]:
ticker = 'AAPL'
year = '2008'
month = '01'
day = '02'
chunksize = 1000000

In [37]:
%%time
x, y = taq_data_extract_juan(ticker, year, month, day)

{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': 'KOI8-R', 'confidence': 0.4388398420352105, 'language': 'Russian'}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': 'ISO-8859-1', 'confidence': 0.73, 'language': ''}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': 'ISO

{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': 'KOI8-R', 'confidence': 0.42664984642312126, 'language': 'Russian'}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'con

{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': 'windows-1253', 'confidence': 0.29845147772483654, 'language': 'Greek'}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 

{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': 'Windows-1251', 'confidence': 0.22771526273139162, 'language': 'Russian'}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None

{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': 'windows-1251', 'confidence': 0.21720355817904355, 'language': 'Russian'}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': 'Windows-1251', 'confidence': 0.25598990785387277, 'language': 'Russian'}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'lan

{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': 'MacCyrillic', 'confidence': 0.20168901830911187, 'language': 'Russian'}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': 'Windows-1252', 'confidence': 0.73, 'language': ''}
{'encodin

{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'language'

{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': 'Windows-1251', 'confidence': 0.43763763703684805, 'language': 'Bulgairan'}
{'encoding': 'Windows-1251', 'confidence': 0.2865484528217458, 'languag

{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': 'Windows-1252', 'confidence': 0.73, 'language': ''}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': 'Windows-1252', 'confidence': 0.6778571428571428, 'language': ''}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': 'MacCyrillic', 'confidence': 0.7313997367253507, 'language': 'Russian'}
{'encoding': 'Windows-1252', 'confidence': 

{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': 'Windows-1251', 'confidence': 0.2792617176587703, 'language': 'Russian'}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': 'IBM855', 'confidence': 0.21332492321156066, 'language': 'Russian'}
{'encoding': 'ISO-8859-1', 'confidence': 0.73, 'language': ''}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'lan

{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': 'ISO-8859-9', 'confidence': 0.3435399038775349, 'language': 'Turkish'}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, '

{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': 'Windows-1253', 'confidence': 0.5652490108424935, 'language': 'Greek'}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, '

{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': 'Windows-1252', 'confidence': 0.73, 'language': ''}
{'encoding': 'ISO-8859-1', 'confidence': 0.73, 'language': ''}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': 'windows-1253', 'confidence': 0.20348964390329766, 'language': 'Greek'}
{'e

{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': 'windows-1253', 'confidence': 0.20088080231479383, 'language': 'Greek'}
{'encoding': 'MacCyrillic', 'confidence': 0.26255375164499767, 'language': 'Russian'}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'langua

{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': 'KOI8-R', 'confidence': 0.7313997367253507, 'language': 'Russian'}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'conf

{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': 'Windows-1253', 'confidence': 0.35610687683077086, 'language': 'Greek'}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': 'Windows-1252', 'confidence': 0.27114285714285713, 'language': ''}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': None, 'confidence': 0.0, 'language': N

IndexError: list index out of range

In [6]:
%%time
w, z = taq_data_extract_droopy(ticker, year, month, day, chunksize)

UnicodeDecodeError: 'utf-8' codec can't decode byte 0xad in position 1: invalid start byte

In [29]:
import chardet
quotes_filename = '../TAQ_2008/Data/AAPL_2008_NASDAQ.quotes'
x = open(quotes_filename, 'rb')
chardet.detect(x)

TypeError: Expected object of type bytes or bytearray, got: <class '_io.BufferedReader'>