# Midpoint data

In [None]:
# Import modules

import numpy as np
import gzip
import pickle

### List of order types: 

* "B" = 1 -- Add buy order
* "S" = 2 -- Add sell order
* "E" = 3 -- Execute outstanding order in part
* "C" = 4 -- Cancel outstanding order in part
* "F" = 5 -- Execute outstanding order in full
* "D" = 6 -- Delete outstanding order in full
* "X" = 7 -- Bulk volume for the cross event
* "T" = 8 -- Execute non-displayed order

In [None]:
# List of stocks and days

tickers = ["AAL", "AAPL","ADBE","ADI", "ADP", "ADSK","AKAM","ALXN","AMAT","AMGN",
         "AMZN","ATVI","AVGO","BBBY","BIDU","BIIB","BMRN","CA",  "CELG","CERN",
         "CHKP","CHRW","CHTR","CMCSA","COST","CSCO","CTSH","CTXS","DISCA","DISH",
         "DLTR","EA",  "EBAY","EQIX","ESRX","EXPD","FAST","FB",  "FISV","FOXA",
         "GILD","GOOG","GRMN","HSIC","ILMN","INTC","INTU","ISRG","JD",  "KHC",
         "KLAC","LBTYA","LLTC","LMCA","LRCX","LVNTA","MAR","MAT","MDLZ","MNST",
         "MSFT","MU",  "MYL", "NFLX","NTAP","NVDA","NXPI","ORLY","PAYX","PCAR",
         "PCLN","QCOM","REGN","ROST","SBAC","SBUX","SIRI","SNDK","SPLS","SRCL",
         "STX", "SYMC","TRIP","TSCO","TSLA","TXN", "VIAB","VIP", "VOD", "VRSK",
         "VRTX","WDC", "WFM", "WYNN","XLNX","YHOO"]

days = ['07','08','09','10','11']

In [None]:
def midpoint_data(ticker, day):
    """
    Obtain the midpoint price from the ITCH 2016 data. For further calculations
    we use the full time range from the opening of the market at 9h30 to the
    closing at 16h in milliseconds and then convert the values to hours (23.4
    million data). To fill the time spaces when nothing happens we replicate
    the last value calculated until a change in the price happens. Save in a
    different pickle file the array of each of the following values: best bid,
    best ask, spread, midpoint price and time.
        :param ticker: String of the abbreviation of the stock to be analized
                       (i.e. 'AAPL')
        :param day: String of the day to be analized (i.e '07')
    """

    print('Midpoint price data')
    print('Processing data for the stock', ticker, 'the day', day +
          ' March, 2016')

    # Load data

    data = np.genfromtxt(gzip.open('../../ITCH_2016/201603{}_{}.csv.gz'
                         .format(day, ticker)), dtype='str', skip_header=1,
                         delimiter=',')

    # Lists of times, ids, types, volumes and prices
    # List of all the available information available in the data excluding
    # the last two columns

    times_ = np.array([int(mytime) for mytime in data[:, 0]])
    ids_ = np.array([int(myid) for myid in data[:, 2]])

    # List of order types:
    # "B" = 1 - > Add buy order
    # "S" = 2 - > Add sell order
    # "E" = 3 - > Execute outstanding order in part
    # "C" = 4 - > Cancel outstanding order in part
    # "F" = 5 - > Execute outstanding order in full
    # "D" = 6 - > Delete outstanding order in full
    # "X" = 7 - > Bulk volume for the cross event
    # "T" = 8 - > Execute non-displayed order

    types_ = np.array([1 * (mytype == 'B') +
                       2 * (mytype == 'S') +
                       3 * (mytype == 'E') +
                       4 * (mytype == 'C') +
                       5 * (mytype == 'F') +
                       6 * (mytype == 'D') +
                       7 * (mytype == 'X') +
                       8 * (mytype == 'T') for mytype in data[:, 3]])
    prices_ = np.array([int(myprice) for myprice in data[:, 5]])

    ids = ids_[types_ < 7]
    times = times_[types_ < 7]
    types = types_[types_ < 7]
    prices = prices_[types_ < 7]

    # Reference lists
    # Reference lists using the original values or the length of the original
    # lists

    prices_ref = 1 * prices
    types_ref = 0 * types
    times_ref = 0 * times
    index_ref = 0 * types
    newids = {}
    insertnr = {}
    hv = 0

    # Help lists with the data of the buy orders and sell orders

    hv_prices = prices[types < 3]
    hv_types = types[types < 3]
    hv_times = times[types < 3]

    # Fill the reference lists where the values of 'T' are 'E', 'C', 'F', 'D'

    # For the data in the length of the ids list (all data)
    for iii in range(len(ids)):

        # If the data is a sell or buy order
        if (types[iii] < 3):

            # Insert in the dictionary newids a key with the valor of the id
            # and the value of hv (a counter)
            newids[ids[iii]] = hv

            # Insert in the dictionary insertnr a key with the valor of the id
            # and the value of the for counter
            insertnr[ids[iii]] = iii

            # Increase the value of hv
            hv += 1

        # If the data is not a sell or buy order
        else:

            # Fill the values of prices_ref with no prices ('E', 'C', 'F', 'D')
            # with the price of the order
            prices_ref[iii] = hv_prices[newids[ids[iii]]]
            # Fill the values of types_ref with no  prices ('E', 'C', 'F', 'D')
            # with the type of the order
            types_ref[iii] = hv_types[newids[ids[iii]]]
            # Fill the values of time_ref with no  prices ('E', 'C', 'F', 'D')
            # with the time of the order
            times_ref[iii] = hv_times[newids[ids[iii]]]
            # Fill the values of index_ref with no  prices ('E', 'C', 'F', 'D')
            # with the position of the sell or buy order
            index_ref[iii] = insertnr[ids[iii]]

    # Minimum and maximum trade price

    # The minimum price allowed is 0.9 times the price of
    # the minimum value of all full executed orders.
    minP = round(0.9 * (1. * prices_ref[types == 5] / 10000).min(), 2)
    # The maximum price allowed is 1.1 times the price of
    # the maximum value of all full executed orders.
    maxP = round(1.1 * (1. * prices_ref[types == 5] / 10000).max(), 2)
    # Values between maxP and minP with step of 0.01 cents
    valuesP = minP + 0.01 * np.arange(int((maxP - minP) / 0.01))
    maxP = valuesP.max()

    # Construct quotes and spread

    nAsk = 0 * valuesP      # Sell values started at 0
    nAsk[-1] = 1            # Last value of nAsk set to 1
    nBid = 0 * valuesP      # Buy values starte at 0
    nBid[0] = 1             # First value of nBid set to 1
    bestAsk = 10000000.     # Set bestAsk and bestAskOld a high value
    bestAskOld = 10000000.
    bestBid = 0.            # Set bestBid and bestBidOld a low value
    bestBidOld = 0.
    bestAsks = []           # Create lists for best asks, bids and times
    bestBids = []
    bestTimes = []

    # Finding the best asks and best bids

    # For the data in the length of the ids list (all data)
    for iii in range(len(ids)):

        # Incoming limit orders

        myPriceIndex = int(round(1. * (1. * prices_ref[iii] / 10000 - minP)
                           / 0.01))

        # Initializing bestAksOld and bestBidOld
        bestAskOld = 1 * bestAsk
        bestBidOld = 1 * bestBid

        # The price is greater than the minP
        if (myPriceIndex >= 0 and
                myPriceIndex < len(valuesP)):

            # If the order is a sell
            if (types[iii] == 2):

                if (nAsk[myPriceIndex] == 0):

                    # The bestAsk is the minimum value between the previous
                    # bestAsk and the value in valuesP with id myPriceIndex
                    bestAsk = min(bestAsk, valuesP[myPriceIndex])

                # Increase the value of nAsk to 1 (value arrived the book)
                nAsk[myPriceIndex] += 1

            # If the order is a buy
            if (types[iii] == 1):

                if (nBid[myPriceIndex] == 0):

                    # The bestBid is the maximum value between the previous
                    # bestBid and the value in valuesP with id myPriceIndex
                    bestBid = max(bestBid, valuesP[myPriceIndex])

                # Increase the value of nBid to 1 (value arrived the book)
                nBid[myPriceIndex] += 1

            # limit orders completely leaving

            # If the order is a full executed order or if the order is a full
            # delete order
            if (types[iii] == 5 or
                    types[iii] == 6):

                # If the order is a sell
                if (types_ref[iii] == 2):

                    # Reduce the value in nAsk to 0 (value left the book)
                    nAsk[myPriceIndex] -= 1

                    # If the value is not in the book and if the value is the
                    # best ask
                    if (nAsk[myPriceIndex] == 0 and
                            valuesP[myPriceIndex] == bestAsk):

                        # The best ask is the minimum value of the prices that
                        # are currently in the order book
                        bestAsk = valuesP[nAsk > 0].min()

                else:

                    # Reduce the value in nBid to 0 (value left the book)
                    nBid[myPriceIndex] -= 1

                    # If the value is not in the book and if the value is the
                    # best bid
                    if (nBid[myPriceIndex] == 0 and
                            valuesP[myPriceIndex] == bestBid):

                        # The best bid is the maximum value of the prices that
                        # are currently in the order book
                        bestBid = valuesP[nBid > 0].max()

        # If the bestAsk changes or and if the bestBid changes
        if (bestAsk != bestAskOld or
                bestBid != bestBidOld):

            # Append the values of bestTimes, bestAsks and bestBids
            bestTimes.append(times[iii])
            bestAsks.append(bestAsk)
            bestBids.append(bestBid)
            bestAskOld = bestAsk
            bestBidOld = bestBid

    # Calculating the spread, midpoint and time

    # Calculating the spread
    spread_ = np.array(bestAsks) - np.array(bestBids)
    # Transforming bestTimes in an array
    timesS = np.array(bestTimes)
    midpoint_ = 1. * (np.array(bestAsks) + np.array(bestBids)) / 2

    # Setting the values in the open market time

    # This line behaves as an or the two arrays must achieve a condition, in
    # this case, be in the market trade hours
    day_times_ind = (1. * timesS / 3600 / 1000 > 9.5) * \
                    (1. * timesS / 3600 / 1000 < 16) > 0

    # Midpoint in the market trade hours
    midpoint = 1. * midpoint_[day_times_ind]
    # Time converted to hours in the market trade hours
    times_spread = 1. * timesS[day_times_ind]
    bestAsks = np.array(bestAsks)[day_times_ind]
    bestBids = np.array(bestBids)[day_times_ind]
    # Spread in the market trade hours
    spread = spread_[day_times_ind]

    # Completing the full time entrances

    # 34 200 000 ms = 9h30 - 57 600 000 ms = 16h
    full_time = np.array(range(34200000, 57600000))

    # As there can be several values for the same millisecond, we use the
    # first value of each millisecond in the full time array as is the
    # easier way to obtain the value and it behaves quiet equal as the
    # original input

    midpoint_first_val = 0. * full_time
    midpoint_first_val[-1] = midpoint[0]

    bestAsks_first_val = 0. * full_time
    bestAsks_first_val[-1] = midpoint[0]

    bestBids_first_val = 0. * full_time
    bestBids_first_val[-1] = midpoint[0]

    spread_first_val = 0. * full_time
    spread_first_val[-1] = midpoint[0]

    count = 0

    for t_idx, t_val in enumerate(full_time):

        if (count < len(times_spread) and t_val == times_spread[count]):

            midpoint_first_val[t_idx] = midpoint[count]
            bestAsks_first_val[t_idx] = bestAsks[count]
            bestBids_first_val[t_idx] = bestBids[count]
            spread_first_val[t_idx] = spread[count]

            count += 1

            while (count < len(times_spread) and
                   times_spread[count - 1] == times_spread[count]):

                count += 1

        else:

            midpoint_first_val[t_idx] = midpoint_first_val[t_idx - 1]
            bestAsks_first_val[t_idx] = bestAsks_first_val[t_idx - 1]
            bestBids_first_val[t_idx] = bestBids_first_val[t_idx - 1]
            spread_first_val[t_idx] = spread_first_val[t_idx - 1]

    # Saving data

    if (not os.path.isdir('../Data/midpoint_data/')):

        os.mkdir('../Data/midpoint_data/')
        print('Folder to save data created')

    pickle.dump(bestAsks_first_val,
                open('../Data/midpoint_data/bestAsks_201603{}_{}.pickl'
                     .format(day, ticker), 'wb'))
    pickle.dump(bestBids_first_val,
                open('../Data/midpoint_data/bestBids_201603{}_{}.pickl'
                     .format(day, ticker), 'wb'))
    pickle.dump(spread_first_val,
                open('../Data/midpoint_data/spread_201603{}_{}.pickl'
                     .format(day, ticker), 'wb'))
    pickle.dump(full_time, open('../Data/midpoint_data/time.pickl', 'wb'))
    pickle.dump(midpoint_first_val,
                open('../Data/midpoint_data/midpoint_201603{}_{}.pickl'
                     .format(day, ticker), 'wb'))

    print('Midpoint price data saved')
    print()

    return None