<left>FINM 33150 - Quantitative Trading Strategies</left>
<left>Winter 2023</left>
<br>
<h1><center> Homework 4: Accumulation Opportunity </center></h1>
<center>Due - 23:00 [CST] February 2nd, 2023</center>
<br>
<h3>Ki Hyun</h3>
<h3>Student ID: 12125881</h3>

<h5> Imports </h5>

In [1]:
%matplotlib inline

In [2]:
import os
import datetime
import pickle
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import pandas as pd
import numpy as np
import scipy as sp
import seaborn as sns
from enum import Enum

<h5> Constants </h5>

In [3]:
# constants needed for grabbing and cleaning data
data_home_dir = "C:\\Users\\kwhyu\\OneDrive - The University of Chicago\\2023-1 Winter\\FINM " \
                "33150\\FINM-33150-W23\\Data\\Crypto\\"
pairs = ['BTC-USD', 'ETH-USD', 'ETH-BTC']
years = ['2021', '2022', '2023']
# constants needed for trading simulation
reaction_time = int(0.1 * (10 ** 9)) # in nano seconds
currency_crypto_cost = 50 * (10 ** (-4))
crypto_crypto_cost = 10 * (10 ** (-4))
p = 0.05 # 5% participation rate

<h5> Helper Functions </h5>

In [4]:
class Order(Enum):
    Sell = -1
    Buy = 1

In [5]:
def read_data(pair, year):
    global data_home_dir
    # creating directory for the pair and year
    book_dir = f'book_narrow_{pair}_{year}.delim.gz'
    trades_dir = f'trades_narrow_{pair}_{year}.delim.gz'
    # retrieving book data:
    print(">>> Retrieving Book data for ", pair, " for the year ", year, "...")
    try:
        book = pd.read_table(os.path.join(data_home_dir, book_dir))
        print(">>> Success!")
    except:
        print(">>> Book data for ", pair, " for the year ", year, " does not exist")
        book = None
    # retrieving trades data:
    print(">>> Retrieving Trades data for ", pair, " for the year ", year, "...")
    try:
        trades = pd.read_table(os.path.join(data_home_dir, trades_dir))
        print(">>> Success!")
    except:
        print(">>> Trades data for ", pair, " for the year ", year, " does not exist")
        return book, None, pair, year, None
    # indexing trades data by Time and Nano second
    trades['Time'] = (trades['timestamp_utc_nanoseconds'] / 10**9).apply(int)\
        .apply(lambda x: datetime.datetime.fromtimestamp(x))
    trades['Nano second'] = trades['timestamp_utc_nanoseconds'] % 10**9
    trades = trades.set_index(['Time', 'Nano second']).sort_index().drop(columns = {'received_utc_nanoseconds'})
    # sorting into buy and sell data
    sell = trades[trades['Side'] < 0]
    buy = trades[trades['Side'] > 0]

    return book, buy, pair, year, sell

In [6]:
def clean_trading_data(raw_df, side, reaction_time):
    df = raw_df[side.value].copy()
    pair = raw_df[2]
    year = raw_df[3]
    if side == Order.Buy:
        detail = "Buy"
    else:
        detail = "Sell"
    print(">>> Cleaning", pair, "for the year", year, detail, "Trades data for VWAP Algorithm based on reaction time",
          reaction_time/(10**9), " seconds...")
    df['hold_time_nanoseconds'] = (df['timestamp_utc_nanoseconds'].shift(-1).fillna(method = 'ffill') -
                                   df['timestamp_utc_nanoseconds']).apply(int)
    df = df.drop(columns = {'timestamp_utc_nanoseconds', 'Side'})
    df = df[df['hold_time_nanoseconds'] > 0]

    temp_l = []
    interval = int(0)
    for trade in df.index:
        interval += df.loc[trade, 'hold_time_nanoseconds']
        if interval >= reaction_time:
            temp_l.append(True)
            interval = int(0)
        else:
            temp_l.append(False)
    df['qualify'] = temp_l
    print(">>> Success!")
    return df

In [7]:
def data_factory(pairs, years, avoid_file_load = False):
    if not avoid_file_load:
        dir = r'./data/full_data.pkl'
        with open(dir, 'rb') as f:
            try:
                ret_dict = pickle.load(f)
                print(">>> Found data...")
                return ret_dict
            except:
                print(">>> No data found")
                print(">>> Proceeding to compute the data...")

    global reaction_time
    ret_dict = {}
    for pair in pairs:
        for year in years:
            raw_df = read_data(pair, year)
            if raw_df[1] is None:
                buy = None
                sell = None
            else:
                buy = clean_trading_data(raw_df, Order.Buy, reaction_time)
                sell = clean_trading_data(raw_df, Order.Sell, reaction_time)

            ret_dict[(pair, year)] = [raw_df[0], buy, sell]

    if not avoid_file_load:
        dir = r'./data/full_data.pkl'
        with open(dir, 'wb') as f:
            pickle.dump(ret_dict, f)

    return ret_dict

In [8]:
def VWAP_participation(df_cleaned, Q, p, start_time,
                       side, transaction_cost):
    df = df_cleaned.loc[start_time:].copy()

    if side == Order.Buy:
        f = False
    else:
        f = True

    accumulated = 0
    indexes = df[df['qualify']].index
    i = 0
    ret_df = {'datetime': [],
              'Nano seconds': [],
              'PriceMillionths': [],
              'SizeBillionths': []}

    while(accumulated < Q and i < len(indexes)):
        if i == 0:
            temp = df.loc[:indexes[i]].sort_values(by = ['PriceMillionths'], ascending = f).iloc[0]
        else:
            temp = df.loc[indexes[i-1]:indexes[i]].iloc[1:].sort_values(by = ['PriceMillionths'], ascending = f).iloc[0]

        ret_df['datetime'].append(temp.name[0])
        ret_df['Nano seconds'].append(temp.name[1])
        ret_df['PriceMillionths'].append(temp['PriceMillionths'])
        ret_df['SizeBillionths'].append(temp['SizeBillionths'] * p)

        accumulated += temp['SizeBillionths'] * p
        i += 1

    ret_df = pd.DataFrame(ret_df).set_index(['datetime', 'Nano seconds'])
    ret_df['NotionalMillionths'] = ret_df['PriceMillionths'] * ret_df['SizeBillionths'] / (10**9)
    ret_df['TradingCostsMillionths'] = ret_df['NotionalMillionths'] * transaction_cost
    ret_df['VWAPMillionths'] = (ret_df['NotionalMillionths'].cumsum() / ret_df['SizeBillionths'].cumsum() * (10**9))\
        .apply(int)

    return ret_df

In [9]:
def total_participation_opportunities(full_dict, p,
                                      currency_crypto_cost, crypto_crypto_cost):

    for pair, year in full_dict.keys():
        if pair == 'ETH-BTC':
            transaction_cost = crypto_crypto_cost
        else:
            transaction_cost = currency_crypto_cost
        df_Buy = full_dict[(pair, year)][Order.Buy.value]
        dir = r'./data/' + pair + '_' + year + '_total_buy_opportunities.csv'
        if os.path.exists(dir):
            print(dir, 'in data file')
        else:
            VWAP_participation(df_Buy, np.infty, p, df_Buy.index[0][0], Order.Buy, transaction_cost).to_csv(dir)
        df_Sell = full_dict[(pair, year)][Order.Sell.value]
        dir = r'./data/' + pair + '_' + year + '_total_sell_opportunities.csv'
        if os.path.exists(dir):
            print(dir, 'in data file')
        else:
            VWAP_participation(df_Sell, np.infty, p, df_Sell.index[0][0], Order.Sell, transaction_cost).to_csv(dir)

In [20]:
def accumulation_analytics(dfs, Q):
    ret_df = {'Start Time': [], 'Start Nano': [], 'End Time': [], 'End Nano': [], 'Accumulated Billionth': [],
              'Notional Millionth': [], 'Duration Nano': [], 'Transaction Cost Millionth': []}
    df = dfs.copy()
    df_cumulative = df.cumsum()
    for time in df.index:
        # don't record the last time for duration != 0
        if time == df.index[-1]:
            break
        # finding end point
        arrival_point = df_cumulative.shift().fillna(0.0).loc[time]
        already = arrival_point['SizeBillionths']
        end_point = df_cumulative[df_cumulative['SizeBillionths'] >= already + Q].head(1)
        if len(end_point) == 0:
            #end_point = df_cumulative.iloc[-1]
            break
        else:
            end_point = end_point.iloc[0]
        # recording the start time
        T_start, Nano_start = time
        ret_df['Start Time'].append(T_start)
        ret_df['Start Nano'].append(Nano_start)
        # recording end time
        T_end, Nano_end = end_point.name
        ret_df['End Time'].append(T_end)
        ret_df['End Nano'].append(Nano_end)
        # recording accumulated
        accumulated = end_point['SizeBillionths'] - already
        ret_df['Accumulated Billionth'].append(accumulated)
        # recording Notional
        notional = end_point['NotionalMillionths'] - arrival_point['NotionalMillionths']
        ret_df['Notional Millionth'].append(notional)
        # recording duration time
        duration = int(T_start.timestamp()) * (10**9) + Nano_start - \
                   (int(T_end.timestamp()) * (10**9) + Nano_end)
        ret_df['Duration Nano'].append(duration)
        # recording Transaction Cost
        transaction_cost = end_point['TradingCostsMillionths'] - arrival_point['TradingCostsMillionths']
        ret_df['Transaction Cost Millionth'].append(transaction_cost)

    return pd.DataFrame(ret_df)

In [11]:
def read_pair_data(pair, year):
    # buy data
    dir_buy = r'./data/' + pair + '_' + year + '_total_buy_opportunities.csv'
    buy_df = pd.read_csv(dir_buy)
    buy_df.datetime = pd.to_datetime(buy_df.datetime)
    buy_df = buy_df.set_index(['datetime', 'Nano seconds'])
    # sell data
    dir_sell = r'./data/' + pair + '_' + year + '_total_sell_opportunities.csv'
    sell_df = pd.read_csv(dir_sell)
    sell_df.datetime = pd.to_datetime(sell_df.datetime)
    sell_df = sell_df.set_index(['datetime', 'Nano seconds'])
    return buy_df, sell_df

<h2> 2. Data </h2>

<h3> 2-1. Raw Data </h3>

<h3> 2-2. Data Processing </h3>

In [12]:
data_dict = data_factory(pairs, years)

>>> Found data...


In [13]:
total_participation_opportunities(data_dict, p, currency_crypto_cost, crypto_crypto_cost)

./data/BTC-USD_2021_total_buy_opportunities.csv in data file
./data/BTC-USD_2021_total_sell_opportunities.csv in data file
./data/BTC-USD_2022_total_buy_opportunities.csv in data file
./data/BTC-USD_2022_total_sell_opportunities.csv in data file
./data/BTC-USD_2023_total_buy_opportunities.csv in data file
./data/BTC-USD_2023_total_sell_opportunities.csv in data file
./data/ETH-USD_2021_total_buy_opportunities.csv in data file
./data/ETH-USD_2021_total_sell_opportunities.csv in data file
./data/ETH-USD_2022_total_buy_opportunities.csv in data file
./data/ETH-USD_2022_total_sell_opportunities.csv in data file
./data/ETH-USD_2023_total_buy_opportunities.csv in data file
./data/ETH-USD_2023_total_sell_opportunities.csv in data file
./data/ETH-BTC_2021_total_buy_opportunities.csv in data file
./data/ETH-BTC_2021_total_sell_opportunities.csv in data file
./data/ETH-BTC_2022_total_buy_opportunities.csv in data file
./data/ETH-BTC_2022_total_sell_opportunities.csv in data file
./data/ETH-BTC_2

In [14]:
BTC_USD_2021 = read_pair_data('BTC-USD', '2021')
BTC_USD_2022 = read_pair_data('BTC-USD', '2022')
BTC_USD_2023 = read_pair_data('BTC-USD', '2023')

<h3> 2-3. Data Description </h3>

In [21]:
BTC_USD_2021[0].head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,PriceMillionths,SizeBillionths,NotionalMillionths,TradingCostsMillionths,VWAPMillionths
datetime,Nano seconds,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2021-04-10 16:28:53,470961000,59122080000,189904.5,11227550.0,56137.75,59122080000
2021-04-10 16:28:53,596148000,59119960000,126228.5,7462624.0,37313.12,59121233506
2021-04-10 16:28:54,64537000,59119960000,27776.5,1642146.0,8210.728,59121130649
2021-04-10 16:28:54,835128000,59123810000,984083.5,58182770.0,290913.8,59123116130
2021-04-10 16:28:55,30223000,59123810000,11798.0,697542.7,3487.714,59123122240
2021-04-10 16:28:55,577343000,59123810000,3785645.0,223821800.0,1119109.0,59123630219
2021-04-10 16:29:00,30069000,59123580000,16450622.0,972619700.0,4863098.0,59123591929
2021-04-10 16:29:02,519128000,59121540000,81632.0,4826210.0,24131.05,59123584195
2021-04-10 16:29:02,810133000,59117620000,5060.5,299164.7,1495.824,59123582802
2021-04-10 16:29:03,5146000,59117620000,81630.0,4825771.0,24128.86,59123560417


In [22]:
BTC_USD_2021[1].head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,PriceMillionths,SizeBillionths,NotionalMillionths,TradingCostsMillionths,VWAPMillionths
datetime,Nano seconds,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2021-04-10 16:28:52,515484000,59125070000,406122.5,24012020.0,120060.106205,59125070000
2021-04-10 16:28:53,74765000,59119950000,849794.5,50239810.0,251199.041751,59121605640
2021-04-10 16:28:53,613354000,59119950000,656146.5,38791350.0,193956.741363,59121037488
2021-04-10 16:28:54,195667000,59119950000,891207.0,52688110.0,263440.566398,59120691757
2021-04-10 16:28:54,716996000,59119950000,1250000.0,73899940.0,369499.6875,59120463004
2021-04-10 16:28:55,71608000,59119950000,132392.0,7827008.0,39135.042102,59120446778
2021-04-10 16:28:55,358280000,59119950000,891184.5,52686780.0,263433.915404,59120359574
2021-04-10 16:28:55,987817000,59121860000,891028.5,52679260.0,263396.311165,59120583594
2021-04-10 16:28:56,830404000,59121860000,352749.0,20855180.0,104275.884966,59120654829
2021-04-10 16:28:57,426911000,59119960000,9283.0,548810.6,2744.052943,59120653810


In [23]:
accumulation_analytics(BTC_USD_2021[0], 10000000000)

KeyboardInterrupt: 

<h2> 3. Analysis </h2>