# Description

# Imports

In [2]:
import numpy as np
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import warnings
from typing import Tuple, Any, Optional, List, Dict
from functools import reduce
warnings.filterwarnings('ignore')

In [41]:
df_binance_futures = pd.read_parquet(r'C:\Users\James Zhang\Desktop\sorrentum\sorrentum_sandbox\projects\SorrIssue2_Cross_exchange_arbitrage_CEX_CEX\data\Binance_futures')
df_binance_futures = df_binance_futures.assign(exchange_id = 'binance_futures')
df_binance_futures = df_binance_futures.rename(columns={'timestamp':'timestamp.1'})

In [42]:
df_binance_spot = pd.read_parquet(r'C:\Users\James Zhang\Desktop\sorrentum\sorrentum_sandbox\projects\SorrIssue2_Cross_exchange_arbitrage_CEX_CEX\data\Binance_spot')
df_binance_spot = df_binance_spot.assign(exchange_id = 'binance_spot')
df_binance_spot = df_binance_spot.rename(columns={'timestamp':'timestamp.1'})

In [43]:
df_binance_us = pd.read_parquet(r'C:\Users\James Zhang\Desktop\sorrentum\sorrentum_sandbox\projects\SorrIssue2_Cross_exchange_arbitrage_CEX_CEX\data\Binanceus_spot')
df_binance_us = df_binance_us.rename(columns={'timestamp':'timestamp.1'})

In [44]:
df_okx = pd.read_parquet(r'C:\Users\James Zhang\Desktop\sorrentum\sorrentum_sandbox\projects\SorrIssue2_Cross_exchange_arbitrage_CEX_CEX\data\OKX_futures')
df_okx = df_okx.rename(columns={'timestamp':'timestamp.1'})

# Functions

In [45]:
def convert_to_multiindex(
    exchange_df: pd.DataFrame, keep_single: Optional[bool] = False
) -> pd.DataFrame:
    """
    Rearrange the given exchange dataframe such that the index is time
    and at any time, features of all coins on the exchange can be 
    determined.

    :param exchange_df: data from some exchange
    :return: nicer dataframe that will be input to define_levels
    """
    # Move timestamp to a column and localize it.
    exchange_df = exchange_df.reset_index()
    exchange_df["timestamp"] = pd.to_datetime(exchange_df["timestamp"])
    exchange_df["timestamp"] = exchange_df["timestamp"].dt.tz_localize(None)
    exchange_df["timestamp"] = exchange_df["timestamp"].astype("datetime64[ns]")
    exchange_df = exchange_df.sort_values(by="timestamp")
    # Get the name of the exchange.
    exchange_id = exchange_df["exchange_id"].unique()[0]
    # Drop all irrelevant columns.
    exchange_df = exchange_df.drop(columns=["timestamp.1", "knowledge_timestamp", "year", "month", "exchange_id"])
    # Group the dataframe by currency pair.
    currency_pair_dfs = exchange_df.groupby("currency_pair")
    currency_pair_dfs = [currency_pair_dfs.get_group(currency_pair) for currency_pair in currency_pair_dfs.groups]
    # Initialize the dataframe that we will return, which starts as just time.
    return_df = pd.DataFrame(exchange_df["timestamp"].unique())
    return_df = return_df.rename(columns={0:"timestamp"})
    # Calls calculate_vwap helper function that also renames OHLCV columns.
    currency_pair_dfs = calculate_vwap(currency_pair_dfs, exchange_id) 
    # Merge all currency pair dataframes into the return dataframe
    for currency_pair in currency_pair_dfs:
        return_df = pd.merge_asof(return_df, currency_pair, on="timestamp")
    # Set index as timestamp which was lost during merging.
    return_df = return_df.set_index("timestamp")
    # Sort by column name to the order is consistent.
    return_df = return_df.sort_index(axis=1)
    # Drop duplicate columns if there are any
    return_df = return_df.loc[:,~return_df.columns.duplicated()]
    # Call define_levels function for next step.
    if keep_single:
        return return_df
    return define_levels(return_df)

def calculate_vwap(
    currency_pair_dfs: List[pd.DataFrame],
    exchange_id: str
) -> List[pd.DataFrame]:
    """
    Calculates volume weighted average price for each currency pair dataframe
    in the given list of currency pair dataframes.

    :param currency_pair_dfs: list of currency pair dataframes
    :param exchange_id: str name of the given exchange
    :return: currency pair dataframes with vwap calculations
    """
    for df in currency_pair_dfs:
        # Get name of currency_pair for renaming purposes.
        currency_pair = df["currency_pair"].unique()[0]
        vwap_column_name = f"vwap-{exchange_id}::{currency_pair}"
        volume_column_name = f"volume-{exchange_id}::{currency_pair}"
        open_column_name = f"open-{exchange_id}::{currency_pair}"
        high_column_name = f"high-{exchange_id}::{currency_pair}"
        low_column_name = f"low-{exchange_id}::{currency_pair}"
        close_column_name = f"close-{exchange_id}::{currency_pair}"
        # Calculate vwap.
        midprice = (df["high"] + df["low"]) / 2
        numerator = np.cumsum(np.multiply(df["volume"], midprice))
        denominator = np.cumsum(df["volume"])
        df[vwap_column_name] = np.divide(numerator, denominator)
        # Now rename the OHLCV columns.
        df.rename(columns={"volume" : volume_column_name, 
                           "open": open_column_name, 
                           "high": high_column_name, 
                           "low": low_column_name, 
                           "close": close_column_name}, inplace=True)
        # Drop irrelevant columns and set timestamp as index.
        df.drop(columns=["currency_pair"], inplace=True)
        df.set_index("timestamp", inplace=True)
    return currency_pair_dfs

def define_levels(single_df: pd.DataFrame) -> pd.DataFrame:
    """
    Create all of the column levels such that we can transform 
    the single_index_df into multi_index.
    
    :param single_df: dataframe returned by convert_to_multi_index
    :return: a multi-index dataframe
    """
    # Store the timestamp for later use.
    timestamp = single_df.index
    # Create a list of all column names.
    columns = list(single_df.columns)
    num_pairs = int(len(columns) / 6)
    # Create outer level (feature).
    close_string = "close " * num_pairs
    high_string = "high " * num_pairs
    low_string = "low " * num_pairs
    open_string = "open " * num_pairs
    volume_string = "volume " * num_pairs
    vwap_string = "vwap " * num_pairs
    feature_string = "".join([close_string, high_string, low_string, open_string, volume_string, vwap_string])
    # Simultaneously inner level (exchange::currency_pair).
    currency_pair_string = ""
    for column_name in columns:
        hyphen = column_name.rfind("-")
        currency_pair_string += column_name[hyphen + 1:] + " "
    # Convert the given dataframe to multi-index.
    return_df = pd.DataFrame(np.array(single_df), columns=[feature_string.split(), currency_pair_string.split()])
    # Restore the initial timestamp.
    return_df.index = timestamp
    # Drop duplicate columns if there are any.
    return_df = return_df.loc[:,~return_df.columns.duplicated()].copy()
    return return_df

def merge_and_convert_to_multiindex(
    exchange_dfs: List[pd.DataFrame]
) -> List[pd.DataFrame]:
    """
    Converts a list of exchange dataframes into one large
    multi-index dataframe.

    :param exchange_dfs: list of all exchange dataframes
    :return: multi-index dataframe
    """
    # Edge case if exchange_dfs of size == 1
    if len(exchange_dfs) == 1:
        return convert_to_multiindex(exchange_df)
    # Make all dataframes in exchange_dfs easily convertible to multi-index
    for i, exchange_df in enumerate(exchange_dfs):
        exchange_dfs[i] = convert_to_multiindex(exchange_df, True)
    # Merge dataframes using reduce().
    return_df = reduce(lambda df1, df2: pd.merge(df1, df2, on="timestamp", how="outer"), exchange_dfs)
    # Sort by time and columns before passing into define_levels
    return_df = return_df.sort_index()
    return_df = return_df.sort_index(axis=1)
    # Drop duplicate columns if there are any.
    return_df = return_df.loc[:,~return_df.columns.duplicated()]
    return define_levels(return_df)

def get_symbols(multindex_df: pd.MultiIndex) -> List[str]:
    """
    Retrieves a list of all of the unique symbols (currency pairs) given a 
    multiindex dataframe.

    :param multiindex_df: a df returned by convert_to_multiindex
    :return: list of symbols
    """
    symbols_with_prefix = multindex_df["close"].columns
    symbols = [symbol[symbol.rfind(":")+1:] for symbol in symbols_with_prefix]
    return list(set(symbols))

def get_symbol_info(multiindex_df: pd.MultiIndex, symbol: str) -> pd.MultiIndex:
    """
    Returns a two-level dataframe with only the given symbol.

    :param multiindex_df: a df returned by convert_to_multiindex
    :param symbol: the desired symbol (currency_pair)
    :return: all data associated with the symbol
    """
    columns_list = multiindex_df.columns
    columns = [column for column in columns_list if symbol in column[1]]
    return multiindex_df[columns]

In [46]:
df = merge_and_convert_to_multiindex([df_binance_futures, df_binance_us, df_binance_spot, df_okx])
df

Unnamed: 0_level_0,close,close,close,close,close,close,close,close,close,close,...,vwap,vwap,vwap,vwap,vwap,vwap,vwap,vwap,vwap,vwap
Unnamed: 0_level_1,binance_futures::APE_USDT,binance_futures::AVAX_USDT,binance_futures::AXS_USDT,binance_futures::BAKE_USDT,binance_futures::BNB_USDT,binance_futures::BTC_BUSD,binance_futures::BTC_USDT,binance_futures::CRV_USDT,binance_futures::CTK_USDT,binance_futures::DOGE_USDT,...,okx::FTM_USDT,okx::GMT_USDT,okx::LINK_USDT,okx::MATIC_USDT,okx::NEAR_USDT,okx::SAND_USDT,okx::SOL_USDT,okx::STORJ_USDT,okx::WAVES_USDT,okx::XRP_USDT
timestamp,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
2019-09-08 17:57:00,,,,,,,10000.0,,,,...,,,,,,,,,,
2019-09-08 17:58:00,,,,,,,10000.0,,,,...,,,,,,,,,,
2019-09-08 17:59:00,,,,,,,10000.0,,,,...,,,,,,,,,,
2019-09-08 18:00:00,,,,,,,10000.0,,,,...,,,,,,,,,,
2019-09-08 18:01:00,,,,,,,10000.0,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-01-31 23:55:00,5.867,19.865,10.900,0.1960,312.58,23145.7,23145.6,1.020,0.8066,0.09630,...,,,,,,,,,,
2023-01-31 23:56:00,5.858,19.821,10.882,0.1958,312.32,23124.6,23127.0,1.017,0.8055,0.09610,...,,,,,,,,,,
2023-01-31 23:57:00,5.864,19.834,10.892,0.1960,312.43,23134.5,23133.6,1.018,0.8059,0.09630,...,,,,,,,,,,
2023-01-31 23:58:00,5.858,19.812,10.884,0.1957,312.19,23124.0,23123.1,1.019,0.8051,0.09619,...,,,,,,,,,,


In [53]:
get_symbols(df)

['GMT_USDT',
 'NEAR_USDT',
 'ETH_USDT',
 'DYDX_USDT',
 'STORJ_USDT',
 'LINK_USDT',
 'WAVES_USDT',
 'SOL_USDT',
 'ETH_BUSD',
 'DOGE_USDT',
 'RUNE_USDT',
 'BAKE_USDT',
 'BNB_USDT',
 'AVAX_USDT',
 'XRP_USDT',
 'CTK_USDT',
 'MATIC_USDT',
 'DOT_USDT',
 'FTM_USDT',
 'OGN_USDT',
 'BTC_USDT',
 'AXS_USDT',
 'CRV_USDT',
 'BTC_BUSD',
 'APE_USDT',
 'SAND_USDT',
 'UNFI_USDT']

In [55]:
get_symbol_info(df, 'BTC_USDT')

Unnamed: 0_level_0,close,close,close,close,high,high,high,high,low,low,...,open,open,volume,volume,volume,volume,vwap,vwap,vwap,vwap
Unnamed: 0_level_1,binance_futures::BTC_USDT,binance_spot::BTC_USDT,binanceus::BTC_USDT,okx::BTC_USDT,binance_futures::BTC_USDT,binance_spot::BTC_USDT,binanceus::BTC_USDT,okx::BTC_USDT,binance_futures::BTC_USDT,binance_spot::BTC_USDT,...,binanceus::BTC_USDT,okx::BTC_USDT,binance_futures::BTC_USDT,binance_spot::BTC_USDT,binanceus::BTC_USDT,okx::BTC_USDT,binance_futures::BTC_USDT,binance_spot::BTC_USDT,binanceus::BTC_USDT,okx::BTC_USDT
timestamp,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
2019-09-08 17:57:00,10000.0,,,,10000.0,,,,10000.0,,...,,,0.001,,,,10000.000000,,,
2019-09-08 17:58:00,10000.0,,,,10000.0,,,,10000.0,,...,,,0.000,,,,10000.000000,,,
2019-09-08 17:59:00,10000.0,,,,10000.0,,,,10000.0,,...,,,0.001,,,,10000.000000,,,
2019-09-08 18:00:00,10000.0,,,,10000.0,,,,10000.0,,...,,,0.000,,,,10000.000000,,,
2019-09-08 18:01:00,10000.0,,,,10000.0,,,,10000.0,,...,,,0.000,,,,10000.000000,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-01-31 23:55:00,23145.6,23149.98,23149.63,,23150.0,23159.53,23154.55,,23141.7,23145.27,...,23149.43,,276.408,132.98878,0.665210,,27982.567315,28969.897150,27207.485001,
2023-01-31 23:56:00,23127.0,23134.07,23129.44,,23145.7,23151.77,23145.36,,23124.6,23126.94,...,23145.36,,198.219,186.18800,2.498794,,27982.565185,28969.884672,27207.471198,
2023-01-31 23:57:00,23133.6,23140.33,23137.48,,23136.0,23144.59,23141.68,,23124.6,23130.01,...,23135.53,,149.375,110.49868,1.791509,,27982.563578,28969.877264,27207.461297,
2023-01-31 23:58:00,23123.1,23131.61,23130.71,,23136.6,23144.61,23144.15,,23123.0,23128.46,...,23137.92,,130.329,163.33895,1.062466,,27982.562175,28969.866312,27207.455428,
