# Description

# Imports

In [21]:
import numpy as np
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import warnings
from typing import Tuple, Any, Optional, List, Dict
warnings.filterwarnings('ignore')

In [4]:
df_binance_futures = pd.read_parquet(r'C:\Users\James Zhang\Desktop\sorrentum\sorrentum_sandbox\projects\SorrIssue2_Cross_exchange_arbitrage_CEX_CEX\data\Binance_futures')
df_binance_futures = df_binance_futures.assign(exchange_id = 'binance_futures')
df_binance_futures = df_binance_futures.rename(columns={'timestamp':'timestamp.1'})

In [5]:
df_binance_spot = pd.read_parquet(r'C:\Users\James Zhang\Desktop\sorrentum\sorrentum_sandbox\projects\SorrIssue2_Cross_exchange_arbitrage_CEX_CEX\data\Binance_spot')
df_binance_spot = df_binance_spot.assign(exchange_id = 'binance_spot')
df_binance_spot = df_binance_spot.rename(columns={'timestamp':'timestamp.1'})

In [6]:
df_binance_us = pd.read_parquet(r'C:\Users\James Zhang\Desktop\sorrentum\sorrentum_sandbox\projects\SorrIssue2_Cross_exchange_arbitrage_CEX_CEX\data\Binanceus_spot')
df_binance_us = df_binance_us.rename(columns={'timestamp':'timestamp.1'})

In [7]:
df_okx = pd.read_parquet(r'C:\Users\James Zhang\Desktop\sorrentum\sorrentum_sandbox\projects\SorrIssue2_Cross_exchange_arbitrage_CEX_CEX\data\OKX_futures')
df_okx = df_okx.rename(columns={'timestamp':'timestamp.1'})

# Functions

In [85]:
def convert_to_multi_index(
    exchange_df: pd.DataFrame,
    keep_single: Optional[bool] = False
) -> pd.DataFrame:
    """
    Rearrange the given exchange dataframe such that the index is time
    and at any time, features of all coins on the exchange can be 
    determined.

    :param exchange_df: data from some exchange
    :return: nicer dataframe that will be input to define_levels
    """
    # Move timestamp to a column and localize it.
    exchange_df = exchange_df.reset_index()
    exchange_df['timestamp'] = pd.to_datetime(exchange_df['timestamp'])
    exchange_df['timestamp'] = exchange_df['timestamp'].dt.tz_localize(None)
    exchange_df['timestamp'] = exchange_df['timestamp'].astype('datetime64[ns]')
    exchange_df = exchange_df.sort_values(by='timestamp')
    # Get the name of the exchange.
    exchange_id = exchange_df['exchange_id'].unique()[0]
    # Drop all irrelevant columns.
    exchange_df = exchange_df.drop(columns=['timestamp.1', 'knowledge_timestamp', 'open', 'close', 'year', 'month', 'exchange_id'])
    # Group the dataframe by currency pair.
    currency_pair_dfs = exchange_df.groupby('currency_pair')
    currency_pair_dfs = [currency_pair_dfs.get_group(currency_pair) for currency_pair in currency_pair_dfs.groups]
    # Initialize the dataframe that we will return, which starts as just time.
    return_df = pd.DataFrame(exchange_df['timestamp'].unique())
    return_df = return_df.rename(columns={0:"timestamp"})
    # Calls calculate_vwap helper function.
    currency_pair_dfs = calculate_vwap(currency_pair_dfs, exchange_id) 
    # Merge all currency pair dataframes into the return dataframe
    for currency_pair in currency_pair_dfs:
        return_df = pd.merge_asof(return_df, currency_pair, on='timestamp')
    # Set index as timestamp which was lost during merging.
    return_df = return_df.set_index('timestamp')
    # Sort by column name to the order is consistent.
    reutrn_df = return_df.sort_index(axis=1)
    # Drop duplicate columns if there are any
    return_df = return_df.loc[:,~return_df.columns.duplicated()]
    # Call define_levels function for next step.
    if keep_single:
        return return_df
    return define_levels(return_df)

def calculate_vwap(
    currency_pair_dfs: List[pd.DataFrame],
    exchange_id: str
) -> List[pd.DataFrame]:
    """
    Calculates volume weighted average price for each currency pair dataframe
    in the given list of currency pair dataframes.

    :param currency_pair_dfs: list of currency pair dataframes
    :param exchange_id: str name of the given exchange
    :return: currency pair dataframes with vwap calculations
    """
    for df in currency_pair_dfs:
        # Get name of currency_pair for renaming purposes.
        currency_pair = df["currency_pair"].unique()[0]
        vwap_column_name = f"vwap-{exchange_id}:{currency_pair}"
        volume_column_name = f"volume-{exchange_id}:{currency_pair}"
        # Calculate vwap.
        midprice = (df["high"] + df["low"]) / 2
        numerator = np.cumsum(np.multiply(df["volume"], midprice))
        denominator = np.cumsum(df["volume"])
        df[vwap_column_name] = np.divide(numerator, denominator)
        # Now rename the volume column.
        df.rename(columns={'volume' : volume_column_name}, inplace=True)
        # Drop irrelevant columns and set timestamp as index.
        df.drop(columns=['high', 'low', 'currency_pair'], inplace=True)
        df.set_index('timestamp', inplace=True)
    return currency_pair_dfs

def define_levels(single_df: pd.DataFrame) -> pd.DataFrame:
    """
    Create all of the column levels such that we can transform 
    the single_index_df into multi_index.
    
    :param single_df: dataframe returned by convert_to_multi_index
    :return: a multi-index dataframe
    """
    # Store the timestamp for later use.
    timestamp = single_df.index
    # Create a list of all column names.
    columns = list(single_df.columns)
    # Create outer level (feature).
    volume_string = "volume " * int(len(columns) / 2)
    vwap_string = "vwap " * int(len(columns) / 2)
    feature_string = "".join([volume_string, vwap_string])
    # Simultaneously create middle level (exchange) and inner level (currency pairs)
    exchange_string = ""
    currency_pair_string = ""
    for column_name in columns:
        hyphen = column_name.rfind("-")
        semicolon = column_name.rfind(":")
        exchange_string += column_name[hyphen + 1:semicolon] + " "
        currency_pair_string += column_name[semicolon + 1:] + " "
    # Convert the given dataframe to multi-index
    return_df = pd.DataFrame(np.array(single_df), columns=[feature_string.split(), exchange_string.split(), currency_pair_string.split()])
    # Restore the initial timestamp
    return_df.index = timestamp
    # Drop duplicate columns if there are any
    return_df = return_df.loc[:,~return_df.columns.duplicated()].copy()
    return return_df

def merge_and_convert_to_multi_index(
    exchange_dfs: List[pd.DataFrame]
) -> List[pd.DataFrame]:
    """
    Converts a list of exchange dataframes into one large
    multi-index dataframe.

    :param exchange_dfs: list of all exchange dataframes
    :return: multi-index dataframe
    """
    # Edge case if exchange_dfs of size == 1
    if len(exchange_dfs) == 1:
        return convert_to_multi_index(exchange_df)
    # Make all dataframes in exchange_dfs easily convertible to multi-index
    for i, exchange_df in enumerate(exchange_dfs):
        exchange_dfs[i] = convert_to_multi_index(exchange_df, True)
    # Merge the first two dataframes
    return_df = pd.merge(exchange_dfs[0], exchange_dfs[1], on="timestamp", how="outer")
    # Now merge the rest of them
    for i in range(2, len(exchange_dfs)):
        return_df = pd.merge(return_df, exchange_dfs[i], on="timestamp", how="outer")
    # Sort by time and columns before passing into define_levels
    return_df = return_df.sort_index()
    return_df = return_df.sort_index(axis=1)
    # Drop duplicate columns if there are any
    return_df = return_df.loc[:,~return_df.columns.duplicated()]
    return define_levels(return_df)

In [86]:
df = merge_and_convert_to_multi_index([df_binance_futures, df_binance_spot, df_binance_spot, df_okx])
df

Unnamed: 0_level_0,volume,volume,volume,volume,volume,volume,volume,volume,volume,volume,...,vwap,vwap,vwap,vwap,vwap,vwap,vwap,vwap,vwap,vwap
Unnamed: 0_level_1,binance_futures,binance_futures,binance_futures,binance_futures,binance_futures,binance_futures,binance_futures,binance_futures,binance_futures,binance_futures,...,okx,okx,okx,okx,okx,okx,okx,okx,okx,okx
Unnamed: 0_level_2,APE_USDT,AVAX_USDT,AXS_USDT,BAKE_USDT,BNB_USDT,BTC_BUSD,BTC_USDT,CRV_USDT,CTK_USDT,DOGE_USDT,...,FTM_USDT,GMT_USDT,LINK_USDT,MATIC_USDT,NEAR_USDT,SAND_USDT,SOL_USDT,STORJ_USDT,WAVES_USDT,XRP_USDT
timestamp,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3,Unnamed: 16_level_3,Unnamed: 17_level_3,Unnamed: 18_level_3,Unnamed: 19_level_3,Unnamed: 20_level_3,Unnamed: 21_level_3
2019-09-08 17:57:00,,,,,,,0.001,,,,...,,,,,,,,,,
2019-09-08 17:58:00,,,,,,,0.000,,,,...,,,,,,,,,,
2019-09-08 17:59:00,,,,,,,0.001,,,,...,,,,,,,,,,
2019-09-08 18:00:00,,,,,,,0.000,,,,...,,,,,,,,,,
2019-09-08 18:01:00,,,,,,,0.000,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-01-31 23:55:00,14535.0,4127.0,3298.0,3810.0,357.12,74.177,276.408,141575.9,5638.0,6800853.0,...,,,,,,,,,,
2023-01-31 23:56:00,13328.0,3890.0,12681.0,23143.0,301.87,36.406,198.219,145940.2,4929.0,4902330.0,...,,,,,,,,,,
2023-01-31 23:57:00,10989.0,5453.0,2843.0,13094.0,403.85,14.833,149.375,101247.9,4621.0,6779943.0,...,,,,,,,,,,
2023-01-31 23:58:00,15542.0,1896.0,5678.0,25428.0,181.45,34.365,130.329,8523.5,10357.0,3246096.0,...,,,,,,,,,,
