# Description

# Imports

In [1]:
import numpy as np
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import warnings
from typing import Tuple, Any, Optional, List, Dict
warnings.filterwarnings('ignore')

In [66]:
df_binance_futures = pd.read_parquet(r'C:\Users\James Zhang\Desktop\sorrentum\sorrentum_sandbox\projects\SorrIssue2_Cross_exchange_arbitrage_CEX_CEX\data\Binance_futures')
df_binance_futures = df_binance_futures.assign(exchange_id = 'binance_futures')
df_binance_futures = df_binance_futures.rename(columns={'timestamp':'timestamp.1'})

In [67]:
df_binance_spot = pd.read_parquet(r'C:\Users\James Zhang\Desktop\sorrentum\sorrentum_sandbox\projects\SorrIssue2_Cross_exchange_arbitrage_CEX_CEX\data\Binance_spot')
df_binance_spot = df_binance_spot.assign(exchange_id = 'binance_spot')
df_binance_spot = df_binance_spot.rename(columns={'timestamp':'timestamp.1'})

In [68]:
df_binance_us = pd.read_parquet(r'C:\Users\James Zhang\Desktop\sorrentum\sorrentum_sandbox\projects\SorrIssue2_Cross_exchange_arbitrage_CEX_CEX\data\Binanceus_spot')
df_binance_us = df_binance_us.rename(columns={'timestamp':'timestamp.1'})

In [2]:
df_okx = pd.read_parquet(r'C:\Users\James Zhang\Desktop\sorrentum\sorrentum_sandbox\projects\SorrIssue2_Cross_exchange_arbitrage_CEX_CEX\data\OKX_futures')
df_okx = df_okx.rename(columns={'timestamp':'timestamp.1'})

# Functions

In [46]:
def convert_to_multi_index(
    exchange_df: pd.DataFrame, keep_single: Optional[bool] = False
) -> pd.DataFrame:
    """
    Rearrange the given exchange dataframe such that the index is time
    and at any time, features of all coins on the exchange can be 
    determined.

    :param exchange_df: data from some exchange
    :return: nicer dataframe that will be input to define_levels
    """
    # Move timestamp to a column and localize it.
    exchange_df = exchange_df.reset_index()
    exchange_df["timestamp"] = pd.to_datetime(exchange_df["timestamp"])
    exchange_df["timestamp"] = exchange_df["timestamp"].dt.tz_localize(None)
    exchange_df["timestamp"] = exchange_df["timestamp"].astype("datetime64[ns]")
    exchange_df = exchange_df.sort_values(by="timestamp")
    # Get the name of the exchange.
    exchange_id = exchange_df["exchange_id"].unique()[0]
    # Drop all irrelevant columns.
    exchange_df = exchange_df.drop(columns=["timestamp.1", "knowledge_timestamp", "year", "month", "exchange_id"])
    # Group the dataframe by currency pair.
    currency_pair_dfs = exchange_df.groupby("currency_pair")
    currency_pair_dfs = [currency_pair_dfs.get_group(currency_pair) for currency_pair in currency_pair_dfs.groups]
    # Initialize the dataframe that we will return, which starts as just time.
    return_df = pd.DataFrame(exchange_df["timestamp"].unique())
    return_df = return_df.rename(columns={0:"timestamp"})
    # Calls calculate_vwap helper function that also renames OHLCV columns.
    currency_pair_dfs = calculate_vwap(currency_pair_dfs, exchange_id) 
    # Merge all currency pair dataframes into the return dataframe
    for currency_pair in currency_pair_dfs:
        return_df = pd.merge_asof(return_df, currency_pair, on="timestamp")
    # Set index as timestamp which was lost during merging.
    return_df = return_df.set_index("timestamp")
    # Sort by column name to the order is consistent.
    return_df = return_df.sort_index(axis=1)
    # Drop duplicate columns if there are any
    return_df = return_df.loc[:,~return_df.columns.duplicated()]
    # Call define_levels function for next step.
    if keep_single:
        return return_df
    return define_levels(return_df)

def calculate_vwap(
    currency_pair_dfs: List[pd.DataFrame],
    exchange_id: str
) -> List[pd.DataFrame]:
    """
    Calculates volume weighted average price for each currency pair dataframe
    in the given list of currency pair dataframes.

    :param currency_pair_dfs: list of currency pair dataframes
    :param exchange_id: str name of the given exchange
    :return: currency pair dataframes with vwap calculations
    """
    for df in currency_pair_dfs:
        # Get name of currency_pair for renaming purposes.
        currency_pair = df["currency_pair"].unique()[0]
        vwap_column_name = f"vwap-{exchange_id}::{currency_pair}"
        volume_column_name = f"volume-{exchange_id}::{currency_pair}"
        open_column_name = f"open-{exchange_id}::{currency_pair}"
        high_column_name = f"high-{exchange_id}::{currency_pair}"
        low_column_name = f"low-{exchange_id}::{currency_pair}"
        close_column_name = f"close-{exchange_id}::{currency_pair}"
        # Calculate vwap.
        midprice = (df["high"] + df["low"]) / 2
        numerator = np.cumsum(np.multiply(df["volume"], midprice))
        denominator = np.cumsum(df["volume"])
        df[vwap_column_name] = np.divide(numerator, denominator)
        # Now rename the OHLCV columns.
        df.rename(columns={"volume" : volume_column_name}, inplace=True)
        df.rename(columns={"open" : open_column_name}, inplace=True)
        df.rename(columns={"high" : high_column_name}, inplace=True)
        df.rename(columns={"low" : low_column_name}, inplace=True)
        df.rename(columns={"close" : close_column_name}, inplace=True)
        # Drop irrelevant columns and set timestamp as index.
        df.drop(columns=["currency_pair"], inplace=True)
        df.set_index("timestamp", inplace=True)
    return currency_pair_dfs

def define_levels(single_df: pd.DataFrame) -> pd.DataFrame:
    """
    Create all of the column levels such that we can transform 
    the single_index_df into multi_index.
    
    :param single_df: dataframe returned by convert_to_multi_index
    :return: a multi-index dataframe
    """
    # Store the timestamp for later use.
    timestamp = single_df.index
    # Create a list of all column names.
    columns = list(single_df.columns)
    num_pairs = int(len(columns) / 6)
    # Create outer level (feature).
    close_string = "close " * num_pairs
    high_string = "high " * num_pairs
    low_string = "low " * num_pairs
    open_string = "open " * num_pairs
    volume_string = "volume " * num_pairs
    vwap_string = "vwap " * num_pairs
    feature_string = "".join([close_string, high_string, low_string, open_string, volume_string, vwap_string])
    # Simultaneously inner level (exchange::currency_pair)
    currency_pair_string = ""
    for column_name in columns:
        hyphen = column_name.rfind("-")
        currency_pair_string += column_name[hyphen + 1:] + " "
    # Convert the given dataframe to multi-index
    return_df = pd.DataFrame(np.array(single_df), columns=[feature_string.split(), currency_pair_string.split()])
    # Restore the initial timestamp
    return_df.index = timestamp
    # Drop duplicate columns if there are any
    return_df = return_df.loc[:,~return_df.columns.duplicated()].copy()
    return return_df

def merge_and_convert_to_multi_index(
    exchange_dfs: List[pd.DataFrame]
) -> List[pd.DataFrame]:
    """
    Converts a list of exchange dataframes into one large
    multi-index dataframe.

    :param exchange_dfs: list of all exchange dataframes
    :return: multi-index dataframe
    """
    # Edge case if exchange_dfs of size == 1
    if len(exchange_dfs) == 1:
        return convert_to_multi_index(exchange_df)
    # Make all dataframes in exchange_dfs easily convertible to multi-index
    for i, exchange_df in enumerate(exchange_dfs):
        exchange_dfs[i] = convert_to_multi_index(exchange_df, True)
    # Merge the first two dataframes
    return_df = pd.merge(exchange_dfs[0], exchange_dfs[1], on="timestamp", how="outer")
    # Now merge the rest of them
    for i in range(2, len(exchange_dfs)):
        return_df = pd.merge(return_df, exchange_dfs[i], on="timestamp", how="outer")
    # Sort by time and columns before passing into define_levels
    return_df = return_df.sort_index()
    return_df = return_df.sort_index(axis=1)
    # Drop duplicate columns if there are any
    return_df = return_df.loc[:,~return_df.columns.duplicated()]
    return define_levels(return_df)

In [64]:
convert_to_multi_index(df_okx)

Unnamed: 0_level_0,close,close,close,close,close,close,close,close,close,close,...,vwap,vwap,vwap,vwap,vwap,vwap,vwap,vwap,vwap,vwap
Unnamed: 0_level_1,okx::APE_USDT,okx::AVAX_USDT,okx::AXS_USDT,okx::BNB_USDT,okx::BTC_USDT,okx::DOGE_USDT,okx::DOT_USDT,okx::ETH_USDT,okx::FTM_USDT,okx::GMT_USDT,...,okx::FTM_USDT,okx::GMT_USDT,okx::LINK_USDT,okx::MATIC_USDT,okx::NEAR_USDT,okx::SAND_USDT,okx::SOL_USDT,okx::STORJ_USDT,okx::WAVES_USDT,okx::XRP_USDT
timestamp,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
2021-01-01 15:00:00,,3.728,,,29329.3,0.004949,8.666,738.21,0.0173,,...,0.017400,,12.136000,,1.365400,,1.642000,,6.225000,0.237200
2021-01-01 15:01:00,,3.728,,,29292.2,0.004938,8.673,737.80,0.0174,,...,0.017420,,12.147935,,1.365355,,1.641388,0.299200,6.223844,0.237389
2021-01-01 15:02:00,,3.726,,,29259.0,0.004953,8.667,736.80,0.0174,,...,0.017413,,12.149790,,1.365157,,1.638708,0.299200,6.222253,0.237936
2021-01-01 15:03:00,,3.719,,,29291.8,0.004950,8.678,736.54,0.0175,,...,0.017410,,12.153976,,1.365119,,1.637566,0.299200,6.218422,0.238043
2021-01-01 15:04:00,,3.719,,,29269.2,0.004952,8.673,736.10,0.0175,,...,0.017408,,12.159146,,1.365020,,1.637444,0.299200,6.216047,0.238044
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-01-31 13:15:00,5.8213,19.580,10.74,311.29,22931.8,0.092770,6.222,1577.23,0.4763,0.5449,...,1.246672,1.029788,21.705751,1.406497,7.209707,1.480779,56.795526,1.373347,13.888414,0.667274
2023-01-31 13:16:00,5.8285,19.580,10.74,311.31,22935.4,0.092920,6.226,1578.30,0.4766,0.5452,...,1.246670,1.029785,21.705744,1.406496,7.209700,1.480774,56.795422,1.373345,13.888358,0.667273
2023-01-31 13:17:00,5.8286,19.580,10.74,311.33,22943.5,0.092910,6.224,1578.30,0.4775,0.5450,...,1.246657,1.029781,21.705736,1.406496,7.209685,1.480773,56.795367,1.373345,13.888155,0.667272
2023-01-31 13:18:00,5.8307,19.570,10.74,311.32,22947.2,0.092900,6.226,1578.47,0.4783,0.5450,...,1.246635,1.029778,21.705457,1.406495,7.209616,1.480773,56.795308,1.373345,13.888130,0.667272


In [69]:
df = merge_and_convert_to_multi_index([df_binance_futures, df_binance_spot, df_binance_spot, df_okx])
df