In [1]:
# TODO(Max): convert to master notebook.
# TODO(Max): the notebook is runnable only from branch: `CMTask2703_Perform_manual_reconciliation_of_OB_data`.

- CCXT data = CCXT real-time DB bid-ask data collection for futures
- CC data = CryptoChassis historical Parquet bid-ask futures data

# Imports

In [2]:
%load_ext autoreload
%autoreload 2

import logging
import os

import pandas as pd

import core.config.config_ as cconconf
import helpers.hdbg as hdbg
import helpers.henv as henv
import helpers.hpandas as hpandas
import helpers.hprint as hprint
import helpers.hs3 as hs3
import helpers.hsql as hsql
import im_v2.ccxt.data.client as icdcl
import im_v2.crypto_chassis.data.client as iccdc
import im_v2.im_lib_tasks as imvimlita

  from tqdm.autonotebook import tqdm


In [3]:
hdbg.init_logger(verbosity=logging.INFO)

_LOG = logging.getLogger(__name__)

_LOG.info("%s", henv.get_system_signature()[0])

hprint.config_notebook()

[0m[36mINFO[0m: > cmd='/venv/lib/python3.8/site-packages/ipykernel_launcher.py -f /home/.local/share/jupyter/runtime/kernel-5daa77d4-e29c-460a-b14a-20583b10f164.json'
[31m-----------------------------------------------------------------------------
This code is not in sync with the container:
code_version='1.1.1' != container_version='1.1.0'
-----------------------------------------------------------------------------
You need to:
- merge origin/master into your branch with `invoke git_merge_master`
- pull the latest container with `invoke docker_pull`[0m
INFO  # Git
  branch_name='CMTask2703_Perform_manual_reconciliation_of_OB_data'
  hash='2741c745f'
  # Last commits:
    * 2741c745f max-rsrch Checkpoint                                                        (54 minutes ago) Tue Sep 27 17:52:40 2022  (HEAD -> CMTask2703_Perform_manual_reconciliation_of_OB_data)
    *   33e777b20 Max Sergeychikov Merge branch 'master' into CMTask2703_Perform_manual_reconciliation_of_OB_data (    

# Config

In [4]:
def get_cmtask2703_config() -> cconconf.Config:
    """
    Get task2360-specific config.
    """
    config = cconconf.Config()
    param_dict = {
        "data": {
            # Parameters for client initialization.
            "cc_im_client": {
                "universe_version": None,
                "resample_1min": True,
                "root_dir": os.path.join(
                    hs3.get_s3_bucket_path("ck"),
                    "reorg",
                    "daily_staged.airflow.pq",
                ),
                "partition_mode": "by_year_month",
                "dataset": "bid_ask",
                "contract_type": "futures",
                "data_snapshot": "",
                "aws_profile": "ck",
            },
            "ccxt_im_client": {
                "resample_1min": False,
                "db_connection": hsql.get_connection(
                    *hsql.get_connection_info_from_env_file(
                        imvimlita.get_db_env_path("dev")
                    )
                ),
                "table_name": "ccxt_bid_ask_futures_test",
            },
            # Parameters for data query.
            "read_data": {
                # DB data starts from here.
                "start_ts": pd.Timestamp("2022-09-08 22:06:00+00:00"),
                "end_ts": pd.Timestamp("2022-09-13 00:00:00+00:00"),
                "columns": None,
                "filter_data_mode": "assert",
            },
        },
        "column_names": {
            "bid_ask_cols": [
                "bid_price",
                "bid_size",
                "ask_price",
                "ask_size",
                "full_symbol",
            ],
        },
        "order_level": 1,
    }
    config = cconconf.Config.from_dict(param_dict)
    return config


config = get_cmtask2703_config()
print(config)

data:
  cc_im_client:
    universe_version: None
    resample_1min: True
    root_dir: s3://cryptokaizen-data/reorg/daily_staged.airflow.pq
    partition_mode: by_year_month
    dataset: bid_ask
    contract_type: futures
    data_snapshot: 
    aws_profile: ck
  ccxt_im_client:
    resample_1min: False
    db_connection: <connection object; dsn: 'user=postgres password=xxx dbname=im_data_db host=dev-im-db.cpox8ul7pzan.eu-north-1.rds.amazonaws.com port=5432', closed: 0>
    table_name: ccxt_bid_ask_futures_test
  read_data:
    start_ts: 2022-09-08 22:06:00+00:00
    end_ts: 2022-09-13 00:00:00+00:00
    columns: None
    filter_data_mode: assert
column_names:
  bid_ask_cols: ['bid_price', 'bid_size', 'ask_price', 'ask_size', 'full_symbol']
order_level: 1


# Functions

In [5]:
def load_and_transform_the_data(
    universe,
    bid_ask_cols,
    is_ccxt: bool,
    start_ts,
    end_ts,
    columns,
    filter_data_mode,
):
    """
    - Load the data through ImClient
       - For CCXT data also choose the order level data
    - Transform to the desired multiindex format with specific format

    :param bid_ask_cols: specify cols with bid-ask data
    """
    # Load the data.
    if is_ccxt:
        df = ccxt_im_client.read_data(
            universe, start_ts, end_ts, columns, filter_data_mode
        )
        # CCXT timestamp data goes up to milliseconds, so one needs to round it to minutes.
        df.index = df.reset_index()["timestamp"].apply(
            lambda x: x.round(freq="T")
        )
        # Choose the specific order level (first level by default).
        df = clean_data_for_orderbook_level(df)
    else:
        df = cc_parquet_client.read_data(
            universe, start_ts, end_ts, columns, filter_data_mode
        )
    # Apply transformation.
    df = df[bid_ask_cols]
    df = df.reset_index().set_index(["timestamp", "full_symbol"])
    return df


def clean_data_for_orderbook_level(
    df: pd.DataFrame, level: int = 1
) -> pd.DataFrame:
    """
    Specify the order level in CCXT bid ask data.

    :param df: Data with multiple levels (e.g., bid_price_1, bid_price_2, etc.)
    :return: Data where specific level has common name (i.e., bid_price)
    """
    level_cols = [col for col in df.columns if col.endswith(f"_{level}")]
    level_cols_cleaned = [elem[:-2] for elem in level_cols]
    #
    zip_iterator = zip(level_cols, level_cols_cleaned)
    col_dict = dict(zip_iterator)
    #
    df = df.rename(columns=col_dict)
    #
    return df

# Initialize clients

In [6]:
# CCXT client.
ccxt_im_client = icdcl.CcxtSqlRealTimeImClient(**config["data"]["ccxt_im_client"])
# CC client.
cc_parquet_client = iccdc.CryptoChassisHistoricalPqByTileClient(
    **config["data"]["cc_im_client"]
)

# Specify universe

In [7]:
# DB universe
ccxt_universe = ccxt_im_client.get_universe()
# CC universe.
cc_universe = cc_parquet_client.get_universe()
# Intersection of universes that will be used for analysis.
universe = list(set(ccxt_universe) & set(cc_universe))

In [8]:
compare_universe = hprint.set_diff_to_str(
    cc_universe, ccxt_universe, add_space=True
)
print(compare_universe)

* obj1: (11) binance::ADA_USDT binance::BNB_USDT binance::BTC_USD binance::BTC_USDT binance::DOGE_USDT binance::DOT_USDT binance::EOS_USDT binance::ETH_USD binance::ETH_USDT binance::SOL_USDT binance::XRP_USDT

* obj2: (27) binance::APE_USDT binance::AVAX_USDT binance::AXS_USDT binance::BAKE_USDT binance::BNB_USDT binance::BTC_BUSD binance::BTC_USDT binance::CRV_USDT binance::CTK_USDT binance::DOGE_USDT binance::DOT_USDT binance::DYDX_USDT binance::ETH_BUSD binance::ETH_USDT binance::FTM_USDT binance::GMT_USDT binance::LINK_USDT binance::MATIC_USDT binance::NEAR_USDT binance::OGN_USDT binance::RUNE_USDT binance::SAND_USDT binance::SOL_USDT binance::STORJ_USDT binance::UNFI_USDT binance::WAVES_USDT binance::XRP_USDT

* intersect=(7) binance::BNB_USDT binance::BTC_USDT binance::DOGE_USDT binance::DOT_USDT binance::ETH_USDT binance::SOL_USDT binance::XRP_USDT

* obj1-obj2=(4) binance::ADA_USDT binance::BTC_USD binance::EOS_USDT binance::ETH_USD

* obj2-obj1=(20) binance::APE_USDT binance:

# Load the data

## Adjust universe

In [9]:
# Even though they're in the intersected universe,
# they are not downloaded in CC.
universe.remove("binance::XRP_USDT")
universe.remove("binance::DOT_USDT")
# These two symbols crashes the downloads on `tz-conversion` stage.
universe

['binance::DOGE_USDT',
 'binance::SOL_USDT',
 'binance::ETH_USDT',
 'binance::BNB_USDT',
 'binance::BTC_USDT']

## Load data

In [10]:
# CCXT data.
bid_ask_cols = config["column_names"]["bid_ask_cols"]
is_ccxt = True
#
data_ccxt = load_and_transform_the_data(
    universe, bid_ask_cols, is_ccxt, **config["data"]["read_data"]
)

In [11]:
# CC data.
is_ccxt = False
#
data_cc = load_and_transform_the_data(
    universe, bid_ask_cols, is_ccxt, **config["data"]["read_data"]
)

# Analysis

## Merge CC and DB data into one DataFrame

In [12]:
data = data_ccxt.merge(
    data_cc,
    how="outer",
    left_index=True,
    right_index=True,
    suffixes=("_ccxt", "_cc"),
)
_LOG.info("Start date = %s", data.reset_index()["timestamp"].min())
_LOG.info("End date = %s", data.reset_index()["timestamp"].max())
_LOG.info(
    "Avg observations per coin = %s",
    len(data) / len(data.reset_index()["full_symbol"].unique()),
)
# Move the same metrics from two vendors together.
data = data.reindex(sorted(data.columns), axis=1)
# NaNs observation.
_LOG.info(
    "Number of observations with NaNs in CryptoChassis = %s",
    len(data[data["bid_price_cc"].isna()]),
)
_LOG.info(
    "Number of observations with NaNs in CCXT = %s",
    len(data[data["bid_price_ccxt"].isna()]),
)
# Remove NaNs.
data = hpandas.dropna(data, report_stats=True)
#
display(data.tail())

INFO  Start date = 2022-09-08 22:06:00+00:00
INFO  End date = 2022-09-13 00:00:00+00:00
INFO  Avg observations per coin = 5875.0
INFO  Number of observations with NaNs in CryptoChassis = 0
INFO  Number of observations with NaNs in CCXT = 5
INFO  removed rows with nans: 5 / 29375 = 0.02%


Unnamed: 0_level_0,Unnamed: 1_level_0,ask_price_cc,ask_price_ccxt,ask_size_cc,ask_size_ccxt,bid_price_cc,bid_price_ccxt,bid_size_cc,bid_size_ccxt
timestamp,full_symbol,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2022-09-12 23:59:00+00:00,binance::BNB_USDT,293.755573,293.67,1097.51,12.39,293.703382,293.66,1289.08,22.82
2022-09-12 23:59:00+00:00,binance::BTC_USDT,22383.539757,22379.3,675.229,9.022,22384.719934,22379.2,852.933,11.326
2022-09-12 23:59:00+00:00,binance::DOGE_USDT,0.063935,0.06397,16203310.0,45400.0,0.06394,0.06396,19796710.0,274591.0
2022-09-12 23:59:00+00:00,binance::ETH_USDT,1715.297304,1715.11,2245.335,9.534,1715.270648,1715.1,2897.342,68.793
2022-09-12 23:59:00+00:00,binance::SOL_USDT,37.439642,37.45,83043.0,738.0,37.430815,37.44,100321.0,2338.0


## Calculate differences

In [13]:
# Full symbol will not be relevant in calculation loops below.
bid_ask_cols.remove("full_symbol")
# Each bid ask value will have a notional and a relative difference between two sources.
for col in bid_ask_cols:
    # Notional difference: CC value - DB value.
    data[f"{col}_diff"] = data[f"{col}_cc"] - data[f"{col}_ccxt"]
    # Relative value: (CC value - DB value)/DB value.
    data[f"{col}_relative_diff_pct"] = (
        100 * (data[f"{col}_cc"] - data[f"{col}_ccxt"]) / data[f"{col}_ccxt"]
    )
#
data.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,ask_price_cc,ask_price_ccxt,ask_size_cc,ask_size_ccxt,bid_price_cc,bid_price_ccxt,bid_size_cc,bid_size_ccxt,bid_price_diff,bid_price_relative_diff_pct,bid_size_diff,bid_size_relative_diff_pct,ask_price_diff,ask_price_relative_diff_pct,ask_size_diff,ask_size_relative_diff_pct
timestamp,full_symbol,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
2022-09-08 22:06:00+00:00,binance::BNB_USDT,280.452998,280.37,1661.8,19.86,280.407683,280.36,1811.39,30.87,0.04768349,0.017008,1780.52,5767.800454,0.082998,0.029603,1641.94,8267.573011
2022-09-08 22:06:00+00:00,binance::BTC_USDT,19302.452881,19307.2,405.041,3.03,19304.282182,19307.1,1052.963,13.845,-2.817818,-0.014595,1039.118,7505.366558,-4.747119,-0.024587,402.011,13267.689769
2022-09-08 22:06:00+00:00,binance::DOGE_USDT,0.060794,0.06079,16984660.0,166509.0,0.06078,0.06078,22626990.0,459769.0,3.458295e-07,0.000569,22167220.0,4821.381607,4e-06,0.006939,16818150.0,10100.447423
2022-09-08 22:06:00+00:00,binance::ETH_USDT,1635.415386,1635.75,1782.946,41.053,1635.541073,1635.74,4864.447,38.886,-0.1989267,-0.012161,4825.561,12409.507278,-0.334614,-0.020456,1741.893,4243.034614
2022-09-08 22:06:00+00:00,binance::SOL_USDT,33.550721,33.56,59813.0,34.0,33.546178,33.55,91326.0,2596.0,-0.003822351,-0.011393,88730.0,3417.950693,-0.009279,-0.02765,59779.0,175820.588235


In [14]:
# Calculate the mean value of differences for each coin.
diff_stats = []
grouper = data.groupby(["full_symbol"])
for col in bid_ask_cols:
    diff_stats.append(grouper[f"{col}_diff"].mean())
    diff_stats.append(grouper[f"{col}_relative_diff_pct"].mean())
#
diff_stats = pd.concat(diff_stats, axis=1)

## Show stats for differences (in %)

### Prices

In [15]:
diff_stats[["bid_price_relative_diff_pct", "ask_price_relative_diff_pct"]]

Unnamed: 0_level_0,bid_price_relative_diff_pct,ask_price_relative_diff_pct
full_symbol,Unnamed: 1_level_1,Unnamed: 2_level_1
binance::BNB_USDT,-0.000208,0.001182
binance::BTC_USDT,-3.1e-05,0.002573
binance::DOGE_USDT,-0.000632,0.001197
binance::ETH_USDT,0.000206,0.001268
binance::SOL_USDT,-0.00042,0.001413


As one can see, the difference between bid and ask prices in DB and CC are less than 1%.

### Sizes

In [16]:
diff_stats[["bid_size_relative_diff_pct", "ask_size_relative_diff_pct"]]

Unnamed: 0_level_0,bid_size_relative_diff_pct,ask_size_relative_diff_pct
full_symbol,Unnamed: 1_level_1,Unnamed: 2_level_1
binance::BNB_USDT,68818.017135,70078.013276
binance::BTC_USDT,267910.987962,350682.580486
binance::DOGE_USDT,50040.46191,83404.551224
binance::ETH_USDT,884531.524803,775768.012765
binance::SOL_USDT,15302.541386,17541.304393


The difference between bid and ask sizes in DB and CC is solid and accounts for more than 100% for each full symbol.

## Correlations

### Bid price

In [17]:
bid_price_corr_matrix = (
    data[["bid_price_cc", "bid_price_ccxt"]].groupby(level=1).corr()
)
bid_price_corr_matrix

Unnamed: 0_level_0,Unnamed: 1_level_0,bid_price_cc,bid_price_ccxt
full_symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
binance::BNB_USDT,bid_price_cc,1.0,0.999529
binance::BNB_USDT,bid_price_ccxt,0.999529,1.0
binance::BTC_USDT,bid_price_cc,1.0,0.999901
binance::BTC_USDT,bid_price_ccxt,0.999901,1.0
binance::DOGE_USDT,bid_price_cc,1.0,0.999466
binance::DOGE_USDT,bid_price_ccxt,0.999466,1.0
binance::ETH_USDT,bid_price_cc,1.0,0.999573
binance::ETH_USDT,bid_price_ccxt,0.999573,1.0
binance::SOL_USDT,bid_price_cc,1.0,0.999864
binance::SOL_USDT,bid_price_ccxt,0.999864,1.0


Correlation stats confirms the stats above: bid prices in DB and CC are highly correlated.

### Ask price

In [18]:
ask_price_corr_matrix = (
    data[["ask_price_cc", "ask_price_ccxt"]].groupby(level=1).corr()
)
ask_price_corr_matrix

Unnamed: 0_level_0,Unnamed: 1_level_0,ask_price_cc,ask_price_ccxt
full_symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
binance::BNB_USDT,ask_price_cc,1.0,0.999529
binance::BNB_USDT,ask_price_ccxt,0.999529,1.0
binance::BTC_USDT,ask_price_cc,1.0,0.999886
binance::BTC_USDT,ask_price_ccxt,0.999886,1.0
binance::DOGE_USDT,ask_price_cc,1.0,0.999453
binance::DOGE_USDT,ask_price_ccxt,0.999453,1.0
binance::ETH_USDT,ask_price_cc,1.0,0.999518
binance::ETH_USDT,ask_price_ccxt,0.999518,1.0
binance::SOL_USDT,ask_price_cc,1.0,0.999861
binance::SOL_USDT,ask_price_ccxt,0.999861,1.0


Correlation stats confirms the stats above: ask prices in DB and CC are highly correlated.

### Bid size

In [19]:
bid_size_corr_matrix = (
    data[["bid_size_cc", "bid_size_ccxt"]].groupby(level=1).corr()
)
bid_size_corr_matrix

Unnamed: 0_level_0,Unnamed: 1_level_0,bid_size_cc,bid_size_ccxt
full_symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
binance::BNB_USDT,bid_size_cc,1.0,0.386222
binance::BNB_USDT,bid_size_ccxt,0.386222,1.0
binance::BTC_USDT,bid_size_cc,1.0,0.360117
binance::BTC_USDT,bid_size_ccxt,0.360117,1.0
binance::DOGE_USDT,bid_size_cc,1.0,0.599602
binance::DOGE_USDT,bid_size_ccxt,0.599602,1.0
binance::ETH_USDT,bid_size_cc,1.0,0.261788
binance::ETH_USDT,bid_size_ccxt,0.261788,1.0
binance::SOL_USDT,bid_size_cc,1.0,0.630683
binance::SOL_USDT,bid_size_ccxt,0.630683,1.0


Correlation stats confirms the stats above: bid sizes in DB and CC are not correlated.

### Ask size

In [20]:
ask_size_corr_matrix = (
    data[["ask_size_cc", "ask_size_ccxt"]].groupby(level=1).corr()
)
ask_size_corr_matrix

Unnamed: 0_level_0,Unnamed: 1_level_0,ask_size_cc,ask_size_ccxt
full_symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
binance::BNB_USDT,ask_size_cc,1.0,0.382751
binance::BNB_USDT,ask_size_ccxt,0.382751,1.0
binance::BTC_USDT,ask_size_cc,1.0,0.372992
binance::BTC_USDT,ask_size_ccxt,0.372992,1.0
binance::DOGE_USDT,ask_size_cc,1.0,0.588139
binance::DOGE_USDT,ask_size_ccxt,0.588139,1.0
binance::ETH_USDT,ask_size_cc,1.0,0.185124
binance::ETH_USDT,ask_size_ccxt,0.185124,1.0
binance::SOL_USDT,ask_size_cc,1.0,0.606954
binance::SOL_USDT,ask_size_ccxt,0.606954,1.0


Correlation stats confirms the stats above: ask sizes in DB and CC are not correlated.