# Description

This notebook conducts the cross-vendor QA between following datasets:

- periodic.airflow.websocket.postgres.bid_ask.futures.v7_3.ccxt.binance.
- periodic.airflow.downloaded_EOD.postgres.bid_ask.futures.v3.cryptochassis.binance

The QA consists of the following data checks:

- Start and End date for both datasets
- Number of observations pet coin for both datasets
- Number of NaNs per dataset
- Notional difference (CC value - CCXT value) for `bid_price`, `ask_price`, `bid_size`, `ask_size` columns
- Relative difference (CC value - CCXT value)/CCXT value for `bid_price`, `ask_price`, `bid_size`, `ask_size` columns
- Pearson correlation for `bid_price`, `ask_price`, `bid_size`, `ask_size` between both datasets

# Imports

In [1]:
%load_ext autoreload
%autoreload 2

import logging

import pandas as pd

import core.config as cconfig
import helpers.hdbg as hdbg
import helpers.henv as henv
import helpers.hpandas as hpandas
import helpers.hprint as hprint
import helpers.hsql as hsql
import im_v2.ccxt.data.client as icdcl
import im_v2.common.data.transform.transform_utils as imvcdttrut
import im_v2.crypto_chassis.data.client as iccdc
import im_v2.im_lib_tasks as imvimlita

  from tqdm.autonotebook import tqdm


In [2]:
hdbg.init_logger(verbosity=logging.INFO)

_LOG = logging.getLogger(__name__)

_LOG.info("%s", henv.get_system_signature()[0])

hprint.config_notebook()

[0m[36mINFO[0m: > cmd='/venv/lib/python3.8/site-packages/ipykernel_launcher.py -f /home/.local/share/jupyter/runtime/kernel-c5ca8c0a-faba-4e07-89c9-e416bdcce797.json'
[31m-----------------------------------------------------------------------------
This code is not in sync with the container:
code_version='1.4.1' != container_version='1.4.0'
-----------------------------------------------------------------------------
You need to:
- merge origin/master into your branch with `invoke git_merge_master`
- pull the latest container with `invoke docker_pull`[0m
INFO  # Git
  branch_name='CMTask3430_factor_out_loading_functions'
  hash='3e80d971d'
  # Last commits:
    * 3e80d971d Daniil Tikhomirov CMTask3430: Factor out config and MarketData                      (22 minutes ago) Wed Dec 21 11:48:48 2022  (HEAD -> CMTask3430_factor_out_loading_functions, origin/CMTask3430_factor_out_loading_functions)
    * c784b406c Nina Lee CMTask3423_add_more_tags (#3426)                              

# Config

In [3]:
def get_example_config() -> cconfig.Config:
    """
    Config for comparison of 1sec CryptoChassis and 1sec CCXT bid/ask data.
    """
    config = cconfig.Config()
    param_dict = {
        "data": {
            # Whether to resample 1sec data to 1min using our production flow.
            # TODO(Danya): Variable overlaps with `resample_1min` parameter for clients.
            "resample_1sec_to_1min": False,
            # Parameters for client initialization.
            "cc_im_client": {
                "universe_version": None,
                "resample_1min": False,
                "contract_type": "futures",
                "tag": "downloaded_1sec",
            },
            "ccxt_im_client": {
                "universe_version": "infer_from_data",
                "resample_1min": False,
                "db_connection": hsql.get_connection(
                    *hsql.get_connection_info_from_env_file(
                        imvimlita.get_db_env_path("dev")
                    )
                ),
                "table_name": "ccxt_bid_ask_futures_raw",
            },
            # Parameters for data query.
            "read_data": {
                # Get start/end ts as inputs to script.
                #  Note: DB data is archived to S3 every 3 days, so we should use
                #  only the latest dates.
                "start_ts": pd.Timestamp("2022-11-28 00:00:00+00:00"),
                "end_ts": pd.Timestamp("2022-11-29 00:00:00+00:00"),
                "columns": None,
                "filter_data_mode": "assert",
            },
        },
        "column_names": {
            "bid_ask_cols": [
                "bid_price",
                "bid_size",
                "ask_price",
                "ask_size",
            ],
        },
        "order_level": 1,
    }
    config = cconfig.Config.from_dict(param_dict)
    return config


config = get_example_config()
print(config)

data: 
  resample_1sec_to_1min: False
  cc_im_client: 
    universe_version: None
    resample_1min: False
    contract_type: futures
    tag: downloaded_1sec
  ccxt_im_client: 
    resample_1min: False
    db_connection: <connection object; dsn: 'user=cryptokaizen_dev password=xxx dbname=im_data_db host=dev-im-db.cpox8ul7pzan.eu-north-1.rds.amazonaws.com port=5432', closed: 0>
    table_name: ccxt_bid_ask_futures_raw
  read_data: 
    start_ts: 2022-11-28 00:00:00+00:00
    end_ts: 2022-11-29 00:00:00+00:00
    columns: None
    filter_data_mode: assert
column_names: 
  bid_ask_cols: ['bid_price', 'bid_size', 'ask_price', 'ask_size']
order_level: 1


# Clients

In [None]:
# CCXT client.
ccxt_im_client_config = config.get_and_mark_as_used(("data", "ccxt_im_client"))
ccxt_im_client = icdcl.CcxtSqlRealTimeImClient(**ccxt_im_client_config)
# CC client.
cc_parquet_client_config = config.get_and_mark_as_used(("data", "cc_im_client"))
cc_parquet_client = iccdc.get_CryptoChassisHistoricalPqByTileClient_example2(
    **cc_parquet_client_config
)

  df = pd.read_sql_query(query, connection)


# Universe

In [None]:
# DB universe
ccxt_universe = ccxt_im_client.get_universe()
# CC universe.
cc_universe = cc_parquet_client.get_universe()
# Intersection of universes that will be used for analysis.
universe = list(set(ccxt_universe) & set(cc_universe))

In [None]:
compare_universe = hprint.set_diff_to_str(
    cc_universe, ccxt_universe, add_space=True
)
print(compare_universe)

# Load data

In [None]:
read_data_config = config.get_and_mark_as_used(("data", "read_data"))

## Load CCXT

In [None]:
ccxt_df = ccxt_im_client.read_data(universe, **read_data_config)

In [None]:
display(ccxt_df.head(10))

On the first glance:
- It has levels where they are not expected to be
- The level columns are empty

### Clean CCXT data

In [None]:
# TODO(Danya): What can be done to make these transformations universal?
#  "if"-switches based on vendor and type?

# Remove level suffix in the TOB column name.
ccxt_df.columns = ccxt_df.columns.str.replace("_l1", "")
# Remove all levels.
target_columns = [col for col in ccxt_df.columns if not col[-1].isnumeric()]
target_columns = [
    col for col in target_columns if col != "end_download_timestamp"
]
ccxt_df = ccxt_df[target_columns]
# CCXT timestamp data goes up to milliseconds, so one needs to round it to seconds.
ccxt_df.index = ccxt_df.reset_index()["timestamp"].apply(
    lambda x: x.ceil(freq="S")
)
display(ccxt_df.head(10))

## Load СС

In [None]:
cc_df = cc_parquet_client.read_data(universe, **read_data_config)
display(cc_df.head(10))

# Resampling data

In [None]:
# Perform VWAP resampling if required by config.
resample_1min = config.get_and_mark_as_used(("data", "resample_1sec_to_1min"))
if resample_1min:
    # TODO(Danya): Function as-is has VWAP and TWAP modes and removes the `full_symbol` column.
    ccxt_df = imvcdttrut.resample_bid_ask_data_to_1min(ccxt_df, mode="VWAP")
    # Fixed during #CmTask3225
    cc_df = imvcdttrut.resample_bid_ask_data_to_1min(cc_df, mode="VWAP")

# Analysis

## Merge CC and DB data into one DataFrame


In [13]:
ccxt_df = ccxt_df.reset_index().set_index(["timestamp", "full_symbol"])
cc_df = cc_df.reset_index().set_index(["timestamp", "full_symbol"])

In [14]:
data = ccxt_df.merge(
    cc_df,
    how="inner",
    left_index=True,
    right_index=True,
    suffixes=("_ccxt", "_cc"),
)

In [15]:
# Conduct a data sanity check.
# Get number of values for both datasets.
len_cc_data = len(cc_df)
len_ccxt_data = len(ccxt_df)
_LOG.info("Start date = %s", data.reset_index()["timestamp"].min())
_LOG.info("End date = %s", data.reset_index()["timestamp"].max())
_LOG.info(
    "Avg observations per coin = %s",
    len(data) / len(data.reset_index()["full_symbol"].unique()),
)
# Move the same metrics from two vendors together.
data = data.reindex(sorted(data.columns), axis=1)
# NaNs observation.
nans_cc = len(data[data["bid_price_cc"].isna()])
nans_ccxt = len(data[data["bid_price_ccxt"].isna()])
_LOG.info(
    "Number of observations with NaNs in CryptoChassis = %s (%s%%)",
    nans_cc,
    nans_cc / len_cc_data,
)
_LOG.info(
    "Number of observations with NaNs in CCXT = %s (%s%%)",
    nans_ccxt,
    nans_ccxt / len_ccxt_data,
)
# Remove NaNs.
data = hpandas.dropna(data, report_stats=True)
#
# Zero bid size.
zero_bid_size_cc = len(data[data["bid_size_cc"] == 0])
_LOG.info(
    "Number of observations with bid_size=0 in CryptoChassis = %s (%s%%)",
    zero_bid_size_cc,
    zero_bid_size_cc / len_cc_data,
)
zero_bid_size_ccxt = len(data[data["bid_size_ccxt"] == 0])
_LOG.info(
    "Number of observations with bid_size=0 in CCXT = %s (%s%%)",
    zero_bid_size_cc,
    zero_bid_size_ccxt / len_ccxt_data,
)
# Zero ask size.
zero_ask_size_cc = len(data[data["ask_size_cc"] == 0])
_LOG.info(
    "Number of observations with ask_size=0 in CryptoChassis = %s (%s%%)",
    zero_ask_size_cc,
    zero_ask_size_cc / len_cc_data,
)
zero_ask_size_ccxt = len(data[data["ask_size_ccxt"] == 0])
_LOG.info(
    "Number of observations with ask_size=0 in CCXT = %s (%s%%)",
    zero_ask_size_cc,
    zero_ask_size_ccxt / len_ccxt_data,
)
#
# Bid !< Ask.
small_bid_cc = len(data[data["ask_price_cc"] >= data["bid_price_cc"]])
_LOG.info(
    "Number of observations with ask_price >= bid_price in CryptoChassis = %s (%s%%)",
    small_bid_cc,
    small_bid_cc / len_cc_data,
)
small_bid_ccxt = len(data[data["ask_price_ccxt"] >= data["bid_price_ccxt"]])
_LOG.info(
    "Number of observations with ask_price >= bid_price in CCXT = %s (%s%%)",
    small_bid_ccxt,
    small_bid_ccxt / len_ccxt_data,
)
#
display(data.tail())

INFO  Start date = 2022-11-28 00:00:01+00:00
INFO  End date = 2022-11-28 23:59:59+00:00
INFO  Avg observations per coin = 312973.71428571426
INFO  Number of observations with NaNs in CryptoChassis = 0 (0.0%)
INFO  Number of observations with NaNs in CCXT = 78392 (0.03569186219896146%)
INFO  removed rows with nans: 78392 / 2190816 = 3.58%
INFO  Number of observations with bid_size=0 in CryptoChassis = 0 (0.0%)
INFO  Number of observations with bid_size=0 in CCXT = 0 (0.0%)
INFO  Number of observations with ask_size=0 in CryptoChassis = 0 (0.0%)
INFO  Number of observations with ask_size=0 in CCXT = 0 (0.0%)
INFO  Number of observations with ask_price >= bid_price in CryptoChassis = 2112424 (3.501286199215021%)
INFO  Number of observations with ask_price >= bid_price in CCXT = 2112424 (0.9617862321892409%)


Unnamed: 0_level_0,Unnamed: 1_level_0,ask_price_cc,ask_price_ccxt,ask_size_cc,ask_size_ccxt,bid_price_cc,bid_price_ccxt,bid_size_cc,bid_size_ccxt,knowledge_timestamp_cc,knowledge_timestamp_ccxt
timestamp,full_symbol,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2022-11-28 23:59:59+00:00,binance::SOL_USDT,13.313,13.313,37.0,34.0,13.312,13.312,114.0,114.0,2022-11-29 11:03:41.070183+00:00,2022-11-28 23:59:58.876401+00:00
2022-11-28 23:59:59+00:00,binance::SOL_USDT,13.313,13.313,37.0,37.0,13.312,13.312,114.0,114.0,2022-11-29 11:03:41.070183+00:00,2022-11-29 00:00:03.580887+00:00
2022-11-28 23:59:59+00:00,binance::XRP_USDT,0.3886,0.3889,146086.5,7027.7,0.3885,0.3888,115627.9,80514.4,2022-11-29 11:03:57.272522+00:00,2022-11-28 23:59:58.876401+00:00
2022-11-28 23:59:59+00:00,binance::XRP_USDT,0.3886,0.3889,146086.5,7027.7,0.3885,0.3888,115627.9,80514.4,2022-11-29 11:03:57.272522+00:00,2022-11-28 23:59:58.876401+00:00
2022-11-28 23:59:59+00:00,binance::XRP_USDT,0.3886,0.3887,146086.5,118488.5,0.3885,0.3886,115627.9,123064.8,2022-11-29 11:03:57.272522+00:00,2022-11-29 00:00:03.580887+00:00


## Calculate differences

In [16]:
# Full symbol will not be relevant in calculation loops below.
bid_ask_cols = config.get_and_mark_as_used(("column_names", "bid_ask_cols"))
# Each bid ask value will have a notional and a relative difference between two sources.
for col in bid_ask_cols:
    # Notional difference: CC value - DB value.
    data[f"{col}_diff"] = data[f"{col}_cc"] - data[f"{col}_ccxt"]
    # Relative value: (CC value - DB value)/DB value.
    data[f"{col}_relative_pct_diff"] = (
        100 * (data[f"{col}_cc"] - data[f"{col}_ccxt"]) / data[f"{col}_ccxt"]
    )

In [17]:
# Calculate the mean value of differences for each coin.
diff_stats = []
grouper = data.groupby(["full_symbol"])
for col in bid_ask_cols:
    diff_stats.append(grouper[f"{col}_diff"].mean())
    diff_stats.append(grouper[f"{col}_relative_pct_diff"].mean())
#
diff_stats = pd.concat(diff_stats, axis=1)

## Show stats for differences (in %)

### Prices

In [18]:
display(
    diff_stats[["bid_price_relative_pct_diff", "ask_price_relative_pct_diff"]]
)

Unnamed: 0_level_0,bid_price_relative_pct_diff,ask_price_relative_pct_diff
full_symbol,Unnamed: 1_level_1,Unnamed: 2_level_1
binance::BNB_USDT,3.3e-05,3.9e-05
binance::BTC_USDT,-1.6e-05,-1.5e-05
binance::DOGE_USDT,-2.9e-05,-3e-05
binance::DOT_USDT,-1.7e-05,-1.8e-05
binance::ETH_USDT,-1.7e-05,-1.7e-05
binance::SOL_USDT,-7.3e-05,-7.3e-05
binance::XRP_USDT,-8e-06,-6e-06


### Sizes

In [19]:
display(diff_stats[["bid_size_relative_pct_diff", "ask_size_relative_pct_diff"]])

Unnamed: 0_level_0,bid_size_relative_pct_diff,ask_size_relative_pct_diff
full_symbol,Unnamed: 1_level_1,Unnamed: 2_level_1
binance::BNB_USDT,394.706292,418.207265
binance::BTC_USDT,958.550336,901.738706
binance::DOGE_USDT,583.672081,452.044739
binance::DOT_USDT,88.567735,111.915672
binance::ETH_USDT,4579.044709,4285.462987
binance::SOL_USDT,274.383705,335.253219
binance::XRP_USDT,711.506376,92.460548


## Correlations

### Bid price

In [20]:
bid_price_corr_matrix = (
    data[["bid_price_cc", "bid_price_ccxt"]].groupby(level=1).corr()
)
display(bid_price_corr_matrix)

Unnamed: 0_level_0,Unnamed: 1_level_0,bid_price_cc,bid_price_ccxt
full_symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
binance::BNB_USDT,bid_price_cc,1.0,0.999977
binance::BNB_USDT,bid_price_ccxt,0.999977,1.0
binance::BTC_USDT,bid_price_cc,1.0,0.99996
binance::BTC_USDT,bid_price_ccxt,0.99996,1.0
binance::DOGE_USDT,bid_price_cc,1.0,0.999919
binance::DOGE_USDT,bid_price_ccxt,0.999919,1.0
binance::DOT_USDT,bid_price_cc,1.0,0.999961
binance::DOT_USDT,bid_price_ccxt,0.999961,1.0
binance::ETH_USDT,bid_price_cc,1.0,0.999953
binance::ETH_USDT,bid_price_ccxt,0.999953,1.0


### Ask price

In [21]:
ask_price_corr_matrix = (
    data[["ask_price_cc", "ask_price_ccxt"]].groupby(level=1).corr()
)
display(ask_price_corr_matrix)

Unnamed: 0_level_0,Unnamed: 1_level_0,ask_price_cc,ask_price_ccxt
full_symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
binance::BNB_USDT,ask_price_cc,1.0,0.999978
binance::BNB_USDT,ask_price_ccxt,0.999978,1.0
binance::BTC_USDT,ask_price_cc,1.0,0.99996
binance::BTC_USDT,ask_price_ccxt,0.99996,1.0
binance::DOGE_USDT,ask_price_cc,1.0,0.999919
binance::DOGE_USDT,ask_price_ccxt,0.999919,1.0
binance::DOT_USDT,ask_price_cc,1.0,0.999962
binance::DOT_USDT,ask_price_ccxt,0.999962,1.0
binance::ETH_USDT,ask_price_cc,1.0,0.999953
binance::ETH_USDT,ask_price_ccxt,0.999953,1.0


### Bid size

In [22]:
bid_size_corr_matrix = (
    data[["bid_size_cc", "bid_size_ccxt"]].groupby(level=1).corr()
)
display(bid_size_corr_matrix)

Unnamed: 0_level_0,Unnamed: 1_level_0,bid_size_cc,bid_size_ccxt
full_symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
binance::BNB_USDT,bid_size_cc,1.0,0.690022
binance::BNB_USDT,bid_size_ccxt,0.690022,1.0
binance::BTC_USDT,bid_size_cc,1.0,0.809734
binance::BTC_USDT,bid_size_ccxt,0.809734,1.0
binance::DOGE_USDT,bid_size_cc,1.0,0.626774
binance::DOGE_USDT,bid_size_ccxt,0.626774,1.0
binance::DOT_USDT,bid_size_cc,1.0,0.905538
binance::DOT_USDT,bid_size_ccxt,0.905538,1.0
binance::ETH_USDT,bid_size_cc,1.0,0.792837
binance::ETH_USDT,bid_size_ccxt,0.792837,1.0


### Ask size

In [23]:
ask_size_corr_matrix = (
    data[["ask_size_cc", "ask_size_ccxt"]].groupby(level=1).corr()
)
display(ask_size_corr_matrix)

Unnamed: 0_level_0,Unnamed: 1_level_0,ask_size_cc,ask_size_ccxt
full_symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
binance::BNB_USDT,ask_size_cc,1.0,0.52904
binance::BNB_USDT,ask_size_ccxt,0.52904,1.0
binance::BTC_USDT,ask_size_cc,1.0,0.817335
binance::BTC_USDT,ask_size_ccxt,0.817335,1.0
binance::DOGE_USDT,ask_size_cc,1.0,0.71604
binance::DOGE_USDT,ask_size_ccxt,0.71604,1.0
binance::DOT_USDT,ask_size_cc,1.0,0.862673
binance::DOT_USDT,ask_size_ccxt,0.862673,1.0
binance::ETH_USDT,ask_size_cc,1.0,0.830717
binance::ETH_USDT,ask_size_ccxt,0.830717,1.0


# Check unused variables in config

In [24]:
display(config)

data (marked_as_used=False, writer=None, val_type=core.config.config_.Config): 
  resample_1sec_to_1min (marked_as_used=True, writer=/tmp/ipykernel_2138/2074502587.py::2::<module>, val_type=bool): False
  cc_im_client (marked_as_used=False, writer=None, val_type=core.config.config_.Config): 
    universe_version (marked_as_used=True, writer=/tmp/ipykernel_2138/3273197507.py::5::<module>, val_type=NoneType): None
    resample_1min (marked_as_used=True, writer=/tmp/ipykernel_2138/3273197507.py::5::<module>, val_type=bool): False
    contract_type (marked_as_used=True, writer=/tmp/ipykernel_2138/3273197507.py::5::<module>, val_type=str): futures
    tag (marked_as_used=True, writer=/tmp/ipykernel_2138/3273197507.py::5::<module>, val_type=str): downloaded_1sec
  ccxt_im_client (marked_as_used=False, writer=None, val_type=core.config.config_.Config): 
    resample_1min (marked_as_used=True, writer=/tmp/ipykernel_2138/3273197507.py::2::<module>, val_type=bool): False
    db_connection (marke