# Imports

In [1]:
import logging
import os

import pandas as pd

import core.config.config_ as cconconf
import helpers.hdbg as hdbg
import helpers.henv as henv
import helpers.hprint as hprint
import helpers.hs3 as hs3
import im_v2.ccxt.data.client as icdcl
import im_v2.ccxt.universe.universe as imvccunun
import research_amp.cc.statistics as ramccsta

In [2]:
hdbg.init_logger(verbosity=logging.INFO)

_LOG = logging.getLogger(__name__)

_LOG.info("%s", henv.get_system_signature()[0])

hprint.config_notebook()

AM_AWS_PROFILE = "am"

[0m[36mINFO[0m: > cmd='/venv/lib/python3.8/site-packages/ipykernel_launcher.py -f /home/.local/share/jupyter/runtime/kernel-311ab241-30eb-4fc8-8c35-3c103a682854.json'
>>ENV<<: is_inside_container=True: code_version=1.0.6, container_version=1.0.6, is_inside_docker=True, is_inside_ci=False, CI_defined=True, CI=''
>>ENV<<: AM_AWS_PROFILE=True AM_ECR_BASE_PATH=True AM_S3_BUCKET=True AM_TELEGRAM_TOKEN=True AWS_ACCESS_KEY_ID=False AWS_DEFAULT_REGION=False AWS_SECRET_ACCESS_KEY=False GH_ACTION_ACCESS_TOKEN=True
INFO  generated new fontManager
INFO  # Git
    branch_name='CMTask919_Refactor_CddClient_Fix_notebooks'
    hash='646a7a324'
    # Last commits:
      *   646a7a324 max-rsrch Checkpoint                                                        (25 seconds ago) Thu Jan 27 12:07:41 2022  (HEAD -> CMTask919_Refactor_CddClient_Fix_notebooks)
      |\  
      * \   9d64529f4 max-rsrch Resolve conflicts                                                 ( 5 minutes ago) Thu Jan 27 12:02:52 202

# Configs

In [3]:
# Generate configs for `CDD` and `CCXT`.

In [4]:
def get_cmtask324_config_ccxt() -> cconconf.Config:
    """
    Get task232-specific config.
    """
    config = cconconf.Config()
    # Load parameters.
    config.add_subconfig("load")
    config["load"]["aws_profile"] = AM_AWS_PROFILE
    config["load"]["data_dir"] = os.path.join(
        hs3.get_s3_bucket_path(AM_AWS_PROFILE), "data"
    )
    # Data parameters.
    config.add_subconfig("data")
    config["data"]["target_frequency"] = "T"
    config["data"]["universe_version"] = "v03"
    config["data"]["vendor"] = "CCXT"
    config["data"]["extension"] = "csv.gz"
    # Column names.
    config.add_subconfig("column_names")
    config["column_names"]["close_price"] = "close"
    config["column_names"]["currency_pair"] = "currency_pair"
    config["column_names"]["exchange_id"] = "exchange_id"
    return config

In [5]:
config_ccxt = get_cmtask324_config_ccxt()
print(config_ccxt)

load:
  aws_profile: am
  data_dir: s3://alphamatic-data/data
data:
  data_type: OHLCV
  target_frequency: T
  universe_version: v03
  vendor: CCXT
column_names:
  close_price: close
  currency_pair: currency_pair
  exchange_id: exchange_id


In [6]:
def get_cmtask324_config_cdd() -> cconconf.Config:
    """
    Get task324-specific config.
    """
    config = cconconf.Config()
    # Load parameters.
    config.add_subconfig("load")
    config["load"]["aws_profile"] = AM_AWS_PROFILE
    config["load"]["data_dir"] = os.path.join(
        hs3.get_s3_bucket_path(AM_AWS_PROFILE), "data"
    )
    # Data parameters.
    config.add_subconfig("data")
    config["data"]["target_frequency"] = "T"
    config["data"]["universe_version"] = "v01"
    config["data"]["vendor"] = "CDD"
    config["data"]["extension"] = "csv.gz"
    # Column names.
    config.add_subconfig("column_names")
    config["column_names"]["close_price"] = "close"
    config["column_names"]["currency_pair"] = "currency_pair"
    config["column_names"]["exchange_id"] = "exchange_id"
    return config

In [7]:
config_cdd = get_cmtask324_config_cdd()
print(config_cdd)

load:
  aws_profile: am
  data_dir: s3://alphamatic-data/data
data:
  data_type: OHLCV
  target_frequency: T
  universe_version: v01
  vendor: CDD
column_names:
  close_price: close
  currency_pair: currency_pair
  exchange_id: exchange_id


# Load the data universe

## CCXT

In [8]:
ccxt_universe = imvccunun.get_vendor_universe(version="v3")

## CDD

In [9]:
# TODO(Juraj): this got deprecated with #CmTask1493 and #CmTask1487
cdd_universe = imvccunun.get_vendor_universe(version="v01", vendor="CDD")
# Remove non-USDT elements, since we are not interested in them.
cdd_universe = [element for element in cdd_universe if element.endswith("USDT")]

# Compare universes

In [10]:
_LOG.info("Number of full symbols in 'CCXT': %s", len(ccxt_universe))
_LOG.info("Number of full symbols in 'CDD': %s", len(cdd_universe))

INFO  Number of full symbols in 'CCXT': 38
INFO  Number of full symbols in 'CDD': 58


In [11]:
# Intersection of full symbols between two vendors.
currency_pair_intersection = set(ccxt_universe).intersection(cdd_universe)
_LOG.info("Number of similar full symbols: %s", len(currency_pair_intersection))
display(currency_pair_intersection)

INFO  Number of similar full symbols: 18


{'binance::ADA_USDT',
 'binance::BNB_USDT',
 'binance::BTC_USDT',
 'binance::EOS_USDT',
 'binance::ETH_USDT',
 'binance::LINK_USDT',
 'binance::SOL_USDT',
 'ftx::BNB_USDT',
 'ftx::BTC_USDT',
 'ftx::ETH_USDT',
 'ftx::LINK_USDT',
 'ftx::XRP_USDT',
 'kucoin::ADA_USDT',
 'kucoin::BNB_USDT',
 'kucoin::BTC_USDT',
 'kucoin::EOS_USDT',
 'kucoin::ETH_USDT',
 'kucoin::XRP_USDT'}

In [12]:
# Full symbols that are included in `CCXT` but not in `CDD`.
ccxt_and_not_cdd = set(ccxt_universe).difference(cdd_universe)
_LOG.info(
    "Number of full symbols that are included in 'CCXT' but not in 'CDD': %s",
    len(ccxt_and_not_cdd),
)
display(ccxt_and_not_cdd)

INFO  Number of full symbols that are included in 'CCXT' but not in 'CDD': 20


{'binance::AVAX_USDT',
 'binance::DOGE_USDT',
 'ftx::DOGE_USDT',
 'ftx::SOL_USDT',
 'gateio::ADA_USDT',
 'gateio::AVAX_USDT',
 'gateio::BNB_USDT',
 'gateio::BTC_USDT',
 'gateio::DOGE_USDT',
 'gateio::EOS_USDT',
 'gateio::ETH_USDT',
 'gateio::FIL_USDT',
 'gateio::LINK_USDT',
 'gateio::SOL_USDT',
 'gateio::XRP_USDT',
 'kucoin::AVAX_USDT',
 'kucoin::DOGE_USDT',
 'kucoin::FIL_USDT',
 'kucoin::LINK_USDT',
 'kucoin::SOL_USDT'}

In [13]:
# Full symbols that are included in `CDD` but not in `CCXT`.
cdd_and_not_ccxt = set(cdd_universe).difference(ccxt_universe)
_LOG.info(
    "Number of full symbols that are included in 'CDD' but not in 'CCXT': %s",
    len(cdd_and_not_ccxt),
)
display(cdd_and_not_ccxt)

INFO  Number of full symbols that are included in 'CDD' but not in 'CCXT': 40


{'binance::AAVE_USDT',
 'binance::BAT_USDT',
 'binance::BTT_USDT',
 'binance::CELR_USDT',
 'binance::CVC_USDT',
 'binance::DAI_USDT',
 'binance::DASH_USDT',
 'binance::DOT_USDT',
 'binance::ETC_USDT',
 'binance::FIL_USDT',
 'binance::ICP_USDT',
 'binance::ICX_USDT',
 'binance::LRC_USDT',
 'binance::LTC_USDT',
 'binance::MATIC_USDT',
 'binance::MKR_USDT',
 'binance::NEO_USDT',
 'binance::ONE_USDT',
 'binance::PAX_USDT',
 'binance::QTUM_USDT',
 'binance::SCU_USDT',
 'binance::TRX_USDT',
 'binance::TUSD_USDT',
 'binance::UNI_USDT',
 'binance::USDC_USDT',
 'binance::VET_USDT',
 'binance::XLM_USDT',
 'binance::XMR_USDT',
 'binance::XRP_USDT',
 'binance::ZEC_USDT',
 'ftx::BCH_USDT',
 'ftx::LTC_USDT',
 'ftx::TRX_USDT',
 'kucoin::BCH_USDT',
 'kucoin::DASH_USDT',
 'kucoin::LTC_USDT',
 'kucoin::NEO_USDT',
 'kucoin::TRX_USDT',
 'kucoin::XTZ_USDT',
 'kucoin::ZEC_USDT'}

# Compare close prices / returns from Binance

## Load the data

The code below can be used to load all the existing data from two vendors `CDD` and `CCXT`. Current version is specified to Binance only, however, even for one exchange there's too many data to operate, that's why the output is the intersection of currency pairs between to universe, since one can compare only the intersection of currency pairs for two vendors.

In [9]:
# Load Binance-specific universe for `CCXT`.
ccxt_binance_universe = [
    element for element in ccxt_universe if element.startswith("binance")
]
# Load Binnance-specific universe for `CDD`.
cdd_binance_universe_initial = [
    element for element in cdd_universe if element.startswith("binance")
]
cdd_binance_universe = cdd_binance_universe_initial.copy()
# SCU_USDT has incorrect columns, so can not be downloaded.
# See CMTask244 - Cannot load CDD - binance - SCU/USDT from s3 for the reference.
cdd_binance_universe.remove("binance::SCU_USDT")
# The intersection of Binance currency pairs from two universes.
currency_pair_intersection_binance = set(ccxt_binance_universe).intersection(
    cdd_binance_universe_initial
)

### "CDD"

In [10]:
vendor_cdd = config_cdd["data"]["vendor"]
universe_version = "v3"
resample_1min = True
root_dir_cdd = config_cdd["load"]["data_dir"]
extension_cdd = config["data"]["extension"]
aws_profile_cdd = config_cdd["load"]["aws_profile"]
cdd_csv_client = icdcl.CcxtCddCsvParquetByAssetClient(
    vendor_cdd,
    universe_version,
    resample_1min,
    root_dir_cdd,
    extension_cdd,
    aws_profile=aws_profile_cdd,
)

start_ts = None
end_ts = None
cdd_binance_df = cdd_csv_client.read_data(
    list(currency_pair_intersection_binance),
    start_ts,
    end_ts,
)

Removed 5231 / 849688 = 0.62% rows
Removed 441 / 835286 = 0.05% rows
Removed 463 / 1057890 = 0.04% rows
Removed 587 / 882807 = 0.07% rows
Removed 388 / 943303 = 0.04% rows
Removed 214 / 869850 = 0.02% rows


In [11]:
display(cdd_binance_df.head(3))
display(cdd_binance_df.shape)

Unnamed: 0_level_0,full_symbol,open,high,low,close,volume,currency_pair,exchange_id
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2019-09-08 17:59:00+00:00,binance::BTC_USDT,10000.0,10000.0,10000.0,10000.0,0.001,BTC_USDT,binance
2019-09-08 18:00:00+00:00,binance::BTC_USDT,,,,,,,
2019-09-08 18:01:00+00:00,binance::BTC_USDT,,,,,,,


(5599806, 8)

### "CCXT"

In [12]:
vendor_ccxt = config_ccxt["data"]["vendor"]
universe_version = "v3"
resample_1min = True
root_dir_ccxt = config_ccxt["load"]["data_dir"]
extension_ccxt = config["data"]["extension"]
aws_profile_ccxt = config_ccxt["load"]["aws_profile"]
ccxt_csv_client = icdcl.CcxtCddCsvParquetByAssetClient(
    vendor_ccxt,
    universe_version,
    resample_1min,
    root_dir_ccxt,
    extension_ccxt,
    aws_profile=aws_profile_ccxt,
)

start_ts = None
end_ts = None
ccxt_binance_df = ccxt_csv_client.read_data(
    list(currency_pair_intersection_binance),
    start_ts,
    end_ts,
)

Removed 48775 / 1617480 = 3.02% rows
Removed 1253 / 1617775 = 0.08% rows
Removed 210 / 1615122 = 0.01% rows
Removed 6067 / 1617924 = 0.37% rows
Removed 215 / 1615369 = 0.01% rows
Removed 108869 / 1398630 = 7.78% rows
Removed 12528 / 574916 = 2.18% rows


In [13]:
display(ccxt_binance_df.head(3))
display(ccxt_binance_df.shape)

Unnamed: 0_level_0,full_symbol,open,high,low,close,volume,currency_pair,exchange_id
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2018-08-17 00:00:00+00:00,binance::ADA_USDT,0.0946,0.0948,0.09442,0.09479,41334.2,ADA_USDT,binance
2018-08-17 00:00:00+00:00,binance::BNB_USDT,9.7779,9.7791,9.7538,9.7778,520.66,BNB_USDT,binance
2018-08-17 00:00:00+00:00,binance::BTC_USDT,6316.0,6319.04,6310.32,6311.64,9.967395,BTC_USDT,binance


(10084929, 8)

## Calculate returns and correlation

In [14]:
def resample_close_price(df: pd.DataFrame, resampling_freq: str) -> pd.Series:
    """
    Resample close price on the currency level to the specified frequency using
    the last close price.

    :param df: OHLCV data
    :param resampling_freq: frequency from `pd.date_range()` to resample to
    :return: resampled close price per currency
    """
    # Reseting DateTime index, since pd.Grouper can't use index values.
    df = df.reset_index().rename(columns={"index": "stamp"})
    # Group by currency pairs and simultaneously resample to the desired frequency.
    resampler = df.groupby(
        ["currency_pair", pd.Grouper(key="timestamp", freq=resampling_freq)]
    )
    # Take the last close value from each resampling period.
    close_series = resampler.close.last()
    return close_series

In [15]:
def calculate_correlations(
    ccxt_close_price: pd.Series, cdd_close_price: pd.Series, compute_returns: bool
) -> pd.DataFrame:
    """
    Take CCXT and CDD close prices and calculate the correlations for each
    specific currency pair.

    :param ccxt_series: resampled close price per currency for CCXT
    :param cdd_series: resampled close price per currency for CDD
    :param compute_returns: if True - compare returns, if False - compare close prices
    :return: correlation matrix per currency
    """
    if compute_returns:
        # Group by currency pairs in order to calculate the percentage returns.
        grouper_cdd = cdd_close_price.groupby("currency_pair")
        cdd_close_price = grouper_cdd.pct_change()
        grouper_ccxt = ccxt_close_price.groupby("currency_pair")
        ccxt_close_price = grouper_ccxt.pct_change()
    # Combine and calculate correlations.
    combined = pd.merge(
        cdd_close_price, ccxt_close_price, left_index=True, right_index=True
    )
    # Rename the columns.
    if compute_returns:
        combined.columns = ["ccxt_returns", "cdd_returns"]
    else:
        combined.columns = ["cdd_close", "ccxt_close"]
    # Group by again to calculte returns correlation for each currency pair.
    corr_matrix = combined.groupby(level=0).corr()
    return corr_matrix

In [16]:
# Corresponding resampled Series.
daily_frequency = "1D"
ccxt_binance_series_1d = resample_close_price(ccxt_binance_df, daily_frequency)
cdd_binance_series_1d = resample_close_price(cdd_binance_df, daily_frequency)

five_min_frequency = "5min"
ccxt_binance_series_5min = resample_close_price(
    ccxt_binance_df, five_min_frequency
)
cdd_binance_series_5min = resample_close_price(cdd_binance_df, five_min_frequency)

### 1-day returns

In [17]:
compute_returns = True
returns_corr_1day = calculate_correlations(
    ccxt_binance_series_1d, cdd_binance_series_1d, compute_returns
)
display(returns_corr_1day)

Unnamed: 0_level_0,Unnamed: 1_level_0,ccxt_returns,cdd_returns
currency_pair,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ADA_USDT,ccxt_returns,1.0,0.997807
ADA_USDT,cdd_returns,0.997807,1.0
BNB_USDT,ccxt_returns,1.0,0.99849
BNB_USDT,cdd_returns,0.99849,1.0
BTC_USDT,ccxt_returns,1.0,0.997763
BTC_USDT,cdd_returns,0.997763,1.0
EOS_USDT,ccxt_returns,1.0,0.998294
EOS_USDT,cdd_returns,0.998294,1.0
ETH_USDT,ccxt_returns,1.0,0.995563
ETH_USDT,cdd_returns,0.995563,1.0


### 5-min returns

In [18]:
compute_returns = True
returns_corr_5min = calculate_correlations(
    ccxt_binance_series_5min, cdd_binance_series_5min, compute_returns
)
display(returns_corr_5min)

Unnamed: 0_level_0,Unnamed: 1_level_0,ccxt_returns,cdd_returns
currency_pair,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ADA_USDT,ccxt_returns,1.0,0.986929
ADA_USDT,cdd_returns,0.986929,1.0
BNB_USDT,ccxt_returns,1.0,0.987301
BNB_USDT,cdd_returns,0.987301,1.0
BTC_USDT,ccxt_returns,1.0,0.990777
BTC_USDT,cdd_returns,0.990777,1.0
EOS_USDT,ccxt_returns,1.0,0.99396
EOS_USDT,cdd_returns,0.99396,1.0
ETH_USDT,ccxt_returns,1.0,0.974921
ETH_USDT,cdd_returns,0.974921,1.0


## Compare close prices

### 1-day close prices

In [19]:
compute_returns = False
close_corr_1day = calculate_correlations(
    ccxt_binance_series_1d, cdd_binance_series_1d, compute_returns
)
display(close_corr_1day)

Unnamed: 0_level_0,Unnamed: 1_level_0,cdd_close,ccxt_close
currency_pair,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ADA_USDT,cdd_close,1.0,0.999995
ADA_USDT,ccxt_close,0.999995,1.0
BNB_USDT,cdd_close,1.0,0.999999
BNB_USDT,ccxt_close,0.999999,1.0
BTC_USDT,cdd_close,1.0,1.0
BTC_USDT,ccxt_close,1.0,1.0
EOS_USDT,cdd_close,1.0,0.999906
EOS_USDT,ccxt_close,0.999906,1.0
ETH_USDT,cdd_close,1.0,0.999994
ETH_USDT,ccxt_close,0.999994,1.0


### 5-min close prices

In [20]:
compute_returns = False
close_corr_5min = calculate_correlations(
    ccxt_binance_series_5min, cdd_binance_series_5min, compute_returns
)
display(close_corr_5min)

Unnamed: 0_level_0,Unnamed: 1_level_0,cdd_close,ccxt_close
currency_pair,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ADA_USDT,cdd_close,1.0,1.0
ADA_USDT,ccxt_close,1.0,1.0
BNB_USDT,cdd_close,1.0,1.0
BNB_USDT,ccxt_close,1.0,1.0
BTC_USDT,cdd_close,1.0,1.0
BTC_USDT,ccxt_close,1.0,1.0
EOS_USDT,cdd_close,1.0,0.999999
EOS_USDT,ccxt_close,0.999999,1.0
ETH_USDT,cdd_close,1.0,1.0
ETH_USDT,ccxt_close,1.0,1.0


# Statistical properties of a full symbol in CDD

In [9]:
# Clearing `CDD` currency pairs that are incorrect.

# Binance.
cdd_universe.remove("binance::SCU_USDT")

# FTX has some critical mistakes in the downloading process, so can not continue analysis with them.
# see CMTask801 - Downloading issues of FTX exchange from 'CDD' universe for further reference.
cdd_ftx_universe = [
    element for element in cdd_universe if element.startswith("ftx")
]
for elem in cdd_ftx_universe:
    cdd_universe.remove(elem)

# Kucoin exchange: the timestamps are obviously wrong and with too short time period.
# See CMTask253 - Fix timestamp for CDD - kucoin for reference.
cdd_kucoin_universe = [
    element for element in cdd_universe if element.startswith("kucoin")
]
for elem in cdd_kucoin_universe:
    cdd_universe.remove(elem)

## Comparison of intersection of full symbols between 'CCXT' and 'CDD'

In [10]:
# Full symbols that are included in `CDD` but not in `CCXT` (cleaned from unavailable full symbols).
cdd_and_ccxt_cleaned = set(ccxt_universe).intersection(cdd_universe)
len(cdd_and_ccxt_cleaned)

7

### Load the intersection of full symbols for 'CDD' and 'CCXT'

#### CDD

In [11]:
compute_start_end_stats = lambda data: ramccsta.compute_start_end_stats(
    data, config_cdd
)

cdd_start_end_table = ramccsta.compute_stats_for_universe(
    cdd_and_ccxt_cleaned, config_cdd, compute_start_end_stats
)

Removed 463 / 1057890 = 0.04% rows
Removed 214 / 869850 = 0.02% rows
Removed 388 / 943303 = 0.04% rows
Removed 441 / 835286 = 0.05% rows
Removed 587 / 882807 = 0.07% rows
Removed 5231 / 849688 = 0.62% rows


In [12]:
cdd_start_end_table.head(3)

Unnamed: 0,exchange_id,currency_pair,min_timestamp,max_timestamp,n_data_points,coverage,days_available,avg_data_points_per_day,longest_not_nan_seq_days,longest_not_nan_seq_perc,longest_not_nan_seq_start_date,longest_not_nan_seq_end_date,vendor
0,binance,BTC_USDT,2019-09-08 17:59:00+00:00,2021-09-16 01:35:00+00:00,1057427,99.459168,738,1432.827913,445,60.367559,2019-09-11 13:06:00+00:00,2020-11-30 05:59:00+00:00,CDD
1,binance,LINK_USDT,2020-01-17 08:00:00+00:00,2021-09-16 01:38:00+00:00,869636,99.371186,607,1432.678748,112,18.439471,2020-06-28 03:10:00+00:00,2020-10-18 04:40:00+00:00,CDD
2,binance,ETH_USDT,2019-11-27 07:45:00+00:00,2021-09-16 01:35:00+00:00,942915,99.401639,658,1433.00152,275,41.825824,2020-02-28 17:24:00+00:00,2020-11-30 05:59:00+00:00,CDD


#### CCXT

In [13]:
compute_start_end_stats = lambda data: ramccsta.compute_start_end_stats(
    data, config_ccxt
)

ccxt_start_end_table = ramccsta.compute_stats_for_universe(
    cdd_and_ccxt_cleaned, config_ccxt, compute_start_end_stats
)

Removed 210 / 1615122 = 0.01% rows
Removed 108869 / 1398630 = 7.78% rows
Removed 215 / 1615369 = 0.01% rows
Removed 1253 / 1617775 = 0.08% rows
Removed 6067 / 1617924 = 0.37% rows
Removed 48775 / 1617480 = 3.02% rows
Removed 12528 / 574916 = 2.18% rows


In [14]:
ccxt_start_end_table.head(3)

Unnamed: 0,exchange_id,currency_pair,min_timestamp,max_timestamp,n_data_points,coverage,days_available,avg_data_points_per_day,longest_not_nan_seq_days,longest_not_nan_seq_perc,longest_not_nan_seq_start_date,longest_not_nan_seq_end_date,vendor
0,binance,BTC_USDT,2018-08-17 00:00:00+00:00,2021-09-14 18:00:00+00:00,1614912,99.708022,1124,1436.754448,155,13.782684,2020-06-28 05:30:00+00:00,2020-11-30 05:59:00+00:00,CCXT
1,binance,LINK_USDT,2019-01-16 10:00:00+00:00,2021-09-16 09:19:00+00:00,1289761,91.960257,973,1325.550874,109,11.26508,2021-04-25 08:45:00+00:00,2021-08-13 01:59:00+00:00,CCXT
2,binance,ETH_USDT,2018-08-17 00:00:00+00:00,2021-09-14 22:08:00+00:00,1615154,99.707696,1124,1436.969751,155,13.780574,2020-06-28 05:30:00+00:00,2020-11-30 05:59:00+00:00,CCXT


### Display the union results

In [15]:
def unify_start_end_tables(
    cdd_df: pd.DataFrame, ccxt_df: pd.DataFrame
) -> pd.DataFrame:
    """
    Combine 'CCXT' and 'CDD' start-end stats tables into one table.

    :param cdd_df: start-end table for 'CCXT'
    :param ccxt_df: start-end table for 'CDD'
    :return: unified start-end table
    """
    # Set Multiindex.
    cdd_df = cdd_df.set_index(["exchange_id", "currency_pair"])
    ccxt_df = ccxt_df.set_index(["exchange_id", "currency_pair"])
    # Add suffixes.
    ccxt_df = ccxt_df.add_suffix("_ccxt")
    cdd_df = cdd_df.add_suffix("_cdd")
    # Combine two universes.
    ccxt_and_cdd = pd.concat([cdd_df, ccxt_df], axis=1)
    # Sort columns.
    cols_to_sort = ccxt_and_cdd.columns.to_list()
    ccxt_and_cdd = ccxt_and_cdd[sorted(cols_to_sort)]
    return ccxt_and_cdd

In [16]:
union_cdd_ccxt_stats = unify_start_end_tables(
    cdd_start_end_table, ccxt_start_end_table
)
display(union_cdd_ccxt_stats)

Unnamed: 0_level_0,Unnamed: 1_level_0,avg_data_points_per_day_ccxt,avg_data_points_per_day_cdd,coverage_ccxt,coverage_cdd,days_available_ccxt,days_available_cdd,longest_not_nan_seq_days_ccxt,longest_not_nan_seq_days_cdd,longest_not_nan_seq_end_date_ccxt,longest_not_nan_seq_end_date_cdd,...,longest_not_nan_seq_start_date_ccxt,longest_not_nan_seq_start_date_cdd,max_timestamp_ccxt,max_timestamp_cdd,min_timestamp_ccxt,min_timestamp_cdd,n_data_points_ccxt,n_data_points_cdd,vendor_ccxt,vendor_cdd
exchange_id,currency_pair,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
binance,BTC_USDT,1436.754448,1432.827913,99.708022,99.459168,1124,738,155,445,2020-11-30 05:59:00+00:00,2020-11-30 05:59:00+00:00,...,2020-06-28 05:30:00+00:00,2019-09-11 13:06:00+00:00,2021-09-14 18:00:00+00:00,2021-09-16 01:35:00+00:00,2018-08-17 00:00:00+00:00,2019-09-08 17:59:00+00:00,1614912,1057427,CCXT,CDD
binance,LINK_USDT,1325.550874,1432.678748,91.960257,99.371186,973,607,109,112,2021-08-13 01:59:00+00:00,2020-10-18 04:40:00+00:00,...,2021-04-25 08:45:00+00:00,2020-06-28 03:10:00+00:00,2021-09-16 09:19:00+00:00,2021-09-16 01:38:00+00:00,2019-01-16 10:00:00+00:00,2020-01-17 08:00:00+00:00,1289761,869636,CCXT,CDD
binance,ETH_USDT,1436.969751,1433.00152,99.707696,99.401639,1124,658,155,275,2020-11-30 05:59:00+00:00,2020-11-30 05:59:00+00:00,...,2020-06-28 05:30:00+00:00,2020-02-28 17:24:00+00:00,2021-09-14 22:08:00+00:00,2021-09-16 01:35:00+00:00,2018-08-17 00:00:00+00:00,2019-11-27 07:45:00+00:00,1615154,942915,CCXT,CDD
binance,BNB_USDT,1435.632327,1431.981132,99.644146,99.318324,1126,583,155,143,2020-11-30 05:59:00+00:00,2020-11-30 05:59:00+00:00,...,2020-06-28 05:30:00+00:00,2020-07-09 21:14:00+00:00,2021-09-16 14:14:00+00:00,2021-09-16 01:35:00+00:00,2018-08-17 00:00:00+00:00,2020-02-10 08:01:00+00:00,1616522,834845,CCXT,CDD
binance,EOS_USDT,1431.489343,1432.175325,99.347466,99.33836,1126,616,84,75,2021-07-18 21:32:00+00:00,2021-08-13 01:59:00+00:00,...,2021-04-25 08:45:00+00:00,2021-05-29 20:22:00+00:00,2021-09-16 16:43:00+00:00,2021-09-16 01:35:00+00:00,2018-08-17 00:00:00+00:00,2020-01-08 08:00:00+00:00,1611857,882220,CCXT,CDD
binance,ADA_USDT,1393.166075,1424.042159,96.714242,98.769673,1126,593,109,77,2021-08-13 01:59:00+00:00,2020-09-12 01:12:00+00:00,...,2021-04-25 08:45:00+00:00,2020-06-26 21:58:00+00:00,2021-09-16 09:19:00+00:00,2021-09-16 01:36:00+00:00,2018-08-17 00:00:00+00:00,2020-01-31 08:01:00+00:00,1568705,844457,CCXT,CDD
binance,SOL_USDT,1405.97,1449.235955,97.61308,99.791106,400,89,109,67,2021-08-13 01:59:00+00:00,2021-08-13 01:59:00+00:00,...,2021-04-25 08:45:00+00:00,2021-06-06 06:57:00+00:00,2021-09-15 08:19:00+00:00,2021-09-04 01:08:00+00:00,2020-08-11 06:00:00+00:00,2021-06-06 06:57:00+00:00,562388,128982,CCXT,CDD


## Comparison of full symbols that are included in 'CDD' but not available in 'CCXT'

In [17]:
# Set of full symbols that are included in `CDD` but not available in `CCXT` (cleaned from unavailable full symbols).
cdd_and_not_ccxt_cleaned = set(cdd_universe).difference(ccxt_universe)
len(cdd_and_not_ccxt_cleaned)

29

In [18]:
# For 'avg_data_points_per_day' the amount of "days_available" is equal to 0, so it crashes the calculations.
cdd_and_not_ccxt_cleaned.remove("binance::DAI_USDT")

In [19]:
compute_start_end_stats = lambda data: ramccsta.compute_start_end_stats(
    data, config_cdd
)

cdd_unique_start_end_table = ramccsta.compute_stats_for_universe(
    cdd_and_not_ccxt_cleaned, config_cdd, compute_start_end_stats
)

Removed 35892 / 526368 = 6.82% rows
Removed 102240 / 526368 = 19.42% rows
Removed 210914 / 512111 = 41.19% rows
Removed 568 / 881359 = 0.06% rows
Removed 27921 / 843922 = 3.31% rows
Removed 5063 / 526368 = 0.96% rows
Removed 4592 / 444306 = 1.03% rows
Removed 6367 / 871287 = 0.73% rows
Removed 9838 / 865526 = 1.14% rows
Removed 1058 / 113086 = 0.94% rows
Removed 10093 / 872723 = 1.16% rows
Removed 12164 / 825163 = 1.47% rows
Removed 3486 / 411190 = 0.85% rows
Removed 100002 / 526368 = 19.00% rows
Removed 607 / 120314 = 0.50% rows
Removed 11557 / 526368 = 2.20% rows
Removed 41461 / 526368 = 7.88% rows
Removed 56193 / 820882 = 6.85% rows
Removed 310 / 885667 = 0.04% rows
Removed 25830 / 830963 = 3.11% rows
Removed 16883 / 842487 = 2.00% rows
Removed 16159 / 845366 = 1.91% rows


In [20]:
display(cdd_unique_start_end_table)

Unnamed: 0,exchange_id,currency_pair,min_timestamp,max_timestamp,n_data_points,coverage,days_available,avg_data_points_per_day,longest_not_nan_seq_days,longest_not_nan_seq_perc,longest_not_nan_seq_start_date,longest_not_nan_seq_end_date,vendor
0,binance,MATIC_USDT,2020-09-11 20:41:00+00:00,2021-09-16 01:36:00+00:00,490476,92.25439,369,1329.203252,75,20.377462,2021-05-29 20:22:00+00:00,2021-08-13 01:59:00+00:00,CDD
1,binance,UNI_USDT,2021-06-06 06:57:00+00:00,2021-09-07 01:08:00+00:00,133302,99.797862,92,1448.934783,67,73.086425,2021-06-06 06:57:00+00:00,2021-08-13 01:59:00+00:00,CDD
2,binance,CELR_USDT,2020-09-11 20:41:00+00:00,2021-09-16 01:36:00+00:00,424128,79.774892,369,1149.398374,32,8.717479,2021-03-11 05:15:00+00:00,2021-04-12 09:41:00+00:00,CDD
3,binance,FIL_USDT,2021-06-06 06:57:00+00:00,2021-09-04 01:08:00+00:00,128982,99.791106,89,1449.235955,67,75.529199,2021-06-06 06:57:00+00:00,2021-08-13 01:59:00+00:00,CDD
4,binance,PAX_USDT,2020-09-11 20:43:00+00:00,2021-09-06 03:59:00+00:00,301197,58.213905,359,838.988858,0,0.050445,2021-05-19 12:15:00+00:00,2021-05-19 16:35:00+00:00,CDD
5,binance,ICP_USDT,2021-06-06 06:57:00+00:00,2021-09-07 01:08:00+00:00,133302,99.797862,92,1448.934783,67,73.086425,2021-06-06 06:57:00+00:00,2021-08-13 01:59:00+00:00,CDD
6,binance,LTC_USDT,2020-01-09 08:09:00+00:00,2021-09-16 01:35:00+00:00,880791,99.339534,615,1432.180488,75,12.218842,2021-05-29 20:22:00+00:00,2021-08-13 01:59:00+00:00,CDD
7,binance,DASH_USDT,2020-02-04 08:06:00+00:00,2021-09-16 01:36:00+00:00,816001,96.089311,589,1385.400679,49,8.326788,2021-05-29 20:22:00+00:00,2021-07-17 22:53:00+00:00,CDD
8,binance,USDC_USDT,2020-09-11 20:41:00+00:00,2021-09-16 01:36:00+00:00,521305,98.053064,369,1412.750678,17,4.684608,2021-05-09 17:32:00+00:00,2021-05-27 00:37:00+00:00,CDD
9,binance,LRC_USDT,2020-10-19 07:05:00+00:00,2021-08-24 00:40:00+00:00,439714,98.906374,308,1427.642857,102,33.039795,2021-03-02 01:59:00+00:00,2021-06-12 02:05:00+00:00,CDD
