# Description

This notebook computes data statistics per exchange id and currency pair for a given vendor universe.

# Imports

In [1]:
import logging
import os

import core.config.config_ as cconconf
import helpers.hdbg as hdbg
import helpers.henv as henv
import helpers.hprint as hprint
import helpers.hs3 as hs3
import im_v2.common.universe as ivcu
import research_amp.cc.statistics as ramccsta

In [2]:
hdbg.init_logger(verbosity=logging.INFO)

_LOG = logging.getLogger(__name__)

_LOG.info("%s", henv.get_system_signature()[0])

hprint.config_notebook()

AM_AWS_PROFILE = "am"

[0m[36mINFO[0m: > cmd='/venv/lib/python3.8/site-packages/ipykernel_launcher.py -f /home/.local/share/jupyter/runtime/kernel-64dc6f60-8b8e-479f-bd58-e8ce0ae7c161.json'
>>ENV<<: is_inside_container=True: code_version=0, container_version=cmamp-1.0.0, is_inside_docker=True, is_inside_ci=False, CI_defined=True, CI=''
>>ENV<<: AM_AWS_PROFILE=True AM_ECR_BASE_PATH=True AM_S3_BUCKET=True AM_TELEGRAM_TOKEN=True AWS_ACCESS_KEY_ID=False AWS_DEFAULT_REGION=False AWS_SECRET_ACCESS_KEY=False GH_ACTION_ACCESS_TOKEN=True
# Git
    branch_name='CMTask587_propagate_universe_changes'
    hash='6c5711c72'
    # Last commits:
      *   6c5711c72 PomazkinG Merge branch 'master' into CMTask587_propagate_universe_changes   (22 seconds ago) Thu Nov 25 22:23:05 2021  (HEAD -> CMTask587_propagate_universe_changes)
      |\  
      * | 967699d8a PomazkinG remove import                                                     (29 seconds ago) Thu Nov 25 22:22:58 2021           
      * | db04c88b7 PomazkinG fix sta

# Config

In [3]:
def get_cmtask232_config() -> cconconf.Config:
    """
    Get task232-specific config.
    """
    config = cconconf.Config()
    # Load parameters.
    config.add_subconfig("load")
    config["load"]["aws_profile"] = AM_AWS_PROFILE
    config["load"]["data_dir"] = os.path.join(
        hs3.get_s3_bucket_path(AM_AWS_PROFILE), "data"
    )
    # Data parameters.
    config.add_subconfig("data")
    config["data"]["data_type"] = "OHLCV"
    config["data"]["target_frequency"] = "T"
    config["data"]["universe_version"] = "v03"
    config["data"]["vendor"] = "CCXT"
    # Column names.
    config.add_subconfig("column_names")
    config["column_names"]["close_price"] = "close"
    config["column_names"]["currency_pair"] = "currency_pair"
    config["column_names"]["exchange_id"] = "exchange_id"
    return config


config = get_cmtask232_config()
print(config)

load:
  aws_profile: am
  data_dir: s3://alphamatic-data/data
data:
  data_type: OHLCV
  target_frequency: T
  universe_version: v03
  vendor: CCXT
column_names:
  close_price: close
  currency_pair: currency_pair
  exchange_id: exchange_id


# Compute start-end table

## Per exchange id and currency pair for a specified vendor

In [4]:
vendor_universe = ivcu.get_vendor_universe(
    config["data"]["vendor"],
    version=config["data"]["universe_version"],
    as_full_symbol=True,
)
vendor_universe

['binance::ADA_USDT',
 'binance::AVAX_USDT',
 'binance::BNB_USDT',
 'binance::BTC_USDT',
 'binance::DOGE_USDT',
 'binance::EOS_USDT',
 'binance::ETH_USDT',
 'binance::LINK_USDT',
 'binance::SOL_USDT',
 'ftx::BNB_USDT',
 'ftx::BTC_USDT',
 'ftx::DOGE_USDT',
 'ftx::ETH_USDT',
 'ftx::LINK_USDT',
 'ftx::SOL_USDT',
 'ftx::XRP_USDT',
 'gateio::ADA_USDT',
 'gateio::AVAX_USDT',
 'gateio::BNB_USDT',
 'gateio::BTC_USDT',
 'gateio::DOGE_USDT',
 'gateio::EOS_USDT',
 'gateio::ETH_USDT',
 'gateio::FIL_USDT',
 'gateio::LINK_USDT',
 'gateio::SOL_USDT',
 'gateio::XRP_USDT',
 'kucoin::ADA_USDT',
 'kucoin::AVAX_USDT',
 'kucoin::BNB_USDT',
 'kucoin::BTC_USDT',
 'kucoin::DOGE_USDT',
 'kucoin::EOS_USDT',
 'kucoin::ETH_USDT',
 'kucoin::FIL_USDT',
 'kucoin::LINK_USDT',
 'kucoin::SOL_USDT',
 'kucoin::XRP_USDT']

In [5]:
compute_start_end_stats = lambda data: ramccsta.compute_start_end_stats(
    data, config
)

start_end_table = ramccsta.compute_stats_for_universe(
    vendor_universe, config, compute_start_end_stats
)

Reading CCXT data for exchange id='binance', currencies='ADA_USDT' from file='s3://alphamatic-data/data/ccxt/20210924/binance/ADA_USDT.csv.gz'...
Processing CCXT data for exchange id='binance', currencies='ADA_USDT'...
Index length increased by 4520 = 1622000 - 1617480
Reading CCXT data for exchange id='binance', currencies='AVAX_USDT' from file='s3://alphamatic-data/data/ccxt/20210924/binance/AVAX_USDT.csv.gz'...
Processing CCXT data for exchange id='binance', currencies='AVAX_USDT'...
Index length increased by 1224 = 517498 - 516274
Reading CCXT data for exchange id='binance', currencies='BNB_USDT' from file='s3://alphamatic-data/data/ccxt/20210924/binance/BNB_USDT.csv.gz'...
Processing CCXT data for exchange id='binance', currencies='BNB_USDT'...
Index length increased by 4520 = 1622295 - 1617775
Reading CCXT data for exchange id='binance', currencies='BTC_USDT' from file='s3://alphamatic-data/data/ccxt/20210924/binance/BTC_USDT.csv.gz'...
Processing CCXT data for exchange id='binan

Index length increased by 94496 = 1619786 - 1525290
Reading CCXT data for exchange id='kucoin', currencies='DOGE_USDT' from file='s3://alphamatic-data/data/ccxt/20210924/kucoin/DOGE_USDT.csv.gz'...
Processing CCXT data for exchange id='kucoin', currencies='DOGE_USDT'...
Index length=314817 has not changed
Reading CCXT data for exchange id='kucoin', currencies='EOS_USDT' from file='s3://alphamatic-data/data/ccxt/20210924/kucoin/EOS_USDT.csv.gz'...
Processing CCXT data for exchange id='kucoin', currencies='EOS_USDT'...
Index length increased by 232290 = 1621431 - 1389141
Reading CCXT data for exchange id='kucoin', currencies='ETH_USDT' from file='s3://alphamatic-data/data/ccxt/20210924/kucoin/ETH_USDT.csv.gz'...
Processing CCXT data for exchange id='kucoin', currencies='ETH_USDT'...
Index length increased by 117548 = 1619960 - 1502412
Reading CCXT data for exchange id='kucoin', currencies='FIL_USDT' from file='s3://alphamatic-data/data/ccxt/20210924/kucoin/FIL_USDT.csv.gz'...
Processing 

In [6]:
# Post-process results.
cols_to_sort_by = ["coverage", "longest_not_nan_seq_perc"]
cols_to_round = [
    "coverage",
    "avg_data_points_per_day",
    "longest_not_nan_seq_perc",
]
stats_table = ramccsta.postprocess_stats_table(
    start_end_table, cols_to_sort_by, cols_to_round
)
stats_table

Unnamed: 0,exchange_id,currency_pair,min_timestamp,max_timestamp,n_data_points,coverage,days_available,avg_data_points_per_day,longest_not_nan_seq_days,longest_not_nan_seq_perc,longest_not_nan_seq_start_date,longest_not_nan_seq_end_date,vendor
32,kucoin,EOS_USDT,2018-08-16 20:10:00-04:00,2021-09-15 20:00:00-04:00,1389141,85.673766,1125,1234.792,941,83.571055,2019-02-17 18:54:00-05:00,2021-09-15 20:00:00-04:00,CCXT
37,kucoin,XRP_USDT,2018-12-03 11:58:00-05:00,2021-09-15 11:25:00-04:00,1357381,92.692715,1016,1336.004921,940,92.498368,2019-02-17 18:51:00-05:00,2021-09-15 11:25:00-04:00,CCXT
33,kucoin,ETH_USDT,2018-08-16 20:01:00-04:00,2021-09-14 19:20:00-04:00,1502412,92.743771,1124,1336.66548,939,83.555458,2019-02-17 18:56:00-05:00,2021-09-14 19:20:00-04:00,CCXT
30,kucoin,BTC_USDT,2018-08-16 20:00:00-04:00,2021-09-14 16:25:00-04:00,1525290,94.166143,1124,1357.019573,939,83.55363,2019-02-17 18:56:00-05:00,2021-09-14 16:25:00-04:00,CCXT
6,binance,ETH_USDT,2018-08-16 20:00:00-04:00,2021-09-14 18:08:00-04:00,1615369,99.720969,1124,1437.161032,155,13.780574,2020-06-28 01:30:00-04:00,2020-11-30 00:59:00-05:00,CCXT
3,binance,BTC_USDT,2018-08-16 20:00:00-04:00,2021-09-14 14:00:00-04:00,1615122,99.720988,1124,1436.941281,155,13.782684,2020-06-28 01:30:00-04:00,2020-11-30 00:59:00-05:00,CCXT
0,binance,ADA_USDT,2018-08-16 20:00:00-04:00,2021-09-16 05:19:00-04:00,1617480,99.721332,1126,1436.483126,155,13.762639,2020-06-28 01:30:00-04:00,2020-11-30 00:59:00-05:00,CCXT
2,binance,BNB_USDT,2018-08-16 20:00:00-04:00,2021-09-16 10:14:00-04:00,1617775,99.721382,1126,1436.745115,155,13.760136,2020-06-28 01:30:00-04:00,2020-11-30 00:59:00-05:00,CCXT
5,binance,EOS_USDT,2018-08-16 20:00:00-04:00,2021-09-16 12:43:00-04:00,1617924,99.721408,1126,1436.877442,155,13.758872,2020-06-28 01:30:00-04:00,2020-11-30 00:59:00-05:00,CCXT
7,binance,LINK_USDT,2019-01-16 05:00:00-05:00,2021-09-16 05:19:00-04:00,1398630,99.722642,973,1437.440904,155,15.916351,2020-06-28 01:30:00-04:00,2020-11-30 00:59:00-05:00,CCXT


Looking at the results we can see that all the exchanges except for Bitfinex have significantly big longest not-NaN sequence (>13% at least) in combine with high data coverage (>85%). Bitfinex has a very low data coverage and its longest not-NaN sequence lengths are less than 1 day long and comprise less than 1% of the original data. This means that Bitfinex data spottiness is too scattered and we should exclude it from our analysis until we get clearer data for it.

In [7]:
_LOG.info(
    "The number of unique exchange and currency pair combinations=%s",
    start_end_table.shape[0],
)
start_end_table

The number of unique exchange and currency pair combinations=38


Unnamed: 0,exchange_id,currency_pair,min_timestamp,max_timestamp,n_data_points,coverage,days_available,avg_data_points_per_day,longest_not_nan_seq_days,longest_not_nan_seq_perc,longest_not_nan_seq_start_date,longest_not_nan_seq_end_date,vendor
0,binance,ADA_USDT,2018-08-16 20:00:00-04:00,2021-09-16 05:19:00-04:00,1617480,99.721332,1126,1436.483126,155,13.762639,2020-06-28 01:30:00-04:00,2020-11-30 00:59:00-05:00,CCXT
1,binance,AVAX_USDT,2020-09-22 02:30:00-04:00,2021-09-16 11:27:00-04:00,516274,99.763477,359,1438.089136,109,30.530553,2021-04-25 04:45:00-04:00,2021-08-12 21:59:00-04:00,CCXT
2,binance,BNB_USDT,2018-08-16 20:00:00-04:00,2021-09-16 10:14:00-04:00,1617775,99.721382,1126,1436.745115,155,13.760136,2020-06-28 01:30:00-04:00,2020-11-30 00:59:00-05:00,CCXT
3,binance,BTC_USDT,2018-08-16 20:00:00-04:00,2021-09-14 14:00:00-04:00,1615122,99.720988,1124,1436.941281,155,13.782684,2020-06-28 01:30:00-04:00,2020-11-30 00:59:00-05:00,CCXT
4,binance,DOGE_USDT,2019-07-05 08:00:00-04:00,2021-09-16 05:19:00-04:00,1154731,99.75216,803,1438.021171,155,19.283863,2020-06-28 01:30:00-04:00,2020-11-30 00:59:00-05:00,CCXT
5,binance,EOS_USDT,2018-08-16 20:00:00-04:00,2021-09-16 12:43:00-04:00,1617924,99.721408,1126,1436.877442,155,13.758872,2020-06-28 01:30:00-04:00,2020-11-30 00:59:00-05:00,CCXT
6,binance,ETH_USDT,2018-08-16 20:00:00-04:00,2021-09-14 18:08:00-04:00,1615369,99.720969,1124,1437.161032,155,13.780574,2020-06-28 01:30:00-04:00,2020-11-30 00:59:00-05:00,CCXT
7,binance,LINK_USDT,2019-01-16 05:00:00-05:00,2021-09-16 05:19:00-04:00,1398630,99.722642,973,1437.440904,155,15.916351,2020-06-28 01:30:00-04:00,2020-11-30 00:59:00-05:00,CCXT
8,binance,SOL_USDT,2020-08-11 02:00:00-04:00,2021-09-15 04:19:00-04:00,574916,99.787552,400,1437.29,110,27.743257,2020-08-11 02:00:00-04:00,2020-11-30 00:59:00-05:00,CCXT
9,ftx,BNB_USDT,2020-04-09 16:55:00-04:00,2021-09-21 21:13:00-04:00,763247,99.972232,530,1440.088679,99,18.738793,2021-05-31 04:41:00-04:00,2021-09-07 13:03:00-04:00,CCXT


## Per currency pair

In [8]:
currency_start_end_table = ramccsta.compute_start_end_table_by_currency(
    start_end_table
)
currency_start_end_table

The number of unique currency pairs=11


Unnamed: 0,currency_pair,min_timestamp,max_timestamp,exchange_id,days_available
0,BNB_USDT,2018-08-16 20:00:00-04:00,2021-09-21 21:13:00-04:00,"[binance, ftx, gateio, kucoin]",1132
1,BTC_USDT,2018-08-16 20:00:00-04:00,2021-09-21 17:13:00-04:00,"[binance, ftx, gateio, kucoin]",1131
2,ETH_USDT,2018-08-16 20:00:00-04:00,2021-09-21 17:54:00-04:00,"[binance, ftx, gateio, kucoin]",1131
3,ADA_USDT,2018-08-16 20:00:00-04:00,2021-09-20 12:39:00-04:00,"[binance, gateio, kucoin]",1130
4,EOS_USDT,2018-08-16 20:00:00-04:00,2021-09-20 12:39:00-04:00,"[binance, gateio, kucoin]",1130
5,XRP_USDT,2018-12-03 11:58:00-05:00,2021-09-21 19:13:00-04:00,"[ftx, gateio, kucoin]",1023
6,LINK_USDT,2019-01-16 05:00:00-05:00,2021-09-21 20:32:00-04:00,"[binance, ftx, gateio, kucoin]",979
7,DOGE_USDT,2019-07-05 08:00:00-04:00,2021-09-21 19:51:00-04:00,"[binance, ftx, gateio, kucoin]",809
8,SOL_USDT,2020-07-26 20:13:00-04:00,2021-09-21 18:33:00-04:00,"[binance, ftx, gateio, kucoin]",421
9,AVAX_USDT,2020-09-22 02:30:00-04:00,2021-09-20 12:39:00-04:00,"[binance, gateio, kucoin]",363
