# Imports

In [61]:
import logging
import time

import ccxt
import pandas as pd

import core.config.config_ as cconconf
import helpers.hdatetime as hdateti
import helpers.hdbg as hdbg
import helpers.henv as henv
import helpers.hprint as hprint
import helpers.hsecrets as hsecret
import im_v2.ccxt.data.client as icdcl
import im_v2.ccxt.data.extract.exchange_class as imvcdeexcl

In [62]:
hdbg.init_logger(verbosity=logging.INFO)

_LOG = logging.getLogger(__name__)

_LOG.info("%s", henv.get_system_signature()[0])

hprint.config_notebook()

INFO  # Git
    branch_name='CMTask1905_Check_CCXT_data_against_source'
    hash='6cf2fa939'
    # Last commits:
      *   6cf2fa939 DanilYachmenev Merge branch 'master' into CMTask1905_Check_CCXT_data_against_source (18 minutes ago) Wed May 18 16:22:28 2022  (HEAD -> CMTask1905_Check_CCXT_data_against_source, origin/CMTask1905_Check_CCXT_data_against_source)
      |\  
      | * 07beed00f Daniil Tikhomirov CMTask1950: enact minor TODOs (#1954)                             (27 minutes ago) Wed May 18 16:13:19 2022  (origin/master, origin/HEAD)
      * | a4712fe4e Nina Lee fixed, linter                                                     (40 minutes ago) Wed May 18 16:00:32 2022           
# Machine info
    system=Linux
    node name=175097e404cb
    release=5.13.0-1022-aws
    version=#24~20.04.1-Ubuntu SMP Thu Apr 7 22:10:15 UTC 2022
    machine=x86_64
    processor=x86_64
    cpu count=8
    cpu freq=scpufreq(current=2499.998, min=0.0, max=0.0)
    memory=svmem(total=33294798848, ava

In [63]:
def get_cmtask1905_config_ccxt() -> cconconf.Config:
    """
    Get task1905-specific config.
    """
    config = cconconf.Config()
    # Load parameters.
    config.add_subconfig("load")
    config["load"]["aws_profile"] = "ck"
    #
    config["load"]["data_dir"] = "s3://cryptokaizen-data/historical"
    # Data parameters.
    config.add_subconfig("data")
    config["data"]["vendor"] = "CCXT"
    config["data"]["data_snapshot"] = "latest"
    config["data"]["version"] = "v3"
    config["data"]["resample_1min"] = True
    config["data"]["partition_mode"] = "by_year_month"
    config["data"]["start_ts"] = None
    config["data"]["end_ts"] = None
    config["data"]["columns"] = None
    config["data"]["filter_data_mode"] = "assert"
    return config

In [64]:
config = get_cmtask1905_config_ccxt()
print(config)

load:
  aws_profile: ck
  data_dir: s3://cryptokaizen-data/historical
data:
  vendor: CCXT
  data_snapshot: latest
  version: v3
  resample_1min: True
  partition_mode: by_year_month
  start_ts: None
  end_ts: None
  columns: None
  filter_data_mode: assert


# Functions

In [65]:
def _get_ccxt_ohlcv_data(
    exchange: ccxt.Exchange,
    currency_pair: str,
    start_timestamp: pd.Timestamp,
    end_timestamp: pd.Timestamp,
) -> pd.DataFrame:
    """
    Get OHLCV data for a given exchange, currency pair and time period.
    """
    start_timestamp = start_timestamp.asm8.astype(int) // 1000000
    end_timestamp = end_timestamp.asm8.astype(int) // 1000000
    all_bars = []
    duration = exchange.parse_timeframe("1m") * 100
    for t in range(
        start_timestamp,
        end_timestamp + duration,
        duration * 500,
    ):
        bars = _get_ccxt_bar_data(currency_pair, t, exchange)
        all_bars.append(bars)
        time.sleep(1)
    all_data = pd.concat(all_bars)
    return all_data


def _get_ccxt_bar_data(
    currency_pair: str, since: "start timestamp", exchange: ccxt.Exchange
):
    """
    Get data for a single bar from CCXT.
    """
    ccxt_data = exchange.fetch_ohlcv(
        currency_pair, timeframe="1m", since=since, limit=500
    )
    columns = ["timestamp", "open", "high", "low", "close", "volume"]
    bars = pd.DataFrame(ccxt_data, columns=columns)
    return bars


def _get_data_for_year_month(
    df: pd.DataFrame, year: int, month: int
) -> pd.DataFrame:
    """
    :return: data for a specific year and month
    """
    df = df.loc[(df.index.year == year) & (df.index.month == month)]
    return df


def _get_data_with_volume_0(df: pd.DataFrame) -> pd.DataFrame:
    """
    Compute % of data points where volume = 0.

    :return: data with volume = 0.
    """
    df_volume_0 = df.loc[df["volume"] == 0]
    return df_volume_0


def _log_into_exchange(exchange: str) -> ccxt.Exchange:
    """
    Log into an exchange via CCXT and return the corresponding `ccxt.Exchange`
    object.
    """
    # Select credentials for provided exchange.
    credentials = hsecret.get_secret(exchange)
    # Enable rate limit.
    credentials["rateLimit"] = True
    exchange_class = getattr(ccxt, exchange)
    # Create a CCXT Exchange class object.
    exchange = exchange_class(credentials)
    hdbg.dassert(
        exchange.checkRequiredCredentials(),
        msg="Required credentials not passed",
    )
    return exchange


def _set_index_ts(df: pd.DataFrame) -> pd.DataFrame:
    """
    Convert epoch column to timestamp index.
    """
    df["timestamp"] = df["timestamp"].apply(
        lambda x: hdateti.convert_unix_epoch_to_timestamp(x)
    )
    df = df.set_index("timestamp")
    return df

# CcxtHistoricalPqByTileClient

In [6]:
client = icdcl.CcxtHistoricalPqByTileClient(
    config["data"]["version"],
    config["data"]["resample_1min"],
    config["load"]["data_dir"],
    config["data"]["partition_mode"],
    aws_profile=config["load"]["aws_profile"],
)

In [7]:
universe = client.get_universe()
universe

['binance::ADA_USDT',
 'binance::AVAX_USDT',
 'binance::BNB_USDT',
 'binance::BTC_USDT',
 'binance::DOGE_USDT',
 'binance::EOS_USDT',
 'binance::ETH_USDT',
 'binance::LINK_USDT',
 'binance::SOL_USDT',
 'ftx::BNB_USDT',
 'ftx::BTC_USDT',
 'ftx::DOGE_USDT',
 'ftx::ETH_USDT',
 'ftx::LINK_USDT',
 'ftx::SOL_USDT',
 'ftx::XRP_USDT',
 'gateio::ADA_USDT',
 'gateio::AVAX_USDT',
 'gateio::BNB_USDT',
 'gateio::BTC_USDT',
 'gateio::DOGE_USDT',
 'gateio::EOS_USDT',
 'gateio::ETH_USDT',
 'gateio::FIL_USDT',
 'gateio::LINK_USDT',
 'gateio::SOL_USDT',
 'gateio::XRP_USDT',
 'kucoin::ADA_USDT',
 'kucoin::AVAX_USDT',
 'kucoin::BNB_USDT',
 'kucoin::BTC_USDT',
 'kucoin::DOGE_USDT',
 'kucoin::EOS_USDT',
 'kucoin::ETH_USDT',
 'kucoin::FIL_USDT',
 'kucoin::LINK_USDT',
 'kucoin::SOL_USDT',
 'kucoin::XRP_USDT']

# Binance::DOGE_USDT

In [8]:
full_symbol_binance = ["binance::DOGE_USDT"]
binance_data = client.read_data(
    full_symbol_binance,
    config["data"]["start_ts"],
    config["data"]["end_ts"],
    config["data"]["columns"],
    config["data"]["filter_data_mode"],
)

In [9]:
binance_2019_09 = _get_data_for_year_month(binance_data, 2019, 9)
binance_2019_09_volume_0 = _get_data_with_volume_0(binance_2019_09)
binance_2019_09.head(3)

Unnamed: 0_level_0,full_symbol,open,high,low,close,volume
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2019-09-01 00:00:00+00:00,binance::DOGE_USDT,0.002453,0.002453,0.002453,0.002453,0.0
2019-09-01 00:01:00+00:00,binance::DOGE_USDT,0.002453,0.002453,0.002453,0.002453,0.0
2019-09-01 00:02:00+00:00,binance::DOGE_USDT,0.002452,0.002452,0.002452,0.002452,6927.0


In [10]:
binance_2019_09_volume_0.head(3)

Unnamed: 0_level_0,full_symbol,open,high,low,close,volume
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2019-09-01 00:00:00+00:00,binance::DOGE_USDT,0.002453,0.002453,0.002453,0.002453,0.0
2019-09-01 00:01:00+00:00,binance::DOGE_USDT,0.002453,0.002453,0.002453,0.002453,0.0
2019-09-01 00:03:00+00:00,binance::DOGE_USDT,0.002452,0.002452,0.002452,0.002452,0.0


# Extractor

In [11]:
ccxt_binance_DOGE_exchange = imvcdeexcl.CcxtExchange("binance")

In [12]:
currency_pair_binance = "DOGE/USDT"
start_timestamp = pd.Timestamp("2019-09-01 00:00:00+00:00")
end_timestamp = pd.Timestamp("2019-09-30 23:59:59+00:00")
sleep_time_in_secs = 1
ccxt_binance_DOGE = ccxt_binance_DOGE_exchange.download_ohlcv_data(
    currency_pair_binance,
    start_timestamp=start_timestamp,
    end_timestamp=end_timestamp,
    sleep_time_in_secs=sleep_time_in_secs,
)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 87/87 [01:56<00:00,  1.34s/it]


In [14]:
ccxt_binance_DOGE = _set_index_ts(ccxt_binance_DOGE)
ccxt_binance_DOGE = _get_data_for_year_month(ccxt_binance_DOGE, 2019, 9)
ccxt_binance_DOGE_volume_0 = _get_data_with_volume_0(ccxt_binance_DOGE)
ccxt_binance_DOGE.head(3)

Unnamed: 0_level_0,open,high,low,close,volume,end_download_timestamp
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2019-09-01 00:00:00+00:00,0.002453,0.002453,0.002453,0.002453,0.0,2022-05-18 13:45:19.357361+00:00
2019-09-01 00:01:00+00:00,0.002453,0.002453,0.002453,0.002453,0.0,2022-05-18 13:45:19.357361+00:00
2019-09-01 00:02:00+00:00,0.002452,0.002452,0.002452,0.002452,6927.0,2022-05-18 13:45:19.357361+00:00


In [15]:
ccxt_binance_DOGE_volume_0.head(3)

Unnamed: 0_level_0,open,high,low,close,volume,end_download_timestamp
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2019-09-01 00:00:00+00:00,0.002453,0.002453,0.002453,0.002453,0.0,2022-05-18 13:45:19.357361+00:00
2019-09-01 00:01:00+00:00,0.002453,0.002453,0.002453,0.002453,0.0,2022-05-18 13:45:19.357361+00:00
2019-09-01 00:03:00+00:00,0.002452,0.002452,0.002452,0.002452,0.0,2022-05-18 13:45:19.357361+00:00


# CCXT w/o Extractor

In [16]:
ccxt_exchange = _log_into_exchange("binance")
start_ts = pd.Timestamp("2019-09-01 00:00:00+00:00")
end_ts = pd.Timestamp("2019-09-30 23:59:59+00:00")
ccxt_df = _get_ccxt_ohlcv_data(
    ccxt_exchange, currency_pair_binance, start_ts, end_ts
)
ccxt_df = _set_index_ts(ccxt_df)
ccxt_df = _get_data_for_year_month(ccxt_df, 2019, 9)
ccxt_df_volume_0 = _get_data_with_volume_0(ccxt_df)
ccxt_df

Unnamed: 0_level_0,open,high,low,close,volume
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2019-09-01 00:00:00+00:00,0.002453,0.002453,0.002453,0.002453,0.0
2019-09-01 00:01:00+00:00,0.002453,0.002453,0.002453,0.002453,0.0
2019-09-01 00:02:00+00:00,0.002452,0.002452,0.002452,0.002452,6927.0
2019-09-01 00:03:00+00:00,0.002452,0.002452,0.002452,0.002452,0.0
2019-09-01 00:04:00+00:00,0.002452,0.002452,0.002452,0.002452,0.0
...,...,...,...,...,...
2019-09-30 23:55:00+00:00,0.002375,0.002375,0.002375,0.002375,0.0
2019-09-30 23:56:00+00:00,0.002375,0.002375,0.002375,0.002375,0.0
2019-09-30 23:57:00+00:00,0.002375,0.002375,0.002375,0.002375,0.0
2019-09-30 23:58:00+00:00,0.002375,0.002375,0.002375,0.002375,0.0


In [17]:
ccxt_df_volume_0.head(3)

Unnamed: 0_level_0,open,high,low,close,volume
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2019-09-01 00:00:00+00:00,0.002453,0.002453,0.002453,0.002453,0.0
2019-09-01 00:01:00+00:00,0.002453,0.002453,0.002453,0.002453,0.0
2019-09-01 00:03:00+00:00,0.002452,0.002452,0.002452,0.002452,0.0


# Summary


|CCXT | | ||            Extractor    | | | |Client | | |
|------|--|-||-------------|-|-|-|------|-|-|
|date|Number of NaN rows %|    Total number of rows| `volume=0` %    |Number of NaN rows %|    Total number of rows| `volume=0` %| Number of NaN rows %|    Total number of rows| `volume=0` %|
|2019-09|    0          |                       429750|          73.22%       |    0          |                       43200|          73.3%   |      0|                43200| 73.3%|


- The huge amount of data from CCXT is duplicated.
- Where volume = 0, data from columns open, high, low, close is exactly the same as in the last row with `volume != 0`. It could mean that volume = 0 rows are NaNs at the source, so it could be the way exchange handles missing data.

# ftx::BTC_USDT

## Client

In [18]:
full_symbol_ftx = ["ftx::BTC_USDT"]
ftx_data = client.read_data(
    full_symbol_ftx,
    config["data"]["start_ts"],
    config["data"]["end_ts"],
    config["data"]["columns"],
    config["data"]["filter_data_mode"],
)

In [19]:
ftx_2020_04 = _get_data_for_year_month(ftx_data, 2020, 4)
ftx_2020_04_volume_0 = _get_data_with_volume_0(ftx_2020_04)
ftx_2020_04.head(3)

Unnamed: 0_level_0,full_symbol,open,high,low,close,volume
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2020-04-01 00:00:00+00:00,ftx::BTC_USDT,6410.0,6422.5,6410.0,6419.0,0.0
2020-04-01 00:01:00+00:00,ftx::BTC_USDT,6419.0,6421.0,6411.0,6417.0,0.0
2020-04-01 00:02:00+00:00,ftx::BTC_USDT,6417.0,6419.5,6415.5,6418.0,0.0


In [20]:
ftx_2020_04_volume_0.head(3)

Unnamed: 0_level_0,full_symbol,open,high,low,close,volume
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2020-04-01 00:00:00+00:00,ftx::BTC_USDT,6410.0,6422.5,6410.0,6419.0,0.0
2020-04-01 00:01:00+00:00,ftx::BTC_USDT,6419.0,6421.0,6411.0,6417.0,0.0
2020-04-01 00:02:00+00:00,ftx::BTC_USDT,6417.0,6419.5,6415.5,6418.0,0.0


In [21]:
ftx_2020_04.loc[ftx_2020_04["open"].isna()]

Unnamed: 0_level_0,full_symbol,open,high,low,close,volume
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1


## Extractor

In [22]:
ccxt_ftx_BTC_exchange = imvcdeexcl.CcxtExchange("ftx")
currency_pair_ftx = "BTC/USDT"
start_timestamp = pd.Timestamp("2020-04-01 00:00:00+00:00")
end_timestamp = pd.Timestamp("2020-04-30 23:59:59+00:00")
sleep_time_in_secs = 1
ccxt_ftx_BTC = ccxt_ftx_BTC_exchange.download_ohlcv_data(
    currency_pair_ftx,
    start_timestamp=start_timestamp,
    end_timestamp=end_timestamp,
    sleep_time_in_secs=sleep_time_in_secs,
)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 87/87 [02:10<00:00,  1.50s/it]


In [23]:
ccxt_ftx_BTC = _set_index_ts(ccxt_ftx_BTC)
ccxt_ftx_BTC = _get_data_for_year_month(ccxt_ftx_BTC, 2020, 4)
ccxt_ftx_BTC_volume_0 = _get_data_with_volume_0(ccxt_ftx_BTC)
ccxt_ftx_BTC.head(3)

Unnamed: 0_level_0,open,high,low,close,volume,end_download_timestamp
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2020-04-01 00:00:00+00:00,6410.0,6422.5,6410.0,6419.0,0.0,2022-05-18 14:07:12.381197+00:00
2020-04-01 00:01:00+00:00,6419.0,6421.0,6411.0,6417.0,0.0,2022-05-18 14:07:12.381197+00:00
2020-04-01 00:02:00+00:00,6417.0,6419.5,6415.5,6418.0,0.0,2022-05-18 14:07:12.381197+00:00


In [24]:
ccxt_ftx_BTC_volume_0.loc[ccxt_ftx_BTC["high"] == 7493.50000000].head(3)

Unnamed: 0_level_0,open,high,low,close,volume,end_download_timestamp
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2020-04-24 00:34:00+00:00,7491.5,7493.5,7488.5,7488.5,0.0,2022-05-18 14:08:51.659306+00:00
2020-04-24 01:41:00+00:00,7492.5,7493.5,7488.0,7490.0,0.0,2022-05-18 14:08:51.659306+00:00
2020-04-24 02:04:00+00:00,7487.0,7493.5,7487.0,7493.5,0.0,2022-05-18 14:08:51.659306+00:00


In [25]:
ccxt_ftx_BTC.loc[(ccxt_ftx_BTC.index.day == 25) & (ccxt_ftx_BTC.index.hour == 3)][
    30:43
]

Unnamed: 0_level_0,open,high,low,close,volume,end_download_timestamp
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2020-04-25 03:30:00+00:00,7497.0,7500.0,7497.0,7500.0,0.0,2022-05-18 14:08:56.774049+00:00
2020-04-25 03:31:00+00:00,7500.0,7518.5,7500.0,7515.5,277272.18765,2022-05-18 14:08:56.774049+00:00
2020-04-25 03:32:00+00:00,7515.5,7515.5,7509.0,7509.0,0.0,2022-05-18 14:08:56.774049+00:00
2020-04-25 03:33:00+00:00,7509.0,7510.5,7507.5,7508.5,0.0,2022-05-18 14:08:56.774049+00:00
2020-04-25 03:34:00+00:00,7508.5,7512.5,7505.5,7512.5,60814.8,2022-05-18 14:08:56.774049+00:00
2020-04-25 03:35:00+00:00,7512.5,7513.5,7510.5,7512.0,60847.2,2022-05-18 14:08:56.774049+00:00
2020-04-25 03:36:00+00:00,7512.0,7513.5,7512.0,7512.0,0.0,2022-05-18 14:08:56.774049+00:00
2020-04-25 03:37:00+00:00,7512.0,7514.0,7511.5,7511.5,0.0,2022-05-18 14:08:56.774049+00:00
2020-04-25 03:38:00+00:00,7511.5,7512.0,7511.0,7511.5,9.7643,2022-05-18 14:08:56.774049+00:00
2020-04-25 03:39:00+00:00,7511.5,7513.5,7511.0,7511.0,0.0,2022-05-18 14:08:56.774049+00:00


## CCXT w/o Extractor

In [68]:
ccxt_exchange_ftx = _log_into_exchange("ftx")
start_ts = pd.Timestamp("2020-04-01 00:00:00+00:00")
end_ts = pd.Timestamp("2020-04-30 23:59:59+00:00")
ccxt_df_ftx = _get_ccxt_ohlcv_data(
    ccxt_exchange_ftx, currency_pair_ftx, start_ts, end_ts
)
ccxt_df_ftx = _set_index_ts(ccxt_df_ftx)
ccxt_df_ftx = _get_data_for_year_month(ccxt_df_ftx, 2020, 4)
ccxt_df_ftx_volume_0 = _get_data_with_volume_0(ccxt_df_ftx)
print(len(ccxt_df_ftx.index.unique()))
display(ccxt_df_ftx.head(3))

43200


Unnamed: 0_level_0,open,high,low,close,volume
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2020-04-01 00:00:00+00:00,6410.0,6422.5,6410.0,6419.0,0.0
2020-04-01 00:01:00+00:00,6419.0,6421.0,6411.0,6417.0,0.0
2020-04-01 00:02:00+00:00,6417.0,6419.5,6415.5,6418.0,0.0



|CCXT | | ||            Extractor    | | | |Client | | |
|------|--|-||-------------|-|-|-|------|-|-|
|date|Number of NaN rows %|    Total number of rows| `volume=0` %    |Number of NaN rows %|    Total number of rows| `volume=0` %| Number of NaN rows %|    Total number of rows| `volume=0` %|
|2020-04|    0          |                       429750|          86.09%       |    0          |                       43200|          85.97%   |      0|                43200| 85.97%|


Values in `volume = 0` rows change and don't have the same pattern as binance.

# gateio::ETH_USDT w/o `volume = 0` in data

Data from `gateio` has NaN spikes in September, October and November in 2021.

## Client

In [31]:
full_symbols_gateio = ["gateio::ETH_USDT", "gateio::ADA_USDT"]
gateio_data = client.read_data(
    [full_symbols_gateio[0]],
    config["data"]["start_ts"],
    config["data"]["end_ts"],
    config["data"]["columns"],
    config["data"]["filter_data_mode"],
)

### October 2021 - 100% of `NaNs`

In [32]:
gateio_data_2021_10 = _get_data_for_year_month(gateio_data, 2021, 10)
gateio_data_2021_10.head(3)

Unnamed: 0_level_0,full_symbol,open,high,low,close,volume
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2021-10-01 00:00:00+00:00,gateio::ETH_USDT,,,,,
2021-10-01 00:01:00+00:00,gateio::ETH_USDT,,,,,
2021-10-01 00:02:00+00:00,gateio::ETH_USDT,,,,,


In [33]:
gateio_data_2021_10.isna().value_counts()

full_symbol  open  high  low   close  volume
False        True  True  True  True   True      44640
dtype: int64

### 34.46% of `NaNs`

In [35]:
gateio_data_2021_09 = _get_data_for_year_month(gateio_data, 2021, 9)
gateio_data_2021_09.head(3)

Unnamed: 0_level_0,full_symbol,open,high,low,close,volume
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2021-09-01 00:00:00+00:00,gateio::ETH_USDT,3428.86,3429.16,3417.4,3418.1,60825.830384
2021-09-01 00:01:00+00:00,gateio::ETH_USDT,3417.89,3421.68,3416.13,3418.18,75574.610423
2021-09-01 00:02:00+00:00,gateio::ETH_USDT,3418.74,3419.99,3407.96,3408.506591,66806.84125


In [36]:
gateio_data_2021_09.isna().value_counts()

full_symbol  open   high   low    close  volume
False        False  False  False  False  False     28314
             True   True   True   True   True      14886
dtype: int64

### No `NaNs`

In [37]:
gateio_data.loc[
    (gateio_data.index.year == 2021) & (gateio_data.index.month == 12)
].head(3)

Unnamed: 0_level_0,full_symbol,open,high,low,close,volume
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2021-12-01 00:00:00+00:00,gateio::ETH_USDT,4629.99,4632.1,4616.58,4616.77,245362.324425
2021-12-01 00:01:00+00:00,gateio::ETH_USDT,4616.59,4618.73,4606.01,4607.25,127593.364191
2021-12-01 00:02:00+00:00,gateio::ETH_USDT,4607.42,4614.05,4602.99,4604.49,436587.493125


## Extractor

In [38]:
ccxt_gateio_exchange = imvcdeexcl.CcxtExchange("gateio")
currency_pair_gateio = ["ETH/USDT", "ADA/USDT"]
start_timestamp = pd.Timestamp("2021-09-01 00:00:00+00:00")
end_timestamp = pd.Timestamp("2021-09-30 23:59:59+00:00")
sleep_time_in_secs = 1
ccxt_gateio_ETH = ccxt_gateio_exchange.download_ohlcv_data(
    currency_pair_gateio[0],
    start_timestamp=start_timestamp,
    end_timestamp=end_timestamp,
    sleep_time_in_secs=sleep_time_in_secs,
)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 87/87 [01:49<00:00,  1.26s/it]


In [39]:
ccxt_gateio_ETH

Unnamed: 0,timestamp,open,high,low,close,volume,end_download_timestamp


In [41]:
start_timestamp = pd.Timestamp("2021-10-01 00:00:00+00:00")
end_timestamp = pd.Timestamp("2021-10-31 23:59:59+00:00")
sleep_time_in_secs = 1
ccxt_gateio_ETH_10 = ccxt_gateio_exchange.download_ohlcv_data(
    currency_pair_gateio[0],
    start_timestamp=start_timestamp,
    end_timestamp=end_timestamp,
    sleep_time_in_secs=sleep_time_in_secs,
)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 90/90 [01:54<00:00,  1.27s/it]


In [42]:
ccxt_gateio_ETH_10

Unnamed: 0,timestamp,open,high,low,close,volume,end_download_timestamp


In [43]:
start_timestamp = pd.Timestamp("2021-12-01 00:00:00+00:00")
end_timestamp = pd.Timestamp("2021-12-31 23:59:59+00:00")
sleep_time_in_secs = 1
ccxt_gateio_ETH_12 = ccxt_gateio_exchange.download_ohlcv_data(
    currency_pair_gateio[0],
    start_timestamp=start_timestamp,
    end_timestamp=end_timestamp,
    sleep_time_in_secs=sleep_time_in_secs,
)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 90/90 [01:53<00:00,  1.26s/it]


In [44]:
ccxt_gateio_ETH_12

Unnamed: 0,timestamp,open,high,low,close,volume,end_download_timestamp


Empty datasets are returned for the dates earlier than December 2021. Probably data is accessible only for a certain amount of time, e.g., 1 year.

In [45]:
# Load recent data to make sure API and Exctractor are working.
start_timestamp = pd.Timestamp("2022-04-25 00:00:00+00:00")
end_timestamp = pd.Timestamp("2022-05-14 23:59:59+00:00")
sleep_time_in_secs = 1
ccxt_gateio_ETH_2022 = ccxt_gateio_exchange.download_ohlcv_data(
    currency_pair_gateio[0],
    start_timestamp=start_timestamp,
    end_timestamp=end_timestamp,
    sleep_time_in_secs=sleep_time_in_secs,
)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 58/58 [01:14<00:00,  1.28s/it]


In [46]:
ccxt_gateio_ETH_2022.head(3)

Unnamed: 0,timestamp,open,high,low,close,volume,end_download_timestamp
0,1650844800000,2920.97,2924.54,2919.81,2924.54,69040.066928,2022-05-18 15:13:05.656362+00:00
1,1650844860000,2924.54,2926.13,2923.59,2923.93,29546.298121,2022-05-18 15:13:05.656362+00:00
2,1650844920000,2924.03,2924.75,2921.06,2921.06,57124.841251,2022-05-18 15:13:05.656362+00:00


## CCXT w/o Extractor

Empty data is also returned from CCXT directly which means that the problem is at source.

In [49]:
ccxt_exchange = _log_into_exchange("gateio")
start_ts = pd.Timestamp("2021-09-01 00:00:00+00:00")
end_ts = pd.Timestamp("2021-09-30 23:59:59+00:00")
ccxt_df = _get_ccxt_ohlcv_data(
    ccxt_exchange, currency_pair_gateio[0], start_ts, end_ts
)
ccxt_df

Unnamed: 0,timestamp,open,high,low,close,volume


### Summary for `gateio` `volume != 0` data.

- Data before January 2022 is not accessible from `gateio` via CCXT
- There are spikes of NaNs in September, October, November 2021 that are common for all coins

# gateio::ADA_USDT with `volume = 0` in data

## Client

In [50]:
gateio_ADA_data = client.read_data(
    [full_symbols_gateio[1]],
    config["data"]["start_ts"],
    config["data"]["end_ts"],
    config["data"]["columns"],
    config["data"]["filter_data_mode"],
)

In [51]:
gateio_ADA_data_2021_09 = _get_data_for_year_month(gateio_ADA_data, 2021, 9)
gateio_ADA_data_2021_09_volume_0 = _get_data_with_volume_0(
    gateio_ADA_data_2021_09
)
gateio_ADA_data_2021_09.head(3)

Unnamed: 0_level_0,full_symbol,open,high,low,close,volume
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2021-09-01 00:00:00+00:00,gateio::ADA_USDT,2.7688,2.769,2.7633,2.7633,5081.382101
2021-09-01 00:01:00+00:00,gateio::ADA_USDT,2.7633,2.7663,2.7618,2.7632,2492.706958
2021-09-01 00:02:00+00:00,gateio::ADA_USDT,2.7639,2.764,2.7571,2.7572,8067.851931


In [52]:
gateio_ADA_data_2021_09_volume_0

Unnamed: 0_level_0,full_symbol,open,high,low,close,volume
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2021-09-01 10:54:00+00:00,gateio::ADA_USDT,2.8252,2.8252,2.8252,2.8252,0.0
2021-09-03 20:10:00+00:00,gateio::ADA_USDT,2.9862,2.9862,2.9862,2.9862,0.0
2021-09-05 03:57:00+00:00,gateio::ADA_USDT,2.8642,2.8642,2.8642,2.8642,0.0
2021-09-11 00:00:00+00:00,gateio::ADA_USDT,2.3843,2.3843,2.3843,2.3843,0.0
2021-09-11 05:21:00+00:00,gateio::ADA_USDT,2.4122,2.4122,2.4122,2.4122,0.0


In [53]:
gateio_ADA_data_2021_09.loc[
    (gateio_ADA_data_2021_09.index.day == 5)
    & (gateio_ADA_data_2021_09.index.hour == 3)
].tail(3)

Unnamed: 0_level_0,full_symbol,open,high,low,close,volume
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2021-09-05 03:57:00+00:00,gateio::ADA_USDT,2.8642,2.8642,2.8642,2.8642,0.0
2021-09-05 03:58:00+00:00,gateio::ADA_USDT,2.8641,2.8641,2.863,2.863,1674.800775
2021-09-05 03:59:00+00:00,gateio::ADA_USDT,2.863,2.863,2.8562,2.8588,33621.183701


In [55]:
# `volume = 0` has the same % as bad data
gateio_ADA_data_2021_07 = _get_data_for_year_month(gateio_ADA_data, 2021, 7)
gateio_ADA_data_2021_07_volume_0 = _get_data_with_volume_0(
    gateio_ADA_data_2021_07
)
gateio_ADA_data_2021_07.head(3)

Unnamed: 0_level_0,full_symbol,open,high,low,close,volume
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2021-07-01 00:00:00+00:00,gateio::ADA_USDT,1.3851,1.3856,1.3823,1.3823,7390.530593
2021-07-01 00:01:00+00:00,gateio::ADA_USDT,1.3831,1.3835,1.3794,1.3801,9420.448341
2021-07-01 00:02:00+00:00,gateio::ADA_USDT,1.3794,1.3795,1.3772,1.3778,4909.162421


In [56]:
gateio_ADA_data_2021_07_volume_0[:10]

Unnamed: 0_level_0,full_symbol,open,high,low,close,volume
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2021-07-03 09:21:00+00:00,gateio::ADA_USDT,1.4126,1.4126,1.4126,1.4126,0.0
2021-07-03 09:22:00+00:00,gateio::ADA_USDT,1.4126,1.4126,1.4126,1.4126,0.0
2021-07-03 09:24:00+00:00,gateio::ADA_USDT,1.4117,1.4117,1.4117,1.4117,0.0
2021-07-03 09:25:00+00:00,gateio::ADA_USDT,1.4117,1.4117,1.4117,1.4117,0.0
2021-07-03 09:28:00+00:00,gateio::ADA_USDT,1.4125,1.4125,1.4125,1.4125,0.0
2021-07-03 09:31:00+00:00,gateio::ADA_USDT,1.4103,1.4103,1.4103,1.4103,0.0
2021-07-03 09:33:00+00:00,gateio::ADA_USDT,1.4103,1.4103,1.4103,1.4103,0.0
2021-07-03 09:34:00+00:00,gateio::ADA_USDT,1.4103,1.4103,1.4103,1.4103,0.0
2021-07-03 13:43:00+00:00,gateio::ADA_USDT,1.4192,1.4192,1.4192,1.4192,0.0
2021-07-03 16:34:00+00:00,gateio::ADA_USDT,1.4204,1.4204,1.4204,1.4204,0.0


In [57]:
gateio_ADA_data_2021_07.loc[
    gateio_ADA_data_2021_07.index >= "2021-07-03 09:20:00+00:00"
].head(10)

Unnamed: 0_level_0,full_symbol,open,high,low,close,volume
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2021-07-03 09:20:00+00:00,gateio::ADA_USDT,1.4148,1.4148,1.4123,1.4126,1983.83057
2021-07-03 09:21:00+00:00,gateio::ADA_USDT,1.4126,1.4126,1.4126,1.4126,0.0
2021-07-03 09:22:00+00:00,gateio::ADA_USDT,1.4126,1.4126,1.4126,1.4126,0.0
2021-07-03 09:23:00+00:00,gateio::ADA_USDT,1.4111,1.4117,1.4108,1.4117,338.153496
2021-07-03 09:24:00+00:00,gateio::ADA_USDT,1.4117,1.4117,1.4117,1.4117,0.0
2021-07-03 09:25:00+00:00,gateio::ADA_USDT,1.4117,1.4117,1.4117,1.4117,0.0
2021-07-03 09:26:00+00:00,gateio::ADA_USDT,1.4113,1.4118,1.4113,1.4118,65.399008
2021-07-03 09:27:00+00:00,gateio::ADA_USDT,1.4118,1.4125,1.4118,1.4125,5151.105443
2021-07-03 09:28:00+00:00,gateio::ADA_USDT,1.4125,1.4125,1.4125,1.4125,0.0
2021-07-03 09:29:00+00:00,gateio::ADA_USDT,1.4124,1.4124,1.4103,1.4103,1644.847671


The pattern is: all values in price-related columns with `volume = 0` are the same as the value of `close` of the last row where `volume != 0`.

## Extractor

In [58]:
start_timestamp = pd.Timestamp("2021-07-01 00:00:00+00:00")
end_timestamp = pd.Timestamp("2021-07-31 23:59:59+00:00")
sleep_time_in_secs = 1
ccxt_gateio_ADA = ccxt_gateio_exchange.download_ohlcv_data(
    currency_pair_gateio[1],
    start_timestamp=start_timestamp,
    end_timestamp=end_timestamp,
    sleep_time_in_secs=sleep_time_in_secs,
)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 90/90 [01:54<00:00,  1.27s/it]


In [59]:
ccxt_gateio_ADA

Unnamed: 0,timestamp,open,high,low,close,volume,end_download_timestamp


## Summary for `gateio`

- Empty data is returned from CCXT and Extractor directly which means that the problem is at source.
- The pattern is: all values in price-related columns with `volume = 0` are the same as the value of `close` of the last row where `volume != 0`.
- Data before January 2022 is not accessible from `gateio` via CCXT
- There are spikes of NaNs in September, October, November 2021 that are common for all coins