In [2]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

# Imports

In [3]:
import logging
import os
import requests
import time

import ccxt
import matplotlib.pyplot as plt
import pandas as pd

import core.config.config_ as cconconf
import core.statistics as costatis
import helpers.hdatetime as hdateti
import helpers.hdbg as hdbg
import helpers.henv as henv
import helpers.hprint as hprint
import helpers.hs3 as hs3
import helpers.hsecrets as hsecret
import im_v2.ccxt.data.client as icdcl
import im_v2.ccxt.data.extract.exchange_class as imvcdeexcl

  from tqdm.autonotebook import tqdm


In [4]:
hdbg.init_logger(verbosity=logging.INFO)

_LOG = logging.getLogger(__name__)

_LOG.info("%s", henv.get_system_signature()[0])

hprint.config_notebook()

[0m[36mINFO[0m: > cmd='/venv/lib/python3.8/site-packages/ipykernel_launcher.py -f /home/.local/share/jupyter/runtime/kernel-f627a239-a49d-4c7b-b927-30443ce8b189.json'
INFO  # Git
    branch_name='CMTask1905_Check_CCXT_data_against_source'
    hash='c7dfbd74d'
    # Last commits:
      * c7dfbd74d Nina Lee fix output                                                        (71 minutes ago) Fri May 13 19:50:38 2022  (HEAD -> CMTask1905_Check_CCXT_data_against_source, origin/CMTask1905_Check_CCXT_data_against_source)
      * 2c4e04c50 Nina Lee CMTask1905: Check CCXT data against source                        (76 minutes ago) Fri May 13 19:45:34 2022           
      * 54aaa90f6 Daniil Tikhomirov CMTask1897: add csv support (#1912)                               (   6 hours ago) Fri May 13 15:07:50 2022  (origin/master, origin/HEAD, master)
# Machine info
    system=Linux
    node name=3b01b48470d2
    release=5.13.0-1022-aws
    version=#24~20.04.1-Ubuntu SMP Thu Apr 7 22:10:15 UTC 2022
 

In [5]:
def get_cmtask1866_config_ccxt() -> cconconf.Config:
    """
    Get task1866-specific config.
    """
    config = cconconf.Config()
    # Load parameters.
    config.add_subconfig("load")
    config["load"]["aws_profile"] = "ck"
    #
    s3_bucket_path = hs3.get_s3_bucket_path(config["load"]["aws_profile"])
    s3_path = "s3://cryptokaizen-data/historical"
    config["load"]["data_dir"] = s3_path
    # Data parameters.
    config.add_subconfig("data")
    config["data"]["vendor"] = "CCXT"
    config["data"]["data_snapshot"] = "latest"
    config["data"]["version"] = "v3"
    config["data"]["resample_1min"] = True
    config["data"]["partition_mode"] = "by_year_month"
    config["data"]["start_ts"] = None
    config["data"]["end_ts"] = None
    config["data"]["columns"] = None
    config["data"]["filter_data_mode"] = "assert"
    # Column names.
    config.add_subconfig("column_names")
    config["column_names"]["full_symbol"] = "full_symbol"
    config["column_names"]["close_price"] = "close"
    return config

In [6]:
config = get_cmtask1866_config_ccxt()
print(config)

load:
  aws_profile: ck
  data_dir: s3://cryptokaizen-data/historical
data:
  vendor: CCXT
  data_snapshot: latest
  version: v3
  resample_1min: True
  partition_mode: by_year_month
  start_ts: None
  end_ts: None
  columns: None
  filter_data_mode: assert
column_names:
  full_symbol: full_symbol
  close_price: close


In [22]:
 pd.set_option("display.float_format", "{:.8f}".format)

# Functions

In [7]:
def _get_qa_stats(data: pd.DataFrame, config: cconconf.Config) -> pd.DataFrame:
    """
    Get quality assurance stats per full symbol in data.
    """
    res_stats = []
    for full_symbol, symbol_data in data.groupby(
        config["column_names"]["full_symbol"]
    ):
        # Compute stats for a full symbol.
        symbol_stats = pd.Series(dtype="object", name=full_symbol)
        symbol_stats["min_timestamp"] = symbol_data.index.min()
        symbol_stats["max_timestamp"] = symbol_data.index.max()
        symbol_stats["NaNs %"] = 100 * (
            costatis.compute_frac_nan(
                symbol_data[config["column_names"]["close_price"]]
            )
        )
        symbol_stats["volume=0 %"] = 100 * (
            symbol_data[symbol_data["volume"] == 0].shape[0]
            / symbol_data.shape[0]
        )
        symbol_stats["bad data %"] = symbol_stats["NaNs %"] + symbol_stats["volume=0 %"]
        res_stats.append(symbol_stats)
    # Combine all full symbol stats.
    res_stats_df = pd.concat(res_stats, axis=1).T
    return res_stats_df


def _get_qa_stats_by_year_month(
    data: pd.DataFrame, config: cconconf.Config
) -> pd.DataFrame:
    """
    Get quality assurance stats per full symbol, year, and month.
    """
    #
    data["year"] = data.index.year
    data["month"] = data.index.month
    #
    res_stats = []
    columns_to_groupby = [config["column_names"]["full_symbol"], "year", "month"]
    for index, symbol_data in data.groupby(columns_to_groupby):
        #
        full_symbol, year, month = index
        # Get stats for a full symbol and add them to overall stats.
        symbol_stats = pd.Series(dtype="object", name=full_symbol)
        symbol_stats["year"] = year
        symbol_stats["month"] = month
        symbol_stats["NaNs %"] = 100 * (
            costatis.compute_frac_nan(
                symbol_data[config["column_names"]["close_price"]]
            )
        )
        symbol_stats["volume=0 %"] = 100 * (
            symbol_data[symbol_data["volume"] == 0].shape[0]
            / symbol_data.shape[0]
        )
        symbol_stats["bad data %"] = symbol_stats["NaNs %"] + symbol_stats["volume=0 %"]
        res_stats.append(symbol_stats)
    res_stats_df = pd.concat(res_stats, axis=1).T
    #
    res_stats_df["year"] = res_stats_df["year"].astype(int)
    res_stats_df["month"] = res_stats_df["month"].astype(int)
    # Set index by full symbol, year, and month.
    res_stats_df = res_stats_df.set_index([res_stats_df.index, "year", "month"])
    return res_stats_df


def _plot_bad_data_stats(bad_data_stats: pd.DataFrame) -> None:
    """
    Plot bad data stats per unique full symbol in data.
    """
    full_symbols = bad_data_stats.index.get_level_values(0).unique()
    for full_symbol in full_symbols:
        bad_data_col_name = "bad data %"
        _ = bad_data_stats.loc[full_symbol].plot.bar(
            y=bad_data_col_name, rot=0, title=full_symbol
        )

In [8]:
def set_index_ts(df):
    df["timestamp"] = df["timestamp"].apply(
        lambda x: hdateti.convert_unix_epoch_to_timestamp(x)
    )
    df = df.set_index("timestamp")
    return df

In [9]:
def percentage(df, df_loc):
    result = 100*len(df_loc)/len(df)
    return round(result, 2)

def log_into_exchange(exchange) -> ccxt.Exchange:
    """
    Log into an exchange via CCXT and return the corresponding
    `ccxt.Exchange` object.
    """
    # Select credentials for provided exchange.
    credentials = hsecret.get_secret(exchange)
    # Enable rate limit.
    credentials["rateLimit"] = True
    exchange_class = getattr(ccxt, exchange)
    # Create a CCXT Exchange class object.
    exchange = exchange_class(credentials)
    hdbg.dassert(
        exchange.checkRequiredCredentials(),
        msg="Required credentials not passed",
    )
    return exchange

In [49]:
def load_ccxt_data(currency_pair, since, exchange):
    ccxt_data = exchange.fetch_ohlcv(
            currency_pair,
            timeframe="1m",
            since=since,
            limit=500)
    columns = ["timestamp", "open", "high", "low", "close", "volume"]
    bars = pd.DataFrame(ccxt_data, columns=columns)
    return bars

In [52]:
def get_all_data(exchange, currency_pair, start_timestamp, end_timestamp):
    all_bars = []
    duration = exchange.parse_timeframe("1m") * 100
    for t in range(
            start_timestamp,
            end_timestamp + duration,
            duration * 500,
        ):
        bars = load_ccxt_data(currency_pair, t, exchange)
        all_bars.append(bars)
        time.sleep(1)
    return pd.concat(all_bars)

# CcxtHistoricalPqByTileClient

In [12]:
client = icdcl.CcxtHistoricalPqByTileClient(
    config["data"]["version"],
    config["data"]["resample_1min"],
    config["load"]["data_dir"],
    config["data"]["partition_mode"],
    aws_profile=config["load"]["aws_profile"],
)

In [10]:
universe = client.get_universe()
universe

['binance::ADA_USDT',
 'binance::AVAX_USDT',
 'binance::BNB_USDT',
 'binance::BTC_USDT',
 'binance::DOGE_USDT',
 'binance::EOS_USDT',
 'binance::ETH_USDT',
 'binance::LINK_USDT',
 'binance::SOL_USDT',
 'ftx::BNB_USDT',
 'ftx::BTC_USDT',
 'ftx::DOGE_USDT',
 'ftx::ETH_USDT',
 'ftx::LINK_USDT',
 'ftx::SOL_USDT',
 'ftx::XRP_USDT',
 'gateio::ADA_USDT',
 'gateio::AVAX_USDT',
 'gateio::BNB_USDT',
 'gateio::BTC_USDT',
 'gateio::DOGE_USDT',
 'gateio::EOS_USDT',
 'gateio::ETH_USDT',
 'gateio::FIL_USDT',
 'gateio::LINK_USDT',
 'gateio::SOL_USDT',
 'gateio::XRP_USDT',
 'kucoin::ADA_USDT',
 'kucoin::AVAX_USDT',
 'kucoin::BNB_USDT',
 'kucoin::BTC_USDT',
 'kucoin::DOGE_USDT',
 'kucoin::EOS_USDT',
 'kucoin::ETH_USDT',
 'kucoin::FIL_USDT',
 'kucoin::LINK_USDT',
 'kucoin::SOL_USDT',
 'kucoin::XRP_USDT']

# Binance::DOGE_USDT

In [11]:
binance_data = client.read_data(
    ["binance::DOGE_USDT"],
    config["data"]["start_ts"],
    config["data"]["end_ts"],
    config["data"]["columns"],
    config["data"]["filter_data_mode"],
)

Unnamed: 0,full_symbol,open,high,low,close,volume
2019-10-01 00:00:00+00:00,binance::DOGE_USDT,0.002379,0.00238,0.002379,0.00238,143877.0
,...,...,...,...,...,...
2022-05-06 13:56:00+00:00,binance::DOGE_USDT,0.1241,0.1242,0.124,0.1241,176923.0


Unnamed: 0,col_name,dtype,num_unique,num_nans,first_elem,type(first_elem)
0,index,"datetime64[ns, UTC]",1488968 / 1488968 = 100.00%,0 / 1488968 = 0.00%,2019-10-01T00:00:00.000000000,<class 'numpy.datetime64'>
1,full_symbol,object,1 / 1488968 = 0.00%,0 / 1488968 = 0.00%,binance::DOGE_USDT,<class 'str'>
2,open,float64,191026 / 1488968 = 12.83%,0 / 1488968 = 0.00%,0.0023791,<class 'numpy.float64'>
3,high,float64,177607 / 1488968 = 11.93%,0 / 1488968 = 0.00%,0.00238,<class 'numpy.float64'>
4,low,float64,178090 / 1488968 = 11.96%,0 / 1488968 = 0.00%,0.0023791,<class 'numpy.float64'>
5,close,float64,189862 / 1488968 = 12.75%,0 / 1488968 = 0.00%,0.00238,<class 'numpy.float64'>
6,volume,float64,834051 / 1488968 = 56.02%,0 / 1488968 = 0.00%,143877.0,<class 'numpy.float64'>


# df=
index=[2019-07-05 12:00:00+00:00, 2022-05-06 13:56:00+00:00]
columns=full_symbol,open,high,low,close,volume
shape=(1488968, 6)
* type=



Unnamed: 0,full_symbol,open,high,low,close,volume
2019-10-01 00:00:00+00:00,binance::DOGE_USDT,0.002379,0.00238,0.002379,0.00238,143877.0
2019-10-01 00:01:00+00:00,binance::DOGE_USDT,0.00238,0.00238,0.00238,0.00238,0.0
2019-10-01 00:02:00+00:00,binance::DOGE_USDT,0.00238,0.00238,0.00238,0.00238,0.0
,...,...,...,...,...,...
2022-05-06 13:54:00+00:00,binance::DOGE_USDT,0.1241,0.1243,0.124,0.1242,1233386.0
2022-05-06 13:55:00+00:00,binance::DOGE_USDT,0.1243,0.1244,0.1241,0.1242,834687.0
2022-05-06 13:56:00+00:00,binance::DOGE_USDT,0.1241,0.1242,0.124,0.1241,176923.0


Unnamed: 0,full_symbol,open,high,low,close,volume
2019-07-05 12:00:00+00:00,binance::DOGE_USDT,0.00449,0.0046,0.00376,0.0042,60726008.0
2019-07-05 12:01:00+00:00,binance::DOGE_USDT,0.0042,0.004387,0.0042,0.0043,84307704.0
2019-07-05 12:02:00+00:00,binance::DOGE_USDT,0.0043,0.004475,0.0043,0.004475,48182744.0
,...,...,...,...,...,...
2022-05-06 13:54:00+00:00,binance::DOGE_USDT,0.1241,0.1243,0.124,0.1242,1233386.0
2022-05-06 13:55:00+00:00,binance::DOGE_USDT,0.1243,0.1244,0.1241,0.1242,834687.0
2022-05-06 13:56:00+00:00,binance::DOGE_USDT,0.1241,0.1242,0.124,0.1241,176923.0


Unnamed: 0,full_symbol,open,high,low,close,volume
2019-07-05 12:00:00+00:00,binance::DOGE_USDT,0.00449,0.0046,0.00376,0.0042,60726008.0
2019-07-05 12:01:00+00:00,binance::DOGE_USDT,0.0042,0.004387,0.0042,0.0043,84307704.0
2019-07-05 12:02:00+00:00,binance::DOGE_USDT,0.0043,0.004475,0.0043,0.004475,48182744.0
,...,...,...,...,...,...
2022-05-06 13:54:00+00:00,binance::DOGE_USDT,0.1241,0.1243,0.124,0.1242,1233386.0
2022-05-06 13:55:00+00:00,binance::DOGE_USDT,0.1243,0.1244,0.1241,0.1242,834687.0
2022-05-06 13:56:00+00:00,binance::DOGE_USDT,0.1241,0.1242,0.124,0.1241,176923.0


In [12]:
binance_2019_09 = binance_data.loc[(binance_data.index.year == 2019) & (binance_data.index.month == 9)]
binance_2019_09_volume_0 = binance_2019_09.loc[binance_2019_09["volume"] == 0]

In [13]:
binance_2019_09

Unnamed: 0_level_0,full_symbol,open,high,low,close,volume
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2019-09-01 00:00:00+00:00,binance::DOGE_USDT,0.002453,0.002453,0.002453,0.002453,0.0
2019-09-01 00:01:00+00:00,binance::DOGE_USDT,0.002453,0.002453,0.002453,0.002453,0.0
2019-09-01 00:02:00+00:00,binance::DOGE_USDT,0.002452,0.002452,0.002452,0.002452,6927.0
2019-09-01 00:03:00+00:00,binance::DOGE_USDT,0.002452,0.002452,0.002452,0.002452,0.0
2019-09-01 00:04:00+00:00,binance::DOGE_USDT,0.002452,0.002452,0.002452,0.002452,0.0
...,...,...,...,...,...,...
2019-09-30 23:55:00+00:00,binance::DOGE_USDT,0.002375,0.002375,0.002375,0.002375,0.0
2019-09-30 23:56:00+00:00,binance::DOGE_USDT,0.002375,0.002375,0.002375,0.002375,0.0
2019-09-30 23:57:00+00:00,binance::DOGE_USDT,0.002375,0.002375,0.002375,0.002375,0.0
2019-09-30 23:58:00+00:00,binance::DOGE_USDT,0.002375,0.002375,0.002375,0.002375,0.0


In [14]:
_LOG.info(binance_2019_09_volume_0.shape)
binance_2019_09_volume_0

INFO  (31664, 6)


Unnamed: 0_level_0,full_symbol,open,high,low,close,volume
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2019-09-01 00:00:00+00:00,binance::DOGE_USDT,0.002453,0.002453,0.002453,0.002453,0.0
2019-09-01 00:01:00+00:00,binance::DOGE_USDT,0.002453,0.002453,0.002453,0.002453,0.0
2019-09-01 00:03:00+00:00,binance::DOGE_USDT,0.002452,0.002452,0.002452,0.002452,0.0
2019-09-01 00:04:00+00:00,binance::DOGE_USDT,0.002452,0.002452,0.002452,0.002452,0.0
2019-09-01 00:06:00+00:00,binance::DOGE_USDT,0.002454,0.002454,0.002454,0.002454,0.0
...,...,...,...,...,...,...
2019-09-30 23:55:00+00:00,binance::DOGE_USDT,0.002375,0.002375,0.002375,0.002375,0.0
2019-09-30 23:56:00+00:00,binance::DOGE_USDT,0.002375,0.002375,0.002375,0.002375,0.0
2019-09-30 23:57:00+00:00,binance::DOGE_USDT,0.002375,0.002375,0.002375,0.002375,0.0
2019-09-30 23:58:00+00:00,binance::DOGE_USDT,0.002375,0.002375,0.002375,0.002375,0.0


# Extractor

In [77]:
ccxt_binance_DOGE_exchange = imvcdeexcl.CcxtExchange("binance")

In [78]:
sleep_time_in_secs = 1
start_timestamp = pd.Timestamp("2019-09-01 00:00:00+00:00")
end_timestamp = pd.Timestamp("2019-09-30 23:59:59+00:00")
ccxt_binance_DOGE = ccxt_binance_DOGE_exchange.download_ohlcv_data(
    "DOGE/USDT",
    start_timestamp=start_timestamp,
    end_timestamp=end_timestamp,
    sleep_time_in_secs=sleep_time_in_secs,
)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 87/87 [01:58<00:00,  1.36s/it]


In [None]:
ccxt_binance_DOGE = set_index_ts(ccxt_binance_DOGE)

In [81]:
ccxt_binance_DOGE = ccxt_binance_DOGE.loc[ccxt_binance_DOGE.index.month == 9]

In [83]:
ccxt_binance_DOGE.loc[ccxt_binance_DOGE['volume'] == 0]

Unnamed: 0_level_0,open,high,low,close,volume,end_download_timestamp
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2019-09-01 00:00:00+00:00,0.00245280,0.00245280,0.00245280,0.00245280,0.00000000,2022-05-13 23:20:45.520757+00:00
2019-09-01 00:01:00+00:00,0.00245280,0.00245280,0.00245280,0.00245280,0.00000000,2022-05-13 23:20:45.520757+00:00
2019-09-01 00:03:00+00:00,0.00245200,0.00245200,0.00245200,0.00245200,0.00000000,2022-05-13 23:20:45.520757+00:00
2019-09-01 00:04:00+00:00,0.00245200,0.00245200,0.00245200,0.00245200,0.00000000,2022-05-13 23:20:45.520757+00:00
2019-09-01 00:06:00+00:00,0.00245360,0.00245360,0.00245360,0.00245360,0.00000000,2022-05-13 23:20:45.520757+00:00
...,...,...,...,...,...,...
2019-09-30 23:55:00+00:00,0.00237470,0.00237470,0.00237470,0.00237470,0.00000000,2022-05-13 23:22:42.513275+00:00
2019-09-30 23:56:00+00:00,0.00237470,0.00237470,0.00237470,0.00237470,0.00000000,2022-05-13 23:22:42.513275+00:00
2019-09-30 23:57:00+00:00,0.00237470,0.00237470,0.00237470,0.00237470,0.00000000,2022-05-13 23:22:42.513275+00:00
2019-09-30 23:58:00+00:00,0.00237470,0.00237470,0.00237470,0.00237470,0.00000000,2022-05-13 23:22:42.513275+00:00


In [84]:
ccxt_binance_DOGE

Unnamed: 0_level_0,open,high,low,close,volume,end_download_timestamp
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2019-09-01 00:00:00+00:00,0.00245280,0.00245280,0.00245280,0.00245280,0.00000000,2022-05-13 23:20:45.520757+00:00
2019-09-01 00:01:00+00:00,0.00245280,0.00245280,0.00245280,0.00245280,0.00000000,2022-05-13 23:20:45.520757+00:00
2019-09-01 00:02:00+00:00,0.00245200,0.00245200,0.00245200,0.00245200,6927.00000000,2022-05-13 23:20:45.520757+00:00
2019-09-01 00:03:00+00:00,0.00245200,0.00245200,0.00245200,0.00245200,0.00000000,2022-05-13 23:20:45.520757+00:00
2019-09-01 00:04:00+00:00,0.00245200,0.00245200,0.00245200,0.00245200,0.00000000,2022-05-13 23:20:45.520757+00:00
...,...,...,...,...,...,...
2019-09-30 23:55:00+00:00,0.00237470,0.00237470,0.00237470,0.00237470,0.00000000,2022-05-13 23:22:42.513275+00:00
2019-09-30 23:56:00+00:00,0.00237470,0.00237470,0.00237470,0.00237470,0.00000000,2022-05-13 23:22:42.513275+00:00
2019-09-30 23:57:00+00:00,0.00237470,0.00237470,0.00237470,0.00237470,0.00000000,2022-05-13 23:22:42.513275+00:00
2019-09-30 23:58:00+00:00,0.00237470,0.00237470,0.00237470,0.00237470,0.00000000,2022-05-13 23:22:42.513275+00:00


Where`volume = 0`, data from columns `open`, `high`, `low`, `close` is exactly the same from previous row where `volume != 0`. It could mean that `volume = 0` rows are `NaNs` at the source, so it could be the way exchange handles missing data.

In [86]:
print(percentage(ccxt_binance_DOGE, ccxt_binance_DOGE.loc[ccxt_binance_DOGE['volume'] == 0]))

73.3


# CCXT w/o Extractor

In [28]:
ccxt_exchange = log_into_exchange('binance')

In [None]:
ccxt_df = get_all_data(ccxt_exchange, "DOGE/USDT", 1567296000000, 1569887999000)

In [34]:
ccxt_df = set_index_ts(ccxt_df)
ccxt_df.index.min(), ccxt_df.index.max(), ccxt_df.shape

(Timestamp('2019-09-01 00:00:00+0000', tz='UTC'),
 Timestamp('2019-10-01 08:19:00+0000', tz='UTC'),
 (432500, 5))

In [35]:
ccxt_df = ccxt_df.loc[ccxt_df.index.month == 9]

In [36]:
ccxt_df.isna().value_counts()

open   high   low    close  volume
False  False  False  False  False     429750
dtype: int64

In [38]:
ccxt_df.loc[ccxt_df['volume'] != 0]

Unnamed: 0_level_0,open,high,low,close,volume
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2019-09-01 00:02:00+00:00,0.00,0.00,0.00,0.00,6927.00
2019-09-01 00:05:00+00:00,0.00,0.00,0.00,0.00,85551.00
2019-09-01 00:08:00+00:00,0.00,0.00,0.00,0.00,4300.00
2019-09-01 00:24:00+00:00,0.00,0.00,0.00,0.00,192594.00
2019-09-01 00:35:00+00:00,0.00,0.00,0.00,0.00,8844.00
...,...,...,...,...,...
2019-09-30 23:38:00+00:00,0.00,0.00,0.00,0.00,6000.00
2019-09-30 23:42:00+00:00,0.00,0.00,0.00,0.00,375.00
2019-09-30 23:44:00+00:00,0.00,0.00,0.00,0.00,8589.00
2019-09-30 23:51:00+00:00,0.00,0.00,0.00,0.00,57944.00


# Summary


|CCXT | | ||			Extractor	| | | |Client | | |
|------|--|-||-------------|-|-|-|------|-|-|
|date|Number of NaN rows %|	Total number of rows| `volume=0` %	|Number of NaN rows %|	Total number of rows| `volume=0` %| Number of NaN rows %|	Total number of rows| `volume=0` %|
|2019-09|	0          |	                   429750|	      73.22%   	|	0          |	                   43200|	      73.3%   |      0|	            43200| 73.3%|


- The huge amount of data from CCXT is duplicates. Unique values are 43200.
- Where volume = 0, data from columns open, high, low, close is exactly the same from previous row where volume != 0. It could mean that volume = 0 rows are NaNs at the source, so it could be the way exchange handles missing data.

# ftx::BTC_USDT

## Client

In [14]:
ftx_data = client.read_data(
    ["ftx::BTC_USDT"],
    config["data"]["start_ts"],
    config["data"]["end_ts"],
    config["data"]["columns"],
    config["data"]["filter_data_mode"],
)

Unnamed: 0,full_symbol,open,high,low,close,volume
2020-10-01 00:00:00+00:00,ftx::BTC_USDT,10777.0,10786.5,10777.0,10782.5,6925.7566
,...,...,...,...,...,...
2022-05-06 14:20:00+00:00,ftx::BTC_USDT,36048.0,36071.0,36018.0,36020.0,217708.5415


Unnamed: 0,col_name,dtype,num_unique,num_nans,first_elem,type(first_elem)
0,index,"datetime64[ns, UTC]",1107073 / 1107073 = 100.00%,0 / 1107073 = 0.00%,2020-10-01T00:00:00.000000000,<class 'numpy.datetime64'>
1,full_symbol,object,1 / 1107073 = 0.00%,0 / 1107073 = 0.00%,ftx::BTC_USDT,<class 'str'>
2,open,float64,75158 / 1107073 = 6.79%,0 / 1107073 = 0.00%,10777.0,<class 'numpy.float64'>
3,high,float64,74623 / 1107073 = 6.74%,0 / 1107073 = 0.00%,10786.5,<class 'numpy.float64'>
4,low,float64,74793 / 1107073 = 6.76%,0 / 1107073 = 0.00%,10777.0,<class 'numpy.float64'>
5,close,float64,75189 / 1107073 = 6.79%,0 / 1107073 = 0.00%,10782.5,<class 'numpy.float64'>
6,volume,float64,946048 / 1107073 = 85.45%,0 / 1107073 = 0.00%,6925.7566,<class 'numpy.float64'>


# df=
index=[2020-03-28 14:40:00+00:00, 2022-05-06 14:20:00+00:00]
columns=full_symbol,open,high,low,close,volume
shape=(1107073, 6)
* type=



Unnamed: 0,full_symbol,open,high,low,close,volume
2020-10-01 00:00:00+00:00,ftx::BTC_USDT,10777.0,10786.5,10777.0,10782.5,6925.7566
2020-10-01 00:01:00+00:00,ftx::BTC_USDT,10782.5,10799.5,10782.5,10798.0,31859.54975
2020-10-01 00:02:00+00:00,ftx::BTC_USDT,10798.0,10798.0,10791.5,10791.5,65.84035
,...,...,...,...,...,...
2022-05-06 14:18:00+00:00,ftx::BTC_USDT,36079.0,36113.0,36019.0,36113.0,258109.8326
2022-05-06 14:19:00+00:00,ftx::BTC_USDT,36113.0,36205.0,36040.0,36048.0,1331075.2469
2022-05-06 14:20:00+00:00,ftx::BTC_USDT,36048.0,36071.0,36018.0,36020.0,217708.5415


Unnamed: 0,full_symbol,open,high,low,close,volume
2020-03-28 14:40:00+00:00,ftx::BTC_USDT,6240.75,6241.0,6240.0,6240.0,0.0
2020-03-28 14:41:00+00:00,ftx::BTC_USDT,6240.0,6240.0,6235.75,6235.75,0.0
2020-03-28 14:42:00+00:00,ftx::BTC_USDT,6235.75,6235.75,6226.25,6226.25,0.0
,...,...,...,...,...,...
2022-05-06 14:18:00+00:00,ftx::BTC_USDT,36079.0,36113.0,36019.0,36113.0,258109.8326
2022-05-06 14:19:00+00:00,ftx::BTC_USDT,36113.0,36205.0,36040.0,36048.0,1331075.2469
2022-05-06 14:20:00+00:00,ftx::BTC_USDT,36048.0,36071.0,36018.0,36020.0,217708.5415


Unnamed: 0,full_symbol,open,high,low,close,volume
2020-03-28 14:40:00+00:00,ftx::BTC_USDT,6240.75,6241.0,6240.0,6240.0,0.0
2020-03-28 14:41:00+00:00,ftx::BTC_USDT,6240.0,6240.0,6235.75,6235.75,0.0
2020-03-28 14:42:00+00:00,ftx::BTC_USDT,6235.75,6235.75,6226.25,6226.25,0.0
,...,...,...,...,...,...
2022-05-06 14:18:00+00:00,ftx::BTC_USDT,36079.0,36113.0,36019.0,36113.0,258109.8326
2022-05-06 14:19:00+00:00,ftx::BTC_USDT,36113.0,36205.0,36040.0,36048.0,1331075.2469
2022-05-06 14:20:00+00:00,ftx::BTC_USDT,36048.0,36071.0,36018.0,36020.0,217708.5415


In [16]:
ftx_2020_04 = ftx_data.loc[(ftx_data.index.year == 2020) & (ftx_data.index.month == 4)]
ftx_2020_04_volume_0 = ftx_2020_04.loc[ftx_2020_04["volume"] == 0]
ftx_2020_04_volume_0

Unnamed: 0_level_0,full_symbol,open,high,low,close,volume
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2020-04-01 00:00:00+00:00,ftx::BTC_USDT,6410.0,6422.5,6410.0,6419.0,0.0
2020-04-01 00:01:00+00:00,ftx::BTC_USDT,6419.0,6421.0,6411.0,6417.0,0.0
2020-04-01 00:02:00+00:00,ftx::BTC_USDT,6417.0,6419.5,6415.5,6418.0,0.0
2020-04-01 00:03:00+00:00,ftx::BTC_USDT,6418.0,6421.0,6416.5,6418.5,0.0
2020-04-01 00:04:00+00:00,ftx::BTC_USDT,6418.5,6418.5,6413.0,6415.5,0.0
...,...,...,...,...,...,...
2020-04-30 23:52:00+00:00,ftx::BTC_USDT,8616.0,8629.5,8609.0,8629.0,0.0
2020-04-30 23:54:00+00:00,ftx::BTC_USDT,8609.5,8616.5,8589.5,8616.5,0.0
2020-04-30 23:55:00+00:00,ftx::BTC_USDT,8616.5,8616.5,8603.5,8609.0,0.0
2020-04-30 23:56:00+00:00,ftx::BTC_USDT,8609.0,8613.5,8595.0,8606.0,0.0


In [17]:
ftx_2020_04

Unnamed: 0_level_0,full_symbol,open,high,low,close,volume
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2020-04-01 00:00:00+00:00,ftx::BTC_USDT,6410.0,6422.5,6410.0,6419.0,0.0000
2020-04-01 00:01:00+00:00,ftx::BTC_USDT,6419.0,6421.0,6411.0,6417.0,0.0000
2020-04-01 00:02:00+00:00,ftx::BTC_USDT,6417.0,6419.5,6415.5,6418.0,0.0000
2020-04-01 00:03:00+00:00,ftx::BTC_USDT,6418.0,6421.0,6416.5,6418.5,0.0000
2020-04-01 00:04:00+00:00,ftx::BTC_USDT,6418.5,6418.5,6413.0,6415.5,0.0000
...,...,...,...,...,...,...
2020-04-30 23:55:00+00:00,ftx::BTC_USDT,8616.5,8616.5,8603.5,8609.0,0.0000
2020-04-30 23:56:00+00:00,ftx::BTC_USDT,8609.0,8613.5,8595.0,8606.0,0.0000
2020-04-30 23:57:00+00:00,ftx::BTC_USDT,8606.0,8616.5,8605.0,8607.0,0.0000
2020-04-30 23:58:00+00:00,ftx::BTC_USDT,8607.0,8641.5,8607.0,8637.5,73397.4500


In [18]:
ftx_2020_04.loc[ftx_2020_04['open'].isna()]

Unnamed: 0_level_0,full_symbol,open,high,low,close,volume
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1


In [47]:
print(percentage(ftx_2020_04, ftx_2020_04_volume_0))

85.97


## Extractor

In [64]:
ccxt_ftx_BTC_exchange = imvcdeexcl.CcxtExchange("ftx")
sleep_time_in_secs = 1
start_timestamp = pd.Timestamp("2020-04-01 00:00:00+00:00")
end_timestamp = pd.Timestamp("2020-04-30 23:59:59+00:00")
ccxt_ftx_BTC = ccxt_ftx_BTC_exchange.download_ohlcv_data(
    "BTC/USDT",
    start_timestamp=start_timestamp,
    end_timestamp=end_timestamp,
    sleep_time_in_secs=sleep_time_in_secs,
)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 87/87 [02:03<00:00,  1.42s/it]


In [23]:
ccxt_ftx_BTC = set_index_ts(ccxt_ftx_BTC)


KeyError: 'timestamp'

In [71]:
ccxt_ftx_BTC = ccxt_ftx_BTC.loc[ccxt_ftx_BTC.index.month == 4]

In [72]:
ccxt_ftx_BTC.loc[ccxt_ftx_BTC['volume'] == 0]

Unnamed: 0_level_0,open,high,low,close,volume,end_download_timestamp
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2020-04-01 00:00:00+00:00,6410.00000000,6422.50000000,6410.00000000,6419.00000000,0.00000000,2022-05-13 22:58:38.957420+00:00
2020-04-01 00:01:00+00:00,6419.00000000,6421.00000000,6411.00000000,6417.00000000,0.00000000,2022-05-13 22:58:38.957420+00:00
2020-04-01 00:02:00+00:00,6417.00000000,6419.50000000,6415.50000000,6418.00000000,0.00000000,2022-05-13 22:58:38.957420+00:00
2020-04-01 00:03:00+00:00,6418.00000000,6421.00000000,6416.50000000,6418.50000000,0.00000000,2022-05-13 22:58:38.957420+00:00
2020-04-01 00:04:00+00:00,6418.50000000,6418.50000000,6413.00000000,6415.50000000,0.00000000,2022-05-13 22:58:38.957420+00:00
...,...,...,...,...,...,...
2020-04-30 23:52:00+00:00,8616.00000000,8629.50000000,8609.00000000,8629.00000000,0.00000000,2022-05-13 23:00:41.210739+00:00
2020-04-30 23:54:00+00:00,8609.50000000,8616.50000000,8589.50000000,8616.50000000,0.00000000,2022-05-13 23:00:41.210739+00:00
2020-04-30 23:55:00+00:00,8616.50000000,8616.50000000,8603.50000000,8609.00000000,0.00000000,2022-05-13 23:00:41.210739+00:00
2020-04-30 23:56:00+00:00,8609.00000000,8613.50000000,8595.00000000,8606.00000000,0.00000000,2022-05-13 23:00:41.210739+00:00


In [73]:
ccxt_ftx_BTC

Unnamed: 0_level_0,open,high,low,close,volume,end_download_timestamp
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2020-04-01 00:00:00+00:00,6410.00000000,6422.50000000,6410.00000000,6419.00000000,0.00000000,2022-05-13 22:58:38.957420+00:00
2020-04-01 00:01:00+00:00,6419.00000000,6421.00000000,6411.00000000,6417.00000000,0.00000000,2022-05-13 22:58:38.957420+00:00
2020-04-01 00:02:00+00:00,6417.00000000,6419.50000000,6415.50000000,6418.00000000,0.00000000,2022-05-13 22:58:38.957420+00:00
2020-04-01 00:03:00+00:00,6418.00000000,6421.00000000,6416.50000000,6418.50000000,0.00000000,2022-05-13 22:58:38.957420+00:00
2020-04-01 00:04:00+00:00,6418.50000000,6418.50000000,6413.00000000,6415.50000000,0.00000000,2022-05-13 22:58:38.957420+00:00
...,...,...,...,...,...,...
2020-04-30 23:55:00+00:00,8616.50000000,8616.50000000,8603.50000000,8609.00000000,0.00000000,2022-05-13 23:00:41.210739+00:00
2020-04-30 23:56:00+00:00,8609.00000000,8613.50000000,8595.00000000,8606.00000000,0.00000000,2022-05-13 23:00:41.210739+00:00
2020-04-30 23:57:00+00:00,8606.00000000,8616.50000000,8605.00000000,8607.00000000,0.00000000,2022-05-13 23:00:41.210739+00:00
2020-04-30 23:58:00+00:00,8607.00000000,8641.50000000,8607.00000000,8637.50000000,73397.45000000,2022-05-13 23:00:41.210739+00:00


In [74]:
ccxt_ftx_BTC.loc[(ccxt_ftx_BTC['high'] == 7493.50000000)
                 & (ccxt_ftx_BTC['volume'] == 0)]

Unnamed: 0_level_0,open,high,low,close,volume,end_download_timestamp
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2020-04-24 00:34:00+00:00,7491.5,7493.5,7488.5,7488.5,0.0,2022-05-13 23:00:10.127422+00:00
2020-04-24 01:41:00+00:00,7492.5,7493.5,7488.0,7490.0,0.0,2022-05-13 23:00:10.127422+00:00
2020-04-24 02:04:00+00:00,7487.0,7493.5,7487.0,7493.5,0.0,2022-05-13 23:00:10.127422+00:00
2020-04-24 02:18:00+00:00,7493.5,7493.5,7490.5,7490.5,0.0,2022-05-13 23:00:10.127422+00:00
2020-04-24 15:10:00+00:00,7493.0,7493.5,7486.5,7486.5,0.0,2022-05-13 23:00:12.818858+00:00
2020-04-24 22:37:00+00:00,7493.5,7493.5,7487.5,7490.5,0.0,2022-05-13 23:00:12.818858+00:00
2020-04-25 01:28:00+00:00,7493.5,7493.5,7490.5,7491.5,0.0,2022-05-13 23:00:14.182170+00:00
2020-04-25 01:38:00+00:00,7492.5,7493.5,7492.0,7492.0,0.0,2022-05-13 23:00:14.182170+00:00
2020-04-25 02:49:00+00:00,7493.0,7493.5,7491.0,7491.5,0.0,2022-05-13 23:00:14.182170+00:00
2020-04-25 03:03:00+00:00,7493.5,7493.5,7493.5,7493.5,0.0,2022-05-13 23:00:14.182170+00:00


In [75]:
ccxt_ftx_BTC.loc[(ccxt_ftx_BTC.index.day == 25)
                 & (ccxt_ftx_BTC.index.hour == 3)]

Unnamed: 0_level_0,open,high,low,close,volume,end_download_timestamp
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2020-04-25 03:00:00+00:00,7491.0,7491.0,7491.0,7491.0,0.0,2022-05-13 23:00:14.182170+00:00
2020-04-25 03:01:00+00:00,7491.0,7491.0,7491.0,7491.0,0.0,2022-05-13 23:00:14.182170+00:00
2020-04-25 03:02:00+00:00,7491.0,7494.5,7491.0,7493.5,3427.5269,2022-05-13 23:00:14.182170+00:00
2020-04-25 03:03:00+00:00,7493.5,7493.5,7493.5,7493.5,0.0,2022-05-13 23:00:14.182170+00:00
2020-04-25 03:04:00+00:00,7493.5,7493.5,7493.5,7493.5,0.0,2022-05-13 23:00:14.182170+00:00
2020-04-25 03:05:00+00:00,7493.5,7493.5,7493.5,7493.5,0.0,2022-05-13 23:00:14.182170+00:00
2020-04-25 03:06:00+00:00,7493.5,7493.5,7493.5,7493.5,0.0,2022-05-13 23:00:14.182170+00:00
2020-04-25 03:07:00+00:00,7493.5,7493.5,7493.5,7493.5,0.0,2022-05-13 23:00:14.182170+00:00
2020-04-25 03:08:00+00:00,7493.5,7493.5,7493.5,7493.5,0.0,2022-05-13 23:00:14.182170+00:00
2020-04-25 03:09:00+00:00,7493.5,7493.5,7493.5,7493.5,0.0,2022-05-13 23:00:14.182170+00:00


So far `ftx` doesn't have same pattern as `binance` where `volume=0` rows have values from the last non-`volume=0` row.

In [76]:
print(percentage(ccxt_ftx_BTC, ccxt_ftx_BTC.loc[ccxt_ftx_BTC['volume'] == 0]))

85.97


## CCXT w/o Extractor

In [53]:
ccxt_exchange_ftx = log_into_exchange('ftx')
ccxt_df_ftx = get_all_data(ccxt_exchange_ftx, "BTC/USDT", 1585699200000, 1588291199000)
ccxt_df_ftx = set_index_ts(ccxt_df_ftx)
ccxt_df_ftx.index.min(), ccxt_df_ftx.index.max(), ccxt_df_ftx.shape

(Timestamp('2020-04-01 00:00:00+0000', tz='UTC'),
 Timestamp('2020-05-01 08:19:00+0000', tz='UTC'),
 (432500, 5))

In [54]:
ccxt_df_ftx = ccxt_df_ftx.loc[ccxt_df_ftx.index.month == 4]

In [56]:
ccxt_df_ftx.isna().value_counts()

open   high   low    close  volume
False  False  False  False  False     429750
dtype: int64

In [58]:
len(ccxt_df_ftx.index.unique())

43200

In [59]:
ccxt_df_ftx

Unnamed: 0_level_0,open,high,low,close,volume
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2020-04-01 00:00:00+00:00,6410.00000000,6422.50000000,6410.00000000,6419.00000000,0.00000000
2020-04-01 00:01:00+00:00,6419.00000000,6421.00000000,6411.00000000,6417.00000000,0.00000000
2020-04-01 00:02:00+00:00,6417.00000000,6419.50000000,6415.50000000,6418.00000000,0.00000000
2020-04-01 00:03:00+00:00,6418.00000000,6421.00000000,6416.50000000,6418.50000000,0.00000000
2020-04-01 00:04:00+00:00,6418.50000000,6418.50000000,6413.00000000,6415.50000000,0.00000000
...,...,...,...,...,...
2020-04-30 23:55:00+00:00,8616.50000000,8616.50000000,8603.50000000,8609.00000000,0.00000000
2020-04-30 23:56:00+00:00,8609.00000000,8613.50000000,8595.00000000,8606.00000000,0.00000000
2020-04-30 23:57:00+00:00,8606.00000000,8616.50000000,8605.00000000,8607.00000000,0.00000000
2020-04-30 23:58:00+00:00,8607.00000000,8641.50000000,8607.00000000,8637.50000000,73397.45000000


In [61]:
print(percentage(ccxt_df_ftx, ccxt_df_ftx.loc[ccxt_df_ftx['volume'] == 0]))

86.09



|CCXT | | ||			Extractor	| | | |Client | | |
|------|--|-||-------------|-|-|-|------|-|-|
|date|Number of NaN rows %|	Total number of rows| `volume=0` %	|Number of NaN rows %|	Total number of rows| `volume=0` %| Number of NaN rows %|	Total number of rows| `volume=0` %|
|2019-09|	0          |	                   429750|	      86.09%   	|	0          |	                   43200|	      85.97%   |      0|	            43200| 85.97%|
