In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

# Imports

In [None]:
import logging
import os
import requests
import time

import ccxt
import matplotlib.pyplot as plt
import pandas as pd

import core.config.config_ as cconconf
import core.statistics as costatis
import helpers.hdatetime as hdateti
import helpers.hdbg as hdbg
import helpers.henv as henv
import helpers.hprint as hprint
import helpers.hs3 as hs3
import helpers.hsecrets as hsecret
import im_v2.ccxt.data.client as icdcl
import im_v2.ccxt.data.extract.exchange_class as imvcdeexcl

In [None]:
hdbg.init_logger(verbosity=logging.INFO)

_LOG = logging.getLogger(__name__)

_LOG.info("%s", henv.get_system_signature()[0])

hprint.config_notebook()

In [None]:
def get_cmtask1866_config_ccxt() -> cconconf.Config:
    """
    Get task1866-specific config.
    """
    config = cconconf.Config()
    # Load parameters.
    config.add_subconfig("load")
    config["load"]["aws_profile"] = "ck"
    #
    s3_bucket_path = hs3.get_s3_bucket_path(config["load"]["aws_profile"])
    s3_path = "s3://cryptokaizen-data/historical"
    config["load"]["data_dir"] = s3_path
    # Data parameters.
    config.add_subconfig("data")
    config["data"]["vendor"] = "CCXT"
    config["data"]["data_snapshot"] = "latest"
    config["data"]["version"] = "v3"
    config["data"]["resample_1min"] = True
    config["data"]["partition_mode"] = "by_year_month"
    config["data"]["start_ts"] = None
    config["data"]["end_ts"] = None
    config["data"]["columns"] = None
    config["data"]["filter_data_mode"] = "assert"
    # Column names.
    config.add_subconfig("column_names")
    config["column_names"]["full_symbol"] = "full_symbol"
    config["column_names"]["close_price"] = "close"
    return config

In [None]:
config = get_cmtask1866_config_ccxt()
print(config)

In [None]:
 pd.set_option("display.float_format", "{:.8f}".format)

# Functions

In [None]:
def _get_qa_stats(data: pd.DataFrame, config: cconconf.Config) -> pd.DataFrame:
    """
    Get quality assurance stats per full symbol in data.
    """
    res_stats = []
    for full_symbol, symbol_data in data.groupby(
        config["column_names"]["full_symbol"]
    ):
        # Compute stats for a full symbol.
        symbol_stats = pd.Series(dtype="object", name=full_symbol)
        symbol_stats["min_timestamp"] = symbol_data.index.min()
        symbol_stats["max_timestamp"] = symbol_data.index.max()
        symbol_stats["NaNs %"] = 100 * (
            costatis.compute_frac_nan(
                symbol_data[config["column_names"]["close_price"]]
            )
        )
        symbol_stats["volume=0 %"] = 100 * (
            symbol_data[symbol_data["volume"] == 0].shape[0]
            / symbol_data.shape[0]
        )
        symbol_stats["bad data %"] = symbol_stats["NaNs %"] + symbol_stats["volume=0 %"]
        res_stats.append(symbol_stats)
    # Combine all full symbol stats.
    res_stats_df = pd.concat(res_stats, axis=1).T
    return res_stats_df


def _get_qa_stats_by_year_month(
    data: pd.DataFrame, config: cconconf.Config
) -> pd.DataFrame:
    """
    Get quality assurance stats per full symbol, year, and month.
    """
    #
    data["year"] = data.index.year
    data["month"] = data.index.month
    #
    res_stats = []
    columns_to_groupby = [config["column_names"]["full_symbol"], "year", "month"]
    for index, symbol_data in data.groupby(columns_to_groupby):
        #
        full_symbol, year, month = index
        # Get stats for a full symbol and add them to overall stats.
        symbol_stats = pd.Series(dtype="object", name=full_symbol)
        symbol_stats["year"] = year
        symbol_stats["month"] = month
        symbol_stats["NaNs %"] = 100 * (
            costatis.compute_frac_nan(
                symbol_data[config["column_names"]["close_price"]]
            )
        )
        symbol_stats["volume=0 %"] = 100 * (
            symbol_data[symbol_data["volume"] == 0].shape[0]
            / symbol_data.shape[0]
        )
        symbol_stats["bad data %"] = symbol_stats["NaNs %"] + symbol_stats["volume=0 %"]
        res_stats.append(symbol_stats)
    res_stats_df = pd.concat(res_stats, axis=1).T
    #
    res_stats_df["year"] = res_stats_df["year"].astype(int)
    res_stats_df["month"] = res_stats_df["month"].astype(int)
    # Set index by full symbol, year, and month.
    res_stats_df = res_stats_df.set_index([res_stats_df.index, "year", "month"])
    return res_stats_df


def _plot_bad_data_stats(bad_data_stats: pd.DataFrame) -> None:
    """
    Plot bad data stats per unique full symbol in data.
    """
    full_symbols = bad_data_stats.index.get_level_values(0).unique()
    for full_symbol in full_symbols:
        bad_data_col_name = "bad data %"
        _ = bad_data_stats.loc[full_symbol].plot.bar(
            y=bad_data_col_name, rot=0, title=full_symbol
        )

In [None]:
def set_index_ts(df):
    df["timestamp"] = df["timestamp"].apply(
        lambda x: hdateti.convert_unix_epoch_to_timestamp(x)
    )
    df = df.set_index("timestamp")
    return df

In [None]:
def percentage(df, df_loc):
    result = 100*len(df_loc)/len(df)
    return round(result, 2)

def log_into_exchange(exchange) -> ccxt.Exchange:
    """
    Log into an exchange via CCXT and return the corresponding
    `ccxt.Exchange` object.
    """
    # Select credentials for provided exchange.
    credentials = hsecret.get_secret(exchange)
    # Enable rate limit.
    credentials["rateLimit"] = True
    exchange_class = getattr(ccxt, exchange)
    # Create a CCXT Exchange class object.
    exchange = exchange_class(credentials)
    hdbg.dassert(
        exchange.checkRequiredCredentials(),
        msg="Required credentials not passed",
    )
    return exchange

In [None]:
def load_ccxt_data(currency_pair, since, exchange):
    ccxt_data = exchange.fetch_ohlcv(
            currency_pair,
            timeframe="1m",
            since=since,
            limit=500)
    columns = ["timestamp", "open", "high", "low", "close", "volume"]
    bars = pd.DataFrame(ccxt_data, columns=columns)
    return bars

In [None]:
def get_all_data(exchange, currency_pair, start_timestamp, end_timestamp):
    all_bars = []
    duration = exchange.parse_timeframe("1m") * 100
    for t in range(
            start_timestamp,
            end_timestamp + duration,
            duration * 500,
        ):
        bars = load_ccxt_data(currency_pair, t, exchange)
        all_bars.append(bars)
        time.sleep(1)
    return pd.concat(all_bars)

# CcxtHistoricalPqByTileClient

In [None]:
client = icdcl.CcxtHistoricalPqByTileClient(
    config["data"]["version"],
    config["data"]["resample_1min"],
    config["load"]["data_dir"],
    config["data"]["partition_mode"],
    aws_profile=config["load"]["aws_profile"],
)

In [None]:
universe = client.get_universe()
universe

# Binance::DOGE_USDT

In [None]:
binance_data = client.read_data(
    ["binance::DOGE_USDT"],
    config["data"]["start_ts"],
    config["data"]["end_ts"],
    config["data"]["columns"],
    config["data"]["filter_data_mode"],
)

In [None]:
binance_2019_09 = binance_data.loc[(binance_data.index.year == 2019) & (binance_data.index.month == 9)]
binance_2019_09_volume_0 = binance_2019_09.loc[binance_2019_09["volume"] == 0]

In [None]:
binance_2019_09

In [None]:
_LOG.info(binance_2019_09_volume_0.shape)
binance_2019_09_volume_0

# Extractor

In [None]:
ccxt_binance_DOGE_exchange = imvcdeexcl.CcxtExchange("binance")

In [None]:
sleep_time_in_secs = 1
start_timestamp = pd.Timestamp("2019-09-01 00:00:00+00:00")
end_timestamp = pd.Timestamp("2019-09-30 23:59:59+00:00")
ccxt_binance_DOGE = ccxt_binance_DOGE_exchange.download_ohlcv_data(
    "DOGE/USDT",
    start_timestamp=start_timestamp,
    end_timestamp=end_timestamp,
    sleep_time_in_secs=sleep_time_in_secs,
)

In [None]:
ccxt_binance_DOGE = set_index_ts(ccxt_binance_DOGE)

In [None]:
ccxt_binance_DOGE = ccxt_binance_DOGE.loc[ccxt_binance_DOGE.index.month == 9]

In [None]:
ccxt_binance_DOGE.loc[ccxt_binance_DOGE['volume'] == 0]

In [None]:
ccxt_binance_DOGE

Where`volume = 0`, data from columns `open`, `high`, `low`, `close` is exactly the same from previous row where `volume != 0`. It could mean that `volume = 0` rows are `NaNs` at the source, so it could be the way exchange handles missing data.

In [None]:
print(percentage(ccxt_binance_DOGE, ccxt_binance_DOGE.loc[ccxt_binance_DOGE['volume'] == 0]))

# CCXT w/o Extractor

In [None]:
ccxt_exchange = log_into_exchange('binance')

In [None]:
ccxt_df = get_all_data(ccxt_exchange, "DOGE/USDT", 1567296000000, 1569887999000)

In [None]:
ccxt_df = set_index_ts(ccxt_df)
ccxt_df.index.min(), ccxt_df.index.max(), ccxt_df.shape

In [None]:
ccxt_df = ccxt_df.loc[ccxt_df.index.month == 9]

In [None]:
ccxt_df.isna().value_counts()

In [None]:
ccxt_df.loc[ccxt_df['volume'] != 0]

# Summary


|CCXT | | ||			Extractor	| | | |Client | | |
|------|--|-||-------------|-|-|-|------|-|-|
|date|Number of NaN rows %|	Total number of rows| `volume=0` %	|Number of NaN rows %|	Total number of rows| `volume=0` %| Number of NaN rows %|	Total number of rows| `volume=0` %|
|2019-09|	0          |	                   429750|	      73.22%   	|	0          |	                   43200|	      73.3%   |      0|	            43200| 73.3%|


- The huge amount of data from CCXT is duplicates. Unique values are 43200.
- Where volume = 0, data from columns open, high, low, close is exactly the same from previous row where volume != 0. It could mean that volume = 0 rows are NaNs at the source, so it could be the way exchange handles missing data.

# ftx::BTC_USDT

## Client

In [None]:
ftx_data = client.read_data(
    ["ftx::BTC_USDT"],
    config["data"]["start_ts"],
    config["data"]["end_ts"],
    config["data"]["columns"],
    config["data"]["filter_data_mode"],
)

In [None]:
ftx_2020_04 = ftx_data.loc[(ftx_data.index.year == 2020) & (ftx_data.index.month == 4)]
ftx_2020_04_volume_0 = ftx_2020_04.loc[ftx_2020_04["volume"] == 0]
ftx_2020_04_volume_0

In [None]:
ftx_2020_04

In [None]:
ftx_2020_04.loc[ftx_2020_04['open'].isna()]

In [None]:
print(percentage(ftx_2020_04, ftx_2020_04_volume_0))

## Extractor

In [None]:
ccxt_ftx_BTC_exchange = imvcdeexcl.CcxtExchange("ftx")
sleep_time_in_secs = 1
start_timestamp = pd.Timestamp("2020-04-01 00:00:00+00:00")
end_timestamp = pd.Timestamp("2020-04-30 23:59:59+00:00")
ccxt_ftx_BTC = ccxt_ftx_BTC_exchange.download_ohlcv_data(
    "BTC/USDT",
    start_timestamp=start_timestamp,
    end_timestamp=end_timestamp,
    sleep_time_in_secs=sleep_time_in_secs,
)

In [None]:
ccxt_ftx_BTC = set_index_ts(ccxt_ftx_BTC)

In [None]:
ccxt_ftx_BTC = ccxt_ftx_BTC.loc[ccxt_ftx_BTC.index.month == 4]

In [None]:
ccxt_ftx_BTC.loc[ccxt_ftx_BTC['volume'] == 0]

In [None]:
ccxt_ftx_BTC

In [None]:
ccxt_ftx_BTC.loc[(ccxt_ftx_BTC['high'] == 7493.50000000)
                 & (ccxt_ftx_BTC['volume'] == 0)]

In [None]:
ccxt_ftx_BTC.loc[(ccxt_ftx_BTC.index.day == 25)
                 & (ccxt_ftx_BTC.index.hour == 3)]

So far `ftx` doesn't have same pattern as `binance` where `volume=0` rows have values from the last non-`volume=0` row.

In [None]:
print(percentage(ccxt_ftx_BTC, ccxt_ftx_BTC.loc[ccxt_ftx_BTC['volume'] == 0]))

## CCXT w/o Extractor

In [None]:
ccxt_exchange_ftx = log_into_exchange('ftx')
ccxt_df_ftx = get_all_data(ccxt_exchange_ftx, "BTC/USDT", 1585699200000, 1588291199000)
ccxt_df_ftx = set_index_ts(ccxt_df_ftx)
ccxt_df_ftx.index.min(), ccxt_df_ftx.index.max(), ccxt_df_ftx.shape

In [None]:
ccxt_df_ftx = ccxt_df_ftx.loc[ccxt_df_ftx.index.month == 4]

In [None]:
ccxt_df_ftx.isna().value_counts()

In [None]:
len(ccxt_df_ftx.index.unique())

In [None]:
ccxt_df_ftx

In [None]:
print(percentage(ccxt_df_ftx, ccxt_df_ftx.loc[ccxt_df_ftx['volume'] == 0]))


|CCXT | | ||			Extractor	| | | |Client | | |
|------|--|-||-------------|-|-|-|------|-|-|
|date|Number of NaN rows %|	Total number of rows| `volume=0` %	|Number of NaN rows %|	Total number of rows| `volume=0` %| Number of NaN rows %|	Total number of rows| `volume=0` %|
|2019-09|	0          |	                   429750|	      86.09%   	|	0          |	                   43200|	      85.97%   |      0|	            43200| 85.97%|


# gateio::ETH_USDT w/o `volume = 0` in data

`gateio` data has weird statistics: no `volume = 0` and still tons of `NaNs`; has `volume = 0` and different amount of `NaNs`, i.e not like the others exchange pattern above. So decided to take a look at two currency pairs with different patterns.

## Client

In [None]:
gateio_data = client.read_data(
    ["gateio::ETH_USDT"],
    config["data"]["start_ts"],
    config["data"]["end_ts"],
    config["data"]["columns"],
    config["data"]["filter_data_mode"],
)

### 100% of `NaNs`

In [None]:
gateio_data_2021_10 = gateio_data.loc[(gateio_data.index.year == 2021)
                                     & (gateio_data.index.month == 10)]
gateio_data_2021_10

In [None]:
gateio_data_2021_10.isna().value_counts()

### 34.46% of `NaNs`

In [None]:
gateio_data_2021_09 = gateio_data.loc[(gateio_data.index.year == 2021)
                                    & (gateio_data.index.month == 9)]
gateio_data_2021_09

In [None]:
gateio_data_2021_09.isna().value_counts()

### No `NaNs`

In [None]:
gateio_data.loc[(gateio_data.index.year == 2021)
                                    & (gateio_data.index.month == 12)]

At first look, `NaNs` appear because of some kind of problem at the source. According to Dan's tables all currency pairs have ~34-39% of `NaNs` for the period from September to November. October data has 100% of `NaNs` for all currency pairs of `gateio` so that definitely could be a technical issue at the exchange.

## Extractor

In [None]:
ccxt_gateio_ETH_exchange = imvcdeexcl.CcxtExchange("gateio")

In [None]:
sleep_time_in_secs = 1
start_timestamp = pd.Timestamp("2021-09-01 00:00:00+00:00")
end_timestamp = pd.Timestamp("2021-09-30 23:59:59+00:00")
ccxt_gateio_ETH = ccxt_gateio_ETH_exchange.download_ohlcv_data(
    "ETH/USDT",
    start_timestamp=start_timestamp,
    end_timestamp=end_timestamp,
    sleep_time_in_secs=sleep_time_in_secs,
)

In [None]:
ccxt_gateio_ETH = set_index_ts(ccxt_gateio_ETH)

In [None]:
ccxt_gateio_ETH

In [None]:
sleep_time_in_secs = 1
start_timestamp = pd.Timestamp("2021-10-01 00:00:00+00:00")
end_timestamp = pd.Timestamp("2021-10-31 23:59:59+00:00")
ccxt_gateio_ETH_10 = ccxt_gateio_ETH_exchange.download_ohlcv_data(
    "ETH/USDT",
    start_timestamp=start_timestamp,
    end_timestamp=end_timestamp,
    sleep_time_in_secs=sleep_time_in_secs,
)

In [None]:
ccxt_gateio_ETH_10

In [None]:
sleep_time_in_secs = 1
start_timestamp = pd.Timestamp("2021-12-01 00:00:00+00:00")
end_timestamp = pd.Timestamp("2021-12-31 23:59:59+00:00")
ccxt_gateio_ETH_12 = ccxt_gateio_ETH_exchange.download_ohlcv_data(
    "ETH/USDT",
    start_timestamp=start_timestamp,
    end_timestamp=end_timestamp,
    sleep_time_in_secs=sleep_time_in_secs,
)

In [None]:
ccxt_gateio_ETH_12

There is no data coming from `Extractor` but somehow we have it on S3. I could say exchange has an expiration date for data.

In [None]:
# Load recent data to make sure API and Exctractor are working.
sleep_time_in_secs = 1
start_timestamp = pd.Timestamp("2022-04-25 00:00:00+00:00")
end_timestamp = pd.Timestamp("2022-05-14 23:59:59+00:00")
ccxt_gateio_ETH_2022 = ccxt_gateio_ETH_exchange.download_ohlcv_data(
    "ETH/USDT",
    start_timestamp=start_timestamp,
    end_timestamp=end_timestamp,
    sleep_time_in_secs=sleep_time_in_secs,
)

In [None]:
ccxt_gateio_ETH_2022

## CCXT w/o Extractor

Take a look at one month of 2021, if it's empty, it have so called an expiration date.

In [None]:
ccxt_exchange = log_into_exchange('gateio')
ccxt_df = get_all_data(ccxt_exchange, "ETH/USDT", 1638316800000, 1640995199000)

In [None]:
ccxt_df

### Summary for `gateio` `volume != 0` data.

- Exchange has an expiration date for data because data we have no longer exist at the source. Here could be useful `end_download_timestamp` column for data we store on S3, just to confirm the statement or vice versa.
- October data has 100% of NaNs for all currency pairs of gateio. Based on that, it could be a technical issue at the exchange.

# gateio::ADA_USDT with `volume = 0` in data

## Client

In [None]:
gateio_ADA_data = client.read_data(
    ["gateio::ADA_USDT"],
    config["data"]["start_ts"],
    config["data"]["end_ts"],
    config["data"]["columns"],
    config["data"]["filter_data_mode"],
)

In [None]:
# `volume = 0` != NaNs amount
gateio_ADA_data_2021_09 = gateio_ADA_data.loc[(gateio_ADA_data.index.year == 2021)
                & (gateio_ADA_data.index.month == 9)]
gateio_ADA_data_2021_09

In [None]:
gateio_ADA_data_2021_09.loc[gateio_ADA_data_2021_09['volume0'] == 0]

In [None]:
gateio_ADA_data_2021_09.loc[(gateio_ADA_data_2021_09.index.day == 5)
                           & (gateio_ADA_data_2021_09.index.hour == 3)].tail()

In [None]:
# `volume = 0` has the same amount as NaNs
gateio_ADA_data_2021_07 = gateio_ADA_data.loc[(gateio_ADA_data.index.year == 2021)
                & (gateio_ADA_data.index.month == 7)]
gateio_ADA_data_2021_07

In [None]:
gateio_ADA_data_2021_07.loc[gateio_ADA_data_2021_07['volume'] == 0]

In [None]:
gateio_ADA_data_2021_07.loc[gateio_ADA_data_2021_07.index >= "2021-07-03 09:20:00+00:00"].head(15)

A pattern where `volume = 0` rows have value for all columns from column `close` of the last non-`volume = 0` row.

## Extractor

In [None]:
# TODO(Nina): Change name of var `ccxt_gateio_ETH_exchange`.
sleep_time_in_secs = 1
start_timestamp = pd.Timestamp("2021-07-01 00:00:00+00:00")
end_timestamp = pd.Timestamp("2021-07-31 23:59:59+00:00")
ccxt_gateio_ADA = ccxt_gateio_ETH_exchange.download_ohlcv_data(
    "ADA/USDT",
    start_timestamp=start_timestamp,
    end_timestamp=end_timestamp,
    sleep_time_in_secs=sleep_time_in_secs,
)

In [None]:
ccxt_gateio_ADA

I think there's no sense to continue with `gateio` analysis, or we can check up the data for 2022.

## Summary for `gateio`

- Small amount of useful data according to Dan's tables for the gateio.
- Data has a pattern where `volume = 0` rows store value for all columns from column `close` of the last non-`volume = 0` row.
- Data has an expiration date because data we have no longer exist at the source. Here could be useful `end_download_timestamp` column for data we store on S3, just to confirm the statement or vice versa.
- October data has 100% of NaNs for all currency pairs of gateio. Based on that, it could be a technical issue at the exchange.