# Description

This notebook performs a check that missing data is not present at source.

# Imports

In [1]:
import logging
import os

import pandas as pd
import requests

import core.statistics as costatis
import helpers.hdatetime as hdateti
import helpers.hdbg as hdbg
import helpers.henv as henv
import helpers.hpandas as hpandas
import helpers.hprint as hprint
import helpers.hs3 as hs3
import im_v2.crypto_chassis.data.client.crypto_chassis_clients as imvccdcccc

  from tqdm.autonotebook import tqdm


In [2]:
hdbg.init_logger(verbosity=logging.INFO)

_LOG = logging.getLogger(__name__)

_LOG.info("%s", henv.get_system_signature()[0])

hprint.config_notebook()

[0m[36mINFO[0m: > cmd='/venv/lib/python3.8/site-packages/ipykernel_launcher.py -f /home/.local/share/jupyter/runtime/kernel-7a838d8b-c28d-455d-99dc-66e7d27bfba1.json'
INFO  # Git
    branch_name='CMTask1945_look_at_FTX_DOGE_XRP_at_the_source'
    hash='780acfb9c'
    # Last commits:
      *   780acfb9c Nina Lee Merge branch 'master' into CMTask1945_look_at_FTX_DOGE_XRP_at_the_source (59 minutes ago) Tue May 24 19:32:56 2022  (HEAD -> CMTask1945_look_at_FTX_DOGE_XRP_at_the_source, origin/CMTask1945_look_at_FTX_DOGE_XRP_at_the_source)
      |\  
      * | 1d2a4ac4d Nina Lee fix                                                               (62 minutes ago) Tue May 24 19:30:07 2022           
      | * 9a9239503 DanilYachmenev Cm task1966 make master cross vendor qa (#1991)                   (   5 hours ago) Tue May 24 15:34:22 2022  (origin/master, origin/HEAD, master, CMTask1999_Move_get_timestamp_stats_get_bad_data_stats_from_the_cross_vendor_QA_notebook_to_a_lib)
# Machine info
    

# Functions

In [3]:
def _get_full_symbol_data_for_year_month(
    df: pd.DataFrame, full_symbol: str, year: int, month: int
) -> pd.DataFrame:
    """
    Get data for one full symbol for a specific year and month.
    """
    df = df[
        (df.index.year == year)
        & (df.index.month == month)
        & (df["full_symbol"] == full_symbol)
    ]
    df = df.round(8)
    if "knowledge_timestamp" in df.columns.to_list():
        df = df.drop(columns=["knowledge_timestamp"])
        df.index.name = "time_seconds"
    return df


def _get_qa_stats(data: pd.DataFrame, source: str) -> pd.DataFrame:
    """
    Get quality assurance stats per full symbol.
    """
    res_stats = []
    for full_symbol, symbol_data in data.groupby("full_symbol"):
        # Compute stats for a full symbol.
        symbol_stats = pd.Series(dtype="object", name=full_symbol)
        symbol_stats["source"] = source
        symbol_stats["min_timestamp"] = symbol_data.index.min()
        symbol_stats["max_timestamp"] = symbol_data.index.max()
        symbol_stats["NaNs [%]"] = 100 * (
            costatis.compute_frac_nan(symbol_data["close"])
        )
        symbol_stats["volume=0 [%]"] = 100 * (
            symbol_data[symbol_data["volume"] == 0].shape[0]
            / symbol_data.shape[0]
        )
        symbol_stats["bad data [%]"] = (
            symbol_stats["NaNs [%]"] + symbol_stats["volume=0 [%]"]
        )
        res_stats.append(symbol_stats)
    res_stats_df = pd.concat(res_stats, axis=1).T
    return res_stats_df


def _load_crypto_chassis_ohlcv(exchange_id: str, currency_pair: str):
    """
    Load data from CryptoChassis API.
    """
    r = requests.get(
        f"https://api.cryptochassis.com/v1/ohlc/{exchange_id}/{currency_pair}?startTime=0"
    )
    df = pd.read_csv(r.json()["historical"]["urls"][0]["url"], compression="gzip")
    df["time_seconds"] = df["time_seconds"].apply(
        lambda x: hdateti.convert_unix_epoch_to_timestamp(x, unit="s")
    )
    df = df.set_index("time_seconds")
    full_symbol = (
        f"{exchange_id.lower()}::{currency_pair.upper().replace('-', '_')}"
    )
    df.insert(0, "full_symbol", full_symbol)
    return df

# Load data from CryptoChassis API

In [4]:
source_ftx_xrp = _load_crypto_chassis_ohlcv("ftx", "xrp-usdt")
source_ftx_xrp_2022_4 = _get_full_symbol_data_for_year_month(
    source_ftx_xrp, "ftx::XRP_USDT", 2022, 4
)

In [5]:
source_ftx_doge = _load_crypto_chassis_ohlcv("ftx", "doge-usdt")
source_ftx_doge_2022_3 = _get_full_symbol_data_for_year_month(
    source_ftx_doge, "ftx::DOGE_USDT", 2022, 3
)

# Load data with client

In [6]:
universe_version = "v1"
resample_1min = False
root_dir = os.path.join(
    hs3.get_s3_bucket_path("ck"),
    "reorg",
    "historical.manual.pq",
)
partition_mode = "by_year_month"
client = imvccdcccc.CryptoChassisHistoricalPqByTileClient(
    universe_version, resample_1min, root_dir, partition_mode, aws_profile="ck"
)

In [7]:
start_ts = None
end_ts = None
columns = None
filter_data_mode = "assert"
full_symbols = ["ftx::XRP_USDT", "ftx::DOGE_USDT"]
s3_ftx = client.read_data(
    full_symbols, start_ts, end_ts, columns, filter_data_mode
)

# Compare data

## ftx::XRP_USDT

In [8]:
s3_ftx_xrp_2022_04 = _get_full_symbol_data_for_year_month(
    s3_ftx, "ftx::XRP_USDT", 2022, 4
)
s3_ftx_xrp_2022_04.shape[0] == source_ftx_xrp_2022_4.shape[0]

True

## ftx::DOGE_USDT

In [9]:
s3_ftx_doge_2022_3 = _get_full_symbol_data_for_year_month(
    s3_ftx, "ftx::DOGE_USDT", 2022, 3
)
source_ftx_doge_2022_3.shape[0] == s3_ftx_doge_2022_3.shape[0]

True

In [10]:
# There are no NaNs for ftx::DOGE_USDT for all period storing on S3.
s3_ftx_doge_2022_3[s3_ftx_doge_2022_3["close"].isna()].shape[0]

0

In [11]:
# There is no volume=0 in the S3 data.
s3_ftx_doge_2022_3[s3_ftx_doge_2022_3["volume"] == 0].shape[0]

0

### Compare non-resampled data from source and S3

In [12]:
# Check if data in both datasets are equal.
s3_ftx_doge_2022_3.eq(source_ftx_doge_2022_3, axis=1).value_counts()

full_symbol  open  high  low   close  volume  vwap  number_of_trades  twap
True         True  True  True  True   True    True  True              True    20248
dtype: int64

### Compare resampled data from the source and S3

In [13]:
source_ftx_doge_2022_3_resampled = hpandas.resample_df(
    source_ftx_doge_2022_3, "T"
)
source_ftx_doge_2022_3_resampled["full_symbol"] = "ftx::DOGE_USDT"
# Check how much NaNs in the resampled data.
source_ftx_doge_2022_3_resampled[
    source_ftx_doge_2022_3_resampled["close"].isna()
].shape[0]

24392

In [14]:
s3_ftx_doge_2022_3_resampled = hpandas.resample_df(s3_ftx_doge_2022_3, "T")
s3_ftx_doge_2022_3_resampled["full_symbol"] = "ftx::DOGE_USDT"
# Check how much NaNs in the resampled data.
s3_ftx_doge_2022_3_resampled[s3_ftx_doge_2022_3_resampled["close"].isna()].shape[
    0
]

24392

In [15]:
s3_stats = _get_qa_stats(s3_ftx_doge_2022_3, "s3")
source_stats = _get_qa_stats(source_ftx_doge_2022_3, "CryptoChassis")
s3_resampled_stats = _get_qa_stats(s3_ftx_doge_2022_3_resampled, "s3_resampled")
source_resampled_stats = _get_qa_stats(
    source_ftx_doge_2022_3_resampled, "CryptoChassis_resampled"
)

In [16]:
stats = pd.concat(
    [s3_stats, source_stats, s3_resampled_stats, source_resampled_stats]
)
stats

Unnamed: 0,source,min_timestamp,max_timestamp,NaNs [%],volume=0 [%],bad data [%]
ftx::DOGE_USDT,s3,2022-03-01 00:00:00+00:00,2022-03-31 23:59:00+00:00,0.0,0.0,0.0
ftx::DOGE_USDT,CryptoChassis,2022-03-01 00:00:00+00:00,2022-03-31 23:59:00+00:00,0.0,0.0,0.0
ftx::DOGE_USDT,s3_resampled,2022-03-01 00:00:00+00:00,2022-03-31 23:59:00+00:00,54.641577,0.0,54.641577
ftx::DOGE_USDT,CryptoChassis_resampled,2022-03-01 00:00:00+00:00,2022-03-31 23:59:00+00:00,54.641577,0.0,54.641577


Equal amount of NaNs after resampling. Data with NaNs on S3 is absent at the source.