In [1]:
import logging

import pandas as pd

import core.config as cconfig
import data_schema.dataset_schema_utils as dsdascut
import helpers.hdbg as hdbg
import helpers.henv as henv
import helpers.hio as hio
import helpers.hprint as hprint
import im_v2.common.data.client.im_raw_data_client as imvcdcimrdc
import im_v2.common.data.qa.dataset_validator as imvcdqdava
import im_v2.common.data.qa.qa_check as imvcdqqach
import im_v2.common.universe.universe as imvcounun

  from tqdm.autonotebook import tqdm


In [2]:
hdbg.init_logger(verbosity=logging.INFO)

_LOG = logging.getLogger(__name__)

_LOG.info("%s", henv.get_system_signature()[0])

hprint.config_notebook()

[0m[36mINFO[0m: > cmd='/venv/lib/python3.9/site-packages/ipykernel_launcher.py -f /home/.local/share/jupyter/runtime/kernel-7b7d6ea8-e37e-4ed2-a74a-68f3891a5ab7.json'
[31m-----------------------------------------------------------------------------
This code is not in sync with the container:
code_version='1.10.0' != container_version='1.12.0'
-----------------------------------------------------------------------------
You need to:
- merge origin/master into your branch with `invoke git_merge_master`
- pull the latest container with `invoke docker_pull`[0m
INFO  # Git
  branch_name='CmampTask6647_Make_bid_ask_QA_more_precise'
  hash='e59affd79'
  # Last commits:
    * e59affd79 Dan      Cm task6547 investigate extra asset id in portfolio from df (#6611) (  17 hours ago) Wed Jan 3 16:03:07 2024  (HEAD -> CmampTask6647_Make_bid_ask_QA_more_precise, origin/master, origin/HEAD)
    * d8220c853 Samarth KaPatel CmTask6505_add_update_ecs_section (#6561)                         (  18 hou

In [3]:
env_var_name = "CK_DATA_RECONCILIATION_CONFIG"
config = cconfig.Config.from_env_var(env_var_name)
if config:
    config = config.to_dict()
    # bid_ask_accuracy needs to be cast to int if its defined
    config["bid_ask_accuracy"] = (
        int(config["bid_ask_accuracy"]) if config["bid_ask_accuracy"] else None
    )
    # Get config from env when running the notebook via the `run_notebook.py`
    # script, e.g., in the system reconciliation flow.
    _LOG.info("Using config from env vars")
else:
    config_dict = {
        "stage": "preprod",
        "start_timestamp": "2024-01-03T18:00:00+00:00",
        "end_timestamp": "2024-01-03T18:30:00+00:00",
        "aws_profile": "ck",
        "dataset_signature": "periodic_daily.airflow.downloaded_200ms.postgres.bid_ask.futures.v7_3.ccxt.binance.v1_0_0",
        "bid_ask_accuracy": 1,
        "data_type": "bid_ask",
        "bid_ask_depth": 1,
        "bid_ask_frequency_sec": "10S",
    }
    config = cconfig.Config.from_dict(config_dict)
print(config)

stage: preprod
start_timestamp: 2024-01-03T18:00:00+00:00
end_timestamp: 2024-01-03T18:30:00+00:00
aws_profile: ck
dataset_signature: periodic_daily.airflow.downloaded_200ms.postgres.bid_ask.futures.v7_3.ccxt.binance.v1_0_0
bid_ask_accuracy: 1
data_type: bid_ask
bid_ask_depth: 1
bid_ask_frequency_sec: 10S


In [4]:
dataset_schema = dsdascut.get_dataset_schema()
dsdascut.validate_dataset_signature(config["dataset_signature"], dataset_schema)

INFO  Loading dataset schema file: /app/amp/data_schema/dataset_schema_versions/dataset_schema_v3.json
INFO  Loaded dataset schema version v3


True

In [5]:
dataset_signature_as_dict = dsdascut.parse_dataset_signature_to_args(
    config["dataset_signature"], dataset_schema
)
dataset_signature_as_dict

{'download_mode': 'periodic_daily',
 'downloading_entity': 'airflow',
 'action_tag': 'downloaded_200ms',
 'data_format': 'postgres',
 'data_type': 'bid_ask',
 'asset_type': 'futures',
 'universe': 'v7_3',
 'vendor': 'ccxt',
 'exchange_id': 'binance',
 'version': 'v1_0_0'}

In [6]:
raw_data_client = imvcdcimrdc.RawDataReader(
    config["dataset_signature"], stage=config["stage"]
)

INFO  Loading dataset schema file: /app/amp/data_schema/dataset_schema_versions/dataset_schema_v3.json
INFO  Loaded dataset schema version v3
INFO  Loading dataset schema file: /app/amp/data_schema/dataset_schema_versions/dataset_schema_v3.json
INFO  Loaded dataset schema version v3
INFO  Unable to fetch DB credentials from environment variables: 
	'POSTGRES_HOST'
	Attempting env file method.
INFO  Unable to fetch DB credentials from env file: 
	
################################################################################
* Failed assertion *
'preprod' in '['local', 'dev', 'prod']'
################################################################################

	Attempting AWS SecretsManager method.
INFO  Fetching secret: preprod.im_data_db
INFO  Created preprod DB connection: 
 None
INFO  Enabled connection to the `ccxt_bid_ask_futures_raw` DB table


In [7]:
data = raw_data_client.read_data(
    pd.Timestamp(config["start_timestamp"]),
    pd.Timestamp(config["end_timestamp"]),
    bid_ask_levels=[i for i in range(1, config["bid_ask_depth"] + 1)],
)

INFO  Executing query: 
	SELECT * FROM ccxt_bid_ask_futures_raw WHERE timestamp >= 1704304800000 AND timestamp <= 1704306600000 AND level IN (1) AND exchange_id = 'binance'


  df = pd.read_sql_query(query, connection)


In [9]:
# Preprocessing.
data.reset_index(inplace=True)
cols_to_keep = [
    "timestamp",
    "currency_pair",
    "exchange_id",
    "end_download_timestamp",
    "knowledge_timestamp",
]
for i in range(1, config["bid_ask_depth"] + 1):
    cols_to_keep += [
        f"bid_size_l{i}",
        f"ask_size_l{i}",
        f"bid_price_l{i}",
        f"ask_price_l{i}",
    ]
data = data[cols_to_keep]

In [10]:
data.head()

Unnamed: 0,timestamp,currency_pair,exchange_id,end_download_timestamp,knowledge_timestamp,bid_size_l1,ask_size_l1,bid_price_l1,ask_price_l1
0,1704304800110,FTM_USDT,binance,2024-01-03 18:00:00.540434+00:00,2024-01-03 18:00:01.468027+00:00,1920.0,3042.0,0.4332,0.4333
1,1704304800113,UNFI_USDT,binance,2024-01-03 18:00:00.534339+00:00,2024-01-03 18:00:01.468027+00:00,2.1,13.9,6.245,6.246
2,1704304800122,AXS_USDT,binance,2024-01-03 18:00:00.542158+00:00,2024-01-03 18:00:01.468027+00:00,74.0,163.0,7.932,7.933
3,1704304800143,DYDX_USDT,binance,2024-01-03 18:00:00.532904+00:00,2024-01-03 18:00:01.468027+00:00,5664.5,2318.8,2.676,2.677
4,1704304800199,ETH_USDT,binance,2024-01-03 18:00:00.461876+00:00,2024-01-03 18:00:01.468027+00:00,1.809,53.872,2223.71,2223.72


In [11]:
# TODO(Juraj): this behavior should be encapsulated in some utility function
data_frequency = "T" if "1min" in dataset_signature_as_dict["action_tag"] else "S"
vendor_name = dataset_signature_as_dict["vendor"].upper()
mode = "download"
version = dataset_signature_as_dict["universe"].replace("_", ".")
exchange_id = dataset_signature_as_dict["exchange_id"]
universe = imvcounun.get_vendor_universe(vendor_name, mode, version=version)
universe_list = universe[exchange_id]

In [12]:
# Bid ask data is aligned to the nearest grid so adjust the
# end_timestamp to avoid the corner case.
def adjust_end_timestamp(timestamp_str):
    timestamp = pd.Timestamp(timestamp_str) - pd.Timedelta(seconds=1)
    return str(timestamp)

In [13]:
adjust_end_timestamp(config["end_timestamp"])

'2024-01-03 18:29:59+00:00'

In [14]:
qa_check_list = [
    imvcdqqach.NaNChecks(),
    imvcdqqach.FullUniversePresentCheck(universe_list),
    imvcdqqach.GapsInTimeIntervalBySymbolsCheck(
        config["start_timestamp"],
        config["end_timestamp"],
        config["bid_ask_frequency_sec"],
        align=True,
    ),
]

In [15]:
dataset_validator = imvcdqdava.DataFrameDatasetValidator(qa_check_list)

In [16]:
try:
    dataset_validator.run_all_checks([data])
except Exception as e:
    # Pass information about success or failure of the QA
    #  back to the task that invoked it.
    data_qa_outcome = str(e)
    raise e
# If no exception was raised mark the QA as successful.
data_qa_outcome = "SUCCESS"

INFO  Running all QA checks:
INFO  	NaNChecks: PASSED
INFO  	FullUniversePresentCheck: PASSED
INFO  	GapsInTimeIntervalBySymbolsCheck: PASSED


In [17]:
# This can be read by the invoke task to find out if QA was successful.
hio.to_file("/app/ck_data_reconciliation_outcome.txt", data_qa_outcome)