# Cross dataset OHLCV Data QA

This notebook is used to perform quality assurance of cross dataset OHLCV data.
As displayed below, the notebook assumes environment variables for the data QA parameters. The intended usage
is via invoke target `dev_scripts.lib_tasks_data_qa.run_cross_dataset_qa_notebook`

## Imports and logging

In [1]:
import logging

import pandas as pd

import core.config as cconfig
import data_schema.dataset_schema_utils as dsdascut
import helpers.hdbg as hdbg
import helpers.henv as henv
import helpers.hio as hio
import helpers.hprint as hprint
import im_v2.common.data.client.im_raw_data_client as imvcdcimrdc
import im_v2.common.data.qa.dataset_validator as imvcdqdava
import im_v2.common.data.qa.qa_check as imvcdqqach
import im_v2.common.universe.universe as imvcounun

  from tqdm.autonotebook import tqdm


### Logging

In [2]:
hdbg.init_logger(verbosity=logging.INFO)

_LOG = logging.getLogger(__name__)

_LOG.info("%s", henv.get_system_signature()[0])

hprint.config_notebook()

[0m[36mINFO[0m: > cmd='/venv/lib/python3.9/site-packages/ipykernel_launcher.py -f /home/.local/share/jupyter/runtime/kernel-99ea5ec0-d5ca-40f4-9df9-b33fd05bf48c.json'
INFO  # Git
  branch_name='CmampTask7260_Make_QA_errors_more_informative'
  hash='89043f127'
  # Last commits:
    * 89043f127 Sonya Nikiforova Cm task7225 add a toc to a notebook 2 (#7254)                     (   9 hours ago) Mon Feb 19 02:19:04 2024  (HEAD -> CmampTask7260_Make_QA_errors_more_informative, origin/master, origin/HEAD, master)
    * f0603f9bb Nina Lee CmTask6284_Add_LimitPriceComputer_and_its_kwargs_to_the_System_config_2 (#7092) (  13 hours ago) Sun Feb 18 22:46:46 2024           
    * 83cb21e36 Juraj Smeriga CmampTask7165_Document_archiving_EFS_to_S3_flow (#7257)           (  19 hours ago) Sun Feb 18 16:29:57 2024           
# Machine info
  system=Linux
  node name=b4daca3a2157
  release=5.15.0-1052-aws
  version=#57~20.04.1-Ubuntu SMP Mon Jan 15 17:04:56 UTC 2024
  machine=x86_64
  processor=x86_64

## QA parameters

To assist debugging you can override any of the parameters after its loaded and rerun QA

In [3]:
env_var_name = "CK_DATA_RECONCILIATION_CONFIG"
config = cconfig.Config.from_env_var(env_var_name)



In [4]:
if config:
    config = config.to_dict()
else:
    config = {
        "stage": "preprod",
        "start_timestamp": "2024-01-03T20:12:00+00:00",
        "end_timestamp": "2024-01-03T20:50:00+00:00",
        "aws_profile": "ck",
        "dataset_signature1": "realtime.airflow.downloaded_1min.postgres.ohlcv.futures.v7_3.ccxt.binance.v1_0_0",
        "dataset_signature2": "periodic_daily.airflow.downloaded_1min.parquet.ohlcv.futures.v7_3.ccxt.binance.v1_0_0",
        "bid_ask_accuracy": 1,
    }
# bid_ask_accuracy needs to be cast to int if its defined
config["bid_ask_accuracy"] = (
    int(config["bid_ask_accuracy"]) if config["bid_ask_accuracy"] else None
)
# bid_ask_depth needs to be cast to int if its defined
# config["bid_ask_depth"] = int(config["bid_ask_depth"]) if config["bid_ask_depth"] else None
config

{'stage': 'preprod',
 'start_timestamp': '2024-01-03T20:12:00+00:00',
 'end_timestamp': '2024-01-03T20:50:00+00:00',
 'aws_profile': 'ck',
 'dataset_signature1': 'realtime.airflow.downloaded_1min.postgres.ohlcv.futures.v7_3.ccxt.binance.v1_0_0',
 'dataset_signature2': 'periodic_daily.airflow.downloaded_1min.parquet.ohlcv.futures.v7_3.ccxt.binance.v1_0_0',
 'bid_ask_accuracy': 1}

### Parse dataset signature

1. Load dataset schema
2. Validate dataset signature
3. Parse dataset attributes to drive some of the QA configuration

In [5]:
dataset_schema = dsdascut.get_dataset_schema()
dsdascut.validate_dataset_signature(config["dataset_signature1"], dataset_schema)
dsdascut.validate_dataset_signature(config["dataset_signature2"], dataset_schema)

INFO  Loading dataset schema file: /app/amp/data_schema/dataset_schema_versions/dataset_schema_v3.json
INFO  Loaded dataset schema version v3


True

In [6]:
dataset_signature_as_dict1 = dsdascut.parse_dataset_signature_to_args(
    config["dataset_signature1"], dataset_schema
)
dataset_signature_as_dict1

{'download_mode': 'realtime',
 'downloading_entity': 'airflow',
 'action_tag': 'downloaded_1min',
 'data_format': 'postgres',
 'data_type': 'ohlcv',
 'asset_type': 'futures',
 'universe': 'v7_3',
 'vendor': 'ccxt',
 'exchange_id': 'binance',
 'version': 'v1_0_0'}

In [7]:
dataset_signature_as_dict2 = dsdascut.parse_dataset_signature_to_args(
    config["dataset_signature2"], dataset_schema
)
dataset_signature_as_dict2

{'download_mode': 'periodic_daily',
 'downloading_entity': 'airflow',
 'action_tag': 'downloaded_1min',
 'data_format': 'parquet',
 'data_type': 'ohlcv',
 'asset_type': 'futures',
 'universe': 'v7_3',
 'vendor': 'ccxt',
 'exchange_id': 'binance',
 'version': 'v1_0_0'}

## Load Data

TODO(Juraj): At the moment assume that first dataset argument is a DB dataset and second is from S3 because of small preprocessing operations needed before performing QA

### First dataset

In [8]:
raw_data_client = imvcdcimrdc.RawDataReader(
    config["dataset_signature1"], stage=config["stage"]
)
data1 = raw_data_client.read_data(
    pd.Timestamp(config["start_timestamp"]), pd.Timestamp(config["end_timestamp"])
)

INFO  Loading dataset schema file: /app/amp/data_schema/dataset_schema_versions/dataset_schema_v3.json
INFO  Loaded dataset schema version v3
INFO  Loading dataset schema file: /app/amp/data_schema/dataset_schema_versions/dataset_schema_v3.json
INFO  Loaded dataset schema version v3
INFO  Unable to fetch DB credentials from environment variables: 
	'POSTGRES_HOST'
	Attempting env file method.
INFO  Unable to fetch DB credentials from env file: 
	
################################################################################
* Failed assertion *
File '/app/amp/im_v2/devops/env/preprod.im_db_config.env' doesn't exist
################################################################################

	Attempting AWS SecretsManager method.
INFO  Fetching secret: preprod.im_data_db
INFO  Created preprod DB connection: 
 None
INFO  Enabled connection to the `ccxt_ohlcv_futures` DB table
INFO  Executing query: 
	SELECT * FROM ccxt_ohlcv_futures WHERE timestamp >= 1704312720000 AND timestamp <

  df = pd.read_sql_query(query, connection)


In [9]:
data1.head()

Unnamed: 0,id,timestamp,open,high,low,close,volume,currency_pair,exchange_id,end_download_timestamp,knowledge_timestamp
0,436454641,1704312720000,2209.34,2209.35,2207.49,2207.5,448.182,ETH_USDT,binance,2024-01-03 20:13:10.020069+00:00,2024-01-03 20:13:10.044211+00:00
1,436454642,1704312720000,42613.6,42613.7,42574.1,42574.1,76.916,BTC_USDT,binance,2024-01-03 20:13:10.020484+00:00,2024-01-03 20:13:10.044211+00:00
2,436454643,1704312720000,0.5088,0.5088,0.508,0.5081,91672.0,SAND_USDT,binance,2024-01-03 20:13:10.020835+00:00,2024-01-03 20:13:10.044211+00:00
3,436454644,1704312720000,0.6235,0.6235,0.6201,0.6201,111169.0,STORJ_USDT,binance,2024-01-03 20:13:10.021199+00:00,2024-01-03 20:13:10.044211+00:00
4,436454645,1704312720000,0.3087,0.3088,0.3084,0.3084,114903.0,GMT_USDT,binance,2024-01-03 20:13:10.021623+00:00,2024-01-03 20:13:10.044211+00:00


### Second dataset

In [10]:
raw_data_client = imvcdcimrdc.RawDataReader(
    config["dataset_signature2"], stage=config["stage"]
)
data2 = raw_data_client.read_data(
    pd.Timestamp(config["start_timestamp"]), pd.Timestamp(config["end_timestamp"])
)
data2 = data2.reset_index(drop=True)

INFO  Loading dataset schema file: /app/amp/data_schema/dataset_schema_versions/dataset_schema_v3.json
INFO  Loaded dataset schema version v3
INFO  Loading dataset schema file: /app/amp/data_schema/dataset_schema_versions/dataset_schema_v3.json
INFO  Loaded dataset schema version v3
INFO  Loading dataset schema file: /app/amp/data_schema/dataset_schema_versions/dataset_schema_v3.json
INFO  Loaded dataset schema version v3
INFO  Loading dataset schema file: /app/amp/data_schema/dataset_schema_versions/dataset_schema_v3.json
INFO  Loaded dataset schema version v3


In [11]:
data2.head()

Unnamed: 0,timestamp,open,high,low,close,volume,exchange_id,knowledge_timestamp,currency_pair,year,month
0,1704312720000,1.452,1.452,1.45,1.451,43148.0,binance,2024-01-04 01:23:12.362895+00:00,APE_USDT,2024,1
1,1704312780000,1.451,1.453,1.45,1.452,21826.0,binance,2024-01-04 01:23:12.362895+00:00,APE_USDT,2024,1
2,1704312840000,1.451,1.453,1.45,1.453,12976.0,binance,2024-01-04 01:23:12.362895+00:00,APE_USDT,2024,1
3,1704312900000,1.452,1.456,1.452,1.456,38907.0,binance,2024-01-04 01:23:12.362895+00:00,APE_USDT,2024,1
4,1704312960000,1.455,1.458,1.455,1.457,55444.0,binance,2024-01-04 01:23:12.362895+00:00,APE_USDT,2024,1


### Preprocess raw data
- remove columns unimportant for QA
- remove duplicates

In [12]:
cols_to_keep = [
    "timestamp",
    "open",
    "high",
    "low",
    "close",
    "volume",
    "currency_pair",
    "exchange_id",
]
data1 = data1[cols_to_keep].sort_values(
    ["currency_pair", "timestamp"], ascending=True, ignore_index=True
)
data2 = data2[cols_to_keep].sort_values(
    ["currency_pair", "timestamp"], ascending=True, ignore_index=True
)

In [13]:
data1 = data1.drop_duplicates()
data2 = data2.drop_duplicates()

In [14]:
data1.head()

Unnamed: 0,timestamp,open,high,low,close,volume,currency_pair,exchange_id
0,1704312720000,1.452,1.452,1.45,1.451,43148.0,APE_USDT,binance
1,1704312780000,1.451,1.453,1.45,1.452,21826.0,APE_USDT,binance
2,1704312840000,1.451,1.453,1.45,1.453,12976.0,APE_USDT,binance
3,1704312900000,1.452,1.456,1.452,1.456,38907.0,APE_USDT,binance
4,1704312960000,1.455,1.458,1.455,1.457,55444.0,APE_USDT,binance


In [15]:
data2.head()

Unnamed: 0,timestamp,open,high,low,close,volume,currency_pair,exchange_id
0,1704312720000,1.452,1.452,1.45,1.451,43148.0,APE_USDT,binance
1,1704312780000,1.451,1.453,1.45,1.452,21826.0,APE_USDT,binance
2,1704312840000,1.451,1.453,1.45,1.453,12976.0,APE_USDT,binance
3,1704312900000,1.452,1.456,1.452,1.456,38907.0,APE_USDT,binance
4,1704312960000,1.455,1.458,1.455,1.457,55444.0,APE_USDT,binance


## Initialize QA checks

### Single dataset checks

In [16]:
datasets = [data1, data2]
signatures = [dataset_signature_as_dict1, dataset_signature_as_dict2]
qa_check_lists = []

In [17]:
for signature in signatures:
    # TODO(Juraj): this behavior should be encapsulated in some utility function
    data_frequency = "T" if "1min" in signature["action_tag"] else "S"
    vendor_name = signature["vendor"].upper()
    mode = "download"
    version = signature["universe"].replace("_", ".")
    exchange_id = signature["exchange_id"]
    universe = imvcounun.get_vendor_universe(vendor_name, mode, version=version)
    universe_list = universe[exchange_id]
    qa_check_list = [
        imvcdqqach.GapsInTimeIntervalBySymbolsCheck(
            config["start_timestamp"], config["end_timestamp"], data_frequency
        ),
        imvcdqqach.NaNChecks(),
        imvcdqqach.OhlcvLogicalValuesCheck(),
        imvcdqqach.FullUniversePresentCheck(universe_list),
        imvcdqqach.DuplicateDifferingOhlcvCheck(),
    ]
    qa_check_lists.append(qa_check_list)

### Cross dataset checks

In [18]:
cross_qa_check_list = [imvcdqqach.OuterCrossOHLCVDataCheck()]

## Initialize QA validators

In [19]:
dataset_validator1 = imvcdqdava.DataFrameDatasetValidator(qa_check_lists[0])
dataset_validator2 = imvcdqdava.DataFrameDatasetValidator(qa_check_lists[1])
cross_dataset_validator = imvcdqdava.DataFrameDatasetValidator(
    cross_qa_check_list
)

## Run QA

In [22]:
full_error_msgs = []
status = "SUCCESS"
_LOG.info("First dataset QA:")
error_msgs = dataset_validator1.run_all_checks([data1], abort_on_error=False)
if error_msgs:
    full_error_msgs.append(error_msgs)
    _LOG.info(error_msgs)
    status = "FAILED"
_LOG.info("Second dataset QA:")
error_msgs = dataset_validator1.run_all_checks([data2], abort_on_error=False)
if error_msgs:
    full_error_msgs.append(error_msgs)
    _LOG.info(error_msgs)
    status = "FAILED"
_LOG.info("Cross dataset QA:")
error_msgs = cross_dataset_validator.run_all_checks(datasets, abort_on_error=False)
if error_msgs:
    full_error_msgs.append(error_msgs)
    _LOG.info(error_msgs)
    status = "FAILED"
# If no exception was raised mark the QA as successful.
data_qa_outcome = status
full_error_msgs = '\n'.join(full_error_msgs)

INFO  First dataset QA:
INFO  Running all QA checks:
INFO  	GapsInTimeIntervalBySymbolsCheck: PASSED
INFO  	NaNChecks: PASSED
INFO  	OhlcvLogicalValuesCheck: PASSED
INFO  	FullUniversePresentCheck: PASSED
INFO  	DuplicateDifferingOhlcvCheck: FAILED: Duplicate table contents:
	         timestamp    open    high     low   close  volume currency_pair exchange_id
186  1704314520000  315.72  315.94  315.53  315.71  493.17      BNB_USDT     binance
187  1704314520000  315.72  315.94  315.53  315.75  509.53      BNB_USDT     binance
INFO  Second dataset QA:
INFO  Running all QA checks:
INFO  	GapsInTimeIntervalBySymbolsCheck: PASSED
INFO  	NaNChecks: PASSED
INFO  	OhlcvLogicalValuesCheck: PASSED
INFO  	FullUniversePresentCheck: PASSED
INFO  	DuplicateDifferingOhlcvCheck: PASSED
INFO  Cross dataset QA:
INFO  Running all QA checks:
INFO  	OuterCrossOHLCVDataCheck: FAILED: Different data found:
	         timestamp  open_A  high_A   low_A  close_A  volume_A currency_pair exchange_id_A  open_B  hi

In [26]:
if data_qa_outcome == "FAILED":
    hdbg.dfatal(f"QA Check unsuccessful for atleast one of the dataset, with the following errors:\n {full_error_msgs}")

AssertionError: 
################################################################################
QA Check unsuccessful for atleast one of the dataset, with the following errors:
 	DuplicateDifferingOhlcvCheck: FAILED: Duplicate table contents:
	         timestamp    open    high     low   close  volume currency_pair exchange_id
186  1704314520000  315.72  315.94  315.53  315.71  493.17      BNB_USDT     binance
187  1704314520000  315.72  315.94  315.53  315.75  509.53      BNB_USDT     binance
	OuterCrossOHLCVDataCheck: FAILED: Different data found:
	         timestamp  open_A  high_A   low_A  close_A  volume_A currency_pair exchange_id_A  open_B  high_B   low_B  close_B  volume_B exchange_id_B  QAcheck
186  1704314520000  315.72  315.94  315.53   315.71    493.17      BNB_USDT       binance  315.72  315.94  315.53   315.75    509.53       binance    False
################################################################################


In [None]:
# This can be read by the invoke task to find out if QA was successful.
hio.to_file("/app/ck_data_reconciliation_outcome.txt", data_qa_outcome)