# Descriptions

The notebook demonstrates how to use `MarketData`.

In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import logging

import pandas as pd

import core.config as cconfig
import helpers.hdbg as hdbg
import helpers.henv as henv
import helpers.hprint as hprint
import im_v2.ccxt.data.client as icdcl
import im_v2.common.universe as ivcu
import market_data.market_data_example as mdmadaex

  from tqdm.autonotebook import tqdm


In [3]:
log_level = logging.INFO
hdbg.init_logger(verbosity=log_level)

_LOG = logging.getLogger(__name__)

_LOG.info("%s", henv.get_system_signature()[0])

hprint.config_notebook()

[0m[36mINFO[0m: > cmd='/venv/lib/python3.9/site-packages/ipykernel_launcher.py -f /home/.local/share/jupyter/runtime/kernel-41bef88b-2880-47fe-95e0-535cc5e2b1ce.json'
INFO  # Git
  branch_name='CmampTask6752_Merge_DAG_optimization_branches'
  hash='4c0210707'
  # Last commits:
    *   4c0210707 Sameep Pote Merge branch 'master' into CmampTask6752_Merge_DAG_optimization_branches (    3 days ago) Fri Feb 16 14:47:59 2024  (HEAD -> CmampTask6752_Merge_DAG_optimization_branches, origin/CmampTask6752_Merge_DAG_optimization_branches)
    |\  
    | * 54cee75d7 Vlad     CmampTask7097_currency_pair_has_incompatible_types_use_legacy_dataset (#7201) (    3 days ago) Fri Feb 16 14:24:22 2024           
    | * b102c0dfd Juraj Smeriga CmampTask7137_Test_market_data_with_realtime_bidask_resampled_1_min_clients (#7171) (    3 days ago) Fri Feb 16 13:55:35 2024           
# Machine info
  system=Linux
  node name=7edba6067cba
  release=5.15.0-1052-aws
  version=#57~20.04.1-Ubuntu SMP Mon Jan 15 17

# Get asset ids

In [4]:
universe_version = "v7.4"
universe_config = {
    "vendor": "CCXT",
    "version": universe_version,
    "mode": "trade",
    "as_full_symbol": True,
}

In [5]:
full_symbols = ivcu.get_vendor_universe(**universe_config)
# Use only a subset for the demonstration.
full_symbols = full_symbols[4:6]
_LOG.info("Full symbols=%s", full_symbols)

INFO  Full symbols=['binance::BNB_USDT', 'binance::BTC_USDT']


In [6]:
asset_ids = list(ivcu.build_numerical_to_string_id_mapping(full_symbols).keys())
_LOG.info("Asset ids=%s", asset_ids)

INFO  Asset ids=[8968126878, 1467591036]


# `ImClientMarketData`

In [7]:
im_client_market_data_config = {
    "start_timestamp": pd.Timestamp("2023-09-11T00:00:00", tz="UTC"),
    "end_timestamp": pd.Timestamp("2023-09-11T04:00:00", tz="UTC"),
    "im_client": {
        "universe_version": universe_version,
        "root_dir": "s3://cryptokaizen-data-test/v3",
        "partition_mode": "by_year_month",
        "dataset": "ohlcv",
        "contract_type": "futures",
        "data_snapshot": "",
        "aws_profile": "ck",
        "resample_1min": False,
        "version": "v1_0_0",
        "download_universe_version": "v7_3",
        "tag": "downloaded_1min",
    },
    "ts_col_name": "timestamp",
    "columns": None,
    "column_remap": None,
    "filter_data_mode": "assert",
    "wall_clock_time": pd.Timestamp("2100-01-01 00:00:00+00:00"),
}
im_client_market_data_config = cconfig.Config().from_dict(
    im_client_market_data_config
)
print(im_client_market_data_config)

start_timestamp: 2023-09-11 00:00:00+00:00
end_timestamp: 2023-09-11 04:00:00+00:00
im_client: 
  universe_version: v7.4
  root_dir: s3://cryptokaizen-data-test/v3
  partition_mode: by_year_month
  dataset: ohlcv
  contract_type: futures
  data_snapshot: 
  aws_profile: ck
  resample_1min: False
  version: v1_0_0
  download_universe_version: v7_3
  tag: downloaded_1min
ts_col_name: timestamp
columns: None
column_remap: None
filter_data_mode: assert
wall_clock_time: 2100-01-01 00:00:00+00:00


In [8]:
ohlcv_im_client = icdcl.CcxtHistoricalPqByTileClient(
    **im_client_market_data_config["im_client"]
)
ohlcv_market_data = mdmadaex.get_HistoricalImClientMarketData_example1(
    ohlcv_im_client,
    asset_ids,
    im_client_market_data_config["columns"],
    im_client_market_data_config["column_remap"],
    wall_clock_time=im_client_market_data_config["wall_clock_time"],
    filter_data_mode=im_client_market_data_config["filter_data_mode"],
)
ohlcv_data = ohlcv_market_data.get_data_for_interval(
    im_client_market_data_config["start_timestamp"],
    im_client_market_data_config["end_timestamp"],
    im_client_market_data_config["ts_col_name"],
    asset_ids,
)
ohlcv_data.head(3)

Unnamed: 0_level_0,asset_id,full_symbol,open,high,low,close,volume,knowledge_timestamp,start_ts
end_ts,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2023-09-10 20:00:00-04:00,1467591036,binance::BTC_USDT,25829.0,25830.5,25828.4,25828.4,35.063,2023-09-11 01:23:22.496982+00:00,2023-09-10 19:59:00-04:00
2023-09-10 20:00:00-04:00,8968126878,binance::BNB_USDT,212.25,212.29,212.25,212.28,120.72,2023-09-11 01:23:28.294152+00:00,2023-09-10 19:59:00-04:00
2023-09-10 20:01:00-04:00,1467591036,binance::BTC_USDT,25828.4,25835.8,25828.1,25835.7,152.568,2023-09-12 01:23:44.163254+00:00,2023-09-10 20:00:00-04:00


# `StitchedMarketData`

In [12]:
stitched_market_data_config = {
    "start_timestamp": pd.Timestamp("2023-05-01T00:00:00", tz="UTC"),
    "end_timestamp": pd.Timestamp("2023-05-10T04:00:00", tz="UTC"),
    "ohlcv_market_data": {
        "im_client": {
            "universe_version": universe_version,
            "root_dir": "s3://cryptokaizen-unit-test/outcomes/Test_run_all_market_data_reference_notebook/v3/",
            "partition_mode": "by_year_month",
            "dataset": "ohlcv",
            "contract_type": "futures",
            "data_snapshot": "",
            "aws_profile": "ck",
            "resample_1min": False,
            "version": "v1_0_0",
            "download_universe_version": "v7_3",
            "tag": "downloaded_1min",
        },
        "ts_col_name": "timestamp",
        "columns": None,
        "column_remap": None,
        "filter_data_mode": "assert",
    },
    "bid_ask_market_data": {
        "im_client": {
            # Download universe version.
            "universe_version": universe_version,
            "dataset": "bid_ask",
            "contract_type": "futures",
            # Data snapshot is not applicable for data version = "v3".
            "data_snapshot": "",
            "universe_version": universe_version,
            # Data currently residing in the test bucket
            "root_dir": "s3://cryptokaizen-unit-test/outcomes/Test_run_all_market_data_reference_notebook/v3/",
            "partition_mode": "by_year_month",
            "dataset": "bid_ask",
            "contract_type": "futures",
            # v2_0_0 is used due to addition of new column in #CmTask7224.
            "version": "v2_0_0",
            "download_universe_version": "v7",
            "tag": "resampled_1min",
            "aws_profile": "ck",
        },
        "ts_col_name": "timestamp",
        # TODO(Grisha): for some reason the current filtering mechanism filters out `asset_ids` which
        # makes it impossible to stitch the 2 market data dfs. So adding the necessary columns manually.
        # Note(Juraj): we currently resampled only top of the book so no need to filter the columns
        # "columns": cfibiask.get_bid_ask_columns_by_level(1)
        # + ["asset_id", "full_symbol", "start_ts", "knowledge_timestamp"],
        "columns": None,
        "column_remap": None,
        "filter_data_mode": "assert",
    },
    "stitched_market_data": {
        "ts_col_name": "timestamp",
        "columns": None,
        "column_remap": None,
        # TODO(Grisha): check why it fails when the mode is `assert`.
        "filter_data_mode": "warn_and_trim",
    },
}
stitched_market_data_config = cconfig.Config().from_dict(
    stitched_market_data_config
)
print(stitched_market_data_config)

start_timestamp: 2023-05-01 00:00:00+00:00
end_timestamp: 2023-05-01 04:00:00+00:00
ohlcv_market_data: 
  im_client: 
    universe_version: v7.4
    root_dir: s3://cryptokaizen-data.preprod/v3
    partition_mode: by_year_month
    dataset: ohlcv
    contract_type: futures
    data_snapshot: 
    aws_profile: ck
    resample_1min: False
    version: v1_0_0
    download_universe_version: v7_3
    tag: downloaded_1min
  ts_col_name: timestamp
  columns: None
  column_remap: None
  filter_data_mode: assert
bid_ask_market_data: 
  im_client: 
    universe_version: v7.4
    dataset: bid_ask
    contract_type: futures
    data_snapshot: 
    root_dir: s3://cryptokaizen-data.preprod/v3
    partition_mode: by_year_month
    version: v2_0_0
    download_universe_version: v7
    tag: resampled_1min
    aws_profile: ck
  ts_col_name: timestamp
  columns: None
  column_remap: None
  filter_data_mode: assert
stitched_market_data: 
  ts_col_name: timestamp
  columns: None
  column_remap: None
  filte

In [13]:
ohlcv_im_client = icdcl.CcxtHistoricalPqByTileClient(
    **stitched_market_data_config["ohlcv_market_data"]["im_client"]
)
ohlcv_market_data = mdmadaex.get_HistoricalImClientMarketData_example1(
    ohlcv_im_client,
    asset_ids,
    stitched_market_data_config["ohlcv_market_data"]["columns"],
    stitched_market_data_config["ohlcv_market_data"]["column_remap"],
    filter_data_mode=stitched_market_data_config["ohlcv_market_data"][
        "filter_data_mode"
    ],
)
ohlcv_data = ohlcv_market_data.get_data_for_interval(
    stitched_market_data_config["start_timestamp"],
    stitched_market_data_config["end_timestamp"],
    stitched_market_data_config["ohlcv_market_data"]["ts_col_name"],
    asset_ids,
)
ohlcv_data.head(3)



Unnamed: 0_level_0,asset_id,full_symbol,open,high,low,close,volume,knowledge_timestamp,start_ts
end_ts,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2023-04-30 20:00:00-04:00,1467591036,binance::BTC_USDT,29247.0,29253.2,29145.5,29223.0,2888.894,2023-05-01 01:22:07.571121+00:00,2023-04-30 19:59:00-04:00
2023-04-30 20:00:00-04:00,8968126878,binance::BNB_USDT,336.96,337.15,336.41,336.84,6341.86,2023-05-01 01:22:16.403833+00:00,2023-04-30 19:59:00-04:00
2023-04-30 20:01:00-04:00,1467591036,binance::BTC_USDT,29223.0,29245.8,29206.9,29240.7,1550.598,2023-05-02 01:22:09.764846+00:00,2023-04-30 20:00:00-04:00


In [14]:
bid_ask_im_client = icdcl.CcxtHistoricalPqByTileClient(
    **stitched_market_data_config["bid_ask_market_data"]["im_client"]
)
bid_ask_market_data = mdmadaex.get_HistoricalImClientMarketData_example1(
    bid_ask_im_client,
    asset_ids,
    stitched_market_data_config["bid_ask_market_data"]["columns"],
    stitched_market_data_config["bid_ask_market_data"]["column_remap"],
    filter_data_mode=stitched_market_data_config["bid_ask_market_data"][
        "filter_data_mode"
    ],
)

In [15]:
stitched_mdata = mdmadaex.get_HorizontalStitchedMarketData_example1(
    bid_ask_market_data,
    ohlcv_market_data,
    asset_ids,
    stitched_market_data_config["stitched_market_data"]["columns"],
    stitched_market_data_config["stitched_market_data"]["column_remap"],
    filter_data_mode=stitched_market_data_config["stitched_market_data"][
        "filter_data_mode"
    ],
)
stitched_mdata_df = stitched_mdata.get_data_for_interval(
    stitched_market_data_config["start_timestamp"],
    stitched_market_data_config["end_timestamp"],
    stitched_market_data_config["stitched_market_data"]["ts_col_name"],
    asset_ids,
)
stitched_mdata_df.head(3)

Unnamed: 0_level_0,asset_id,full_symbol,level_1.bid_price.open,level_1.bid_size.open,level_1.ask_price.open,level_1.ask_size.open,level_1.bid_ask_midpoint.open,level_1.half_spread.open,level_1.log_size_imbalance.open,level_1.bid_price.close,level_1.bid_size.close,level_1.ask_price.close,level_1.ask_size.close,level_1.bid_ask_midpoint.close,level_1.half_spread.close,level_1.log_size_imbalance.close,level_1.bid_price.high,level_1.bid_size.max,level_1.ask_price.high,level_1.ask_size.max,level_1.bid_ask_midpoint.max,level_1.half_spread.max,level_1.log_size_imbalance.max,level_1.bid_price.low,level_1.bid_size.min,level_1.ask_price.low,level_1.ask_size.min,level_1.bid_ask_midpoint.min,level_1.half_spread.min,level_1.log_size_imbalance.min,level_1.bid_price.mean,level_1.bid_size.mean,level_1.ask_price.mean,level_1.ask_size.mean,level_1.bid_ask_midpoint.mean,level_1.half_spread.mean,level_1.log_size_imbalance.mean,level_1.bid_ask_midpoint_var.100ms,level_1.log_size_imbalance_var.100ms,start_ts,open,high,low,close,volume,knowledge_timestamp
end_ts,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1
2023-04-30 20:00:00-04:00,1467591036,binance::BTC_USDT,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2023-04-30 19:59:00-04:00,29247.0,29253.2,29145.5,29223.0,2888.894,2023-05-01 01:22:07.571121+00:00
2023-04-30 20:00:00-04:00,8968126878,binance::BNB_USDT,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2023-04-30 19:59:00-04:00,336.96,337.15,336.41,336.84,6341.86,2023-05-01 01:22:16.403833+00:00
2023-04-30 20:01:00-04:00,1467591036,binance::BTC_USDT,29223.5,9.139,29223.6,0.421,29223.55,0.05,3.077673,29240.6,2.934,29240.7,10.842,29240.65,0.05,-1.307061,29245.7,28.248,29245.8,69.545,29245.75,3.4,9.682092,29210.0,0.001,29210.8,0.001,29210.4,0.05,-9.708142,29229.246311,3.819143,29229.440574,7.39173,29229.343443,0.097131,-0.40492,1752.21,1402.704628,2023-04-30 20:00:00-04:00,29223.0,29245.8,29206.9,29240.7,1550.598,2024-02-19 16:46:59.296616+00:00


# `ReplayedMarketData`

In [None]:
# TODO(Dan): Add reference code for `ReplayedMarketData`.