# Master raw data gallery

This notebook showcases locations and basic structure of raw data from:

- S3 (parquet datasets)
- IM DB (Postgres)

The secondary purpose is to provide a guide on how to use `RawDataReader`

## Specs
- This notebook:
  - is a catalog of all the datasets that exist in the our system
  - shows how to load data using our low-level functions or specific API for specific datasets
  - shows how a snippet of the data looks like (for this we want to load the minimal amount of data)
  - doesn't compute any statistics
  - should be quickly to execute, like < 1min, so we can run it in the unit tests

## Life cycle
- Any time a new dataset is added (e.g., in real-time DB, Parquet) we add some information on how to load it and how it looks like
- In general we try not to delete any data but we only add data loaders

## Monster dataset matrix spreadsheet

The gallery should match 1-to-1 with the dataset matrix

https://docs.google.com/spreadsheets/d/13Vyrxs9Eg-C6y91XIogLHi4A1_AFK7_KCF2KEnnxYv0/edit#gid=521856766

# Imports

In [1]:
import datetime
import logging

import pandas as pd

import helpers.hdbg as hdbg
import helpers.henv as henv
import helpers.hpandas as hpandas
import helpers.hprint as hprint
import im_v2.common.data.client.im_raw_data_client as imvcdcimrdc

  from tqdm.autonotebook import tqdm


In [2]:
hdbg.init_logger(verbosity=logging.INFO)
log_level = logging.INFO

_LOG = logging.getLogger(__name__)

_LOG.info("%s", henv.get_system_signature()[0])

hprint.config_notebook()

INFO  > cmd='/venv/lib/python3.9/site-packages/ipykernel_launcher.py -f /home/.local/share/jupyter/runtime/kernel-223cb510-05de-4297-83b9-09af6258e474.json'
INFO  # Git
  branch_name='CmampTask8922_Update_dataset_matrix_and_gallery_for_crypto.com_datasets'
  hash='3d9f7c7da'
  # Last commits:
    * 3d9f7c7da pavolrabatin removed `prod-im-db` from terraform configuration (#8938)         (   5 hours ago) Tue Jul 9 12:15:48 2024  (HEAD -> CmampTask8922_Update_dataset_matrix_and_gallery_for_crypto.com_datasets, origin/master, origin/HEAD, origin/CmampTask8922_Update_dataset_matrix_and_gallery_for_crypto.com_datasets, origin/CmTask8946_Remove_obsolete_code_from_trading_DAGs, origin/CmTask8759_Organize_Broker_TradingOps_doc_oms_order_, master)
    * f86b25654 pavolrabatin resource added (#8905)                                            (   7 hours ago) Tue Jul 9 09:50:30 2024           
    * 3d921ae5f Shayan   AlertManager optimization (#8912)                                 (  22 hours ag

# Active datasets

## realtime.airflow.resampled_1min.postgres.bid_ask.futures.v8.ccxt.binance.v1_0_0

In [3]:
signature = "realtime.airflow.resampled_1min.postgres.bid_ask.futures.v8.ccxt.binance.v1_0_0"
reader = imvcdcimrdc.RawDataReader(signature, stage="preprod")
data = reader.read_data_head()
_LOG.log(log_level, hpandas.df_to_str(data, log_level=log_level))

INFO  Loading dataset schema file: /app/amp/data_schema/dataset_schema_versions/dataset_schema_v3.json
INFO  Loaded dataset schema version v3
INFO  Loading dataset schema file: /app/amp/data_schema/dataset_schema_versions/dataset_schema_v3.json
INFO  Loaded dataset schema version v3
INFO  Unable to fetch DB credentials from environment variables: 
	'POSTGRES_HOST'
	Attempting env file method.
INFO  Unable to fetch DB credentials from env file: 
	
################################################################################
* Failed assertion *
File '/app/amp/im_v2/devops/env/preprod.im_db_config.env' doesn't exist
################################################################################

	Attempting AWS SecretsManager method.
INFO  Fetching secret: preprod.im_data_db
INFO  Created preprod DB connection: 
 None
INFO  Enabled connection to the `ccxt_bid_ask_futures_resampled_1min` DB table
INFO  Enabled connection to the `ccxt_bid_ask_futures_resampled_1min` DB table
INFO  Exec

  df = pd.read_sql_query(query, connection)


Unnamed: 0,timestamp,currency_pair,exchange_id,level,end_download_timestamp,knowledge_timestamp,ask_price_close,ask_price_high,ask_price_low,ask_price_mean,ask_price_open,ask_size_close,ask_size_max,ask_size_mean,ask_size_min,ask_size_open,bid_price_close,bid_price_high,bid_price_low,bid_price_mean,bid_price_open,bid_size_close,bid_size_max,bid_size_mean,bid_size_min,bid_size_open,bid_ask_midpoint_open,half_spread_open,log_size_imbalance_open,bid_ask_midpoint_close,half_spread_close,log_size_imbalance_close,bid_ask_midpoint_min,half_spread_min,log_size_imbalance_min,bid_ask_midpoint_max,half_spread_max,log_size_imbalance_max,bid_ask_midpoint_mean,half_spread_mean,log_size_imbalance_mean,bid_ask_midpoint_var_100ms,bid_ask_midpoint_autocovar_100ms,log_size_imbalance_var_100ms,log_size_imbalance_autocovar_100ms
0.0,1707326160000,APE_USDT,binance,1,,2024-02-07 17:16:01.044261+00:00,1.3705,1.3706,1.3694,1.370104,1.3702,151.0,1365.0,207.435065,4.0,358.0,1.3704,1.3705,1.3693,1.370003,1.3701,769.0,3280.0,667.461039,36.0,354.0,,,,,,,,,,,,,,,,,,,
1.0,1707326160000,AVAX_USDT,binance,1,,2024-02-07 17:16:01.044261+00:00,35.023,35.029,34.974,35.005584,35.015,29.0,402.0,49.987013,1.0,7.0,35.022,35.028,34.973,35.004539,35.014,109.0,427.0,94.857143,1.0,99.0,,,,,,,,,,,,,,,,,,,
2.0,1707326160000,AXS_USDT,binance,1,,2024-02-07 17:16:01.044261+00:00,7.081,7.082,7.079,7.080197,7.079,101.0,1558.0,575.223684,10.0,100.0,7.08,7.081,7.078,7.079197,7.078,213.0,1584.0,384.493421,69.0,69.0,,,,,,,,,,,,,,,,,,,
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7.0,1707326160000,CTK_USDT,binance,1,,2024-02-07 17:16:01.044261+00:00,0.6273,0.628,0.627,0.627445,0.6278,22.0,7872.0,2159.821192,16.0,857.0,0.6271,0.6278,0.6269,0.627314,0.6276,101.0,1762.0,397.357616,16.0,1302.0,,,,,,,,,,,,,,,,,,,
8.0,1707326160000,DOGE_USDT,binance,1,,2024-02-07 17:16:01.044261+00:00,0.07946,0.07947,0.07946,0.079466,0.07947,5153503.0,5548917.0,2366407.155844,60133.0,323984.0,0.07945,0.07946,0.07945,0.079456,0.07946,621673.0,738487.0,379849.584416,41907.0,614888.0,,,,,,,,,,,,,,,,,,,
9.0,1707326160000,DOT_USDT,binance,1,,2024-02-07 17:16:01.044261+00:00,6.87,6.871,6.863,6.867229,6.864,1889.0,3865.9,1477.835294,106.5,1747.8,6.869,6.87,6.862,6.866229,6.863,73.3,3367.5,1395.09085,73.3,1976.3,,,,,,,,,,,,,,,,,,,


INFO  None


## realtime.airflow.downloaded_200ms.postgres.bid_ask.futures.v8.ccxt.binance.v1_0_0

In [4]:
signature = "realtime.airflow.downloaded_200ms.postgres.bid_ask.futures.v8.ccxt.binance.v1_0_0"
reader = imvcdcimrdc.RawDataReader(signature, stage="preprod")
data = reader.read_data_head()
_LOG.log(log_level, hpandas.df_to_str(data, log_level=log_level))

INFO  Loading dataset schema file: /app/amp/data_schema/dataset_schema_versions/dataset_schema_v3.json
INFO  Loaded dataset schema version v3
INFO  Loading dataset schema file: /app/amp/data_schema/dataset_schema_versions/dataset_schema_v3.json
INFO  Loaded dataset schema version v3
INFO  Enabled connection to the `ccxt_bid_ask_futures_raw` DB table
INFO  Enabled connection to the `ccxt_bid_ask_futures_raw` DB table
INFO  Executing query: 
	SELECT * FROM ccxt_bid_ask_futures_raw WHERE exchange_id = 'binance' LIMIT 10


  df = pd.read_sql_query(query, connection)


Unnamed: 0,timestamp,bid_size,bid_price,ask_size,ask_price,currency_pair,exchange_id,level,end_download_timestamp,knowledge_timestamp
0.0,1711689552058,14.569,3572.63,39.939,3572.64,ETH_USDT,binance,1,2024-03-29 05:19:12.228817+00:00,2024-03-29 05:19:12.681645+00:00
1.0,1711689552012,0.487,70441.1,3.077,70441.2,BTC_USDT,binance,1,2024-03-29 05:19:12.230280+00:00,2024-03-29 05:19:12.681645+00:00
2.0,1711689551961,6459.0,0.703,4362.0,0.7031,SAND_USDT,binance,1,2024-03-29 05:19:12.231349+00:00,2024-03-29 05:19:12.681645+00:00
,...,...,...,...,...,...,...,...,...,...
7.0,1711689552041,3.0,2.0383,341.0,2.0384,APE_USDT,binance,1,2024-03-29 05:19:12.234426+00:00,2024-03-29 05:19:12.681645+00:00
8.0,1711689552061,775.0,0.9971,23973.0,0.9972,MATIC_USDT,binance,1,2024-03-29 05:19:12.234926+00:00,2024-03-29 05:19:12.681645+00:00
9.0,1711689552002,1210.7,3.504,3219.8,3.505,DYDX_USDT,binance,1,2024-03-29 05:19:12.235282+00:00,2024-03-29 05:19:12.681645+00:00


INFO  None


## realtime.airflow.downloaded_200ms.postgres.bid_ask.futures.v8.binance.binance.v1_0_0

In [5]:
signature = "realtime.airflow.downloaded_200ms.postgres.bid_ask.futures.v8.binance.binance.v1_0_0"
reader = imvcdcimrdc.RawDataReader(signature, stage="preprod")
data = reader.read_data_head()
_LOG.log(log_level, hpandas.df_to_str(data, log_level=log_level))

INFO  Loading dataset schema file: /app/amp/data_schema/dataset_schema_versions/dataset_schema_v3.json
INFO  Loaded dataset schema version v3
INFO  Loading dataset schema file: /app/amp/data_schema/dataset_schema_versions/dataset_schema_v3.json
INFO  Loaded dataset schema version v3
INFO  Enabled connection to the `binance_bid_ask_futures_raw` DB table
INFO  Enabled connection to the `binance_bid_ask_futures_raw` DB table
INFO  Executing query: 
	SELECT * FROM binance_bid_ask_futures_raw WHERE exchange_id = 'binance' ORDER BY timestamp DESC LIMIT 10


  df = pd.read_sql_query(query, connection)


Unnamed: 0,timestamp,bid_size,bid_price,ask_size,ask_price,currency_pair,exchange_id,level,end_download_timestamp,knowledge_timestamp
0.0,1715888594810,0.109,55.23,7.974,55.24,COMP_USDT,binance,1,2024-05-16 19:43:14.979093+00:00,2024-05-16 19:43:14.992051+00:00
1.0,1715888594807,2907.0,0.5492,2561.0,0.5493,SEI_USDT,binance,1,2024-05-16 19:43:14.977667+00:00,2024-05-16 19:43:14.992051+00:00
2.0,1715888594807,5.0,159.237,123.0,159.238,SOL_USDT,binance,1,2024-05-16 19:43:14.978062+00:00,2024-05-16 19:43:14.992051+00:00
,...,...,...,...,...,...,...,...,...,...
7.0,1715888594798,521.0,0.3354,7665.0,0.3355,CHR_USDT,binance,1,2024-05-16 19:43:14.978995+00:00,2024-05-16 19:43:14.992051+00:00
8.0,1715888594791,2.0,6.467,797.0,6.468,RUNE_USDT,binance,1,2024-05-16 19:43:14.977835+00:00,2024-05-16 19:43:14.992051+00:00
9.0,1715888594786,378.0,0.3793,10202.0,0.3794,1INCH_USDT,binance,1,2024-05-16 19:43:14.979225+00:00,2024-05-16 19:43:14.992051+00:00


INFO  None


## realtime.airflow.resampled_1min.postgres.bid_ask.futures.v8.binance.binance.v1_0_0

In [6]:
signature = "realtime.airflow.resampled_1min.postgres.bid_ask.futures.v8.binance.binance.v1_0_0"
reader = imvcdcimrdc.RawDataReader(signature, stage="preprod")
data = reader.read_data_head()
_LOG.log(log_level, hpandas.df_to_str(data, log_level=log_level))

INFO  Loading dataset schema file: /app/amp/data_schema/dataset_schema_versions/dataset_schema_v3.json
INFO  Loaded dataset schema version v3
INFO  Loading dataset schema file: /app/amp/data_schema/dataset_schema_versions/dataset_schema_v3.json
INFO  Loaded dataset schema version v3
INFO  Enabled connection to the `binance_bid_ask_futures_resampled_1min` DB table
INFO  Enabled connection to the `binance_bid_ask_futures_resampled_1min` DB table
INFO  Executing query: 
	SELECT * FROM binance_bid_ask_futures_resampled_1min WHERE exchange_id = 'binance' ORDER BY timestamp DESC LIMIT 10


  df = pd.read_sql_query(query, connection)


Unnamed: 0,timestamp,bid_size_open,bid_size_close,bid_size_min,bid_size_max,bid_size_mean,bid_price_open,bid_price_close,bid_price_high,bid_price_low,bid_price_mean,ask_size_open,ask_size_close,ask_size_min,ask_size_max,ask_size_mean,ask_price_open,ask_price_close,ask_price_high,ask_price_low,ask_price_mean,bid_ask_midpoint_open,half_spread_open,log_size_imbalance_open,bid_ask_midpoint_close,half_spread_close,log_size_imbalance_close,bid_ask_midpoint_min,half_spread_min,log_size_imbalance_min,bid_ask_midpoint_max,half_spread_max,log_size_imbalance_max,bid_ask_midpoint_mean,half_spread_mean,log_size_imbalance_mean,bid_ask_midpoint_var_100ms,bid_ask_midpoint_autocovar_100ms,log_size_imbalance_var_100ms,log_size_imbalance_autocovar_100ms,currency_pair,exchange_id,level,end_download_timestamp,knowledge_timestamp
0.0,1715888580000,646.0,440.4,87.4,13842.4,3303.668447,0.4947,0.495,0.4951,0.4946,0.494913,4635.6,5814.2,87.4,7981.8,4087.821536,0.4948,0.4951,0.4952,0.4947,0.495013,0.49475,0.00005,-1.970721,0.49505,0.00005,-2.580375,0.49465,0.00005,-4.514424,0.49515,0.00005,5.064996,0.494963,0.00005,-1.080507,0.0,0.0,3298.716855,3189.009827,ZRX_USDT,binance,1,2024-05-16 19:43:07.323688+00:00,2024-05-16 19:43:07.417691+00:00
1.0,1715888580000,11392.0,169869.0,11392.0,378240.0,209576.008403,0.02309,0.02309,0.0231,0.02308,0.023087,143692.0,29713.0,1342.0,405076.0,45770.117647,0.0231,0.0231,0.02311,0.02309,0.023097,0.023095,0.000005,-2.534761,0.023095,0.000005,1.743443,0.023085,0.000005,-3.571163,0.023105,0.000005,5.419823,0.023092,0.000005,1.340125,0.0,0.0,2312.307205,2234.818069,ZIL_USDT,binance,1,2024-05-16 19:43:07.323688+00:00,2024-05-16 19:43:07.417691+00:00
2.0,1715888580000,142.0,60.0,6.0,315.0,118.144876,1.6222,1.6231,1.6232,1.6216,1.622619,106.0,151.0,4.0,1056.0,159.540636,1.6226,1.6234,1.6238,1.6218,1.623007,1.6224,0.0002,0.292388,1.62325,0.00015,-0.922935,1.6217,0.00005,-5.016333,1.6235,0.00035,3.520461,1.622813,0.000194,-0.240552,0.000001,0.0,1279.487642,1221.724396,ZETA_USDT,binance,1,2024-05-16 19:43:07.323688+00:00,2024-05-16 19:43:07.417691+00:00
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7.0,1715888580000,0.9,31.0,0.6,80.0,18.323986,9.356,9.355,9.357,9.354,9.355757,12.7,36.7,1.0,36.7,20.540541,9.358,9.358,9.358,9.355,9.356988,9.357,0.001,-2.646963,9.3565,0.0015,-0.16879,9.3545,0.0005,-4.037186,9.3575,0.0015,3.32569,9.356372,0.000616,-0.240438,0.000007,0.0,1410.622218,1341.699652,XVS_USDT,binance,1,2024-05-16 19:43:07.323688+00:00,2024-05-16 19:43:07.417691+00:00
8.0,1715888580000,302028.0,26761.0,3605.0,421352.0,41696.846416,0.005886,0.005881,0.005886,0.005877,0.005882,135405.0,19169.0,1276.0,291028.0,36328.090444,0.005887,0.005882,0.005887,0.005878,0.005883,0.005886,0.000001,0.802249,0.005881,0.000001,0.333651,0.005878,0.0,-2.819428,0.005886,0.000001,3.612037,0.005882,0.000001,0.082351,0.0,0.0,991.568682,868.036853,XVG_USDT,binance,1,2024-05-16 19:43:07.323688+00:00,2024-05-16 19:43:07.417691+00:00
9.0,1715888580000,16919.6,18563.0,818.6,30852.6,10217.447466,0.917,0.917,0.918,0.916,0.917316,10159.4,1751.5,1291.5,62122.8,22197.377703,0.918,0.918,0.919,0.917,0.918319,0.9175,0.0005,0.510073,0.9175,0.0005,2.360698,0.9165,0.0005,-4.089123,0.9185,0.001,3.173417,0.917818,0.000502,-0.619486,0.000005,0.0,2333.741311,2280.146302,XTZ_USDT,binance,1,2024-05-16 19:43:07.323688+00:00,2024-05-16 19:43:07.417691+00:00


INFO  None


## periodic_daily.airflow.archived_200ms.parquet.bid_ask.futures.v8.ccxt.binance.v1_0_0

In [7]:
# TODO(Juraj): #CmTask8309

## periodic_daily.airflow.resampled_1min.parquet.bid_ask.futures.v8.ccxt.binance.v1_0_0

In [8]:
# TODO(Juraj): #CmTask8309

## periodic_daily.airflow.archived_200ms.parquet.bid_ask.futures.v8.binance.binance.v1_0_0

In [9]:
signature = "periodic_daily.airflow.archived_200ms.parquet.bid_ask.futures.v8.binance.binance.v1_0_0"
reader = imvcdcimrdc.RawDataReader(signature, stage="preprod")
start_timestamp = pd.Timestamp("2024-05-16T00:00:00+00:00")
end_timestamp = start_timestamp + pd.Timedelta(minutes=2)
data = reader.read_data(
    start_timestamp, end_timestamp, currency_pairs=["BTC_USDT", "ETH_USDT"]
)
_LOG.log(log_level, hpandas.df_to_str(data, log_level=log_level))

INFO  Loading dataset schema file: /app/amp/data_schema/dataset_schema_versions/dataset_schema_v3.json
INFO  Loaded dataset schema version v3
INFO  Loading dataset schema file: /app/amp/data_schema/dataset_schema_versions/dataset_schema_v3.json
INFO  Loaded dataset schema version v3
INFO  Loading dataset schema file: /app/amp/data_schema/dataset_schema_versions/dataset_schema_v3.json
INFO  Loaded dataset schema version v3
INFO  Before from_parquet


Unnamed: 0,timestamp,bid_size,bid_price,ask_size,ask_price,exchange_id,level,end_download_timestamp,knowledge_timestamp,currency_pair,year,month,day
2024-05-16 00:00:00.143000+00:00,1715817600143,4.022,66175.4,3.215,66175.5,binance,1,2024-05-16 00:00:00.337472+00:00,2024-05-16 00:00:00.350773+00:00,BTC_USDT,2024,5,16
2024-05-16 00:00:00.365000+00:00,1715817600365,2.226,66175.4,3.755,66175.5,binance,1,2024-05-16 00:00:00.538837+00:00,2024-05-16 00:00:00.552395+00:00,BTC_USDT,2024,5,16
2024-05-16 00:00:00.570000+00:00,1715817600570,2.623,66175.4,3.555,66175.5,binance,1,2024-05-16 00:00:00.740175+00:00,2024-05-16 00:00:00.754108+00:00,BTC_USDT,2024,5,16
,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-05-16 00:01:56.289000+00:00,1715817716289,54.277,3031.67,41.898,3031.68,binance,1,2024-05-16 00:01:56.450604+00:00,2024-05-16 00:01:56.475475+00:00,ETH_USDT,2024,5,16
2024-05-16 00:01:56.437000+00:00,1715817716437,54.274,3031.67,42.601,3031.68,binance,1,2024-05-16 00:01:56.651671+00:00,2024-05-16 00:01:56.673926+00:00,ETH_USDT,2024,5,16
2024-05-16 00:01:56.544000+00:00,1715817716544,52.984,3031.67,42.731,3031.68,binance,1,2024-05-16 00:01:56.853605+00:00,2024-05-16 00:01:56.874993+00:00,ETH_USDT,2024,5,16


INFO  None


## realtime.airflow.downloaded_1min.postgres.ohlcv.futures.v8.ccxt.binance.v1_0_0

In [10]:
signature = "realtime.airflow.downloaded_1min.postgres.ohlcv.futures.v8.ccxt.binance.v1_0_0"
reader = imvcdcimrdc.RawDataReader(signature, stage="preprod")
data = reader.read_data_head()
_LOG.log(log_level, hpandas.df_to_str(data, log_level=log_level))

INFO  Loading dataset schema file: /app/amp/data_schema/dataset_schema_versions/dataset_schema_v3.json
INFO  Loaded dataset schema version v3
INFO  Loading dataset schema file: /app/amp/data_schema/dataset_schema_versions/dataset_schema_v3.json
INFO  Loaded dataset schema version v3
INFO  Enabled connection to the `ccxt_ohlcv_futures` DB table
INFO  Enabled connection to the `ccxt_ohlcv_futures` DB table
INFO  Executing query: 
	SELECT * FROM ccxt_ohlcv_futures WHERE exchange_id = 'binance' ORDER BY timestamp DESC LIMIT 10


  df = pd.read_sql_query(query, connection)


Unnamed: 0,timestamp,open,high,low,close,volume,currency_pair,exchange_id,end_download_timestamp,knowledge_timestamp
0.0,1715888520000,0.02167,0.02168,0.02166,0.02168,210396.0,USTC_USDT,binance,2024-05-16 19:43:10.928690+00:00,2024-05-16 19:43:10.992352+00:00
1.0,1715888520000,0.0868,0.0868,0.08674,0.08677,176463.0,LOOM_USDT,binance,2024-05-16 19:43:10.927322+00:00,2024-05-16 19:43:10.992352+00:00
2.0,1715888520000,5.038,5.04,5.037,5.04,822.9,GAS_USDT,binance,2024-05-16 19:43:10.925938+00:00,2024-05-16 19:43:10.992352+00:00
,...,...,...,...,...,...,...,...,...,...
7.0,1715888520000,81.54,81.58,81.52,81.58,42.0,ILV_USDT,binance,2024-05-16 19:43:10.918907+00:00,2024-05-16 19:43:10.992352+00:00
8.0,1715888520000,1.0688,1.0689,1.0679,1.0688,2629.0,TWT_USDT,binance,2024-05-16 19:43:10.917531+00:00,2024-05-16 19:43:10.992352+00:00
9.0,1715888520000,4.5453,4.5496,4.5436,4.5467,12282.0,JTO_USDT,binance,2024-05-16 19:43:10.916163+00:00,2024-05-16 19:43:10.992352+00:00


INFO  None


## periodic_daily.airflow.downloaded_all.parquet.trades.futures.v8.binance.binance.v2_0_0

In [11]:
signature = "periodic_daily.airflow.downloaded_all.parquet.trades.futures.v8.binance.binance.v2_0_0"
reader = imvcdcimrdc.RawDataReader(signature, stage="preprod")
start_timestamp = pd.Timestamp("2024-01-01T00:00:00+00:00")
end_timestamp = start_timestamp + pd.Timedelta(minutes=2)
data = reader.read_data(
    start_timestamp, end_timestamp, currency_pairs=["BTC_USDT"]
)
_LOG.log(log_level, hpandas.df_to_str(data, log_level=log_level))

INFO  Loading dataset schema file: /app/amp/data_schema/dataset_schema_versions/dataset_schema_v3.json
INFO  Loaded dataset schema version v3
INFO  Loading dataset schema file: /app/amp/data_schema/dataset_schema_versions/dataset_schema_v3.json
INFO  Loaded dataset schema version v3
INFO  Loading dataset schema file: /app/amp/data_schema/dataset_schema_versions/dataset_schema_v3.json
INFO  Loaded dataset schema version v3
INFO  Before from_parquet


Unnamed: 0,timestamp,price,amount,is_buyer_maker,id,quote_qty,exchange_id,knowledge_timestamp,currency_pair,year,month,day
2024-01-01 00:00:00.006000+00:00,1704067200006,42314.0,0.033,False,4426785098,1396.362,binance,2024-04-10 15:40:39.807031+00:00,BTC_USDT,2024,1,1
2024-01-01 00:00:00.006000+00:00,1704067200006,42314.0,0.215,False,4426785099,9097.51,binance,2024-04-10 15:40:39.807031+00:00,BTC_USDT,2024,1,1
2024-01-01 00:00:00.022000+00:00,1704067200022,42314.0,0.1,False,4426785100,4231.4,binance,2024-04-10 15:40:39.807031+00:00,BTC_USDT,2024,1,1
,...,...,...,...,...,...,...,...,...,...,...,...
2024-01-01 00:01:59.554000+00:00,1704067319554,42350.5,0.006,False,4426790290,254.103,binance,2024-04-18 01:10:16.793311+00:00,BTC_USDT,2024,1,1
2024-01-01 00:01:59.780000+00:00,1704067319780,42350.5,0.059,False,4426790291,2498.6795,binance,2024-04-18 01:10:16.793311+00:00,BTC_USDT,2024,1,1
2024-01-01 00:01:59.828000+00:00,1704067319828,42350.4,0.015,True,4426790292,635.256,binance,2024-04-18 01:10:16.793311+00:00,BTC_USDT,2024,1,1


INFO  None


In [12]:
## TODO(Sonaal): Add info about bid/ask historical data

## realtime.airflow.downloaded_200ms.postgres.bid_ask.spot.v7_3.ccxt.binance.v1_0_0

In [13]:
signature = "realtime.airflow.downloaded_200ms.postgres.bid_ask.spot.v7_3.ccxt.binance.v1_0_0"
reader = imvcdcimrdc.RawDataReader(signature, stage="preprod")
data = reader.read_data_head()
_LOG.log(log_level, hpandas.df_to_str(data, log_level=log_level))

INFO  Loading dataset schema file: /app/amp/data_schema/dataset_schema_versions/dataset_schema_v3.json
INFO  Loaded dataset schema version v3
INFO  Loading dataset schema file: /app/amp/data_schema/dataset_schema_versions/dataset_schema_v3.json
INFO  Loaded dataset schema version v3
INFO  Enabled connection to the `ccxt_bid_ask_spot_raw` DB table
INFO  Enabled connection to the `ccxt_bid_ask_spot_raw` DB table
INFO  Executing query: 
	SELECT * FROM ccxt_bid_ask_spot_raw WHERE exchange_id = 'binance' ORDER BY timestamp DESC LIMIT 10


  df = pd.read_sql_query(query, connection)


Unnamed: 0,timestamp,bid_size,bid_price,ask_size,ask_price,currency_pair,exchange_id,level,end_download_timestamp,knowledge_timestamp
0.0,1715888752320,1111.0,0.5261,39.0,0.5263,STORJ_USDT,binance,1,2024-05-16 19:45:52.457091+00:00,2024-05-16 19:45:52.483667+00:00
1.0,1715888752320,2928.0,0.526,441.0,0.5264,STORJ_USDT,binance,2,2024-05-16 19:45:52.457091+00:00,2024-05-16 19:45:52.483667+00:00
2.0,1715888752320,3898.0,0.5259,8131.0,0.5265,STORJ_USDT,binance,3,2024-05-16 19:45:52.457091+00:00,2024-05-16 19:45:52.483667+00:00
,...,...,...,...,...,...,...,...,...,...
7.0,1715888752320,21095.0,0.5254,4867.0,0.527,STORJ_USDT,binance,8,2024-05-16 19:45:52.457091+00:00,2024-05-16 19:45:52.483667+00:00
8.0,1715888752320,8413.0,0.5253,3264.0,0.5271,STORJ_USDT,binance,9,2024-05-16 19:45:52.457091+00:00,2024-05-16 19:45:52.483667+00:00
9.0,1715888752320,16450.0,0.5252,2099.0,0.5272,STORJ_USDT,binance,10,2024-05-16 19:45:52.457091+00:00,2024-05-16 19:45:52.483667+00:00


INFO  None


## periodic_daily.airflow.archived_200ms.postgres.bid_ask.spot.v7.ccxt.binance.v1_0_0

In [14]:
signature = "periodic_daily.airflow.archived_200ms.postgres.bid_ask.spot.v7.ccxt.binance.v1_0_0"
reader = imvcdcimrdc.RawDataReader(signature, stage="preprod")
data = reader.read_data_head()
_LOG.log(log_level, hpandas.df_to_str(data, log_level=log_level))

INFO  Loading dataset schema file: /app/amp/data_schema/dataset_schema_versions/dataset_schema_v3.json
INFO  Loaded dataset schema version v3
INFO  Loading dataset schema file: /app/amp/data_schema/dataset_schema_versions/dataset_schema_v3.json
INFO  Loaded dataset schema version v3
INFO  Enabled connection to the `ccxt_bid_ask_spot_raw` DB table
INFO  Enabled connection to the `ccxt_bid_ask_spot_raw` DB table
INFO  Executing query: 
	SELECT * FROM ccxt_bid_ask_spot_raw WHERE exchange_id = 'binance' ORDER BY timestamp DESC LIMIT 10


  df = pd.read_sql_query(query, connection)


Unnamed: 0,timestamp,bid_size,bid_price,ask_size,ask_price,currency_pair,exchange_id,level,end_download_timestamp,knowledge_timestamp
0.0,1715888752320,1111.0,0.5261,39.0,0.5263,STORJ_USDT,binance,1,2024-05-16 19:45:52.457091+00:00,2024-05-16 19:45:52.483667+00:00
1.0,1715888752320,2928.0,0.526,441.0,0.5264,STORJ_USDT,binance,2,2024-05-16 19:45:52.457091+00:00,2024-05-16 19:45:52.483667+00:00
2.0,1715888752320,3898.0,0.5259,8131.0,0.5265,STORJ_USDT,binance,3,2024-05-16 19:45:52.457091+00:00,2024-05-16 19:45:52.483667+00:00
,...,...,...,...,...,...,...,...,...,...
7.0,1715888752320,21095.0,0.5254,4867.0,0.527,STORJ_USDT,binance,8,2024-05-16 19:45:52.457091+00:00,2024-05-16 19:45:52.483667+00:00
8.0,1715888752320,8413.0,0.5253,3264.0,0.5271,STORJ_USDT,binance,9,2024-05-16 19:45:52.457091+00:00,2024-05-16 19:45:52.483667+00:00
9.0,1715888752320,16450.0,0.5252,2099.0,0.5272,STORJ_USDT,binance,10,2024-05-16 19:45:52.457091+00:00,2024-05-16 19:45:52.483667+00:00


INFO  None


## realtime.airflow.downloaded_200ms.postgres.bid_ask.spot.v7_3.ccxt.okx.v1_0_0

In [15]:
signature = (
    "realtime.airflow.downloaded_200ms.postgres.bid_ask.spot.v7_3.ccxt.okx.v1_0_0"
)
reader = imvcdcimrdc.RawDataReader(signature, stage="preprod")
data = reader.read_data_head()
_LOG.log(log_level, hpandas.df_to_str(data, log_level=log_level))

INFO  Loading dataset schema file: /app/amp/data_schema/dataset_schema_versions/dataset_schema_v3.json
INFO  Loaded dataset schema version v3
INFO  Loading dataset schema file: /app/amp/data_schema/dataset_schema_versions/dataset_schema_v3.json
INFO  Loaded dataset schema version v3
INFO  Enabled connection to the `ccxt_bid_ask_spot_raw` DB table
INFO  Enabled connection to the `ccxt_bid_ask_spot_raw` DB table
INFO  Executing query: 
	SELECT * FROM ccxt_bid_ask_spot_raw WHERE exchange_id = 'okx' ORDER BY timestamp DESC LIMIT 10


  df = pd.read_sql_query(query, connection)


Unnamed: 0,timestamp,bid_size,bid_price,ask_size,ask_price,currency_pair,exchange_id,level,end_download_timestamp,knowledge_timestamp
0.0,1715888790109,3011.472,4.863,678.777,4.882,WLD_USDT,okx,10,2024-05-16 19:46:30.481910+00:00,2024-05-16 19:46:30.585210+00:00
1.0,1715888790109,900.853,4.864,5162.821,4.881,WLD_USDT,okx,9,2024-05-16 19:46:30.481910+00:00,2024-05-16 19:46:30.585210+00:00
2.0,1715888790109,5114.216,4.865,817.85,4.88,WLD_USDT,okx,8,2024-05-16 19:46:30.481910+00:00,2024-05-16 19:46:30.585210+00:00
,...,...,...,...,...,...,...,...,...,...
7.0,1715888790109,438.143,4.87,953.924,4.875,WLD_USDT,okx,3,2024-05-16 19:46:30.481910+00:00,2024-05-16 19:46:30.585210+00:00
8.0,1715888790109,168.851,4.871,118.686,4.874,WLD_USDT,okx,2,2024-05-16 19:46:30.481910+00:00,2024-05-16 19:46:30.585210+00:00
9.0,1715888790109,101.229,4.872,54.147,4.873,WLD_USDT,okx,1,2024-05-16 19:46:30.481910+00:00,2024-05-16 19:46:30.585210+00:00


INFO  None


## periodic_daily.airflow.archived_200ms.postgres.bid_ask.spot.v7_3.ccxt.okx.v1_0_0

In [16]:
signature = "periodic_daily.airflow.archived_200ms.postgres.bid_ask.spot.v7_3.ccxt.okx.v1_0_0"
reader = imvcdcimrdc.RawDataReader(signature, stage="preprod")
data = reader.read_data_head()
_LOG.log(log_level, hpandas.df_to_str(data, log_level=log_level))

INFO  Loading dataset schema file: /app/amp/data_schema/dataset_schema_versions/dataset_schema_v3.json
INFO  Loaded dataset schema version v3
INFO  Loading dataset schema file: /app/amp/data_schema/dataset_schema_versions/dataset_schema_v3.json
INFO  Loaded dataset schema version v3
INFO  Enabled connection to the `ccxt_bid_ask_spot_raw` DB table
INFO  Enabled connection to the `ccxt_bid_ask_spot_raw` DB table
INFO  Executing query: 
	SELECT * FROM ccxt_bid_ask_spot_raw WHERE exchange_id = 'okx' ORDER BY timestamp DESC LIMIT 10


  df = pd.read_sql_query(query, connection)


Unnamed: 0,timestamp,bid_size,bid_price,ask_size,ask_price,currency_pair,exchange_id,level,end_download_timestamp,knowledge_timestamp
0.0,1715888790109,3011.472,4.863,678.777,4.882,WLD_USDT,okx,10,2024-05-16 19:46:30.481910+00:00,2024-05-16 19:46:30.585210+00:00
1.0,1715888790109,900.853,4.864,5162.821,4.881,WLD_USDT,okx,9,2024-05-16 19:46:30.481910+00:00,2024-05-16 19:46:30.585210+00:00
2.0,1715888790109,5114.216,4.865,817.85,4.88,WLD_USDT,okx,8,2024-05-16 19:46:30.481910+00:00,2024-05-16 19:46:30.585210+00:00
,...,...,...,...,...,...,...,...,...,...
7.0,1715888790109,438.143,4.87,953.924,4.875,WLD_USDT,okx,3,2024-05-16 19:46:30.481910+00:00,2024-05-16 19:46:30.585210+00:00
8.0,1715888790109,168.851,4.871,118.686,4.874,WLD_USDT,okx,2,2024-05-16 19:46:30.481910+00:00,2024-05-16 19:46:30.585210+00:00
9.0,1715888790109,101.229,4.872,54.147,4.873,WLD_USDT,okx,1,2024-05-16 19:46:30.481910+00:00,2024-05-16 19:46:30.585210+00:00


INFO  None


## realtime.airflow.downloaded_200ms.postgres.bid_ask.futures.v7_4.ccxt.cryptocom.v1_0_0

In [17]:
signature = "realtime.airflow.downloaded_200ms.postgres.bid_ask.futures.v7_4.ccxt.cryptocom.v1_0_0"
reader = imvcdcimrdc.RawDataReader(signature, stage="preprod")
data = reader.read_data_head()
_LOG.log(log_level, hpandas.df_to_str(data, log_level=log_level))

INFO  Loading dataset schema file: /app/amp/data_schema/dataset_schema_versions/dataset_schema_v3.json
INFO  Loaded dataset schema version v3
INFO  Loading dataset schema file: /app/amp/data_schema/dataset_schema_versions/dataset_schema_v3.json
INFO  Loaded dataset schema version v3
INFO  Enabled connection to the `ccxt_bid_ask_futures_raw` DB table
INFO  Enabled connection to the `ccxt_bid_ask_futures_raw` DB table
INFO  Executing query: 
	SELECT * FROM ccxt_bid_ask_futures_raw WHERE exchange_id = 'cryptocom' ORDER BY timestamp DESC LIMIT 10


  df = pd.read_sql_query(query, connection)


Unnamed: 0,timestamp,bid_size,bid_price,ask_size,ask_price,currency_pair,exchange_id,level,end_download_timestamp,knowledge_timestamp
0.0,1715888797183,9.37,159.39,8.91,159.72,SOL_USD,cryptocom,10,2024-05-16 19:46:37.323794+00:00,2024-05-16 19:46:37.378304+00:00
1.0,1715888797183,0.8,159.4,8.95,159.71,SOL_USD,cryptocom,9,2024-05-16 19:46:37.323794+00:00,2024-05-16 19:46:37.378304+00:00
2.0,1715888797183,83.47,159.41,9.0,159.7,SOL_USD,cryptocom,8,2024-05-16 19:46:37.323794+00:00,2024-05-16 19:46:37.378304+00:00
,...,...,...,...,...,...,...,...,...,...
7.0,1715888797183,1.5,159.47,4.05,159.59,SOL_USD,cryptocom,3,2024-05-16 19:46:37.323794+00:00,2024-05-16 19:46:37.378304+00:00
8.0,1715888797183,2.69,159.48,3.5,159.58,SOL_USD,cryptocom,2,2024-05-16 19:46:37.323794+00:00,2024-05-16 19:46:37.378304+00:00
9.0,1715888797183,2.0,159.52,37.24,159.57,SOL_USD,cryptocom,1,2024-05-16 19:46:37.323794+00:00,2024-05-16 19:46:37.378304+00:00


INFO  None


## periodic_daily.airflow.archived_200ms.postgres.bid_ask.futures.v7_4.ccxt.cryptocom.v1_0_0

In [18]:
signature = "periodic_daily.airflow.archived_200ms.postgres.bid_ask.futures.v7_4.ccxt.cryptocom.v1_0_0"
reader = imvcdcimrdc.RawDataReader(signature, stage="preprod")
data = reader.read_data_head()
_LOG.log(log_level, hpandas.df_to_str(data, log_level=log_level))

INFO  Loading dataset schema file: /app/amp/data_schema/dataset_schema_versions/dataset_schema_v3.json
INFO  Loaded dataset schema version v3
INFO  Loading dataset schema file: /app/amp/data_schema/dataset_schema_versions/dataset_schema_v3.json
INFO  Loaded dataset schema version v3
INFO  Enabled connection to the `ccxt_bid_ask_futures_raw` DB table
INFO  Enabled connection to the `ccxt_bid_ask_futures_raw` DB table
INFO  Executing query: 
	SELECT * FROM ccxt_bid_ask_futures_raw WHERE exchange_id = 'cryptocom' ORDER BY timestamp DESC LIMIT 10


  df = pd.read_sql_query(query, connection)


Unnamed: 0,timestamp,bid_size,bid_price,ask_size,ask_price,currency_pair,exchange_id,level,end_download_timestamp,knowledge_timestamp
0.0,1715888797183,9.37,159.39,8.91,159.72,SOL_USD,cryptocom,10,2024-05-16 19:46:37.323794+00:00,2024-05-16 19:46:37.378304+00:00
1.0,1715888797183,0.8,159.4,8.95,159.71,SOL_USD,cryptocom,9,2024-05-16 19:46:37.323794+00:00,2024-05-16 19:46:37.378304+00:00
2.0,1715888797183,83.47,159.41,9.0,159.7,SOL_USD,cryptocom,8,2024-05-16 19:46:37.323794+00:00,2024-05-16 19:46:37.378304+00:00
,...,...,...,...,...,...,...,...,...,...
7.0,1715888797183,1.5,159.47,4.05,159.59,SOL_USD,cryptocom,3,2024-05-16 19:46:37.323794+00:00,2024-05-16 19:46:37.378304+00:00
8.0,1715888797183,2.69,159.48,3.5,159.58,SOL_USD,cryptocom,2,2024-05-16 19:46:37.323794+00:00,2024-05-16 19:46:37.378304+00:00
9.0,1715888797183,2.0,159.52,37.24,159.57,SOL_USD,cryptocom,1,2024-05-16 19:46:37.323794+00:00,2024-05-16 19:46:37.378304+00:00


INFO  None


## periodic_daily.airflow.downloaded_1min.parquet.ohlcv.spot.v7_3.ccxt.binance.v1_0_0

In [19]:
signature = "periodic_daily.airflow.downloaded_1min.parquet.ohlcv.spot.v7_3.ccxt.binance.v1_0_0"
reader = imvcdcimrdc.RawDataReader(signature, stage="preprod")
data = reader.read_data_head()
_LOG.log(log_level, hpandas.df_to_str(data, log_level=log_level))

INFO  Loading dataset schema file: /app/amp/data_schema/dataset_schema_versions/dataset_schema_v3.json
INFO  Loaded dataset schema version v3
INFO  Loading dataset schema file: /app/amp/data_schema/dataset_schema_versions/dataset_schema_v3.json
INFO  Loaded dataset schema version v3
INFO  Loading dataset schema file: /app/amp/data_schema/dataset_schema_versions/dataset_schema_v3.json
INFO  Loaded dataset schema version v3
INFO  Loading the data from `s3://cryptokaizen-data.preprod/v3/periodic_daily/airflow/downloaded_1min/parquet/ohlcv/spot/v7_3/ccxt/binance/v1_0_0`


Unnamed: 0,timestamp,open,high,low,close,volume,exchange_id,knowledge_timestamp,currency_pair,year,month
2024-05-15 00:00:00+00:00,1715731200000,0.6319,0.6342,0.6315,0.6338,1010.9,binance,2024-05-16 01:24:33.668966+00:00,CTK_USDT,2024,5
2024-05-15 00:01:00+00:00,1715731260000,0.6337,0.6345,0.6333,0.6336,1779.8,binance,2024-05-16 01:24:33.668966+00:00,CTK_USDT,2024,5
2024-05-15 00:02:00+00:00,1715731320000,0.6337,0.6349,0.6331,0.6339,3290.5,binance,2024-05-16 01:24:33.668966+00:00,CTK_USDT,2024,5
,...,...,...,...,...,...,...,...,...,...,...
2024-05-15 00:07:00+00:00,1715731620000,0.634,0.6342,0.6337,0.634,914.0,binance,2024-05-16 01:24:33.668966+00:00,CTK_USDT,2024,5
2024-05-15 00:08:00+00:00,1715731680000,0.6339,0.6342,0.6339,0.6342,155.8,binance,2024-05-16 01:24:33.668966+00:00,CTK_USDT,2024,5
2024-05-15 00:09:00+00:00,1715731740000,0.6339,0.6343,0.6337,0.6343,2469.2,binance,2024-05-16 01:24:33.668966+00:00,CTK_USDT,2024,5


INFO  None


## periodic_daily.airflow.downloaded_1min.parquet.ohlcv.futures.v7_3.ccxt.binance.v1_0_0

In [20]:
signature = "periodic_daily.airflow.downloaded_1min.parquet.ohlcv.spot.v7_3.ccxt.binance.v1_0_0"
reader = imvcdcimrdc.RawDataReader(signature, stage="preprod")
data = reader.read_data_head()
_LOG.log(log_level, hpandas.df_to_str(data, log_level=log_level))

INFO  Loading dataset schema file: /app/amp/data_schema/dataset_schema_versions/dataset_schema_v3.json
INFO  Loaded dataset schema version v3
INFO  Loading dataset schema file: /app/amp/data_schema/dataset_schema_versions/dataset_schema_v3.json
INFO  Loaded dataset schema version v3
INFO  Loading dataset schema file: /app/amp/data_schema/dataset_schema_versions/dataset_schema_v3.json
INFO  Loaded dataset schema version v3
INFO  Loading the data from `s3://cryptokaizen-data.preprod/v3/periodic_daily/airflow/downloaded_1min/parquet/ohlcv/spot/v7_3/ccxt/binance/v1_0_0`


Unnamed: 0,timestamp,open,high,low,close,volume,exchange_id,knowledge_timestamp,currency_pair,year,month
2024-05-15 00:00:00+00:00,1715731200000,0.6319,0.6342,0.6315,0.6338,1010.9,binance,2024-05-16 01:24:33.668966+00:00,CTK_USDT,2024,5
2024-05-15 00:01:00+00:00,1715731260000,0.6337,0.6345,0.6333,0.6336,1779.8,binance,2024-05-16 01:24:33.668966+00:00,CTK_USDT,2024,5
2024-05-15 00:02:00+00:00,1715731320000,0.6337,0.6349,0.6331,0.6339,3290.5,binance,2024-05-16 01:24:33.668966+00:00,CTK_USDT,2024,5
,...,...,...,...,...,...,...,...,...,...,...
2024-05-15 00:07:00+00:00,1715731620000,0.634,0.6342,0.6337,0.634,914.0,binance,2024-05-16 01:24:33.668966+00:00,CTK_USDT,2024,5
2024-05-15 00:08:00+00:00,1715731680000,0.6339,0.6342,0.6339,0.6342,155.8,binance,2024-05-16 01:24:33.668966+00:00,CTK_USDT,2024,5
2024-05-15 00:09:00+00:00,1715731740000,0.6339,0.6343,0.6337,0.6343,2469.2,binance,2024-05-16 01:24:33.668966+00:00,CTK_USDT,2024,5


INFO  None


## periodic_daily.airflow.downloaded_1min.parquet.ohlcv.futures.v8.ccxt.binance.v1_0_0

In [21]:
signature = "periodic_daily.airflow.downloaded_1min.parquet.ohlcv.futures.v8.ccxt.binance.v1_0_0"
reader = imvcdcimrdc.RawDataReader(signature, stage="preprod")
start_timestamp = pd.Timestamp("2024-01-01T00:00:00+00:00")
end_timestamp = start_timestamp + pd.Timedelta(minutes=5)
data = reader.read_data(
    start_timestamp, end_timestamp, currency_pairs=["BTC_USDT", "ETH_USDT"]
)
_LOG.log(log_level, hpandas.df_to_str(data, log_level=log_level))

INFO  Loading dataset schema file: /app/amp/data_schema/dataset_schema_versions/dataset_schema_v3.json
INFO  Loaded dataset schema version v3
INFO  Loading dataset schema file: /app/amp/data_schema/dataset_schema_versions/dataset_schema_v3.json
INFO  Loaded dataset schema version v3
INFO  Loading dataset schema file: /app/amp/data_schema/dataset_schema_versions/dataset_schema_v3.json
INFO  Loaded dataset schema version v3
INFO  Before from_parquet


Unnamed: 0,timestamp,open,high,low,close,volume,exchange_id,knowledge_timestamp,currency_pair,year,month
2024-01-01 00:00:00+00:00,1704067200000,42314.0,42335.8,42289.6,42331.9,289.641,binance,2024-02-27 20:28:37.170290+00:00,BTC_USDT,2024,1
2024-01-01 00:01:00+00:00,1704067260000,42331.9,42353.1,42331.8,42350.4,202.444,binance,2024-02-27 20:28:37.170290+00:00,BTC_USDT,2024,1
2024-01-01 00:02:00+00:00,1704067320000,42350.4,42370.8,42349.6,42360.2,271.521,binance,2024-02-27 20:28:37.170290+00:00,BTC_USDT,2024,1
,...,...,...,...,...,...,...,...,...,...,...
2024-01-01 00:03:00+00:00,1704067380000,2286.3,2288.44,2286.29,2288.35,2191.911,binance,2024-02-27 21:23:12.203301+00:00,ETH_USDT,2024,1
2024-01-01 00:04:00+00:00,1704067440000,2288.36,2289.92,2288.36,2289.92,1946.412,binance,2024-02-27 21:23:12.203301+00:00,ETH_USDT,2024,1
2024-01-01 00:05:00+00:00,1704067500000,2289.92,2291.93,2288.77,2291.45,3691.669,binance,2024-02-27 21:23:12.203301+00:00,ETH_USDT,2024,1


INFO  None


## periodic_daily.airflow.downloaded_1min.parquet.ohlcv.futures.v7_4.ccxt.cryptocom.v1_0_0

In [22]:
signature = "periodic_daily.airflow.downloaded_1min.parquet.ohlcv.futures.v7_4.ccxt.cryptocom.v1_0_0"
reader = imvcdcimrdc.RawDataReader(signature, stage="preprod")
data = reader.read_data_head()
_LOG.log(log_level, hpandas.df_to_str(data, log_level=log_level))

INFO  Loading dataset schema file: /app/amp/data_schema/dataset_schema_versions/dataset_schema_v3.json
INFO  Loaded dataset schema version v3
INFO  Loading dataset schema file: /app/amp/data_schema/dataset_schema_versions/dataset_schema_v3.json
INFO  Loaded dataset schema version v3
INFO  Loading dataset schema file: /app/amp/data_schema/dataset_schema_versions/dataset_schema_v3.json
INFO  Loaded dataset schema version v3
INFO  Loading the data from `s3://cryptokaizen-data.preprod/v3/periodic_daily/airflow/downloaded_1min/parquet/ohlcv/futures/v7_4/ccxt/cryptocom/v1_0_0`


Unnamed: 0,timestamp,open,high,low,close,volume,exchange_id,knowledge_timestamp,currency_pair,year,month
2024-05-15 00:00:00+00:00,1715731200000,141.96,142.08,141.94,142.06,15.93,cryptocom,2024-05-16 01:24:46.466080+00:00,SOL_USD,2024,5
2024-05-15 00:01:00+00:00,1715731260000,142.03,142.11,142.02,142.07,40.14,cryptocom,2024-05-16 01:24:46.466080+00:00,SOL_USD,2024,5
2024-05-15 00:02:00+00:00,1715731320000,142.09,142.26,142.09,142.18,65.22,cryptocom,2024-05-16 01:24:46.466080+00:00,SOL_USD,2024,5
,...,...,...,...,...,...,...,...,...,...,...
2024-05-15 00:07:00+00:00,1715731620000,142.32,142.35,142.27,142.27,3.6,cryptocom,2024-05-16 01:24:46.466080+00:00,SOL_USD,2024,5
2024-05-15 00:08:00+00:00,1715731680000,142.28,142.4,142.25,142.25,11.34,cryptocom,2024-05-16 01:24:46.466080+00:00,SOL_USD,2024,5
2024-05-15 00:09:00+00:00,1715731740000,142.21,142.32,142.21,142.32,8.79,cryptocom,2024-05-16 01:24:46.466080+00:00,SOL_USD,2024,5


INFO  None


## periodic_daily.airflow.downloaded_1min.parquet.ohlcv.futures.v8.ccxt.okx.v1_0_0

In [4]:
signature = "periodic_daily.airflow.downloaded_1min.parquet.ohlcv.futures.v8.ccxt.okx.v1_0_0"
reader = imvcdcimrdc.RawDataReader(signature, stage="preprod")
start_timestamp = pd.Timestamp("2024-01-01T00:00:00+00:00")
end_timestamp = start_timestamp + pd.Timedelta(minutes=5)
data = reader.read_data(
    start_timestamp, end_timestamp, currency_pairs=["BTC_USDT", "ETH_USDT"]
)
_LOG.log(log_level, hpandas.df_to_str(data, log_level=log_level))

INFO  Loading dataset schema file: /app/amp/data_schema/dataset_schema_versions/dataset_schema_v3.json
INFO  Loaded dataset schema version v3
INFO  Loading dataset schema file: /app/amp/data_schema/dataset_schema_versions/dataset_schema_v3.json
INFO  Loaded dataset schema version v3
INFO  Loading dataset schema file: /app/amp/data_schema/dataset_schema_versions/dataset_schema_v3.json
INFO  Loaded dataset schema version v3


Unnamed: 0,timestamp,open,high,low,close,volume,exchange_id,knowledge_timestamp,currency_pair,year,month
2024-01-01 00:00:00+00:00,1704067200000,42297.7,42310.0,42266.1,42308.0,89.86,okx,2024-02-21 21:17:19.094084+00:00,BTC_USDT,2024,1
2024-01-01 00:01:00+00:00,1704067260000,42308.0,42329.7,42307.9,42327.5,83.71,okx,2024-02-21 21:17:19.094084+00:00,BTC_USDT,2024,1
2024-01-01 00:02:00+00:00,1704067320000,42327.6,42345.1,42325.0,42334.4,82.67,okx,2024-02-21 21:17:19.094084+00:00,BTC_USDT,2024,1
,...,...,...,...,...,...,...,...,...,...,...
2024-01-01 00:03:00+00:00,1704067380000,2285.06,2286.95,2285.06,2286.95,671.3,okx,2024-02-21 21:21:00.192712+00:00,ETH_USDT,2024,1
2024-01-01 00:04:00+00:00,1704067440000,2286.95,2288.76,2286.95,2288.69,1308.8,okx,2024-02-21 21:21:00.192712+00:00,ETH_USDT,2024,1
2024-01-01 00:05:00+00:00,1704067500000,2288.68,2290.83,2287.52,2290.24,1706.2,okx,2024-02-21 21:21:00.192712+00:00,ETH_USDT,2024,1


INFO  None


## periodic_daily.airflow.resampled_1min.parquet.bid_ask.futures.v8.ccxt.binance.v2_0_0

In [5]:
signature = "periodic_daily.airflow.resampled_1min.parquet.bid_ask.futures.v8.ccxt.binance.v2_0_0"
reader = imvcdcimrdc.RawDataReader(signature, stage="preprod", add_suffix="tokyo")
start_timestamp = pd.Timestamp("2024-06-11T00:00:00+00:00")
end_timestamp = start_timestamp + pd.Timedelta(minutes=5)
data = reader.read_data(
    start_timestamp, end_timestamp, currency_pairs=["XMR_USDT"]
)
_LOG.log(log_level, hpandas.df_to_str(data, log_level=log_level))

INFO  Loading dataset schema file: /app/amp/data_schema/dataset_schema_versions/dataset_schema_v3.json
INFO  Loaded dataset schema version v3
INFO  Loading dataset schema file: /app/amp/data_schema/dataset_schema_versions/dataset_schema_v3.json
INFO  Loaded dataset schema version v3
INFO  Loading dataset schema file: /app/amp/data_schema/dataset_schema_versions/dataset_schema_v3.json
INFO  Loaded dataset schema version v3


Unnamed: 0_level_0,timestamp,level_1.bid_price.open,level_1.bid_size.open,level_1.ask_price.open,level_1.ask_size.open,level_1.bid_ask_midpoint.open,level_1.half_spread.open,level_1.log_size_imbalance.open,level_1.bid_price.close,level_1.bid_size.close,level_1.ask_price.close,level_1.ask_size.close,level_1.bid_ask_midpoint.close,level_1.half_spread.close,level_1.log_size_imbalance.close,level_1.bid_price.high,level_1.bid_size.max,level_1.ask_price.high,level_1.ask_size.max,level_1.bid_ask_midpoint.max,level_1.half_spread.max,level_1.log_size_imbalance.max,level_1.bid_price.low,level_1.bid_size.min,level_1.ask_price.low,level_1.ask_size.min,level_1.bid_ask_midpoint.min,level_1.half_spread.min,level_1.log_size_imbalance.min,level_1.bid_price.mean,level_1.bid_size.mean,level_1.ask_price.mean,level_1.ask_size.mean,level_1.bid_ask_midpoint.mean,level_1.half_spread.mean,level_1.log_size_imbalance.mean,level_1.bid_ask_midpoint_var.100ms,level_1.bid_ask_midpoint_autocovar.100ms,level_1.log_size_imbalance_var.100ms,level_1.log_size_imbalance_autocovar.100ms,exchange_id,knowledge_timestamp,currency_pair,year,month
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1
2024-06-11 00:01:00+00:00,1718064060000,178.5,0.111,178.54,0.512,178.52,0.02,-1.528794,178.87,0.089,178.91,0.098,178.89,0.02,-0.096331,179.01,5.163,179.07,26.815,179.04,0.065,5.027383,178.49,0.002,178.5,0.015,178.495,0.005,-5.833348,178.804365,0.519182,178.846522,1.2021,178.825443,0.021079,-0.23116,0.09265,-0.00015,1915.053525,1637.803137,binance,2024-06-12 04:41:22.098656+00:00,XMR_USDT,2024,6
2024-06-11 00:02:00+00:00,1718064120000,178.87,1.141,178.92,0.158,178.895,0.025,1.977065,178.9,0.839,178.93,0.105,178.915,0.015,2.07825,179.04,33.944,179.09,7.352,179.055,0.035,4.32322,178.87,0.01,178.91,0.028,178.89,0.005,-4.21138,178.92955,0.499678,178.972067,0.475495,178.950808,0.021258,-0.142474,0.033975,0.0001,1849.311905,1612.953513,binance,2024-06-12 04:41:22.098656+00:00,XMR_USDT,2024,6
2024-06-11 00:03:00+00:00,1718064180000,178.9,0.839,178.93,0.105,178.915,0.015,2.07825,179.04,0.032,179.05,0.319,179.045,0.005,-2.299455,179.07,2.589,179.11,14.083,179.085,0.035,4.038994,178.9,0.011,178.93,0.013,178.915,0.005,-4.801783,178.97585,0.47459,179.009217,0.682882,178.992533,0.016683,-0.075147,0.02035,-0.00015,1731.799097,1456.013402,binance,2024-06-12 04:41:22.098656+00:00,XMR_USDT,2024,6
2024-06-11 00:04:00+00:00,1718064240000,179.03,0.126,179.08,5.835,179.055,0.025,-3.835348,178.97,1.144,178.98,0.752,178.975,0.005,0.41955,179.07,2.562,179.11,13.094,179.09,0.03,4.046554,178.91,0.018,178.92,0.013,178.915,0.005,-4.795239,179.0192,0.627035,179.04805,2.26452,179.033625,0.014425,-0.395573,0.02165,0.0002,2928.749314,2541.065562,binance,2024-06-12 04:41:22.098656+00:00,XMR_USDT,2024,6
2024-06-11 00:05:00+00:00,1718064300000,178.97,1.144,178.98,0.752,178.975,0.005,0.41955,179.1,0.418,179.11,8.643,179.105,0.005,-3.029024,179.1,1.374,179.11,10.873,179.105,0.025,3.171085,178.96,0.001,178.98,0.028,178.975,0.005,-6.43615,179.048683,0.449267,179.064583,2.718245,179.056633,0.00795,-0.982251,0.01185,0.0,3100.14698,2894.218125,binance,2024-06-12 04:41:22.098656+00:00,XMR_USDT,2024,6
2024-06-11 00:00:00+00:00,1718064000000,178.44,0.8,178.47,0.101,178.455,0.015,2.069491,178.5,0.111,178.54,0.512,178.52,0.02,-1.528794,178.56,2.209,178.6,1.333,178.58,0.025,4.717382,178.44,0.057,178.47,0.008,178.455,0.005,-3.152136,178.512718,0.476685,178.547517,0.434013,178.530117,0.017399,0.308731,0.006325,-5e-05,2067.222832,1969.132652,binance,2024-06-11 04:41:53.168048+00:00,XMR_USDT,2024,6


INFO  None


In [None]:
# ## periodic_daily.airflow.downloaded_200ms.postgres.bid_ask.futures.v7_5.ccxt.cryptocom.v1_0_0
# TODO(Juraj): The downloader for this dataset runs in Tokyo, we do not have cross-region RDS access available, should
# it be added?

## periodic_daily.airflow.archived_200ms.parquet.bid_ask.futures.v7_5.ccxt.cryptocom.v1_0_0

In [3]:
signature = "periodic_daily.airflow.archived_200ms.parquet.bid_ask.futures.v7_5.ccxt.cryptocom.v1_0_0"
reader = imvcdcimrdc.RawDataReader(signature, stage="preprod", add_suffix="tokyo")
start_timestamp = pd.Timestamp("2024-07-09T06:00:00+00:00")
end_timestamp = start_timestamp + pd.Timedelta(minutes=1)
data = reader.read_data(
    start_timestamp, end_timestamp
)
_LOG.log(log_level, hpandas.df_to_str(data, log_level=log_level))

INFO  Loading dataset schema file: /app/amp/data_schema/dataset_schema_versions/dataset_schema_v3.json
INFO  Loaded dataset schema version v3
INFO  Loading dataset schema file: /app/amp/data_schema/dataset_schema_versions/dataset_schema_v3.json
INFO  Loaded dataset schema version v3
INFO  Loading dataset schema file: /app/amp/data_schema/dataset_schema_versions/dataset_schema_v3.json
INFO  Loaded dataset schema version v3


Unnamed: 0,timestamp,bid_size,bid_price,ask_size,ask_price,exchange_id,level,end_download_timestamp,knowledge_timestamp,currency_pair,year,month,day
2024-07-09 06:00:12.809000+00:00,1720504812809,0.236,57271.4,0.378,57271.5,cryptocom,1,2024-07-09 06:00:12.901517+00:00,2024-07-09 06:00:12.914780+00:00,BTC_USD,2024,7,9
2024-07-09 06:00:13.011000+00:00,1720504813011,0.236,57271.4,0.8196,57271.5,cryptocom,1,2024-07-09 06:00:13.103480+00:00,2024-07-09 06:00:13.115856+00:00,BTC_USD,2024,7,9
2024-07-09 06:00:13.313000+00:00,1720504813313,0.236,57271.4,0.789,57271.5,cryptocom,1,2024-07-09 06:00:13.317016+00:00,2024-07-09 06:00:13.332579+00:00,BTC_USD,2024,7,9
,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-07-09 06:00:12.394000+00:00,1720504812394,1.6024,3067.35,0.0324,3067.61,cryptocom,1,2024-07-09 06:00:12.408426+00:00,2024-07-09 06:00:12.422987+00:00,ETH_USD,2024,7,9
2024-07-09 06:00:12.596000+00:00,1720504812596,1.6808,3067.27,4.5,3067.5,cryptocom,1,2024-07-09 06:00:12.611365+00:00,2024-07-09 06:00:12.628991+00:00,ETH_USD,2024,7,9
2024-07-09 06:00:12.898000+00:00,1720504812898,0.4201,3067.39,0.0323,3067.49,cryptocom,1,2024-07-09 06:00:12.901371+00:00,2024-07-09 06:00:12.914780+00:00,ETH_USD,2024,7,9


INFO  None


## periodic_daily.airflow.resampled_1min.parquet.bid_ask.futures.v7_5.ccxt.cryptocom.v2_0_0

In [6]:
signature = "periodic_daily.airflow.resampled_1min.parquet.bid_ask.futures.v7_5.ccxt.cryptocom.v2_0_0"
reader = imvcdcimrdc.RawDataReader(signature, stage="preprod", add_suffix="tokyo")
start_timestamp = pd.Timestamp("2024-07-08T20:00:00+00:00")
end_timestamp = start_timestamp + pd.Timedelta(minutes=5)
data = reader.read_data(
    start_timestamp, end_timestamp
)
_LOG.log(log_level, hpandas.df_to_str(data, log_level=log_level))

INFO  Loading dataset schema file: /app/amp/data_schema/dataset_schema_versions/dataset_schema_v3.json
INFO  Loaded dataset schema version v3
INFO  Loading dataset schema file: /app/amp/data_schema/dataset_schema_versions/dataset_schema_v3.json
INFO  Loaded dataset schema version v3
INFO  Loading dataset schema file: /app/amp/data_schema/dataset_schema_versions/dataset_schema_v3.json
INFO  Loaded dataset schema version v3


Unnamed: 0,timestamp,level_1.bid_price.open,level_1.bid_size.open,level_1.ask_price.open,level_1.ask_size.open,level_1.bid_ask_midpoint.open,level_1.half_spread.open,level_1.log_size_imbalance.open,level_1.bid_price.close,level_1.bid_size.close,level_1.ask_price.close,level_1.ask_size.close,level_1.bid_ask_midpoint.close,level_1.half_spread.close,level_1.log_size_imbalance.close,level_1.bid_price.high,level_1.bid_size.max,level_1.ask_price.high,level_1.ask_size.max,level_1.bid_ask_midpoint.max,level_1.half_spread.max,level_1.log_size_imbalance.max,level_1.bid_price.low,level_1.bid_size.min,level_1.ask_price.low,level_1.ask_size.min,level_1.bid_ask_midpoint.min,level_1.half_spread.min,level_1.log_size_imbalance.min,level_1.bid_price.mean,level_1.bid_size.mean,level_1.ask_price.mean,level_1.ask_size.mean,level_1.bid_ask_midpoint.mean,level_1.half_spread.mean,level_1.log_size_imbalance.mean,level_1.bid_ask_midpoint_var.100ms,level_1.bid_ask_midpoint_autocovar.100ms,level_1.log_size_imbalance_var.100ms,level_1.log_size_imbalance_autocovar.100ms,exchange_id,knowledge_timestamp,currency_pair,year,month
2024-07-08 20:00:00+00:00,1720468800000,56475.8,0.101,56483.3,0.4435,56479.55,3.75,-1.479577,56466.0,0.152,56468.4,0.152,56467.2,1.2,0.0,56510.6,0.9557,56516.4,0.6056,56513.5,5.35,3.667353,56450.0,0.0057,56456.9,0.0027,56453.85,0.05,-2.416853,56485.787167,0.125292,56488.508667,0.149978,56487.147917,1.36075,-0.03603,2003.6875,0.0,712.734912,464.278884,cryptocom,2024-07-09 04:22:35.212411+00:00,BTC_USD,2024,7
2024-07-08 20:01:00+00:00,1720468860000,56466.0,0.152,56468.3,0.1418,56467.15,1.15,0.069463,56453.8,0.0735,56461.2,0.0491,56457.5,3.7,0.403426,56487.6,0.1969,56491.9,0.2242,56489.4,6.2,3.809949,56428.1,0.0042,56430.0,0.0027,56429.25,0.05,-3.191276,56453.558833,0.075335,56458.254333,0.070159,56455.906583,2.34775,0.177464,2378.395,0.0,782.276003,606.965937,cryptocom,2024-07-09 04:22:35.212411+00:00,BTC_USD,2024,7
2024-07-08 20:02:00+00:00,1720468920000,56453.8,0.0735,56461.2,0.0491,56457.5,3.7,0.403426,56425.5,0.1057,56430.7,0.1297,56428.1,2.6,-0.204619,56467.3,0.243,56475.1,0.3361,56470.5,5.5,2.571849,56411.1,0.0037,56415.3,0.0027,56413.2,0.05,-3.857024,56430.5495,0.063856,56436.955667,0.07561,56433.752583,3.203083,-0.113241,637.945,0.0,662.704235,470.1478,cryptocom,2024-07-09 04:22:35.212411+00:00,BTC_USD,2024,7
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-07-08 20:03:00+00:00,1720468980000,3001.62,0.7,3001.8,8.2,3001.71,0.09,-2.460809,3003.47,2.9,3003.48,4.9,3003.475,0.005,-0.524524,3003.96,6.2,3004.08,14.984,3003.99,0.235,2.688848,3001.62,0.0173,3001.74,0.333,3001.68,0.005,-5.646284,3002.765133,2.455578,3002.865383,3.272069,3002.815258,0.050125,-0.377865,4.459375,0.0,1162.261698,923.014995,cryptocom,2024-07-09 04:22:35.212411+00:00,ETH_USD,2024,7
2024-07-08 20:04:00+00:00,1720469040000,3003.47,2.9,3003.48,4.9,3003.475,0.005,-0.524524,2999.43,6.1988,2999.64,7.7,2999.535,0.105,-0.216865,3003.47,10.2,3003.48,14.3,3003.475,0.235,3.578531,2999.12,0.23,2999.37,0.0335,2999.245,0.005,-4.129936,3000.6655,2.496191,3000.81255,4.095613,3000.739025,0.073525,-0.423312,4.24915,-0.0072,1150.490304,978.441419,cryptocom,2024-07-09 04:22:35.212411+00:00,ETH_USD,2024,7
2024-07-08 20:05:00+00:00,1720469100000,2999.43,6.1988,2999.58,2.2,2999.505,0.075,1.035898,2998.44,0.4,2998.5,1.3,2998.47,0.03,-1.178655,3000.31,9.8273,3000.66,16.214,3000.485,0.33,4.831294,2997.19,0.0026,2997.3,0.0334,2997.245,0.005,-7.18425,2999.193817,2.423717,2999.313817,3.926643,2999.253817,0.06,-0.391764,4.577625,0.0,1813.302223,1414.070233,cryptocom,2024-07-09 04:22:35.212411+00:00,ETH_USD,2024,7


INFO  None


# Semi-Active datasets

 - not currently downloaded but could be revived

## periodic_daily.airflow.archived_200ms.parquet.bid_ask.spot.v7_4.ccxt.kraken.v1_0_0

In [24]:
signature = "periodic_daily.airflow.archived_200ms.parquet.bid_ask.spot.v7_4.ccxt.kraken.v1_0_0"
reader = imvcdcimrdc.RawDataReader(signature, stage="preprod")
start_timestamp = pd.Timestamp("2024-01-01T00:00:00+00:00")
end_timestamp = start_timestamp + pd.Timedelta(minutes=1)
data = reader.read_data(
    start_timestamp, end_timestamp, currency_pairs=["BTC_USDT", "ETH_USDT"]
)
_LOG.log(log_level, hpandas.df_to_str(data, log_level=log_level))

INFO  Loading dataset schema file: /app/amp/data_schema/dataset_schema_versions/dataset_schema_v3.json
INFO  Loaded dataset schema version v3
INFO  Loading dataset schema file: /app/amp/data_schema/dataset_schema_versions/dataset_schema_v3.json
INFO  Loaded dataset schema version v3
INFO  Loading dataset schema file: /app/amp/data_schema/dataset_schema_versions/dataset_schema_v3.json
INFO  Loaded dataset schema version v3
INFO  Before from_parquet


Unnamed: 0,timestamp,bid_size,bid_price,ask_size,ask_price,exchange_id,level,end_download_timestamp,knowledge_timestamp,currency_pair,year,month,day
2024-01-01 00:00:00.153000+00:00,1704067200153,0.0712,42289.1,0.036078,42289.2,kraken,1,2024-01-01 00:00:00.252015+00:00,2024-01-01 00:00:01.750584+00:00,BTC_USDT,2024,1,1
2024-01-01 00:00:00.293000+00:00,1704067200293,0.17413,42289.1,0.036078,42289.2,kraken,1,2024-01-01 00:00:00.453643+00:00,2024-01-01 00:00:01.750584+00:00,BTC_USDT,2024,1,1
2024-01-01 00:00:00.293000+00:00,1704067200293,0.0712,42284.8,0.014123,42289.3,kraken,2,2024-01-01 00:00:00.453643+00:00,2024-01-01 00:00:01.750584+00:00,BTC_USDT,2024,1,1
,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-01-01 00:00:56.609000+00:00,1704067256609,0.013656,2281.72,0.5906,2282.32,kraken,4,2024-01-01 00:00:56.739928+00:00,2024-01-01 00:01:00.823556+00:00,ETH_USDT,2024,1,1
2024-01-01 00:00:56.609000+00:00,1704067256609,1.087427,2281.7,1.0,2282.43,kraken,5,2024-01-01 00:00:56.739928+00:00,2024-01-01 00:01:00.823556+00:00,ETH_USDT,2024,1,1
2024-01-01 00:00:56.609000+00:00,1704067256609,0.042971,2281.47,0.755004,2282.44,kraken,6,2024-01-01 00:00:56.739928+00:00,2024-01-01 00:01:00.823556+00:00,ETH_USDT,2024,1,1


INFO  None


## periodic_daily.airflow.downloaded_all.parquet.trades.spot.v7_4.ccxt.kraken.v1_0_0

In [25]:
signature = "periodic_daily.airflow.downloaded_all.parquet.trades.spot.v7_4.ccxt.kraken.v1_0_0"
reader = imvcdcimrdc.RawDataReader(signature, stage="preprod")
start_timestamp = pd.Timestamp("2024-01-01T00:00:00+00:00")
end_timestamp = start_timestamp + pd.Timedelta(minutes=1)
data = reader.read_data(
    start_timestamp, end_timestamp, currency_pairs=["BTC_USDT", "ETH_USDT"]
)
_LOG.log(log_level, hpandas.df_to_str(data, log_level=log_level))

INFO  Loading dataset schema file: /app/amp/data_schema/dataset_schema_versions/dataset_schema_v3.json
INFO  Loaded dataset schema version v3
INFO  Loading dataset schema file: /app/amp/data_schema/dataset_schema_versions/dataset_schema_v3.json
INFO  Loaded dataset schema version v3
INFO  Loading dataset schema file: /app/amp/data_schema/dataset_schema_versions/dataset_schema_v3.json
INFO  Loaded dataset schema version v3
INFO  Before from_parquet


Unnamed: 0,timestamp,symbol,side,price,amount,exchange_id,knowledge_timestamp,currency_pair,year,month,day
2024-01-01 00:00:00.006000+00:00,1704067200006,BTC/USDT,buy,42289.2,0.013922,kraken,2024-01-02 02:21:51.994445+00:00,BTC_USDT,2024,1,1
2024-01-01 00:00:02.738000+00:00,1704067202738,BTC/USDT,buy,42289.2,0.0001,kraken,2024-01-02 02:21:51.994445+00:00,BTC_USDT,2024,1,1
2024-01-01 00:00:40.338000+00:00,1704067240338,BTC/USDT,buy,42293.3,0.0001,kraken,2024-01-02 02:21:51.994445+00:00,BTC_USDT,2024,1,1
,...,...,...,...,...,...,...,...,...,...,...
2024-01-01 00:00:12.209000+00:00,1704067212209,ETH/USDT,sell,2281.4,0.126,kraken,2024-01-02 02:21:44.902187+00:00,ETH_USDT,2024,1,1
2024-01-01 00:00:56.768000+00:00,1704067256768,ETH/USDT,buy,2282.17,0.056,kraken,2024-01-02 02:21:44.902187+00:00,ETH_USDT,2024,1,1
2024-01-01 00:00:58.088000+00:00,1704067258088,ETH/USDT,buy,2282.19,0.001702,kraken,2024-01-02 02:21:44.902187+00:00,ETH_USDT,2024,1,1


INFO  None


# Legacy datasets - datasets not actively maintained

## periodic_daily.airflow.downloaded_1min.csv.ohlcv.futures.v7.ccxt.binance.v1_0_0

In [None]:
#  the dataset reside under previous, deprecated schema:
# s3://cryptokaizen-data/reorg/daily_staged.airflow.pq/

# signature = "periodic_daily.airflow.downloaded_1min.csv.ohlcv.futures.v7.ccxt.binance.v1_0_0"
# reader = imvcdcimrdc.RawDataReader(signature)
# data = reader.read_data_head()
# _LOG.log(log_level, hpandas.df_to_str(data, log_level=log_level))

## periodic_daily.airflow.downloaded_1min.parquet.ohlcv.futures.v7.ccxt.binance.v1_0_0

In [None]:
signature = "periodic_daily.airflow.downloaded_1min.parquet.ohlcv.futures.v7.ccxt.binance.v1_0_0"
reader = imvcdcimrdc.RawDataReader(signature)
data = reader.read_data_head()
_LOG.log(log_level, hpandas.df_to_str(data, log_level=log_level))

## periodic_daily.airflow.downloaded_1min.parquet.ohlcv.spot.v7.ccxt.binance.v1_0_0

In [None]:
signature = "periodic_daily.airflow.downloaded_1min.parquet.ohlcv.futures.v7.ccxt.binance.v1_0_0"
reader = imvcdcimrdc.RawDataReader(signature)
data = reader.read_data_head()
_LOG.log(log_level, hpandas.df_to_str(data, log_level=log_level))

## periodic_daily.airflow.downloaded_1min.parquet.ohlcv.spot.v7.ccxt.binanceus.v1_0_0

In [None]:
signature = "periodic_daily.airflow.downloaded_1min.parquet.ohlcv.spot.v7.ccxt.binanceus.v1_0_0"
reader = imvcdcimrdc.RawDataReader(signature)
data = reader.read_data_head()
_LOG.log(log_level, hpandas.df_to_str(data, log_level=log_level))

## periodic_daily.airflow.downloaded_1min.parquet.ohlcv.futures.v3.crypto_chassis.binance.v1_0_0

In [None]:
signature = "periodic_daily.airflow.downloaded_1min.parquet.ohlcv.futures.v3.crypto_chassis.binance.v1_0_0"
reader = imvcdcimrdc.RawDataReader(signature)
data = reader.read_data_head()
_LOG.log(log_level, hpandas.df_to_str(data, log_level=log_level))

## periodic_daily.airflow.downloaded_1sec.parquet.bid_ask.futures.v3.crypto_chassis.binance.v1_0_0

In [None]:
signature = "periodic_daily.airflow.downloaded_1sec.parquet.bid_ask.futures.v3.crypto_chassis.binance.v1_0_0"
reader = imvcdcimrdc.RawDataReader(signature)
data = reader.read_data_head()
_LOG.log(log_level, hpandas.df_to_str(data, log_level=log_level))

## periodic_daily.airflow.resampled_1min.parquet.bid_ask.futures.v3.crypto_chassis.binance.v1_0_0

In [None]:
signature = "periodic_daily.airflow.resampled_1min.parquet.bid_ask.futures.v3.crypto_chassis.binance.v1_0_0"
reader = imvcdcimrdc.RawDataReader(signature)
data = reader.read_data_head()
_LOG.log(log_level, hpandas.df_to_str(data, log_level=log_level))

## periodic_daily.airflow.downloaded_1sec.parquet.bid_ask.spot.v3.crypto_chassis.binance.v1_0_0

In [None]:
# The dataset reside under previous schema:
# s3://cryptokaizen-data/reorg/daily_staged.airflow.pq/

# TODO(Juraj): Spot bid ask spot data are not collected currently
# signature = "periodic_daily.airflow.downloaded_1sec.parquet.bid_ask.spot.v3.crypto_chassis.binance.v1_0_0"
# reader = imvcdcimrdc.RawDataReader(signature)
# data = reader.read_data_head()
# _LOG.log(log_level, hpandas.df_to_str(data, log_level=log_level))

## periodic_daily.airflow.resampled_1min.parquet.bid_ask.spot.v3.crypto_chassis.binance.v1_0_0

In [None]:
# The dataset reside under previous schema:
# s3://cryptokaizen-data/reorg/daily_staged.airflow.pq/

# TODO(Juraj): Spot bid ask spot data are not collected currently
# signature = "periodic_daily.airflow.downloaded_1sec.parquet.bid_ask.spot.v3.crypto_chassis.binance.v1_0_0"
# reader = imvcdcimrdc.RawDataReader(signature)
# data = reader.read_data_head()
# _LOG.log(log_level, hpandas.df_to_str(data, log_level=log_level))

## periodic_daily.airflow.downloaded_1min.parquet.ohlcv.spot.v3.crypto_chassis.binance.v1_0_0

In [None]:
signature = "periodic_daily.airflow.downloaded_1min.parquet.ohlcv.spot.v3.crypto_chassis.binance.v1_0_0"
reader = imvcdcimrdc.RawDataReader(signature)
data = reader.read_data_head()
_LOG.log(log_level, hpandas.df_to_str(data, log_level=log_level))

## periodic_daily.airflow.downloaded_1sec.parquet.trades.futures.v3_1.crypto_chassis.binance.v1_0_0

In [None]:
signature = "periodic_daily.airflow.downloaded_1sec.parquet.trades.futures.v3_1.crypto_chassis.binance.v1_0_0"
reader = imvcdcimrdc.RawDataReader(signature)
data = reader.read_data_head()
_LOG.log(log_level, hpandas.df_to_str(data, log_level=log_level))

## periodic_daily.airflow.downloaded_1min.parquet.ohlcv.futures.v7_3.ccxt.okx.v1_0_0

In [None]:
signature = "periodic_daily.airflow.downloaded_1min.parquet.ohlcv.futures.v7_3.ccxt.okx.v1_0_0"
reader = imvcdcimrdc.RawDataReader(signature)
data = reader.read_data_head()
_LOG.log(log_level, hpandas.df_to_str(data, log_level=log_level))

## bulk.airflow.downloaded_1min.parquet.ohlcv.futures.v7_5.ccxt.binance.v1_0_0

_This dataset is in the test stage only_

In [None]:
signature = (
    "bulk.airflow.downloaded_1min.parquet.ohlcv.futures.v7_5.ccxt.binance.v1_0_0"
)
reader = imvcdcimrdc.RawDataReader(signature, stage="test")
# 4 months of data is available.
start_timestamp = pd.Timestamp("2023-02-01T00:00:00+00:00")
end_timestamp = pd.Timestamp("2023-06-01T00:00:00+00:00")
binance_ohlcv_data = reader.read_data(start_timestamp, end_timestamp)
_LOG.log(
    log_level, hpandas.df_to_str(binance_ohlcv_data.head(), log_level=log_level)
)

# RawDataReader Guide

## Loading parquet data with filters

TODO(Juraj): Support for filtering by level for parquet bid/ask datasets will be added once
#3694 is finished.

In [None]:
signature = "periodic_daily.airflow.downloaded_1min.parquet.ohlcv.futures.v7.ccxt.binance.v1_0_0"
start_timestamp = pd.Timestamp(
    datetime.datetime.utcnow() - datetime.timedelta(minutes=10, days=2), tz="UTC"
)
end_timestamp = start_timestamp + datetime.timedelta(minutes=10)
currency_pairs = ["BTC_USDT", "ETH_USDT"]
reader = imvcdcimrdc.RawDataReader(signature)
data = reader.read_data(
    start_timestamp, end_timestamp, currency_pairs=currency_pairs
)
_LOG.log(log_level, hpandas.df_to_str(data, log_level=log_level))

In [None]:
# This works with stage preprod.
signature = "periodic_daily.airflow.downloaded_1min.parquet.ohlcv.futures.v7_6.ccxt.okx.v1_0_0"
start_timestamp = pd.Timestamp(
    datetime.datetime.utcnow() - datetime.timedelta(minutes=10, days=1), tz="UTC"
)
end_timestamp = start_timestamp + datetime.timedelta(minutes=10)
currency_pairs = ["BTC_USDT", "ETH_USDT"]
reader = imvcdcimrdc.RawDataReader(signature, stage="preprod")
data = reader.read_data(
    start_timestamp, end_timestamp, currency_pairs=currency_pairs
)
_LOG.log(log_level, hpandas.df_to_str(data, log_level=log_level))

## Loading postgres data with filters

In [None]:
signature = "realtime.airflow.resampled_1min.postgres.bid_ask.futures.v7.ccxt.binance.v1_0_0"
reader = imvcdcimrdc.RawDataReader(signature)
start_timestamp = pd.Timestamp(
    datetime.datetime.utcnow() - datetime.timedelta(minutes=10, days=2), tz="UTC"
)
end_timestamp = start_timestamp + datetime.timedelta(minutes=10)
currency_pairs = ["BTC_USDT", "ETH_USDT"]
bid_ask_levels = [1, 2]
data = reader.read_data(
    start_timestamp,
    end_timestamp,
    currency_pairs=currency_pairs,
    bid_ask_levels=bid_ask_levels,
)
_LOG.log(log_level, hpandas.df_to_str(data, log_level=log_level))