# Description


This notebook showcases locations and basic structure of raw data from:

- S3 (parquet datasets)
- IM DB (Postgres)

The secondary purpose is to provide a guide on how to use `RawDataReader`

## Specs
- This notebook:
  - is a catalog of all the datasets that exist in the our system
  - shows how to load data using our low-level functions or specific API for specific datasets
  - shows how a snippet of the data looks like (for this we want to load the minimal amount of data)
  - doesn't compute any statistics
  - should be quickly to execute, like < 1min, so we can run it in the unit tests

## Life cycle
- Any time a new dataset is added (e.g., in real-time DB, Parquet) we add some information on how to load it and how it looks like
- In general we try not to delete any data but we only add data loaders

## Monster dataset matrix spreadsheet

The gallery should match 1-to-1 with the dataset matrix

https://docs.google.com/spreadsheets/d/1aN2TBTtDqX5itnlG70lS2otkKCHKPN2yE_Hu3JPhPVo/edit#gid=0

# Imports

In [8]:
import datetime
import logging

import pandas as pd

import helpers.hdbg as hdbg
import helpers.henv as henv
import helpers.hpandas as hpandas
import helpers.hprint as hprint
import im_v2.common.data.client.im_raw_data_client as imvcdcimrdc

  from tqdm.autonotebook import tqdm


In [9]:
hdbg.init_logger(verbosity=logging.INFO)
log_level = logging.INFO

_LOG = logging.getLogger(__name__)

_LOG.info("%s", henv.get_system_signature()[0])

hprint.config_notebook()

[0m[36mINFO[0m: > cmd='/venv/lib/python3.8/site-packages/ipykernel_launcher.py -f /home/.local/share/jupyter/runtime/kernel-eb8b7f2a-0eb1-4cf1-9963-17d2311d0811.json'
[31m-----------------------------------------------------------------------------
This code is not in sync with the container:
code_version='1.4.3' != container_version='1.4.0'
-----------------------------------------------------------------------------
You need to:
- merge origin/master into your branch with `invoke git_merge_master`
- pull the latest container with `invoke docker_pull`[0m
INFO  # Git
  branch_name='CmTask4582_Download_larger_universe_'
  hash='5dcc451d5'
  # Last commits:
    * 5dcc451d5 vlady    CmTask4582: refresh notebook with the downloaded data             (   6 hours ago) Wed Jun 21 07:41:31 2023  (HEAD -> CmTask4582_Download_larger_universe_, origin/CmTask4582_Download_larger_universe_)
    * 844105035 vlady    CmTask4582: fix for empty dataframes                              (  20 hours ag

# Realtime (the DB data)

## realtime.airflow.resampled_1min.postgres.bid_ask.futures.v7.ccxt.binance.v1_0_0

In [3]:
signature = "realtime.airflow.resampled_1min.postgres.bid_ask.futures.v7.ccxt.binance.v1_0_0"
reader = imvcdcimrdc.RawDataReader(signature)
data = reader.read_data_head()
_LOG.log(log_level, hpandas.df_to_str(data, log_level=log_level))

INFO  Loading dataset schema file: /app/data_schema/dataset_schema_versions/dataset_schema_v3.json
INFO  Loaded dataset schema version v3
INFO  Unable to fetch DB credentials from environment variables: 
	'POSTGRES_HOST'
	Attempting env file method.
INFO  Unable to fetch DB credentials from env file: 
	
################################################################################
* Failed assertion *
File '/app/im_v2/devops/env/prod.im_db_config.env' doesn't exist
################################################################################

	Attempting AWS SecretsManager method.
INFO  Fetching secret: prod.im_data_db.read_only
INFO  Created prod DB connection: 
 <connection object at 0x7fba6d044540; dsn: 'user=ck_prod_read_only password=xxx dbname=prod.im_data_db host=prod-im-db.cpox8ul7pzan.eu-north-1.rds.amazonaws.com port=5432', closed: 0>
INFO  Enabled connection to the `ccxt_bid_ask_futures_resampled_1min` DB table
INFO  Executing query: 
	SELECT * FROM ccxt_bid_ask_futures

  df = pd.read_sql_query(query, connection)


Unnamed: 0,timestamp,bid_size,bid_price,ask_size,ask_price,currency_pair,exchange_id,level,end_download_timestamp,knowledge_timestamp
0.0,1677781260000,7179614.666667,0.132744,8115892.016667,0.133639,OGN_USDT,binance,5,,2023-03-02 18:21:20.032763+00:00
1.0,1677781260000,5673264.283333,0.133036,2742931.5,0.133335,OGN_USDT,binance,2,,2023-03-02 18:21:20.032763+00:00
2.0,1677781260000,8089900.95,0.132436,6092994.583333,0.133924,OGN_USDT,binance,8,,2023-03-02 18:21:20.032763+00:00
,...,...,...,...,...,...,...,...,...,...
7.0,1677781260000,5434347.6,0.132363,4094748.766667,0.134059,OGN_USDT,binance,9,,2023-03-02 18:21:20.032763+00:00
8.0,1677781260000,5087017.95,0.132546,7678864.266667,0.133864,OGN_USDT,binance,7,,2023-03-02 18:21:20.032763+00:00
9.0,1677781260000,2569287.3,0.133133,1201681.983333,0.133266,OGN_USDT,binance,1,,2023-03-02 18:21:20.032763+00:00


INFO  None


## realtime.airflow.downloaded_200ms.postgres.bid_ask.futures.v7.ccxt.binance.v1_0_0

In [4]:
signature = "realtime.airflow.downloaded_200ms.postgres.bid_ask.futures.v7.ccxt.binance.v1_0_0"
reader = imvcdcimrdc.RawDataReader(signature)
data = reader.read_data_head()
_LOG.log(log_level, hpandas.df_to_str(data, log_level=log_level))

INFO  Loading dataset schema file: /app/data_schema/dataset_schema_versions/dataset_schema_v3.json
INFO  Loaded dataset schema version v3
INFO  Enabled connection to the `ccxt_bid_ask_futures_raw` DB table
INFO  Executing query: 
	SELECT * FROM ccxt_bid_ask_futures_raw ORDER BY timestamp DESC LIMIT 10


  df = pd.read_sql_query(query, connection)


Unnamed: 0,timestamp,bid_size,bid_price,ask_size,ask_price,currency_pair,exchange_id,level,end_download_timestamp,knowledge_timestamp
0.0,1677781350714,475036.9,0.3752,314720.6,0.3771,XRP_USDT,binance,10,2023-03-02 18:22:30.843817+00:00,2023-03-02 18:22:30.863121+00:00
1.0,1677781350714,740329.7,0.3753,455004.2,0.377,XRP_USDT,binance,9,2023-03-02 18:22:30.843817+00:00,2023-03-02 18:22:30.863121+00:00
2.0,1677781350714,537731.6,0.3754,729732.8,0.3769,XRP_USDT,binance,8,2023-03-02 18:22:30.843817+00:00,2023-03-02 18:22:30.863121+00:00
,...,...,...,...,...,...,...,...,...,...
7.0,1677781350714,596392.2,0.3759,602532.8,0.3764,XRP_USDT,binance,3,2023-03-02 18:22:30.843817+00:00,2023-03-02 18:22:30.863121+00:00
8.0,1677781350714,350655.3,0.376,394053.9,0.3763,XRP_USDT,binance,2,2023-03-02 18:22:30.843817+00:00,2023-03-02 18:22:30.863121+00:00
9.0,1677781350714,160719.6,0.3761,193269.7,0.3762,XRP_USDT,binance,1,2023-03-02 18:22:30.843817+00:00,2023-03-02 18:22:30.863121+00:00


INFO  None


## realtime.airflow.downloaded_1min.postgres.ohlcv.futures.v7.ccxt.binance.v1_0_0

In [5]:
signature = "realtime.airflow.downloaded_1min.postgres.ohlcv.futures.v7.ccxt.binance.v1_0_0"
reader = imvcdcimrdc.RawDataReader(signature)
data = reader.read_data_head()
_LOG.log(log_level, hpandas.df_to_str(data, log_level=log_level))

INFO  Loading dataset schema file: /app/data_schema/dataset_schema_versions/dataset_schema_v3.json
INFO  Loaded dataset schema version v3
INFO  Enabled connection to the `ccxt_ohlcv_futures` DB table
INFO  Executing query: 
	SELECT * FROM ccxt_ohlcv_futures ORDER BY timestamp DESC LIMIT 10


  df = pd.read_sql_query(query, connection)


Unnamed: 0,timestamp,open,high,low,close,volume,currency_pair,exchange_id,end_download_timestamp,knowledge_timestamp
0.0,1677781260000,0.376,0.3762,0.376,0.3762,116084.2,XRP_USDT,binance,2023-03-02 18:22:05.028864+00:00,2023-03-02 18:22:05.051758+00:00
1.0,1677781260000,2.513,2.5136,2.5123,2.5134,4258.1,WAVES_USDT,binance,2023-03-02 18:22:05.032623+00:00,2023-03-02 18:22:05.051758+00:00
2.0,1677781260000,5.883,5.884,5.882,5.883,306.0,UNFI_USDT,binance,2023-03-02 18:22:05.028285+00:00,2023-03-02 18:22:05.051758+00:00
,...,...,...,...,...,...,...,...,...,...
7.0,1677781260000,0.1332,0.1332,0.1332,0.1332,842.0,OGN_USDT,binance,2023-03-02 18:22:05.033008+00:00,2023-03-02 18:22:05.051758+00:00
8.0,1677781260000,2.217,2.218,2.216,2.218,24802.0,NEAR_USDT,binance,2023-03-02 18:22:05.029520+00:00,2023-03-02 18:22:05.051758+00:00
9.0,1677781260000,1.199,1.1994,1.1987,1.1993,162305.0,MATIC_USDT,binance,2023-03-02 18:22:05.027694+00:00,2023-03-02 18:22:05.051758+00:00


INFO  None


# Historical (data updated daily)

## periodic_daily.airflow.downloaded_1min.csv.ohlcv.futures.v7.ccxt.binance.v1_0_0

In [6]:
#  the dataset reside under previous, deprecated schema:
# s3://cryptokaizen-data/reorg/daily_staged.airflow.pq/

# signature = "periodic_daily.airflow.downloaded_1min.csv.ohlcv.futures.v7.ccxt.binance.v1_0_0"
# reader = imvcdcimrdc.RawDataReader(signature)
# data = reader.read_data_head()
# _LOG.log(log_level, hpandas.df_to_str(data, log_level=log_level))

## periodic_daily.airflow.downloaded_1min.parquet.ohlcv.futures.v7.ccxt.binance.v1_0_0

In [7]:
signature = "periodic_daily.airflow.downloaded_1min.parquet.ohlcv.futures.v7.ccxt.binance.v1_0_0"
reader = imvcdcimrdc.RawDataReader(signature)
data = reader.read_data_head()
_LOG.log(log_level, hpandas.df_to_str(data, log_level=log_level))

INFO  Loading dataset schema file: /app/data_schema/dataset_schema_versions/dataset_schema_v3.json
INFO  Loaded dataset schema version v3
INFO  Loading dataset schema file: /app/data_schema/dataset_schema_versions/dataset_schema_v3.json
INFO  Loaded dataset schema version v3
INFO  Loading the data from `s3://cryptokaizen-data/v3/periodic_daily/airflow/downloaded_1min/parquet/ohlcv/futures/v7/ccxt/binance/v1_0_0`


Unnamed: 0,timestamp,open,high,low,close,volume,exchange_id,knowledge_timestamp,currency_pair,year,month
2023-03-01 00:00:00+00:00,1677628800000,0.7996,0.7999,0.7985,0.7999,14156.0,binance,2023-03-02 00:17:38.803525+00:00,CTK_USDT,2023,3
2023-03-01 00:01:00+00:00,1677628860000,0.7999,0.8009,0.7996,0.8009,20682.0,binance,2023-03-02 00:17:38.803525+00:00,CTK_USDT,2023,3
2023-03-01 00:02:00+00:00,1677628920000,0.8008,0.8017,0.8007,0.8016,51016.0,binance,2023-03-02 00:17:38.803525+00:00,CTK_USDT,2023,3
,...,...,...,...,...,...,...,...,...,...,...
2023-03-01 00:07:00+00:00,1677629220000,0.7985,0.7985,0.7982,0.7984,4306.0,binance,2023-03-02 00:17:38.803525+00:00,CTK_USDT,2023,3
2023-03-01 00:08:00+00:00,1677629280000,0.7985,0.7995,0.7982,0.7985,12840.0,binance,2023-03-02 00:17:38.803525+00:00,CTK_USDT,2023,3
2023-03-01 00:09:00+00:00,1677629340000,0.7985,0.7989,0.7981,0.7981,12686.0,binance,2023-03-02 00:17:38.803525+00:00,CTK_USDT,2023,3


INFO  None


## periodic_daily.airflow.downloaded_1min.parquet.ohlcv.spot.v7.ccxt.binance.v1_0_0

In [8]:
signature = "periodic_daily.airflow.downloaded_1min.parquet.ohlcv.futures.v7.ccxt.binance.v1_0_0"
reader = imvcdcimrdc.RawDataReader(signature)
data = reader.read_data_head()
_LOG.log(log_level, hpandas.df_to_str(data, log_level=log_level))

INFO  Loading dataset schema file: /app/data_schema/dataset_schema_versions/dataset_schema_v3.json
INFO  Loaded dataset schema version v3
INFO  Loading dataset schema file: /app/data_schema/dataset_schema_versions/dataset_schema_v3.json
INFO  Loaded dataset schema version v3
INFO  Loading the data from `s3://cryptokaizen-data/v3/periodic_daily/airflow/downloaded_1min/parquet/ohlcv/futures/v7/ccxt/binance/v1_0_0`


Unnamed: 0,timestamp,open,high,low,close,volume,exchange_id,knowledge_timestamp,currency_pair,year,month
2023-03-01 00:00:00+00:00,1677628800000,0.7996,0.7999,0.7985,0.7999,14156.0,binance,2023-03-02 00:17:38.803525+00:00,CTK_USDT,2023,3
2023-03-01 00:01:00+00:00,1677628860000,0.7999,0.8009,0.7996,0.8009,20682.0,binance,2023-03-02 00:17:38.803525+00:00,CTK_USDT,2023,3
2023-03-01 00:02:00+00:00,1677628920000,0.8008,0.8017,0.8007,0.8016,51016.0,binance,2023-03-02 00:17:38.803525+00:00,CTK_USDT,2023,3
,...,...,...,...,...,...,...,...,...,...,...
2023-03-01 00:07:00+00:00,1677629220000,0.7985,0.7985,0.7982,0.7984,4306.0,binance,2023-03-02 00:17:38.803525+00:00,CTK_USDT,2023,3
2023-03-01 00:08:00+00:00,1677629280000,0.7985,0.7995,0.7982,0.7985,12840.0,binance,2023-03-02 00:17:38.803525+00:00,CTK_USDT,2023,3
2023-03-01 00:09:00+00:00,1677629340000,0.7985,0.7989,0.7981,0.7981,12686.0,binance,2023-03-02 00:17:38.803525+00:00,CTK_USDT,2023,3


INFO  None


## periodic_daily.airflow.downloaded_1min.parquet.ohlcv.spot.v7.ccxt.binanceus.v1_0_0

In [9]:
signature = "periodic_daily.airflow.downloaded_1min.parquet.ohlcv.spot.v7.ccxt.binanceus.v1_0_0"
reader = imvcdcimrdc.RawDataReader(signature)
data = reader.read_data_head()
_LOG.log(log_level, hpandas.df_to_str(data, log_level=log_level))

INFO  Loading dataset schema file: /app/data_schema/dataset_schema_versions/dataset_schema_v3.json
INFO  Loaded dataset schema version v3
INFO  Loading dataset schema file: /app/data_schema/dataset_schema_versions/dataset_schema_v3.json
INFO  Loaded dataset schema version v3
INFO  Loading the data from `s3://cryptokaizen-data/v3/periodic_daily/airflow/downloaded_1min/parquet/ohlcv/spot/v7/ccxt/binanceus/v1_0_0`


Unnamed: 0,timestamp,open,high,low,close,volume,exchange_id,knowledge_timestamp,currency_pair,year,month
2023-03-01 00:00:00+00:00,1677628800000,0.4246,0.4246,0.4246,0.4246,0.0,binanceus,2023-03-02 00:17:23.567986+00:00,STORJ_USDT,2023,3
2023-03-01 00:01:00+00:00,1677628860000,0.4246,0.4246,0.4246,0.4246,0.0,binanceus,2023-03-02 00:17:23.567986+00:00,STORJ_USDT,2023,3
2023-03-01 00:02:00+00:00,1677628920000,0.4246,0.4246,0.4246,0.4246,0.0,binanceus,2023-03-02 00:17:23.567986+00:00,STORJ_USDT,2023,3
,...,...,...,...,...,...,...,...,...,...,...
2023-03-01 00:07:00+00:00,1677629220000,0.4246,0.4246,0.4246,0.4246,0.0,binanceus,2023-03-02 00:17:23.567986+00:00,STORJ_USDT,2023,3
2023-03-01 00:08:00+00:00,1677629280000,0.4246,0.4246,0.4246,0.4246,0.0,binanceus,2023-03-02 00:17:23.567986+00:00,STORJ_USDT,2023,3
2023-03-01 00:09:00+00:00,1677629340000,0.4237,0.4237,0.4237,0.4237,78.66,binanceus,2023-03-02 00:17:23.567986+00:00,STORJ_USDT,2023,3


INFO  None


## periodic_daily.airflow.downloaded_1min.parquet.ohlcv.futures.v3.crypto_chassis.binance.v1_0_0

In [10]:
signature = "periodic_daily.airflow.downloaded_1min.parquet.ohlcv.futures.v3.crypto_chassis.binance.v1_0_0"
reader = imvcdcimrdc.RawDataReader(signature)
data = reader.read_data_head()
_LOG.log(log_level, hpandas.df_to_str(data, log_level=log_level))

INFO  Loading dataset schema file: /app/data_schema/dataset_schema_versions/dataset_schema_v3.json
INFO  Loaded dataset schema version v3
INFO  Loading dataset schema file: /app/data_schema/dataset_schema_versions/dataset_schema_v3.json
INFO  Loaded dataset schema version v3
INFO  Loading the data from `s3://cryptokaizen-data/v3/periodic_daily/airflow/downloaded_1min/parquet/ohlcv/futures/v3/crypto_chassis/binance/v1_0_0`


Unnamed: 0,timestamp,open,high,low,close,volume,vwap,number_of_trades,twap,exchange_id,knowledge_timestamp,currency_pair,year,month
2023-03-01 00:00:00+00:00,1677628800,0.3761,0.3762,0.3758,0.3759,1940393.5,0.37596,152,0.375929,binance,2023-03-02 00:17:09.917800+00:00,XRP_USDT,2023,3
2023-03-01 00:01:00+00:00,1677628860,0.3758,0.3761,0.3757,0.3759,1025109.2,0.375924,126,0.375914,binance,2023-03-02 00:17:09.917800+00:00,XRP_USDT,2023,3
2023-03-01 00:02:00+00:00,1677628920,0.376,0.376,0.3758,0.3759,800839.5,0.375923,84,0.375885,binance,2023-03-02 00:17:09.917800+00:00,XRP_USDT,2023,3
,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-03-01 00:07:00+00:00,1677629220,0.3755,0.3756,0.3752,0.3754,836788.0,0.375344,84,0.37536,binance,2023-03-02 00:17:09.917800+00:00,XRP_USDT,2023,3
2023-03-01 00:08:00+00:00,1677629280,0.3754,0.3756,0.3751,0.3752,1099763.6,0.375405,73,0.375284,binance,2023-03-02 00:17:09.917800+00:00,XRP_USDT,2023,3
2023-03-01 00:09:00+00:00,1677629340,0.3751,0.3753,0.3751,0.3752,490412.7,0.375221,76,0.375217,binance,2023-03-02 00:17:09.917800+00:00,XRP_USDT,2023,3


INFO  None


## periodic_daily.airflow.downloaded_1sec.parquet.bid_ask.futures.v3.crypto_chassis.binance.v1_0_0

In [11]:
signature = "periodic_daily.airflow.downloaded_1sec.parquet.bid_ask.futures.v3.crypto_chassis.binance.v1_0_0"
reader = imvcdcimrdc.RawDataReader(signature)
data = reader.read_data_head()
_LOG.log(log_level, hpandas.df_to_str(data, log_level=log_level))

INFO  Loading dataset schema file: /app/data_schema/dataset_schema_versions/dataset_schema_v3.json
INFO  Loaded dataset schema version v3
INFO  Loading dataset schema file: /app/data_schema/dataset_schema_versions/dataset_schema_v3.json
INFO  Loaded dataset schema version v3
INFO  Loading the data from `s3://cryptokaizen-data/v3/periodic_daily/airflow/downloaded_1sec/parquet/bid_ask/futures/v3/crypto_chassis/binance/v1_0_0`


Unnamed: 0,timestamp,bid_price_l1,bid_size_l1,bid_price_l2,bid_size_l2,bid_price_l3,bid_size_l3,bid_price_l4,bid_size_l4,bid_price_l5,bid_size_l5,bid_price_l6,bid_size_l6,bid_price_l7,bid_size_l7,bid_price_l8,bid_size_l8,bid_price_l9,bid_size_l9,bid_price_l10,bid_size_l10,ask_price_l1,ask_size_l1,ask_price_l2,ask_size_l2,ask_price_l3,ask_size_l3,ask_price_l4,ask_size_l4,ask_price_l5,ask_size_l5,ask_price_l6,ask_size_l6,ask_price_l7,ask_size_l7,ask_price_l8,ask_size_l8,ask_price_l9,ask_size_l9,ask_price_l10,ask_size_l10,exchange_id,knowledge_timestamp,currency_pair,year,month
2023-03-01 00:00:00+00:00,1677628800,0.376,145052.2,0.3759,201517.9,0.3758,404410.6,0.3757,408462.3,0.3756,359647.6,0.3755,187863.6,0.3754,501934.7,0.3753,569477.3,0.3752,213139.8,0.3751,504338.0,0.3761,79712.6,0.3762,219496.1,0.3763,529168.6,0.3764,318085.5,0.3765,261150.6,0.3766,225826.6,0.3767,434430.2,0.3768,146106.9,0.3769,533330.4,0.377,343512.0,binance,2023-03-02 11:03:05.314310+00:00,XRP_USDT,2023,3
2023-03-01 00:00:01+00:00,1677628801,0.376,208527.4,0.3759,188222.6,0.3758,335850.9,0.3757,350274.0,0.3756,359647.8,0.3755,189061.9,0.3754,503384.2,0.3753,578191.4,0.3752,211287.5,0.3751,435225.3,0.3761,79750.3,0.3762,219496.1,0.3763,529172.0,0.3764,306113.9,0.3765,261150.6,0.3766,225826.6,0.3767,434430.2,0.3768,146106.9,0.3769,533330.4,0.377,343512.0,binance,2023-03-02 11:03:05.314310+00:00,XRP_USDT,2023,3
2023-03-01 00:00:02+00:00,1677628802,0.376,211187.2,0.3759,192213.3,0.3758,395066.5,0.3757,350274.0,0.3756,359647.8,0.3755,189061.9,0.3754,503384.2,0.3753,578191.4,0.3752,211287.5,0.3751,435225.3,0.3761,81642.4,0.3762,222154.0,0.3763,526512.8,0.3764,279521.7,0.3765,261150.6,0.3766,225826.6,0.3767,434430.2,0.3768,146106.9,0.3769,533330.4,0.377,343512.0,binance,2023-03-02 11:03:05.314310+00:00,XRP_USDT,2023,3
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-03-01 00:00:09+00:00,1677628809,0.376,149271.4,0.3759,160078.1,0.3758,343263.6,0.3757,274449.0,0.3756,408740.9,0.3755,181540.9,0.3754,483017.6,0.3753,351670.4,0.3752,200834.4,0.3751,348856.2,0.3761,143390.8,0.3762,273839.6,0.3763,378209.2,0.3764,307040.1,0.3765,289302.1,0.3766,252498.1,0.3767,427880.1,0.3768,132699.6,0.3769,306694.9,0.377,178243.3,binance,2023-03-02 11:03:05.314310+00:00,XRP_USDT,2023,3
2023-03-01 00:00:10+00:00,1677628810,0.3759,158199.2,0.3758,238228.6,0.3757,315491.9,0.3756,414360.8,0.3755,213190.8,0.3754,493971.4,0.3753,420783.8,0.3752,182510.1,0.3751,362127.2,0.375,294224.2,0.376,60741.0,0.3761,200317.6,0.3762,301629.5,0.3763,425585.3,0.3764,234639.8,0.3765,291357.6,0.3766,257134.1,0.3767,431414.1,0.3768,135358.2,0.3769,308104.9,binance,2023-03-02 11:03:05.314310+00:00,XRP_USDT,2023,3
2023-03-01 00:00:11+00:00,1677628811,0.3758,84797.0,0.3757,154853.3,0.3756,263440.2,0.3755,234608.3,0.3754,247260.9,0.3753,223310.6,0.3752,251507.7,0.3751,722704.8,0.375,307289.7,0.3749,372233.2,0.3759,166703.6,0.376,307981.7,0.3761,276470.3,0.3762,344489.6,0.3763,271749.4,0.3764,147592.1,0.3765,250531.2,0.3766,194480.0,0.3767,428577.8,0.3768,137195.8,binance,2023-03-02 11:03:05.314310+00:00,XRP_USDT,2023,3


INFO  None


## periodic_daily.airflow.resampled_1min.parquet.bid_ask.futures.v3.crypto_chassis.binance.v1_0_0

In [12]:
signature = "periodic_daily.airflow.resampled_1min.parquet.bid_ask.futures.v3.crypto_chassis.binance.v1_0_0"
reader = imvcdcimrdc.RawDataReader(signature)
data = reader.read_data_head()
_LOG.log(log_level, hpandas.df_to_str(data, log_level=log_level))

INFO  Loading dataset schema file: /app/data_schema/dataset_schema_versions/dataset_schema_v3.json
INFO  Loaded dataset schema version v3
INFO  Loading dataset schema file: /app/data_schema/dataset_schema_versions/dataset_schema_v3.json
INFO  Loaded dataset schema version v3
INFO  Loading the data from `s3://cryptokaizen-data/v3/periodic_daily/airflow/resampled_1min/parquet/bid_ask/futures/v3/crypto_chassis/binance/v1_0_0`


Unnamed: 0,timestamp,bid_price_l1,bid_size_l1,ask_price_l1,ask_size_l1,bid_price_l2,bid_size_l2,ask_price_l2,ask_size_l2,bid_price_l3,bid_size_l3,ask_price_l3,ask_size_l3,bid_price_l4,bid_size_l4,ask_price_l4,ask_size_l4,bid_price_l5,bid_size_l5,ask_price_l5,ask_size_l5,bid_price_l6,bid_size_l6,ask_price_l6,ask_size_l6,bid_price_l7,bid_size_l7,ask_price_l7,ask_size_l7,bid_price_l8,bid_size_l8,ask_price_l8,ask_size_l8,bid_price_l9,bid_size_l9,ask_price_l9,ask_size_l9,bid_price_l10,bid_size_l10,ask_price_l10,ask_size_l10,exchange_id,knowledge_timestamp,currency_pair,year,month
2023-03-01 00:00:00+00:00,1677628800,0.080762,11970578.0,0.080775,4264628.0,0.080741,21680081.0,0.080788,14486424.0,0.080732,36731642.0,0.080787,40555075.0,0.080721,40128277.0,0.080796,46316428.0,0.080714,48638996.0,0.080807,39950222.0,0.080712,38733772.0,0.080822,65483739.0,0.080691,27627845.0,0.080829,96289825.0,0.080683,29564972.0,0.080835,35562800.0,0.080675,35744474.0,0.080846,30596061.0,0.080666,34860538.0,0.080855,25892712.0,binance,2023-03-01 11:05:44.320499+00:00,DOGE_USDT,2023,3
2023-03-01 00:01:00+00:00,1677628860,0.080817,6947634.0,0.080827,6022351.0,0.080803,21168120.0,0.080843,10295530.0,0.080796,46840873.0,0.080846,34981834.0,0.080793,42256331.0,0.08086,34963790.0,0.080777,52474935.0,0.080867,32592869.0,0.080766,34399870.0,0.08088,56070236.0,0.080756,29702742.0,0.080889,82003634.0,0.080748,21226078.0,0.080896,48224980.0,0.08074,25031541.0,0.080906,25784819.0,0.080726,23706863.0,0.080916,29044185.0,binance,2023-03-02 11:06:08.610534+00:00,DOGE_USDT,2023,3
2023-03-01 00:02:00+00:00,1677628920,0.080774,5048737.0,0.080798,6842320.0,0.080768,24208172.0,0.080808,14056312.0,0.080763,58651850.0,0.080813,39817246.0,0.080756,48501635.0,0.080822,48000058.0,0.080741,51383139.0,0.080832,39967577.0,0.080731,34033765.0,0.080848,50496741.0,0.080723,35633300.0,0.080856,90311559.0,0.080709,38669283.0,0.080861,60978832.0,0.080706,29034828.0,0.080872,37670805.0,0.080694,28272731.0,0.080883,39173015.0,binance,2023-03-02 11:06:08.610534+00:00,DOGE_USDT,2023,3
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-03-01 00:07:00+00:00,1677629220,0.080784,8738968.0,0.080782,8816495.0,0.080768,13548333.0,0.080788,26795458.0,0.08075,31924453.0,0.080801,49638639.0,0.080744,40337866.0,0.08081,49087841.0,0.080736,49834541.0,0.080826,52412742.0,0.080724,42778000.0,0.080834,72650525.0,0.08071,33859267.0,0.080841,91287703.0,0.080698,36290982.0,0.080854,46566208.0,0.080691,34726869.0,0.080856,46690178.0,0.080682,31265522.0,0.080867,41220460.0,binance,2023-03-02 11:06:08.610534+00:00,DOGE_USDT,2023,3
2023-03-01 00:08:00+00:00,1677629280,0.080686,4714743.0,0.080691,9698304.0,0.08067,12557255.0,0.080703,24970759.0,0.08066,31807041.0,0.080713,50225361.0,0.08065,37543661.0,0.080722,49822156.0,0.080641,48192926.0,0.080733,50680090.0,0.080632,43741027.0,0.080745,69175789.0,0.080624,32860789.0,0.080751,84230775.0,0.080611,35074007.0,0.080759,51239137.0,0.080604,33664393.0,0.080772,46372005.0,0.080592,32656690.0,0.080782,39483224.0,binance,2023-03-02 11:06:08.610534+00:00,DOGE_USDT,2023,3
2023-03-01 00:09:00+00:00,1677629340,0.080602,4279626.0,0.080605,7834810.0,0.080593,15199284.0,0.08062,17542403.0,0.08058,35004928.0,0.080633,44212132.0,0.080568,37496609.0,0.08064,52925343.0,0.08056,45153548.0,0.080653,47267226.0,0.080547,37387443.0,0.080657,62661697.0,0.08054,30606147.0,0.080668,94971729.0,0.080525,39204918.0,0.080678,49789352.0,0.080521,29821008.0,0.080683,44049248.0,0.080512,28928942.0,0.080698,30399450.0,binance,2023-03-02 11:06:08.610534+00:00,DOGE_USDT,2023,3


INFO  None


## periodic_daily.airflow.downloaded_1sec.parquet.bid_ask.spot.v3.crypto_chassis.binance.v1_0_0

In [13]:
# The dataset reside under previous schema:
# s3://cryptokaizen-data/reorg/daily_staged.airflow.pq/

# TODO(Juraj): Spot bid ask spot data are not collected currently
# signature = "periodic_daily.airflow.downloaded_1sec.parquet.bid_ask.spot.v3.crypto_chassis.binance.v1_0_0"
# reader = imvcdcimrdc.RawDataReader(signature)
# data = reader.read_data_head()
# _LOG.log(log_level, hpandas.df_to_str(data, log_level=log_level))

## periodic_daily.airflow.resampled_1min.parquet.bid_ask.spot.v3.crypto_chassis.binance.v1_0_0

In [14]:
# The dataset reside under previous schema:
# s3://cryptokaizen-data/reorg/daily_staged.airflow.pq/

# TODO(Juraj): Spot bid ask spot data are not collected currently
# signature = "periodic_daily.airflow.downloaded_1sec.parquet.bid_ask.spot.v3.crypto_chassis.binance.v1_0_0"
# reader = imvcdcimrdc.RawDataReader(signature)
# data = reader.read_data_head()
# _LOG.log(log_level, hpandas.df_to_str(data, log_level=log_level))

## periodic_daily.airflow.downloaded_1min.parquet.ohlcv.spot.v3.crypto_chassis.binance.v1_0_0

In [15]:
signature = "periodic_daily.airflow.downloaded_1min.parquet.ohlcv.spot.v3.crypto_chassis.binance.v1_0_0"
reader = imvcdcimrdc.RawDataReader(signature)
data = reader.read_data_head()
_LOG.log(log_level, hpandas.df_to_str(data, log_level=log_level))

INFO  Loading dataset schema file: /app/data_schema/dataset_schema_versions/dataset_schema_v3.json
INFO  Loaded dataset schema version v3
INFO  Loading dataset schema file: /app/data_schema/dataset_schema_versions/dataset_schema_v3.json
INFO  Loaded dataset schema version v3
INFO  Loading the data from `s3://cryptokaizen-data/v3/periodic_daily/airflow/downloaded_1min/parquet/ohlcv/spot/v3/crypto_chassis/binance/v1_0_0`


Unnamed: 0,timestamp,open,high,low,close,volume,vwap,number_of_trades,twap,exchange_id,knowledge_timestamp,currency_pair,year,month
2022-10-01 00:00:00+00:00,1664582400,0.4346,0.4347,0.4341,0.4341,28292.6,0.434331,31,0.434419,binance,2022-10-02 00:16:43.552325+00:00,ADA_USDT,2022,10
2022-10-01 00:01:00+00:00,1664582460,0.434,0.4341,0.4339,0.4341,51737.3,0.433961,15,0.43394,binance,2022-10-02 00:16:43.552325+00:00,ADA_USDT,2022,10
2022-10-01 00:02:00+00:00,1664582520,0.4341,0.4343,0.4341,0.4342,38364.8,0.43419,11,0.434191,binance,2022-10-02 00:16:43.552325+00:00,ADA_USDT,2022,10
,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-10-01 00:07:00+00:00,1664582820,0.4352,0.4353,0.4352,0.4352,9646.3,0.435264,9,0.435256,binance,2022-10-02 00:16:43.552325+00:00,ADA_USDT,2022,10
2022-10-01 00:08:00+00:00,1664582880,0.4353,0.4355,0.4353,0.4354,41853.8,0.435395,27,0.435448,binance,2022-10-02 00:16:43.552325+00:00,ADA_USDT,2022,10
2022-10-01 00:09:00+00:00,1664582940,0.4354,0.4358,0.4354,0.4356,251143.6,0.435609,31,0.435597,binance,2022-10-02 00:16:43.552325+00:00,ADA_USDT,2022,10


INFO  None


## periodic_daily.airflow.downloaded_1sec.parquet.trades.futures.v3_1.crypto_chassis.binance.v1_0_0

In [16]:
signature = "periodic_daily.airflow.downloaded_1sec.parquet.trades.futures.v3_1.crypto_chassis.binance.v1_0_0"
reader = imvcdcimrdc.RawDataReader(signature)
data = reader.read_data_head()
_LOG.log(log_level, hpandas.df_to_str(data, log_level=log_level))

INFO  Loading dataset schema file: /app/data_schema/dataset_schema_versions/dataset_schema_v3.json
INFO  Loaded dataset schema version v3
INFO  Loading dataset schema file: /app/data_schema/dataset_schema_versions/dataset_schema_v3.json
INFO  Loaded dataset schema version v3
INFO  Loading the data from `s3://cryptokaizen-data/v3/periodic_daily/airflow/downloaded_1sec/parquet/trades/futures/v3_1/crypto_chassis/binance/v1_0_0`


Unnamed: 0,timestamp,price,size,is_buyer_maker,exchange_id,knowledge_timestamp,currency_pair,year,month
2023-03-01 23:09:37+00:00,1677712177,0.3831,0.5,0,binance,2023-03-02 10:35:24.019240+00:00,XRP_USDT,2023,3
2023-03-01 23:09:37+00:00,1677712177,0.3831,15.3,0,binance,2023-03-02 10:35:24.019240+00:00,XRP_USDT,2023,3
2023-03-01 23:09:37+00:00,1677712177,0.3831,0.3,0,binance,2023-03-02 10:35:24.019240+00:00,XRP_USDT,2023,3
,...,...,...,...,...,...,...,...,...
2023-03-01 23:09:42+00:00,1677712182,0.3831,0.1,0,binance,2023-03-02 10:35:24.019240+00:00,XRP_USDT,2023,3
2023-03-01 23:09:43+00:00,1677712183,0.3831,0.1,0,binance,2023-03-02 10:35:24.019240+00:00,XRP_USDT,2023,3
2023-03-01 23:09:43+00:00,1677712183,0.3831,0.6,0,binance,2023-03-02 10:35:24.019240+00:00,XRP_USDT,2023,3


INFO  None


## periodic_daily.airflow.downloaded_1min.parquet.ohlcv.futures.v7_3.ccxt.okx.v1_0_0

In [17]:
signature = "periodic_daily.airflow.downloaded_1min.parquet.ohlcv.futures.v7_3.ccxt.okx.v1_0_0"
reader = imvcdcimrdc.RawDataReader(signature)
data = reader.read_data_head()
_LOG.log(log_level, hpandas.df_to_str(data, log_level=log_level))

INFO  Loading dataset schema file: /app/data_schema/dataset_schema_versions/dataset_schema_v3.json
INFO  Loaded dataset schema version v3
INFO  Loading dataset schema file: /app/data_schema/dataset_schema_versions/dataset_schema_v3.json
INFO  Loaded dataset schema version v3
INFO  Loading the data from `s3://cryptokaizen-data/v3/periodic_daily/airflow/downloaded_1min/parquet/ohlcv/futures/v7_3/ccxt/okx/v1_0_0`


Unnamed: 0,timestamp,open,high,low,close,volume,exchange_id,knowledge_timestamp,currency_pair,year,month
2023-03-01 15:00:00+00:00,1677682800000,22.659,22.68,22.59,22.63,3854.328155,okx,2023-03-02 00:17:24.696974+00:00,SOL_USDT,2023,3
2023-03-01 15:01:00+00:00,1677682860000,22.65,22.741,22.639,22.692,604.633494,okx,2023-03-02 00:17:24.696974+00:00,SOL_USDT,2023,3
2023-03-01 15:02:00+00:00,1677682920000,22.68,22.729,22.651,22.701,437.3072,okx,2023-03-02 00:17:24.696974+00:00,SOL_USDT,2023,3
,...,...,...,...,...,...,...,...,...,...,...
2023-03-01 15:07:00+00:00,1677683220000,22.573,22.607,22.563,22.594,314.446202,okx,2023-03-02 00:17:24.696974+00:00,SOL_USDT,2023,3
2023-03-01 15:08:00+00:00,1677683280000,22.581,22.6,22.555,22.6,537.739643,okx,2023-03-02 00:17:24.696974+00:00,SOL_USDT,2023,3
2023-03-01 15:09:00+00:00,1677683340000,22.602,22.609,22.562,22.569,196.007398,okx,2023-03-02 00:17:24.696974+00:00,SOL_USDT,2023,3


INFO  None


## bulk.airflow.downloaded_1min.parquet.ohlcv.futures.v7_5.ccxt.binance.v1_0_0

_This dataset is in the test stage only_

In [11]:
signature = "bulk.airflow.downloaded_1min.parquet.ohlcv.futures.v7_5.ccxt.binance.v1_0_0"
reader = imvcdcimrdc.RawDataReader(signature, stage="test")
# 4 months of data is available.
start_timestamp = pd.Timestamp("2023-02-01T00:00:00+00:00")
end_timestamp = pd.Timestamp("2023-06-01T00:00:00+00:00")
binance_ohlcv_data = reader.read_data(start_timestamp, end_timestamp)
_LOG.log(log_level, hpandas.df_to_str(binance_ohlcv_data.head(), log_level=log_level))

INFO  Loading dataset schema file: /app/data_schema/dataset_schema_versions/dataset_schema_v3.json
INFO  Loaded dataset schema version v3
INFO  Loading dataset schema file: /app/data_schema/dataset_schema_versions/dataset_schema_v3.json
INFO  Loaded dataset schema version v3
INFO  Loading dataset schema file: /app/data_schema/dataset_schema_versions/dataset_schema_v3.json
INFO  Loaded dataset schema version v3


Unnamed: 0_level_0,timestamp,open,high,low,close,volume,exchange_id,knowledge_timestamp,currency_pair,year,month
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2023-02-01 00:00:00+00:00,1675209600000,0.5141,0.5143,0.5137,0.5142,59009.0,binance,2023-06-20 17:19:02.877891+00:00,1INCH_USDT,2023,2
2023-02-01 00:01:00+00:00,1675209660000,0.5142,0.5143,0.514,0.514,26966.0,binance,2023-06-20 17:19:02.877891+00:00,1INCH_USDT,2023,2
2023-02-01 00:02:00+00:00,1675209720000,0.5141,0.5157,0.5141,0.5154,49452.0,binance,2023-06-20 17:19:02.877891+00:00,1INCH_USDT,2023,2
2023-02-01 00:03:00+00:00,1675209780000,0.5154,0.5162,0.5153,0.5157,60371.0,binance,2023-06-20 17:19:02.877891+00:00,1INCH_USDT,2023,2
2023-02-01 00:04:00+00:00,1675209840000,0.5155,0.5157,0.5147,0.5148,24003.0,binance,2023-06-20 17:19:02.877891+00:00,1INCH_USDT,2023,2


INFO  None


# Archived data (data transferred from IM DB to postgres)

TODO(Juraj): #CmTask3376 Update once the support for archive data has been added to the `RawDataReader`

- So far only single dataset stored in s3://cryptokaizen-data/db_archive/prod/ccxt_bid_ask_futures_raw/timestamp/
   - can be retrieved using `hparquet.from_parquet`
   - be aware of the large footprint of the dataset

# RawDataReader Guide

## Loading parquet data with filters

TODO(Juraj): Support for filtering by level for parquet bid/ask datasets will be added once
#3694 is finished.

In [18]:
signature = "periodic_daily.airflow.downloaded_1min.parquet.ohlcv.futures.v7.ccxt.binance.v1_0_0"
start_timestamp = pd.Timestamp(
    datetime.datetime.utcnow() - datetime.timedelta(minutes=10, days=2), tz="UTC"
)
end_timestamp = start_timestamp + datetime.timedelta(minutes=10)
currency_pairs = ["BTC_USDT", "ETH_USDT"]
reader = imvcdcimrdc.RawDataReader(signature)
data = reader.read_data(
    start_timestamp, end_timestamp, currency_pairs=currency_pairs
)
_LOG.log(log_level, hpandas.df_to_str(data, log_level=log_level))

INFO  Loading dataset schema file: /app/data_schema/dataset_schema_versions/dataset_schema_v3.json
INFO  Loaded dataset schema version v3
INFO  Loading dataset schema file: /app/data_schema/dataset_schema_versions/dataset_schema_v3.json
INFO  Loaded dataset schema version v3


Unnamed: 0,timestamp,open,high,low,close,volume,exchange_id,knowledge_timestamp,currency_pair,year,month
2023-02-28 18:14:00+00:00,1677608040000,23513.2,23515.0,23502.1,23502.1,113.371,binance,2023-03-01 00:17:08.407622+00:00,BTC_USDT,2023,2
2023-02-28 18:15:00+00:00,1677608100000,23502.1,23502.2,23473.1,23480.0,814.729,binance,2023-03-01 00:17:08.407622+00:00,BTC_USDT,2023,2
2023-02-28 18:16:00+00:00,1677608160000,23479.9,23481.4,23462.2,23472.1,670.173,binance,2023-03-01 00:17:08.407622+00:00,BTC_USDT,2023,2
,...,...,...,...,...,...,...,...,...,...,...
2023-02-28 18:21:00+00:00,1677608460000,1639.5,1639.66,1638.84,1639.33,1733.123,binance,2023-03-01 00:17:07.177332+00:00,ETH_USDT,2023,2
2023-02-28 18:22:00+00:00,1677608520000,1639.33,1640.09,1639.32,1640.08,1965.509,binance,2023-03-01 00:17:07.177332+00:00,ETH_USDT,2023,2
2023-02-28 18:23:00+00:00,1677608580000,1640.09,1640.58,1639.87,1640.57,2212.749,binance,2023-03-01 00:17:07.177332+00:00,ETH_USDT,2023,2


INFO  None


## Loading postgres data with filters

In [19]:
signature = "realtime.airflow.resampled_1min.postgres.bid_ask.futures.v7.ccxt.binance.v1_0_0"
reader = imvcdcimrdc.RawDataReader(signature)
start_timestamp = pd.Timestamp(
    datetime.datetime.utcnow() - datetime.timedelta(minutes=10, days=2), tz="UTC"
)
end_timestamp = start_timestamp + datetime.timedelta(minutes=10)
currency_pairs = ["BTC_USDT", "ETH_USDT"]
bid_ask_levels = [1, 2]
data = reader.read_data(
    start_timestamp,
    end_timestamp,
    currency_pairs=currency_pairs,
    bid_ask_levels=bid_ask_levels,
)
_LOG.log(log_level, hpandas.df_to_str(data, log_level=log_level))

INFO  Loading dataset schema file: /app/data_schema/dataset_schema_versions/dataset_schema_v3.json
INFO  Loaded dataset schema version v3
INFO  Enabled connection to the `ccxt_bid_ask_futures_resampled_1min` DB table
INFO  Executing query: 
	SELECT * FROM ccxt_bid_ask_futures_resampled_1min WHERE timestamp >= 1677608028319 AND timestamp <= 1677608628319 AND currency_pair IN ('BTC_USDT', 'ETH_USDT') AND level IN (1, 2)


  df = pd.read_sql_query(query, connection)


Unnamed: 0,currency_pair,exchange_id,end_download_timestamp,knowledge_timestamp,bid_size_l1,bid_size_l2,bid_price_l1,bid_price_l2,ask_size_l1,ask_size_l2,ask_price_l1,ask_price_l2
1677608040000.0,BTC_USDT,binance,,2023-02-28 18:14:23.282809+00:00,1070.850667,67.409933,23512.204814,23512.434733,1462.470483,112.091333,23512.11375,23512.583785
1677608040000.0,ETH_USDT,binance,,2023-02-28 18:14:23.282809+00:00,4021.40005,411.259333,1642.866815,1642.851185,6783.787717,597.0741,1642.861594,1642.825523
1677608100000.0,BTC_USDT,binance,,2023-02-28 18:15:23.210084+00:00,1004.723067,82.7194,23511.473195,23509.909508,1274.354083,176.919633,23509.471753,23509.677931
,...,...,...,...,...,...,...,...,...,...,...,...
1677608520000.0,ETH_USDT,binance,,2023-02-28 18:22:24.179459+00:00,5151.6178,2599.250817,1639.244919,1639.286374,4792.972583,393.588817,1639.312817,1639.392715
1677608580000.0,BTC_USDT,binance,,2023-02-28 18:23:23.906101+00:00,995.892383,97.64825,23470.033445,23470.762329,905.937433,76.54555,23470.681552,23471.022601
1677608580000.0,ETH_USDT,binance,,2023-02-28 18:23:23.906101+00:00,6111.852533,532.559533,1639.624245,1639.662132,5792.204867,370.59395,1639.609131,1639.838361


INFO  None
