# Description


This notebook showcases locations and basic structure of raw data from:

- S3 (parquet datasets)
- DB (PostGres)

# Imports

In [None]:
import logging

import helpers.hdbg as hdbg
import helpers.henv as henv
import helpers.hpandas as hpandas
import helpers.hparquet as hparque
import helpers.hprint as hprint
from im_v2.common.notebooks.master_raw_data_gallery_lib import *

In [None]:
hdbg.init_logger(verbosity=logging.INFO)
log_level = logging.INFO

_LOG = logging.getLogger(__name__)

_LOG.info("%s", henv.get_system_signature()[0])

hprint.config_notebook()

# Realtime (the DB data)

## real_time.airflow.csv.ohlcv.futures.1_min.ccxt.binance

In [None]:
# Get the real time data from DB.
ccxt_rt = get_raw_data_from_db(
    "ccxt_ohlcv_futures", "binance", start_ts=None, end_ts=None
)
_LOG.info(f"{len(ccxt_rt)} rows overall")
_LOG.log(log_level, hpandas.df_to_str(ccxt_rt, log_level=log_level))

# Historical (data updated daily)

## historical.daily.parquet.ohlcv.futures.1_min.ccxt.binance

In [None]:
s3_path = "s3://cryptokaizen-data/reorg/daily_staged.airflow.pq/ohlcv-futures/ccxt/binance"
# Load daily data from s3 parquet.
ccxt_futures_daily = hparque.from_parquet(s3_path, aws_profile="ck")
_LOG.info(f"{len(ccxt_futures_daily)} rows overall")
_LOG.log(log_level, hpandas.df_to_str(ccxt_futures_daily, log_level=log_level))

## historical.daily.parquet.bid_ask.futures.1_sec.crypto_chassis.binance

The amount of data is too big to process it all at once, so the data will be loaded separately for each month.

In [None]:
s3_path = "s3://cryptokaizen-data/reorg/daily_staged.airflow.pq/bid_ask-futures/crypto_chassis/binance"
start_ts = "20220627-000000"
end_ts = "20221130-000000"
process_s3_data_in_chunks(start_ts, end_ts, s3_path, 3)

## historical.daily.parquet.bid_ask.futures.1_min.crypto_chassis.binance

In [None]:
s3_path = "s3://cryptokaizen-data/reorg/daily_staged.airflow.pq/bid_ask-futures/crypto_chassis.resampled_1min/binance"
# Load daily data from s3 parquet.
cc_ba_futures_resampled = hparque.from_parquet(s3_path, aws_profile="ck")
_LOG.info(f"{len(cc_ba_futures_resampled)} rows overall")
_LOG.log(
    log_level, hpandas.df_to_str(cc_ba_futures_resampled, log_level=log_level)
)

## historical.daily.parquet.bid_ask.spot.1_sec.crypto_chassis.binance

In [None]:
s3_path = "s3://cryptokaizen-data/reorg/daily_staged.airflow.pq/bid_ask/crypto_chassis/binance"
start_ts = "20220501-000000"
end_ts = "20221130-000000"
process_s3_data_in_chunks(start_ts, end_ts, s3_path, 3)

## historical.daily.parquet.bid_ask.spot.1_min.crypto_chassis.binance

In [None]:
s3_path = "s3://cryptokaizen-data/reorg/daily_staged.airflow.pq/bid_ask/crypto_chassis.resampled_1min/binance"
# Load daily data from s3 parquet.
cc_ba_spot_resampled = hparque.from_parquet(s3_path, aws_profile="ck")
_LOG.info(f"{len(cc_ba_spot_resampled)} rows overall")
_LOG.log(log_level, hpandas.df_to_str(cc_ba_spot_resampled, log_level=log_level))