In [1]:
# ! pip install dask

In [1]:
from pathlib import Path
from ace_rl.core.cube_builder import CubeBuilderConfig, build_raw_cube

root = Path("../../../work/processed/binance").expanduser()
cfg = CubeBuilderConfig(
    per_symbol_root=root / "data" / "zarr_per_symbol",
    # chưa có meta -> None; builder sẽ suy luận
    calendar_path=None,
    feature_order_path=None,
    symbols=["BTCUSDT", "ETHUSDT", "BNBUSDT", "SOLUSDT"],
    use_dask=False,
    # Quan trọng: cắt block theo giao thời gian
    calendar_mode="range_intersection",  # mặc định đã là mode này
    output_path=root / "data" / "zarr_cube_intersection",  # tuỳ chọn
    debug_timing=True
)

cube = build_raw_cube(cfg)
print(cube)


[CubeBuilder] start build for 4 symbols (use_dask=False)
[CubeBuilder] loaded BTCUSDT ((3507322, 9)) in 2.06s


  raw_times = np.asarray(group["time"][:], dtype="datetime64[ns]")


[CubeBuilder] loaded ETHUSDT ((3507344, 9)) in 2.07s
[CubeBuilder] loaded BNBUSDT ((3507365, 9)) in 2.15s
[CubeBuilder] loaded SOLUSDT ((2662968, 9)) in 1.73s
[CubeBuilder] resolved calendar (2662909 entries) and feature order (9 features) in 3.85s
[CubeBuilder] reindexed BTCUSDT -> (1, 2662909, 9) in 1.75s
[CubeBuilder] reindexed ETHUSDT -> (1, 2662909, 9) in 1.23s
[CubeBuilder] reindexed BNBUSDT -> (1, 2662909, 9) in 1.16s
[CubeBuilder] reindexed SOLUSDT -> (1, 2662909, 9) in 0.58s
[CubeBuilder] concatenated values -> (2662909, 4, 9) in 4.44s
[CubeBuilder] generated mask in 0.34s
[CubeBuilder] applied chunking {'time': 16384} in 10.44s
[CubeBuilder] wrote cube to ../../../work/processed/binance/data/zarr_cube_intersection in 2.47s
[CubeBuilder] total build_raw_cube elapsed 34.29s
<xarray.Dataset> Size: 501MB
Dimensions:  (symbol: 4, time: 2662909, feature: 9)
Coordinates:
  * symbol   (symbol) object 32B 'BTCUSDT' 'ETHUSDT' 'BNBUSDT' 'SOLUSDT'
  * time     (time) datetime64[ns] 21MB 

In [2]:
from pathlib import Path
import xarray as xr

root = Path("../../../work/processed/binance").expanduser()
cached_cube = xr.open_zarr(
    (root / "data" / "zarr_cube_intersection").as_posix(),
    consolidated=True,  # flip to False if you ever disable consolidation
)
print(cached_cube)


<xarray.Dataset> Size: 501MB
Dimensions:  (feature: 9, time: 2662909, symbol: 4)
Coordinates:
  * feature  (feature) <U13 468B 'Open' 'High' ... 'TakerBuyQuote'
  * symbol   (symbol) object 32B 'BTCUSDT' 'ETHUSDT' 'BNBUSDT' 'SOLUSDT'
  * time     (time) datetime64[ns] 21MB 2020-08-11T06:00:00 ... 2025-09-04T11...
Data variables:
    mask     (time, symbol, feature) bool 96MB dask.array<chunksize=(16384, 4, 9), meta=np.ndarray>
    values   (time, symbol, feature) float32 383MB dask.array<chunksize=(16384, 4, 9), meta=np.ndarray>
Attributes:
    calendar_source:       inferred:range_intersection
    feature_order_source:  inferred


In [3]:
from xarray.testing import assert_equal
# quick boolean check (ignores dataset attrs):
print("cube.equals(cached_cube):", cube.equals(cached_cube))

# strict check (data, coords, attrs) – raises if anything differs:
assert_equal(cube, cached_cube)
print("✅ cube and cached_cube are identical")

cube.equals(cached_cube): True
✅ cube and cached_cube are identical


In [4]:
from xarray.testing import assert_identical
assert_identical(cube, cached_cube)


In [3]:
print(type(cube))

<class 'xarray.core.dataset.Dataset'>


In [4]:
cube.attrs["calendar_source"]

'inferred:range_intersection'

In [5]:
cube.attrs["feature_order_source"]

'inferred'

In [6]:
print(cube)
print()
print(cube.coords.get("symbol").values)
print()
print(cube.coords.get("feature").values)
print()
print(cube.data_vars.get("values"))
print()

<xarray.Dataset> Size: 501MB
Dimensions:  (symbol: 4, time: 2662909, feature: 9)
Coordinates:
  * symbol   (symbol) object 32B 'BTCUSDT' 'ETHUSDT' 'BNBUSDT' 'SOLUSDT'
  * time     (time) datetime64[ns] 21MB 2020-08-11T06:00:00 ... 2025-09-04T11...
  * feature  (feature) <U13 468B 'Open' 'High' ... 'TakerBuyQuote'
Data variables:
    values   (time, symbol, feature) float32 383MB dask.array<chunksize=(16384, 4, 9), meta=np.ndarray>
    mask     (time, symbol, feature) bool 96MB dask.array<chunksize=(16384, 4, 9), meta=np.ndarray>
Attributes:
    calendar_source:       inferred:range_intersection
    feature_order_source:  inferred

['BTCUSDT' 'ETHUSDT' 'BNBUSDT' 'SOLUSDT']

['Open' 'High' 'Low' 'Close' 'Volume' 'QuoteVolume' 'Trades'
 'TakerBuyBase' 'TakerBuyQuote']

<xarray.DataArray 'values' (time: 2662909, symbol: 4, feature: 9)> Size: 383MB
dask.array<xarray-values, shape=(2662909, 4, 9), dtype=float32, chunksize=(16384, 4, 9), chunktype=numpy.ndarray>
Coordinates:
  * symbol   (sym

In [None]:
from tabulate import tabulate

n_tail = 5 


for sym in cube.coords.get("symbol").values:
    df = cube.data_vars.get("values").sel(symbol=sym).to_pandas()
    print(f"Symbol: {sym}")
    print(tabulate(df.tail(n_tail), headers="keys", tablefmt="psql"))
    print()

Symbol: BTCUSDT
+---------------------+--------+--------+--------+---------+----------+---------------+----------+----------------+-----------------+
| time                |   Open |   High |    Low |   Close |   Volume |   QuoteVolume |   Trades |   TakerBuyBase |   TakerBuyQuote |
|---------------------+--------+--------+--------+---------+----------+---------------+----------+----------------+-----------------|
| 2025-09-04 11:48:00 | 110948 | 110948 | 110914 |  110918 |  1.93779 |        214944 |     1026 |        0.76295 |         84624.2 |
| 2025-09-04 11:49:00 | 110918 | 110973 | 110918 |  110973 |  6.53812 |        725402 |      795 |        4.55666 |        505564   |
| 2025-09-04 11:50:00 | 110973 | 110988 | 110951 |  110951 |  4.68964 |        520440 |     1332 |        3.3058  |        366862   |
| 2025-09-04 11:51:00 | 110951 | 110951 | 110942 |  110942 |  1.43702 |        159433 |      302 |        0.16738 |         18570.1 |
| 2025-09-04 11:52:00 | 110942 | 110962 | 1109

  x = np.divide(x1, x2, out)


In [8]:
def verify_quote_volume(cube, n_tail=5):
    for sym in cube.coords['symbol'].values:
        print(f"\n============{sym}============")

        ds_tail = cube.sel(symbol=sym).isel(time=slice(-n_tail, None))
        df = ds_tail.to_dataframe().reset_index()
        df = df.pivot(index="time", columns="feature", values="values")

        ## tinh toan verify 
        df['calc_quote'] = df['Volume'] * df['Close']
        df['calc_taker_quote'] = df['TakerBuyBase'] * df['Close']

        ## tinh toan sai so %
        df['err_quote_pct'] = (df['QuoteVolume'] - df['calc_quote']) / df['QuoteVolume']
        df['err_taker_pct'] = (df['TakerBuyQuote'] - df['calc_taker_quote']) / df['TakerBuyQuote']

        ## print table 
        print(tabulate(
            df[["Volume","QuoteVolume","calc_quote","err_quote_pct",
                "TakerBuyBase","TakerBuyQuote","calc_taker_quote","err_taker_pct"]],
            headers="keys", tablefmt="psql", floatfmt=".4f"
        ))

verify_quote_volume(cube, n_tail=5)




+---------------------+----------+---------------+--------------+-----------------+----------------+-----------------+--------------------+-----------------+
| time                |   Volume |   QuoteVolume |   calc_quote |   err_quote_pct |   TakerBuyBase |   TakerBuyQuote |   calc_taker_quote |   err_taker_pct |
|---------------------+----------+---------------+--------------+-----------------+----------------+-----------------+--------------------+-----------------|
| 2025-09-04 11:48:00 |   1.9378 |   214943.8750 |  214934.9531 |          0.0000 |         0.7630 |      84624.2500 |         84624.5547 |         -0.0000 |
| 2025-09-04 11:49:00 |   6.5381 |   725402.0000 |  725552.8125 |         -0.0002 |         4.5567 |     505563.8125 |        505664.9062 |         -0.0002 |
| 2025-09-04 11:50:00 |   4.6896 |   520440.4688 |  520322.1562 |          0.0002 |         3.3058 |     366862.4375 |        366783.1562 |          0.0002 |
| 2025-09-04 11:51:00 |   1.4370 |   159433.2969 |  

In [9]:
cube

Unnamed: 0,Array,Chunk
Bytes,365.69 MiB,2.25 MiB
Shape,"(2662909, 4, 9)","(16384, 4, 9)"
Dask graph,163 chunks in 1 graph layer,163 chunks in 1 graph layer
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 365.69 MiB 2.25 MiB Shape (2662909, 4, 9) (16384, 4, 9) Dask graph 163 chunks in 1 graph layer Data type float32 numpy.ndarray",9  4  2662909,

Unnamed: 0,Array,Chunk
Bytes,365.69 MiB,2.25 MiB
Shape,"(2662909, 4, 9)","(16384, 4, 9)"
Dask graph,163 chunks in 1 graph layer,163 chunks in 1 graph layer
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,91.42 MiB,576.00 kiB
Shape,"(2662909, 4, 9)","(16384, 4, 9)"
Dask graph,163 chunks in 1 graph layer,163 chunks in 1 graph layer
Data type,bool numpy.ndarray,bool numpy.ndarray
"Array Chunk Bytes 91.42 MiB 576.00 kiB Shape (2662909, 4, 9) (16384, 4, 9) Dask graph 163 chunks in 1 graph layer Data type bool numpy.ndarray",9  4  2662909,

Unnamed: 0,Array,Chunk
Bytes,91.42 MiB,576.00 kiB
Shape,"(2662909, 4, 9)","(16384, 4, 9)"
Dask graph,163 chunks in 1 graph layer,163 chunks in 1 graph layer
Data type,bool numpy.ndarray,bool numpy.ndarray


In [10]:
cube.variables.get("values")

Unnamed: 0,Array,Chunk
Bytes,365.69 MiB,2.25 MiB
Shape,"(2662909, 4, 9)","(16384, 4, 9)"
Dask graph,163 chunks in 1 graph layer,163 chunks in 1 graph layer
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 365.69 MiB 2.25 MiB Shape (2662909, 4, 9) (16384, 4, 9) Dask graph 163 chunks in 1 graph layer Data type float32 numpy.ndarray",9  4  2662909,

Unnamed: 0,Array,Chunk
Bytes,365.69 MiB,2.25 MiB
Shape,"(2662909, 4, 9)","(16384, 4, 9)"
Dask graph,163 chunks in 1 graph layer,163 chunks in 1 graph layer
Data type,float32 numpy.ndarray,float32 numpy.ndarray


In [11]:
from pathlib import Path
import xarray as xr
import yaml
import hashlib
import json
import numpy as np
from typing import Tuple

from pathlib import Path
from ace_rl.core.cube_builder import CubeBuilderConfig, build_raw_cube
from ace_rl.core.cube_feature import (
    make_name, _wrap_single_feature, save_cube, load_cube
)

In [12]:
# def make_return(cube: xr.Dataset, lookback: int = 1) -> Tuple[str, xr.Dataset]:
#     close = cube["values"].sel(feature="Close")
#     ret = np.log(close / close.shift(time=lookback))
#     # print dim ret to debug 
#     print(ret)
#     print(ret.dims, ret.shape)  # ('time', 'symbol') (1000, 10)

#     params = {"lookback": lookback, "method": "log_return"}
#     name = make_name("ret", f"lag{lookback}", params)

#     ret_da = (
#         ret
#         .expand_dims(feature=[f"return_lag{lookback}"])
#         .transpose("time", "symbol", "feature")
#     )

#     print("\n",ret_da, "\n")

#     ds = xr.Dataset(
#         {
#             "values": (
#                 ("time", "symbol", "feature"),
#                 ret_da.data
#             )
#         },
#         coords={
#             "time": cube.coords["time"],
#             "symbol": cube.coords["symbol"],
#             "feature": [f"return_lag{lookback}"],
#         },
#         attrs={"params": params},
#     )
#     return name, ds

In [13]:
# name_ret5, ds_ret5 = make_return(cube, lookback=5)
# print(f"[Feature] Generated: {name_ret5}")
# print(ds_ret5)

In [16]:
from pathlib import Path
from ace_rl.core.cube_builder import CubeBuilderConfig, build_raw_cube
from ace_rl.core.cube_feature import (
    make_return, make_volatility, make_forward_return,
    save_cube, load_cube
)

# Root folder (đặt chung cho raw/features/labels)
root = Path("work/processed/binance/data")

# Giả sử đã build được raw cube
# cfg = CubeBuilderConfig(
#     per_symbol_root=root / "zarr_per_symbol",
#     symbols=["BTCUSDT", "ETHUSDT", "BNBUSDT", "SOLUSDT"],
#     output_path=root / "zarr_cube_intersection",
# )
# cube = build_raw_cube(cfg)

# 1️⃣ Tính return lag5
name_ret5, ds_ret5 = make_return(cube, lookback=5)
print(f"[Feature] Generated: {name_ret5}")
save_cube(ds_ret5, root, "features", name_ret5)

print(ds_ret5)

[Feature] Generated: ret_lag5_bb88
<xarray.Dataset> Size: 64MB
Dimensions:  (feature: 1, symbol: 4, time: 2662909)
Coordinates:
  * feature  (feature) object 8B 'return_lag5'
  * symbol   (symbol) object 32B 'BTCUSDT' 'ETHUSDT' 'BNBUSDT' 'SOLUSDT'
  * time     (time) datetime64[ns] 21MB 2020-08-11T06:00:00 ... 2025-09-04T11...
Data variables:
    values   (time, symbol, feature) float32 43MB dask.array<chunksize=(16384, 4, 1), meta=np.ndarray>
Attributes:
    params:   {'lookback': 5, 'method': 'log_return'}


In [17]:
from pathlib import Path
from ace_rl.core.cube_builder import CubeBuilderConfig, build_raw_cube
from ace_rl.core.cube_feature import (
    make_return, make_volatility, make_forward_return,
    save_cube, load_cube
)

# Root folder (đặt chung cho raw/features/labels)
root = Path("work/processed/binance/data")

# Giả sử đã build được raw cube
# cfg = CubeBuilderConfig(
#     per_symbol_root=root / "zarr_per_symbol",
#     symbols=["BTCUSDT", "ETHUSDT", "BNBUSDT", "SOLUSDT"],
#     output_path=root / "zarr_cube_intersection",
# )
# cube = build_raw_cube(cfg)

# 1️⃣ Tính return lag5
name_ret5, ds_ret5 = make_return(cube, lookback=5)
print(f"[Feature] Generated: {name_ret5}")
save_cube(ds_ret5, root, "features", name_ret5)

# 2️⃣ Tính volatility 30
name_vol30, ds_vol30 = make_volatility(cube, window=30)
print(f"[Feature] Generated: {name_vol30}")
save_cube(ds_vol30, root, "features", name_vol30)

# 3️⃣ Tính forward return 10
name_lbl10, ds_lbl10 = make_forward_return(cube, horizon=10)
print(f"[Label] Generated: {name_lbl10}")
save_cube(ds_lbl10, root, "labels", name_lbl10)

# 🔄 Load lại & debug
print("\n=== Load back one feature ===")
ds_loaded = load_cube(root, "features", name_ret5)
print(ds_loaded)
print("Attrs:", ds_loaded.attrs)


[Feature] Generated: ret_lag5_bb88
[Feature] Generated: vol_win30_8cd5
[Label] Generated: label_fwd10_9859

=== Load back one feature ===
<xarray.Dataset> Size: 64MB
Dimensions:  (feature: 1, symbol: 4, time: 2662909)
Coordinates:
  * feature  (feature) object 8B 'return_lag5'
  * symbol   (symbol) object 32B 'BTCUSDT' 'ETHUSDT' 'BNBUSDT' 'SOLUSDT'
  * time     (time) datetime64[ns] 21MB 2020-08-11T06:00:00 ... 2025-09-04T11...
Data variables:
    values   (time, symbol, feature) float32 43MB dask.array<chunksize=(16384, 4, 1), meta=np.ndarray>
Attributes:
    params:   {'lookback': 5, 'method': 'log_return'}
Attrs: {'params': {'lookback': 5, 'method': 'log_return'}}
