In [1]:
from pathlib import Path
import xarray as xr

root = Path("../../../work/processed/binance").expanduser()
cached_cube = xr.open_zarr(
    (root / "data" / "zarr_cube_intersection").as_posix(),
    consolidated=True,  # flip to False if you ever disable consolidation
)
print(cached_cube)

cube = cached_cube 


<xarray.Dataset> Size: 501MB
Dimensions:  (feature: 9, time: 2662909, symbol: 4)
Coordinates:
  * feature  (feature) <U13 468B 'Open' 'High' ... 'TakerBuyQuote'
  * symbol   (symbol) object 32B 'BTCUSDT' 'ETHUSDT' 'BNBUSDT' 'SOLUSDT'
  * time     (time) datetime64[ns] 21MB 2020-08-11T06:00:00 ... 2025-09-04T11...
Data variables:
    mask     (time, symbol, feature) bool 96MB dask.array<chunksize=(16384, 4, 9), meta=np.ndarray>
    values   (time, symbol, feature) float32 383MB dask.array<chunksize=(16384, 4, 9), meta=np.ndarray>
Attributes:
    calendar_source:       inferred:range_intersection
    feature_order_source:  inferred


In [2]:
print(cube)
print()
print(cube.coords.get("symbol").values)
print()
print(cube.coords.get("feature").values)
print()
print(cube.data_vars.get("values"))
print()

<xarray.Dataset> Size: 501MB
Dimensions:  (feature: 9, time: 2662909, symbol: 4)
Coordinates:
  * feature  (feature) <U13 468B 'Open' 'High' ... 'TakerBuyQuote'
  * symbol   (symbol) object 32B 'BTCUSDT' 'ETHUSDT' 'BNBUSDT' 'SOLUSDT'
  * time     (time) datetime64[ns] 21MB 2020-08-11T06:00:00 ... 2025-09-04T11...
Data variables:
    mask     (time, symbol, feature) bool 96MB dask.array<chunksize=(16384, 4, 9), meta=np.ndarray>
    values   (time, symbol, feature) float32 383MB dask.array<chunksize=(16384, 4, 9), meta=np.ndarray>
Attributes:
    calendar_source:       inferred:range_intersection
    feature_order_source:  inferred

['BTCUSDT' 'ETHUSDT' 'BNBUSDT' 'SOLUSDT']

['Open' 'High' 'Low' 'Close' 'Volume' 'QuoteVolume' 'Trades'
 'TakerBuyBase' 'TakerBuyQuote']

<xarray.DataArray 'values' (time: 2662909, symbol: 4, feature: 9)> Size: 383MB
dask.array<open_dataset-values, shape=(2662909, 4, 9), dtype=float32, chunksize=(16384, 4, 9), chunktype=numpy.ndarray>
Coordinates:
  * feature

In [5]:
import xarray as xr
import numpy as np
import pandas as pd

# === CONFIG ===
LABEL_ZARR = "../../../work/processed/binance/data/zarr_cube_label"
GROUP      = "labels"
RET_EPS    = 5e-5  # epsilon để phân loại returns: >eps pos, |r|<=eps zero, <-eps neg

# === LOAD ===
ds_lbl = xr.open_zarr(LABEL_ZARR, group=GROUP)
values = ds_lbl["values"]            # (time, symbol, label)
labels = list(map(str, ds_lbl["label"].values))
symbols = list(map(str, ds_lbl["symbol"].values))
time = ds_lbl["time"].values
print(f"Loaded labels cube: values{tuple(values.shape)} | symbols={symbols[:8]} ... | n_labels={len(labels)}")

# === HELPERS ===
def _counts_float(arr: np.ndarray, eps: float = 0.0):
    """arr: 1D float array (NaN allowed). Return dict pos/zero/neg/nan/total."""
    mask = ~np.isnan(arr)
    v = arr[mask]
    if eps == 0.0:
        pos = int((v > 0).sum()); zero = int((v == 0).sum()); neg = int((v < 0).sum())
    else:
        pos = int((v >  eps).sum()); zero = int((np.abs(v) <= eps).sum()); neg = int((v < -eps).sum())
    return dict(total=int(arr.size), nan=int((~mask).sum()), pos=pos, zero=zero, neg=neg)

def summarize_all(ret_eps: float = 0.0) -> pd.DataFrame:
    """Tổng hợp theo từng label (gộp tất cả symbol & time)."""
    rows = []
    for lab in labels:
        arr = values.sel(label=lab).values.ravel()
        # TB vs Returns: TB dùng eps=0, Returns dùng ret_eps
        is_tb = lab.startswith("tb_label__")
        cnt = _counts_float(arr, eps=(0.0 if is_tb else ret_eps))
        cov = 100.0 * (cnt["total"] - cnt["nan"]) / max(1, cnt["total"])
        rows.append(dict(label=lab, coverage_pct=round(cov,2), **cnt))
    df = pd.DataFrame(rows).sort_values(["label"]).reset_index(drop=True)
    return df

def summarize_by_symbol(ret_eps: float = 0.0) -> pd.DataFrame:
    """Tổng hợp theo từng (symbol, label)."""
    rows = []
    for sym in symbols:
        print(f"  - Summarizing symbol {sym} ...")
        v_sym = values.sel(symbol=sym)
        for lab in labels:
            arr = v_sym.sel(label=lab).values.ravel()
            is_tb = lab.startswith("tb_label__")
            cnt = _counts_float(arr, eps=(0.0 if is_tb else ret_eps))
            cov = 100.0 * (cnt["total"] - cnt["nan"]) / max(1, cnt["total"])
            rows.append(dict(symbol=sym, label=lab, coverage_pct=round(cov,2), **cnt))
    return pd.DataFrame(rows).sort_values(["symbol","label"]).reset_index(drop=True)

# === RUN SUMMARY ===
df_all = summarize_all(ret_eps=RET_EPS)
display(df_all)

# (tuỳ chọn) xem theo từng symbol:
df_sym = summarize_by_symbol(ret_eps=RET_EPS)
display(df_sym)

# === QUICK DESCRIPTIVE cho returns (mean/std/quantiles) ===
def describe_returns(ret_eps: float = 0.0, qs=(0.01,0.05,0.5,0.95,0.99)) -> pd.DataFrame:
    rows = []
    for lab in labels:
        if not lab.startswith("ret_log__H"):
            continue
        arr = values.sel(label=lab).values.astype(float).ravel()
        v = arr[~np.isnan(arr)]
        if v.size == 0:
            continue
        stats = dict(
            label=lab,
            mean=float(v.mean()),
            std=float(v.std()),
            pos_share=float((v > ret_eps).mean()),
            zero_share=float((np.abs(v) <= ret_eps).mean()),
            neg_share=float((v < -ret_eps).mean()),
        )
        q = np.quantile(v, q=qs)
        stats.update({f"q{int(100*qv)}": float(q[i]) for i, qv in enumerate(qs)})
        rows.append(stats)
    return pd.DataFrame(rows).sort_values("label")

df_desc = describe_returns(ret_eps=RET_EPS)
display(df_desc)


Loaded labels cube: values(2662909, 4, 8) | symbols=['BTCUSDT', 'ETHUSDT', 'BNBUSDT', 'SOLUSDT'] ... | n_labels=8


Unnamed: 0,label,coverage_pct,total,nan,pos,zero,neg
0,ret_log__H1,100.0,10651636,10,4693535,1286088,4672003
1,ret_log__H15,100.0,10651636,66,5218926,305884,5126760
2,ret_log__H5,100.0,10651636,26,5085791,539004,5026815
3,ret_log__H60,100.0,10651636,246,5330086,158016,5163288
4,tb_exit_ret__H60,100.0,10651636,54,4649308,133035,5869239
5,tb_label__H60__pct,100.0,10651636,0,4659498,74889,5917249
6,tb_label__H60__vol,100.0,10651636,0,5282482,1077,5368077
7,tb_t_hit__H60,100.0,10651636,0,10651636,0,0


  - Summarizing symbol BTCUSDT ...
  - Summarizing symbol ETHUSDT ...
  - Summarizing symbol BNBUSDT ...
  - Summarizing symbol SOLUSDT ...


Unnamed: 0,symbol,label,coverage_pct,total,nan,pos,zero,neg
0,BNBUSDT,ret_log__H1,100.0,2662909,3,1117598,451009,1094299
1,BNBUSDT,ret_log__H15,100.0,2662909,17,1292967,127975,1241950
2,BNBUSDT,ret_log__H5,100.0,2662909,7,1242420,218424,1202058
3,BNBUSDT,ret_log__H60,100.0,2662909,62,1334565,66374,1261908
4,BNBUSDT,tb_exit_ret__H60,100.0,2662909,16,1182048,58657,1422188
5,BNBUSDT,tb_label__H60__pct,100.0,2662909,0,1181276,52666,1428967
6,BNBUSDT,tb_label__H60__vol,100.0,2662909,0,1313411,767,1348731
7,BNBUSDT,tb_t_hit__H60,100.0,2662909,0,2662909,0,0
8,BTCUSDT,ret_log__H1,100.0,2662909,3,1163656,332891,1166359
9,BTCUSDT,ret_log__H15,100.0,2662909,17,1304451,65813,1292628


Unnamed: 0,label,mean,std,pos_share,zero_share,neg_share,q1,q5,q50,q95,q99
0,ret_log__H1,1e-06,0.001316,0.44064,0.120741,0.438619,-0.003663,-0.001719,0.0,0.001716,0.003686
1,ret_log__H15,1.8e-05,0.004955,0.489968,0.028717,0.481315,-0.014223,-0.006579,0.0,0.006541,0.014167
2,ret_log__H5,6e-06,0.00291,0.477467,0.050603,0.47193,-0.008173,-0.003808,0.0,0.003793,0.008175
3,ret_log__H60,7.1e-05,0.009695,0.500412,0.014835,0.484753,-0.028342,-0.013283,5.5e-05,0.013254,0.028352
