In [1]:
# ! pip install dask

In [2]:
from pathlib import Path
from ace_rl.core.cube_builder import CubeBuilderConfig, build_raw_cube

root = Path("../../../work/processed/binance").expanduser()
cfg = CubeBuilderConfig(
    per_symbol_root=root / "data" / "zarr_per_symbol",
    # chưa có meta -> None; builder sẽ suy luận
    calendar_path=None,
    feature_order_path=None,
    symbols=["BTCUSDT", "ETHUSDT", "BNBUSDT", "SOLUSDT"],
    use_dask=False,
    # Quan trọng: cắt block theo giao thời gian
    calendar_mode="range_intersection",  # mặc định đã là mode này
    output_path=root / "data" / "zarr_cube_intersection",  # tuỳ chọn
)

cube = build_raw_cube(cfg)
print(cube)


  raw_times = np.asarray(group["time"][:], dtype="datetime64[ns]")


<xarray.Dataset> Size: 501MB
Dimensions:  (symbol: 4, time: 2662909, feature: 9)
Coordinates:
  * symbol   (symbol) object 32B 'BTCUSDT' 'ETHUSDT' 'BNBUSDT' 'SOLUSDT'
  * time     (time) datetime64[ns] 21MB 2020-08-11T06:00:00 ... 2025-09-04T11...
  * feature  (feature) <U13 468B 'Open' 'High' ... 'TakerBuyQuote'
Data variables:
    values   (time, symbol, feature) float32 383MB dask.array<chunksize=(16384, 4, 9), meta=np.ndarray>
    mask     (time, symbol, feature) bool 96MB dask.array<chunksize=(16384, 4, 9), meta=np.ndarray>
Attributes:
    calendar_source:       inferred:range_intersection
    feature_order_source:  inferred


In [3]:
print(type(cube))

<class 'xarray.core.dataset.Dataset'>


In [5]:
print(cube)

<xarray.Dataset> Size: 501MB
Dimensions:  (symbol: 4, time: 2662909, feature: 9)
Coordinates:
  * symbol   (symbol) object 32B 'BTCUSDT' 'ETHUSDT' 'BNBUSDT' 'SOLUSDT'
  * time     (time) datetime64[ns] 21MB 2020-08-11T06:00:00 ... 2025-09-04T11...
  * feature  (feature) <U13 468B 'Open' 'High' ... 'TakerBuyQuote'
Data variables:
    values   (time, symbol, feature) float32 383MB dask.array<chunksize=(16384, 4, 9), meta=np.ndarray>
    mask     (time, symbol, feature) bool 96MB dask.array<chunksize=(16384, 4, 9), meta=np.ndarray>
Attributes:
    calendar_source:       inferred:range_intersection
    feature_order_source:  inferred


In [6]:
cube
print(cube.dims, cube.sizes)
print(cube.coords)
print(cube.data_vars)


Coordinates:
  * symbol   (symbol) object 32B 'BTCUSDT' 'ETHUSDT' 'BNBUSDT' 'SOLUSDT'
  * time     (time) datetime64[ns] 21MB 2020-08-11T06:00:00 ... 2025-09-04T11...
  * feature  (feature) <U13 468B 'Open' 'High' ... 'TakerBuyQuote'
Data variables:
    values   (time, symbol, feature) float32 383MB dask.array<chunksize=(16384, 4, 9), meta=np.ndarray>
    mask     (time, symbol, feature) bool 96MB dask.array<chunksize=(16384, 4, 9), meta=np.ndarray>


In [9]:
print(cube["values"].isel(symbol=0, feature=0).to_series().head())
print("\n\n")
print(cube["mask"].isel(symbol=0, feature=0).to_series().value_counts())


time
2020-08-11 06:00:00    11854.559570
2020-08-11 06:01:00    11850.000000
2020-08-11 06:02:00    11843.839844
2020-08-11 06:03:00    11850.129883
2020-08-11 06:04:00    11849.019531
Name: values, dtype: float32



mask
True     2662908
False          1
Name: count, dtype: int64


In [12]:
import numpy as np
import pandas as pd
import xarray as xr

# Ép mask ra hẳn bộ nhớ (≈96 MB, ổn)
mask_dense = cube["mask"].compute()

# Giữ lại các ô mask=False và stack thành một chiều "hit"
bad_mask = (
    mask_dense
    .where(~mask_dense, drop=True)
    .stack(hit=("time", "symbol", "feature"))
)

if bad_mask.size == 0:
    print("No mask=False entries found.")
else:
    idx = bad_mask.indexes["hit"]            # pandas.MultiIndex
    idx_df = idx.to_frame(index=False)       # vài chục dòng, rất nhỏ

    # Lấy giá trị thật ở cùng vị trí
    bad_vals = cube["values"].sel(
        time=xr.DataArray(idx_df["time"].to_numpy(), dims="hit"),
        symbol=xr.DataArray(idx_df["symbol"].to_numpy(), dims="hit"),
        feature=xr.DataArray(idx_df["feature"].to_numpy(), dims="hit"),
    ).compute()

    for (t, s, f), val in zip(idx, bad_vals.values):
        print(f"{t} / {s} / {f} -> value={val}")


2021-04-25 04:01:00 / BTCUSDT / Open -> value=nan
2021-04-25 04:01:00 / BTCUSDT / High -> value=nan
2021-04-25 04:01:00 / BTCUSDT / Low -> value=nan
2021-04-25 04:01:00 / BTCUSDT / Close -> value=nan
2021-04-25 04:01:00 / BTCUSDT / Volume -> value=nan
2021-04-25 04:01:00 / BTCUSDT / QuoteVolume -> value=nan
2021-04-25 04:01:00 / BTCUSDT / Trades -> value=nan
2021-04-25 04:01:00 / BTCUSDT / TakerBuyBase -> value=nan
2021-04-25 04:01:00 / BTCUSDT / TakerBuyQuote -> value=nan
2021-04-25 04:01:00 / ETHUSDT / Open -> value=nan
2021-04-25 04:01:00 / ETHUSDT / High -> value=nan
2021-04-25 04:01:00 / ETHUSDT / Low -> value=nan
2021-04-25 04:01:00 / ETHUSDT / Close -> value=nan
2021-04-25 04:01:00 / ETHUSDT / Volume -> value=nan
2021-04-25 04:01:00 / ETHUSDT / QuoteVolume -> value=nan
2021-04-25 04:01:00 / ETHUSDT / Trades -> value=nan
2021-04-25 04:01:00 / ETHUSDT / TakerBuyBase -> value=nan
2021-04-25 04:01:00 / ETHUSDT / TakerBuyQuote -> value=nan
2021-04-25 04:01:00 / BNBUSDT / Open -> valu

In [13]:
from pathlib import Path
import pandas as pd

BASE = Path("/home/kylh/phd/tw_fin_rl/work/data/binance/spot/1m")
symbols = ["BTCUSDT", "ETHUSDT", "BNBUSDT", "SOLUSDT"]
target_ts = pd.Timestamp("2021-04-25 04:01:00", tz="UTC")

for sym in symbols:
    path = BASE / f"{sym}_1m_2019-01-01_2025-09-04.csv"
    reader = pd.read_csv(
        path,
        usecols=[
            "open_time", "open", "high", "low", "close",
            "volume", "quote_asset_volume",
            "number_of_trades", "taker_buy_base_asset_volume",
            "taker_buy_quote_asset_volume",
        ],
        parse_dates=["open_time"],
        chunksize=100_000,
    )

    matching = [
        chunk.loc[chunk["open_time"] == target_ts]
        for chunk in reader
        if (chunk["open_time"] == target_ts).any()
    ]
    if matching:
        row = pd.concat(matching)
        print(f"\n{sym} @ {target_ts}:")
        print(row.to_string(index=False))
    else:
        print(f"\n{sym} @ {target_ts}: không tìm thấy candle.")



BTCUSDT @ 2021-04-25 04:01:00+00:00: không tìm thấy candle.

ETHUSDT @ 2021-04-25 04:01:00+00:00: không tìm thấy candle.

BNBUSDT @ 2021-04-25 04:01:00+00:00: không tìm thấy candle.

SOLUSDT @ 2021-04-25 04:01:00+00:00:
                open_time    open    high     low   close  volume  quote_asset_volume  number_of_trades  taker_buy_base_asset_volume  taker_buy_quote_asset_volume
2021-04-25 04:01:00+00:00 41.2849 41.2849 41.2849 41.2849     0.0                 0.0                 0                          0.0                           0.0


In [14]:
import xarray as xr

# Bóc riêng Volume và mask tương ứng
vol = cube["values"].sel(feature="Volume")
vol_mask = cube["mask"].sel(feature="Volume")

# Volume hợp lệ nhưng bằng 0
zero_hits = ((vol == 0) & vol_mask).compute()

# Giữ lại những ô True và gom về một chiều "hit"
zero_hits = zero_hits.where(zero_hits, drop=True).stack(hit=("time", "symbol"))

if zero_hits.size == 0:
    print("Không có candle nào volume=0.")
else:
    idx = zero_hits.indexes["hit"].to_frame(index=False)
    print(idx)  # mỗi dòng: time & symbol có volume=0


                     time   symbol
0     2020-08-11 06:01:00  BTCUSDT
1     2020-08-11 06:01:00  ETHUSDT
2     2020-08-11 06:01:00  BNBUSDT
3     2020-08-11 06:01:00  SOLUSDT
4     2020-08-11 06:02:00  BTCUSDT
...                   ...      ...
89327 2023-10-15 03:44:00  SOLUSDT
89328 2023-10-15 04:24:00  BTCUSDT
89329 2023-10-15 04:24:00  ETHUSDT
89330 2023-10-15 04:24:00  BNBUSDT
89331 2023-10-15 04:24:00  SOLUSDT

[89332 rows x 2 columns]


In [15]:
# Volume hợp lệ rồi mới xét bằng 0
vol = cube["values"].sel(feature="Volume")
vol_mask = cube["mask"].sel(feature="Volume")

zero_flags = ((vol == 0) & vol_mask)

# Đếm theo symbol (boolean → int rồi cộng)
zero_counts = zero_flags.sum(dim="time").compute()

# In gọn dưới dạng series
print(zero_counts.to_series().astype(int))


symbol
BTCUSDT      153
ETHUSDT      153
BNBUSDT      153
SOLUSDT    22333
dtype: int64


In [17]:
vol = cube["values"].sel(feature="Volume")
vol_mask = cube["mask"].sel(feature="Volume")
zero = ((vol == 0) & vol_mask)
counts = zero.sum(dim="time").compute()
totals = vol_mask.sum(dim="time").compute()
pct = (counts / totals * 100).round(3)
print(pd.DataFrame({"count": counts.to_series().astype(int),
                    "pct": pct.to_series()}))

zero_sol = zero.sel(symbol="SOLUSDT").compute()
zero_sol = zero_sol.where(zero_sol, drop=True)
counts_by_month = zero_sol.groupby("time.month").count()
counts_by_year  = zero_sol.groupby("time.year").count()
print(counts_by_year)
print(counts_by_month)



         count    pct
symbol               
BTCUSDT    153  0.006
ETHUSDT    153  0.006
BNBUSDT    153  0.006
SOLUSDT  22333  0.839
<xarray.DataArray (year: 4)> Size: 32B
array([21584,   482,    31,   236])
Coordinates:
    symbol   <U7 28B 'SOLUSDT'
    feature  <U13 52B 'Volume'
  * year     (year) int64 32B 2020 2021 2022 2023
<xarray.DataArray (month: 11)> Size: 88B
array([ 347,  116,   92,    2,    9,   22, 2942, 2196, 5726, 5959, 4922])
Coordinates:
    symbol   <U7 28B 'SOLUSDT'
    feature  <U13 52B 'Volume'
  * month    (month) int64 88B 1 2 3 4 5 6 8 9 10 11 12


In [18]:
import pandas as pd
import numpy as np
import xarray as xr

# Volume slice cho SOL
vol_sol = cube["values"].sel(symbol="SOLUSDT", feature="Volume")
mask_sol = cube["mask"].sel(symbol="SOLUSDT", feature="Volume")

# Tìm timestamp volume=0 (mà mask vẫn True)
zero_sol = ((vol_sol == 0) & mask_sol).compute()
zero_times = zero_sol.where(zero_sol, drop=True).indexes["time"]

# Lấy vài sample (đổi n_samples theo ý bạn)
n_samples = 5
sample_times = pd.Index(np.random.choice(zero_times, size=n_samples, replace=False)).sort_values()

# In full candle tại các timestamp này
sample = cube.sel(symbol="SOLUSDT", time=sample_times).compute()
print(sample["values"].to_dataframe().unstack("feature"))


                      symbol                                                  \
feature                 Open     High      Low    Close   Volume QuoteVolume   
time                                                                           
2020-08-24 05:59:00  SOLUSDT  SOLUSDT  SOLUSDT  SOLUSDT  SOLUSDT     SOLUSDT   
2020-10-21 08:54:00  SOLUSDT  SOLUSDT  SOLUSDT  SOLUSDT  SOLUSDT     SOLUSDT   
2020-10-31 22:36:00  SOLUSDT  SOLUSDT  SOLUSDT  SOLUSDT  SOLUSDT     SOLUSDT   
2020-11-10 22:20:00  SOLUSDT  SOLUSDT  SOLUSDT  SOLUSDT  SOLUSDT     SOLUSDT   
2020-11-13 19:28:00  SOLUSDT  SOLUSDT  SOLUSDT  SOLUSDT  SOLUSDT     SOLUSDT   

                                                         values          \
feature               Trades TakerBuyBase TakerBuyQuote    Open    High   
time                                                                      
2020-08-24 05:59:00  SOLUSDT      SOLUSDT       SOLUSDT  3.3125  3.3125   
2020-10-21 08:54:00  SOLUSDT      SOLUSDT       SOLUSDT  1.

In [19]:
import numpy as np
import pandas as pd
import xarray as xr
from prettytable import PrettyTable  # pip install prettytable nếu chưa có

# Volume slice cho SOL
vol_sol = cube["values"].sel(symbol="SOLUSDT", feature="Volume")
mask_sol = cube["mask"].sel(symbol="SOLUSDT", feature="Volume")

# Tìm timestamp volume=0 (mask vẫn True)
zero_flags = ((vol_sol == 0) & mask_sol).compute()
zero_times = zero_flags.where(zero_flags, drop=True).indexes["time"]

# Lấy vài mẫu (tuỳ chỉnh n_samples)
n_samples = 5
sample_times = pd.Index(
    np.random.choice(zero_times, size=min(n_samples, len(zero_times)), replace=False)
).sort_values()

# Lấy full candle tại các timestamp này
sample = (
    cube.sel(symbol="SOLUSDT", time=sample_times)
    .compute()["values"]
    .to_dataframe()
    .reset_index()
    .pivot(index="time", columns="feature", values="values")
)

# In thành PrettyTable
table = PrettyTable()
table.field_names = ["time"] + sample.columns.tolist()
for ts, row in sample.iterrows():
    table.add_row([ts] + [f"{val:.6g}" if pd.notna(val) else "NaN" for val in row])

print(table)


+---------------------+--------+--------+--------+--------+-------------+--------------+---------------+--------+--------+
|         time        | Close  |  High  |  Low   |  Open  | QuoteVolume | TakerBuyBase | TakerBuyQuote | Trades | Volume |
+---------------------+--------+--------+--------+--------+-------------+--------------+---------------+--------+--------+
| 2020-08-21 19:45:00 | 3.0538 | 3.0538 | 3.0538 | 3.0538 |      0      |      0       |       0       |   0    |   0    |
| 2020-09-26 19:49:00 | 3.0513 | 3.0513 | 3.0513 | 3.0513 |      0      |      0       |       0       |   0    |   0    |
| 2020-10-24 01:59:00 | 1.944  | 1.944  | 1.944  | 1.944  |      0      |      0       |       0       |   0    |   0    |
| 2020-12-07 08:54:00 | 1.847  | 1.847  | 1.847  | 1.847  |      0      |      0       |       0       |   0    |   0    |
| 2020-12-13 01:50:00 | 1.4789 | 1.4789 | 1.4789 | 1.4789 |      0      |      0       |       0       |   0    |   0    |
+---------------

In [20]:
import pandas as pd
import xarray as xr
from prettytable import PrettyTable

# Lấy slice Volume + mask cho SOL
vol_sol = cube["values"].sel(symbol="SOLUSDT", feature="Volume")
mask_sol = cube["mask"].sel(symbol="SOLUSDT", feature="Volume")

# Tìm mọi timestamp volume=0 (mask vẫn True)
zero_flags = ((vol_sol == 0) & mask_sol).compute()
zero_times = zero_flags.where(zero_flags, drop=True).indexes["time"]

# Lấy toàn bộ candle tại các timestamp này, sắp xếp theo thời gian
sample = (
    cube.sel(symbol="SOLUSDT", time=zero_times)
    .compute()["values"]
    .to_dataframe()
    .reset_index()
    .pivot(index="time", columns="feature", values="values")
    .sort_index()
)

batch_size = 20  # số dòng mỗi bảng
columns = ["time"] + sample.columns.tolist()

for start in range(0, len(sample), batch_size):
    chunk = sample.iloc[start:start + batch_size]

    table = PrettyTable()
    table.field_names = columns

    for ts, row in chunk.iterrows():
        table.add_row([ts] + [f"{val:.6g}" if pd.notna(val) else "NaN" for val in row])

    print(f"Rows {start + 1} – {start + len(chunk)} / {len(sample)}")
    print(table)
    print()  # ngăn cách giữa các “trang”


Rows 1 – 20 / 22333
+---------------------+--------+--------+--------+--------+-------------+--------------+---------------+--------+--------+
|         time        | Close  |  High  |  Low   |  Open  | QuoteVolume | TakerBuyBase | TakerBuyQuote | Trades | Volume |
+---------------------+--------+--------+--------+--------+-------------+--------------+---------------+--------+--------+
| 2020-08-11 06:01:00 |  2.85  |  2.85  |  2.85  |  2.85  |      0      |      0       |       0       |   0    |   0    |
| 2020-08-11 06:02:00 |  2.85  |  2.85  |  2.85  |  2.85  |      0      |      0       |       0       |   0    |   0    |
| 2020-08-11 06:09:00 | 3.1846 | 3.1846 | 3.1846 | 3.1846 |      0      |      0       |       0       |   0    |   0    |
| 2020-08-11 06:10:00 | 3.1846 | 3.1846 | 3.1846 | 3.1846 |      0      |      0       |       0       |   0    |   0    |
| 2020-08-11 06:11:00 | 3.1846 | 3.1846 | 3.1846 | 3.1846 |      0      |      0       |       0       |   0    |   0  

In [21]:
import pandas as pd
import xarray as xr

# Các feature giá cần kiểm tra
price_feats = ["Open", "High", "Low", "Close"]

# Trích lát giá và mask chỉ cho nhóm feature này
prices = cube["values"].sel(feature=price_feats)
mask = cube["mask"].sel(feature=price_feats)

# Cờ “giá = 0 và mask=True”
zero_prices = ((prices == 0) & mask)

# Tổng hợp theo symbol -> dataframe (số candle giá=0 cho từng feature)
counts = zero_prices.sum(dim="time").compute().to_pandas().astype(int)

# Nếu muốn tổng hợp chung (có ít nhất 1 giá = 0 trên candle), dùng “any” trên feature
any_zero = zero_prices.any(dim="feature")
counts_any = any_zero.sum(dim="time").compute().to_series().astype(int)

print("Số candle giá=0 theo từng feature:")
print(counts)

print("\nSố candle có bất kỳ giá = 0:")
print(counts_any)


Số candle giá=0 theo từng feature:
feature  Open  High  Low  Close
symbol                         
BTCUSDT     0     0    0      0
ETHUSDT     0     0    0      0
BNBUSDT     0     0    0      0
SOLUSDT     0     0    0      0

Số candle có bất kỳ giá = 0:
symbol
BTCUSDT    0
ETHUSDT    0
BNBUSDT    0
SOLUSDT    0
dtype: int64


In [23]:
import pandas as pd
import xarray as xr

price_feats = ["Open", "High", "Low", "Close"]

# Lấy lát giá và mask tương ứng
mask_price = cube["mask"].sel(feature=price_feats)
values_price = cube["values"].sel(feature=price_feats)

# Candle bị “giá không xác định” nếu mask=False (giá không hữu hạn)
invalid_flags = ~mask_price

# Nếu bạn muốn chắc chắn lọc cả giá = NaN/inf dù mask lỡ sai:
# invalid_flags = (~mask_price) | ~xr.ufuncs.isfinite(values_price)

# 1) Đếm theo symbol và feature
invalid_counts = invalid_flags.sum(dim="time").compute().to_pandas().astype(int)
print("Số candle giá không xác định theo từng feature:")
print(invalid_counts)

# 2) Đếm số candle có ít nhất một giá không xác định
invalid_any = invalid_flags.any(dim="feature")
invalid_any_counts = invalid_any.sum(dim="time").compute().to_series().astype(int)
print("\nSố candle có ít nhất một giá không xác định:")
print(invalid_any_counts)

# 3) Nếu muốn xem cụ thể timestamp/symbol bị lỗi:
invalid_any = invalid_flags.any(dim="feature").compute()  # ép sang DataArray thường

bad_points = (
    invalid_any.where(invalid_any, drop=True)
    .stack(hit=("time", "symbol"))
)

idx = bad_points.indexes["hit"].to_frame(index=False)
print(idx.head())



Số candle giá không xác định theo từng feature:
feature  Open  High  Low  Close
symbol                         
BTCUSDT     1     1    1      1
ETHUSDT     1     1    1      1
BNBUSDT     1     1    1      1
SOLUSDT     0     0    0      0

Số candle có ít nhất một giá không xác định:
symbol
BTCUSDT    1
ETHUSDT    1
BNBUSDT    1
SOLUSDT    0
Name: mask, dtype: int64
                 time   symbol
0 2021-04-25 04:01:00  BTCUSDT
1 2021-04-25 04:01:00  ETHUSDT
2 2021-04-25 04:01:00  BNBUSDT
