In [1]:
from pathlib import Path
import xarray as xr

root = Path("../../../work/processed/binance").expanduser()
cached_cube = xr.open_zarr(
    (root / "data" / "zarr_cube_intersection").as_posix(),
    consolidated=True,  # flip to False if you ever disable consolidation
)
print(cached_cube)

cube = cached_cube 


<xarray.Dataset> Size: 501MB
Dimensions:  (feature: 9, time: 2662909, symbol: 4)
Coordinates:
  * feature  (feature) <U13 468B 'Open' 'High' ... 'TakerBuyQuote'
  * symbol   (symbol) object 32B 'BTCUSDT' 'ETHUSDT' 'BNBUSDT' 'SOLUSDT'
  * time     (time) datetime64[ns] 21MB 2020-08-11T06:00:00 ... 2025-09-04T11...
Data variables:
    mask     (time, symbol, feature) bool 96MB dask.array<chunksize=(16384, 4, 9), meta=np.ndarray>
    values   (time, symbol, feature) float32 383MB dask.array<chunksize=(16384, 4, 9), meta=np.ndarray>
Attributes:
    calendar_source:       inferred:range_intersection
    feature_order_source:  inferred


In [2]:
print(cube)
print()
print(cube.coords.get("symbol").values)
print()
print(cube.coords.get("feature").values)
print()
print(cube.data_vars.get("values"))
print()

<xarray.Dataset> Size: 501MB
Dimensions:  (feature: 9, time: 2662909, symbol: 4)
Coordinates:
  * feature  (feature) <U13 468B 'Open' 'High' ... 'TakerBuyQuote'
  * symbol   (symbol) object 32B 'BTCUSDT' 'ETHUSDT' 'BNBUSDT' 'SOLUSDT'
  * time     (time) datetime64[ns] 21MB 2020-08-11T06:00:00 ... 2025-09-04T11...
Data variables:
    mask     (time, symbol, feature) bool 96MB dask.array<chunksize=(16384, 4, 9), meta=np.ndarray>
    values   (time, symbol, feature) float32 383MB dask.array<chunksize=(16384, 4, 9), meta=np.ndarray>
Attributes:
    calendar_source:       inferred:range_intersection
    feature_order_source:  inferred

['BTCUSDT' 'ETHUSDT' 'BNBUSDT' 'SOLUSDT']

['Open' 'High' 'Low' 'Close' 'Volume' 'QuoteVolume' 'Trades'
 'TakerBuyBase' 'TakerBuyQuote']

<xarray.DataArray 'values' (time: 2662909, symbol: 4, feature: 9)> Size: 383MB
dask.array<open_dataset-values, shape=(2662909, 4, 9), dtype=float32, chunksize=(16384, 4, 9), chunktype=numpy.ndarray>
Coordinates:
  * feature

In [3]:
import xarray as xr

# giả sử bạn đã có raw cube trong biến `cube`
values = cube["values"]

# 1) mask NaN (lazy, vẫn dùng dask)
nan_mask = values.isnull()

# 2) Tổng số ô NaN toàn cube (tất cả feature)
total_nan = int(nan_mask.sum().compute())
print("Tổng số ô NaN:", total_nan)

# 3) NaN theo symbol (gộp mọi feature, đếm số time-step bị NaN)
nan_by_symbol = nan_mask.any("feature").sum("time").compute()
print("NaN theo symbol:\n", nan_by_symbol)

# 4) Thời điểm nào có NaN ở từng symbol (lọc toàn bộ feature cùng lúc)
nan_time_symbol = nan_mask.any("feature").compute()

# lấy danh sách (time, symbol) có NaN
nan_df = nan_time_symbol.to_dataframe(name="has_nan").query("has_nan")
print("Các time/symbol có NaN:\n", nan_df)

# 5) Nếu muốn soi cả feature nào bị NaN tại những vị trí này:
with_feature = nan_mask.sel(time=nan_df.index.get_level_values("time").unique()).compute()
feature_df = with_feature.to_dataframe(name="is_nan").query("is_nan")
print("Chi tiết feature bị NaN:\n", feature_df)


Tổng số ô NaN: 27
NaN theo symbol:
 <xarray.DataArray 'values' (symbol: 4)> Size: 32B
array([1, 1, 1, 0])
Coordinates:
  * symbol   (symbol) object 32B 'BTCUSDT' 'ETHUSDT' 'BNBUSDT' 'SOLUSDT'
Các time/symbol có NaN:
                              has_nan
time                symbol          
2021-04-25 04:01:00 BTCUSDT     True
                    ETHUSDT     True
                    BNBUSDT     True
Chi tiết feature bị NaN:
                                            is_nan
time                symbol  feature              
2021-04-25 04:01:00 BTCUSDT Open             True
                            High             True
                            Low              True
                            Close            True
                            Volume           True
                            QuoteVolume      True
                            Trades           True
                            TakerBuyBase     True
                            TakerBuyQuote    True
                    ET

In [4]:
from ace_rl.core.candle_feat_ops import (
    make_atr, make_rsi,
    make_ema_close, make_ema_ratio, 
    make_return,
    make_taker_buy_ratio,
    make_volatility,
    make_volume_ratio,
    make_zscore
)

In [5]:
from pathlib import Path
import xarray as xr
from ace_rl.core.cube_feature import load_cube

root = Path("/home/kylh/phd/tw_fin_rl/work/processed/binance")
feature_names = [
    "liq_volratio_w60_0895",
    "mom_rsi_14_a289",
    "of_taker_buy_ratio_967f",
    "ret_lag5_bb88",
    "risk_atr_14_1981",
    "trend_emar_20_264a",
    "vol_win60_4b89",
    "z_Close_w240_1f15",
]

feature_cubes = {name: load_cube(root, "features", name) for name in feature_names}

## print debug 
for name, ds in feature_cubes.items():
    print(f"Feature: {name}, shape: {ds['values'].shape}, coords: {ds.coords}")
    print(ds)
    print()

# optional: stitch them into one dataset sharing the same time/symbol index
combined = xr.concat([ds["values"] for ds in feature_cubes.values()], dim="feature").to_dataset(name="values")


Feature: liq_volratio_w60_0895, shape: (2662909, 4, 1), coords: Coordinates:
  * feature  (feature) <U12 48B 'vol_ratio_60'
  * symbol   (symbol) <U7 112B 'BTCUSDT' 'ETHUSDT' 'BNBUSDT' 'SOLUSDT'
  * time     (time) datetime64[ns] 21MB 2020-08-11T06:00:00 ... 2025-09-04T11...
<xarray.Dataset> Size: 64MB
Dimensions:  (feature: 1, symbol: 4, time: 2662909)
Coordinates:
  * feature  (feature) <U12 48B 'vol_ratio_60'
  * symbol   (symbol) <U7 112B 'BTCUSDT' 'ETHUSDT' 'BNBUSDT' 'SOLUSDT'
  * time     (time) datetime64[ns] 21MB 2020-08-11T06:00:00 ... 2025-09-04T11...
Data variables:
    values   (time, symbol, feature) float32 43MB dask.array<chunksize=(16384, 4, 1), meta=np.ndarray>
Attributes:
    params:   {'method': 'vol/ma', 'window': 60}

Feature: mom_rsi_14_a289, shape: (2662909, 4, 1), coords: Coordinates:
  * feature  (feature) <U6 24B 'rsi_14'
  * symbol   (symbol) <U7 112B 'BTCUSDT' 'ETHUSDT' 'BNBUSDT' 'SOLUSDT'
  * time     (time) datetime64[ns] 21MB 2020-08-11T06:00:00 ... 2025-

In [6]:
print(combined)

print(combined.coords.get("symbol").values)
print(combined.coords.get("feature").values)

<xarray.Dataset> Size: 362MB
Dimensions:  (feature: 8, symbol: 4, time: 2662909)
Coordinates:
  * feature  (feature) <U15 480B 'vol_ratio_60' 'rsi_14' ... 'z_Close_240'
  * symbol   (symbol) <U7 112B 'BTCUSDT' 'ETHUSDT' 'BNBUSDT' 'SOLUSDT'
  * time     (time) datetime64[ns] 21MB 2020-08-11T06:00:00 ... 2025-09-04T11...
Data variables:
    values   (time, symbol, feature) float32 341MB dask.array<chunksize=(16384, 4, 1), meta=np.ndarray>
['BTCUSDT' 'ETHUSDT' 'BNBUSDT' 'SOLUSDT']
['vol_ratio_60' 'rsi_14' 'taker_buy_ratio' 'return_lag5' 'atr_14'
 'ema_ratio_20' 'volatility_60' 'z_Close_240']


In [8]:
import pandas as pd

# giả sử bạn đã có feature_cubes như ở trên
nan_reports = {}

for name, ds in feature_cubes.items():
    vals = ds["values"]
    nan_mask = vals.isnull()

    total_nan = int(nan_mask.sum().compute())
    nan_by_symbol = (
        nan_mask.any("feature")
        .sum("time")
        .compute()
        .to_series()
    )

    nan_time_symbol = nan_mask.any("feature").compute()
    nan_df = nan_time_symbol.to_dataframe(name="has_nan").query("has_nan").reset_index()

    nan_reports[name] = dict(
        total_nan=total_nan,
        nan_by_symbol=nan_by_symbol,
        nan_df=nan_df,
    )

    print(f"=== {name} ===")
    print("Tổng ô NaN:", total_nan)
    print("NaN theo symbol:")
    print(nan_by_symbol)
    print("Các time/symbol có NaN (đầu tiên vài dòng):")
    print(nan_df.head())
    print()

# (tuỳ chọn) Nếu bạn có raw cube đã load ở biến raw_cube:
# raw_nan = raw_cube["values"].isnull().any("feature").compute()
# for name, report in nan_reports.items():
#     merged = (
#         report["nan_df"]
#         .merge(
#             raw_nan.to_dataframe(name="raw_has_nan").reset_index(),
#             on=["time", "symbol"],
#             how="left",
#         )
#     )
#     mismatches = merged.query("raw_has_nan != True")
#     print(name, "mismatch vs raw:", len(mismatches))


=== liq_volratio_w60_0895 ===
Tổng ô NaN: 416
NaN theo symbol:
symbol
BTCUSDT    119
ETHUSDT    119
BNBUSDT    119
SOLUSDT     59
Name: values, dtype: int64
Các time/symbol có NaN (đầu tiên vài dòng):
                 time   symbol  has_nan
0 2020-08-11 06:00:00  BTCUSDT     True
1 2020-08-11 06:00:00  ETHUSDT     True
2 2020-08-11 06:00:00  BNBUSDT     True
3 2020-08-11 06:00:00  SOLUSDT     True
4 2020-08-11 06:01:00  BTCUSDT     True

=== mom_rsi_14_a289 ===
Tổng ô NaN: 4
NaN theo symbol:
symbol
BTCUSDT    1
ETHUSDT    1
BNBUSDT    1
SOLUSDT    1
Name: values, dtype: int64
Các time/symbol có NaN (đầu tiên vài dòng):
                 time   symbol  has_nan
0 2020-08-11 06:00:00  BTCUSDT     True
1 2020-08-11 06:00:00  ETHUSDT     True
2 2020-08-11 06:00:00  BNBUSDT     True
3 2020-08-11 06:00:00  SOLUSDT     True

=== of_taker_buy_ratio_967f ===
Tổng ô NaN: 3
NaN theo symbol:
symbol
BTCUSDT    1
ETHUSDT    1
BNBUSDT    1
SOLUSDT    0
Name: values, dtype: int64
Các time/symbol có NaN 

In [9]:
import pandas as pd

TARGET = pd.Timestamp("2021-04-25 04:01:00")
LOOK_AROUND = pd.Timedelta("45min")      # chỉnh nếu muốn rộng hơn / hẹp hơn

def split_nan_windows(ds):
    vals = ds["values"]

    # (time, symbol) vì mỗi cube chỉ có 1 feature
    da = vals.isel(feature=0)

    # cột mốc thời gian đầu tiên mà giá trị trở nên hợp lệ (hết warm-up)
    first_valid = da.notnull().idxmax("time").compute()
    warmup_cutoff = dict(zip(first_valid["symbol"].values, first_valid.values))

    # tất cả vị trí NaN
    nan_df = (
        vals.isnull()
        .any("feature")
        .compute()
        .to_dataframe(name="has_nan")
        .query("has_nan")
        .reset_index()
    )
    nan_df = nan_df.drop(columns="has_nan")

    # gắn nhãn warm-up vs giữa chừng
    nan_df["warmup_cutoff"] = nan_df["symbol"].map(warmup_cutoff)
    nan_df["is_warmup"] = nan_df["time"] < nan_df["warmup_cutoff"]

    warmup = nan_df.query("is_warmup").copy()
    midrun = nan_df.query("~is_warmup").drop(columns=["is_warmup", "warmup_cutoff"])

    # zoom quanh candle mất dữ liệu
    around_target = midrun[
        midrun["time"].between(TARGET - LOOK_AROUND, TARGET + LOOK_AROUND)
    ]

    return warmup, midrun, around_target


reports = {}
for name, ds in feature_cubes.items():
    warmup_df, midrun_df, around_df = split_nan_windows(ds)
    reports[name] = dict(warmup=warmup_df, midrun=midrun_df, around=around_df)

    print(f"=== {name} ===")
    print(f"NaN warm-up (per symbol):\n{warmup_df.groupby('symbol').size()}")
    print(f"NaN giữa chừng: {len(midrun_df)} rows")
    if around_df.empty:
        print("  (không thấy NaN quanh 2021-04-25 04:01)")
    else:
        print("  NaN quanh candle missing:")
        print(around_df.sort_values('time'))
    print()


=== liq_volratio_w60_0895 ===
NaN warm-up (per symbol):
symbol
BNBUSDT    59
BTCUSDT    59
ETHUSDT    59
SOLUSDT    59
dtype: int64
NaN giữa chừng: 180 rows
  NaN quanh candle missing:
                   time   symbol
236 2021-04-25 04:01:00  BTCUSDT
237 2021-04-25 04:01:00  ETHUSDT
238 2021-04-25 04:01:00  BNBUSDT

=== mom_rsi_14_a289 ===
NaN warm-up (per symbol):
symbol
BNBUSDT    1
BTCUSDT    1
ETHUSDT    1
SOLUSDT    1
dtype: int64
NaN giữa chừng: 0 rows
  (không thấy NaN quanh 2021-04-25 04:01)

=== of_taker_buy_ratio_967f ===
NaN warm-up (per symbol):
Series([], dtype: int64)
NaN giữa chừng: 3 rows
  NaN quanh candle missing:
                 time   symbol
0 2021-04-25 04:01:00  BTCUSDT
1 2021-04-25 04:01:00  ETHUSDT
2 2021-04-25 04:01:00  BNBUSDT

=== ret_lag5_bb88 ===
NaN warm-up (per symbol):
symbol
BNBUSDT    5
BTCUSDT    5
ETHUSDT    5
SOLUSDT    5
dtype: int64
NaN giữa chừng: 6 rows
  NaN quanh candle missing:
                  time   symbol
20 2021-04-25 04:01:00  BTCUSDT
