In [1]:
from pathlib import Path
import xarray as xr

root = Path("../../../work/processed/binance").expanduser()
cached_cube = xr.open_zarr(
    (root / "data" / "zarr_cube_intersection").as_posix(),
    consolidated=True,  # flip to False if you ever disable consolidation
)
print(cached_cube)

cube = cached_cube 

print('\n==========\n', cube)


<xarray.Dataset> Size: 501MB
Dimensions:  (feature: 9, time: 2662909, symbol: 4)
Coordinates:
  * feature  (feature) <U13 468B 'Open' 'High' ... 'TakerBuyQuote'
  * symbol   (symbol) object 32B 'BTCUSDT' 'ETHUSDT' 'BNBUSDT' 'SOLUSDT'
  * time     (time) datetime64[ns] 21MB 2020-08-11T06:00:00 ... 2025-09-04T11...
Data variables:
    mask     (time, symbol, feature) bool 96MB dask.array<chunksize=(16384, 4, 9), meta=np.ndarray>
    values   (time, symbol, feature) float32 383MB dask.array<chunksize=(16384, 4, 9), meta=np.ndarray>
Attributes:
    calendar_source:       inferred:range_intersection
    feature_order_source:  inferred

 <xarray.Dataset> Size: 501MB
Dimensions:  (feature: 9, time: 2662909, symbol: 4)
Coordinates:
  * feature  (feature) <U13 468B 'Open' 'High' ... 'TakerBuyQuote'
  * symbol   (symbol) object 32B 'BTCUSDT' 'ETHUSDT' 'BNBUSDT' 'SOLUSDT'
  * time     (time) datetime64[ns] 21MB 2020-08-11T06:00:00 ... 2025-09-04T11...
Data variables:
    mask     (time, symbol, f

In [2]:
import xarray as xr
import numpy as np
import pandas as pd

# === CONFIG ===
LABEL_ZARR = "../../../work/processed/binance/data/zarr_cube_label"
GROUP      = "labels"
RET_EPS    = 5e-5  # epsilon để phân loại returns: >eps pos, |r|<=eps zero, <-eps neg

# === LOAD ===
ds_lbl = xr.open_zarr(LABEL_ZARR, group=GROUP)
values = ds_lbl["values"]            # (time, symbol, label)
labels = list(map(str, ds_lbl["label"].values))
symbols = list(map(str, ds_lbl["symbol"].values))
time = ds_lbl["time"].values
print(f"Loaded labels cube: values{tuple(values.shape)} | symbols={symbols[:8]} ... | n_labels={len(labels)}")

print('\n==========\n', ds_lbl)


Loaded labels cube: values(2662909, 4, 8) | symbols=['BTCUSDT', 'ETHUSDT', 'BNBUSDT', 'SOLUSDT'] ... | n_labels=8

 <xarray.Dataset> Size: 447MB
Dimensions:  (label: 8, time: 2662909, symbol: 4)
Coordinates:
  * label    (label) object 64B 'ret_log__H1' 'ret_log__H15' ... 'tb_t_hit__H60'
  * symbol   (symbol) object 32B 'BTCUSDT' 'ETHUSDT' 'BNBUSDT' 'SOLUSDT'
  * time     (time) datetime64[ns] 21MB 2020-08-11T06:00:00 ... 2025-09-04T11...
Data variables:
    mask     (time, symbol, label) bool 85MB dask.array<chunksize=(16384, 4, 8), meta=np.ndarray>
    values   (time, symbol, label) float32 341MB dask.array<chunksize=(16384, 4, 8), meta=np.ndarray>
Attributes:
    horizons:           [1, 5, 15, 60]
    label_builder:      returns + triple_barrier(strict_nan)
    strict_nan_window:  True
    tb_cfg_pct:         {"up_pct": 0.01, "dn_pct": 0.005, "H": 60, "neutral":...
    tb_cfg_vol:         {"enabled": true, "up": 2.0, "dn": 2.0, "H": 60, "win...


In [3]:
from pathlib import Path
import xarray as xr
from ace_rl.core.cube_feature import load_cube

root = Path("../../../work/processed/binance")
feature_names = [
    "liq_volratio_w60_0895",
    "mom_rsi_14_a289",
    "of_taker_buy_ratio_967f",
    "ret_lag5_bb88",
    "risk_atr_14_1981",
    "trend_emar_20_264a",
    "vol_win60_4b89",
    "z_Close_w240_1f15",
]

feature_cubes = {name: load_cube(root, "features", name) for name in feature_names}

## print debug 
for name, ds in feature_cubes.items():
    print(f"Feature: {name}, shape: {ds['values'].shape}, coords: {ds.coords}")
    print(ds)
    print()

# optional: stitch them into one dataset sharing the same time/symbol index
combined = xr.concat([ds["values"] for ds in feature_cubes.values()], dim="feature").to_dataset(name="values")

print("Combined dataset:")
print('\n==========\n', combined)

Feature: liq_volratio_w60_0895, shape: (2662909, 4, 1), coords: Coordinates:
  * feature  (feature) <U12 48B 'vol_ratio_60'
  * symbol   (symbol) <U7 112B 'BTCUSDT' 'ETHUSDT' 'BNBUSDT' 'SOLUSDT'
  * time     (time) datetime64[ns] 21MB 2020-08-11T06:00:00 ... 2025-09-04T11...
<xarray.Dataset> Size: 64MB
Dimensions:  (feature: 1, symbol: 4, time: 2662909)
Coordinates:
  * feature  (feature) <U12 48B 'vol_ratio_60'
  * symbol   (symbol) <U7 112B 'BTCUSDT' 'ETHUSDT' 'BNBUSDT' 'SOLUSDT'
  * time     (time) datetime64[ns] 21MB 2020-08-11T06:00:00 ... 2025-09-04T11...
Data variables:
    values   (time, symbol, feature) float32 43MB dask.array<chunksize=(16384, 4, 1), meta=np.ndarray>
Attributes:
    params:   {'method': 'vol/ma', 'window': 60}

Feature: mom_rsi_14_a289, shape: (2662909, 4, 1), coords: Coordinates:
  * feature  (feature) <U6 24B 'rsi_14'
  * symbol   (symbol) <U7 112B 'BTCUSDT' 'ETHUSDT' 'BNBUSDT' 'SOLUSDT'
  * time     (time) datetime64[ns] 21MB 2020-08-11T06:00:00 ... 2025-

In [4]:
import numpy as np
import pandas as pd
import xarray as xr
from pathlib import Path
from ace_rl.core.cube_feature import load_cube

from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import RidgeCV
from sklearn.metrics import mean_squared_error, mean_absolute_error

# --- 1. Load & hợp nhất feature ---
root = Path("../../../work/processed/binance")
feature_names = [
    "liq_volratio_w60_0895",
    "mom_rsi_14_a289",
    "of_taker_buy_ratio_967f",
    "ret_lag5_bb88",
    "risk_atr_14_1981",
    "trend_emar_20_264a",
    "vol_win60_4b89",
    "z_Close_w240_1f15",
]

feature_arrays = []
for name in feature_names:
    ds = load_cube(root, "features", name)
    arr = ds["values"]
    # đảm bảo tên feature sạch và duy nhất
    arr = arr.assign_coords(
        feature=[str(arr.coords["feature"].values[0]).replace(" ", "_")]
    )
    feature_arrays.append(arr)

feature_xr = xr.concat(feature_arrays, dim="feature")

feature_df = (
    feature_xr
    .to_dataset(name="value")
    .to_dataframe()
    .reset_index()
    .pivot(index=["time", "symbol"], columns="feature", values="value")
)

# --- 2. Lấy label chính (vd: forward return H=5) ---
LABEL_ZARR = "../../../work/processed/binance/data/zarr_cube_label"
TARGET_LABEL = "ret_log__H5"      # đổi sang label khác nếu cần

ds_lbl = xr.open_zarr(LABEL_ZARR, group="labels")
target_df = (
    ds_lbl["values"]
    .sel(label=TARGET_LABEL)
    .to_dataframe(name="target")
    .reset_index()
    .set_index(["time", "symbol"])
)

# --- 3. Join feature + label, drop NaN ---
dataset_df = (
    feature_df
    .join(target_df, how="inner")
    .dropna()
    .reset_index()
    .sort_values("time")
)

feature_cols = [c for c in dataset_df.columns if c not in ("time", "symbol", "target")]

print(f"Bộ dữ liệu sau khi ghép: {dataset_df.shape[0]} hàng, {len(feature_cols)} feature.")

# --- 4. Chia các giai đoạn theo thời gian ---
unique_times = pd.Index(dataset_df["time"].unique()).sort_values()

train_cut = unique_times[int(len(unique_times) * 0.60)]
valid_cut = unique_times[int(len(unique_times) * 0.80)]
test_cut  = unique_times[int(len(unique_times) * 0.90)]

def assign_period(ts):
    if ts <= train_cut:
        return "train"
    if ts <= valid_cut:
        return "valid"
    if ts <= test_cut:
        return "test"
    return "unseen"

dataset_df["period"] = dataset_df["time"].map(assign_period)

split_dfs = {name: df for name, df in dataset_df.groupby("period")}
for name in ["train", "valid", "test", "unseen"]:
    print(f"{name:>7}: {len(split_dfs.get(name, []))} samples")

# --- 5. Huấn luyện mô hình đơn giản ---
X_train = split_dfs["train"][feature_cols].values
y_train = split_dfs["train"]["target"].values

model = make_pipeline(
    StandardScaler(),
    RidgeCV(alphas=np.logspace(-3, 3, 13))
)
model.fit(X_train, y_train)

# --- 6. Đánh giá trên từng split ---
results = []
pred_col = f"pred_{TARGET_LABEL}"

for name, df in split_dfs.items():
    X = df[feature_cols].values
    y = df["target"].values
    preds = model.predict(X)
    mse = mean_squared_error(y, preds)
    mae = mean_absolute_error(y, preds)

    eps = 5e-5
    mask = np.abs(y) > eps
    directional = float((np.sign(preds[mask]) == np.sign(y[mask])).mean()) if mask.any() else np.nan

    results.append(dict(period=name, mse=mse, mae=mae, dir_acc=directional))
    dataset_df.loc[df.index, pred_col] = preds

results_df = pd.DataFrame(results).set_index("period").sort_index()
display(results_df)

# (tùy chọn) xem nhanh một vài dòng dự đoán
dataset_df[["time", "symbol", "target", pred_col, "period"]].head()


ModuleNotFoundError: No module named 'sklearn'