In [9]:
import numpy as np
import pandas as pd
import xarray as xr
from pathlib import Path
from ace_rl.core.cube_feature import load_cube

from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import RidgeCV
from sklearn.metrics import mean_squared_error, mean_absolute_error

In [10]:


# --- 1. Load & hợp nhất feature ---
root = Path("../../../work/processed/binance")
feature_names = [
    "liq_volratio_w60_0895",
    "mom_rsi_14_a289",
    "of_taker_buy_ratio_967f",
    "ret_lag5_bb88",
    "risk_atr_14_1981",
    "trend_emar_20_264a",
    "vol_win60_4b89",
    "z_Close_w240_1f15",
]

feature_arrays = []
for name in feature_names:
    ds = load_cube(root, "features", name)
    arr = ds["values"]
    # đảm bảo tên feature sạch và duy nhất
    arr = arr.assign_coords(
        feature=[str(arr.coords["feature"].values[0]).replace(" ", "_")]
    )
    feature_arrays.append(arr)

feature_xr = xr.concat(feature_arrays, dim="feature")

feature_df = (
    feature_xr
    .to_dataset(name="value")
    .to_dataframe()
    .reset_index()
    .pivot(index=["time", "symbol"], columns="feature", values="value")
)

# --- 2. Lấy label chính (vd: forward return H=5) ---
# --- 2. Load label chính ---
LABEL_ZARR = "../../../work/processed/binance/data/zarr_cube_label"
TARGET_LABEL = "ret_log__H5"

ds_lbl = xr.open_zarr(LABEL_ZARR, group="labels")
target_da = ds_lbl["values"].sel(label=TARGET_LABEL).reset_coords(drop=True)
# hoặc: target_da = ds_lbl["values"].sel(label=TARGET_LABEL).squeeze(drop=True)

target_df = (
    target_da
    .to_dataframe(name="target")
    .reset_index()
    .set_index(["time", "symbol"])
)


# --- 3. Join feature + label, drop NaN ---
dataset_df = (
    feature_df
    .join(target_df, how="inner")
    .dropna()
    .reset_index()
    .sort_values("time")
)

feature_cols = [c for c in dataset_df.columns if c not in ("time", "symbol", "target")]

print(f"Bộ dữ liệu sau khi ghép: {dataset_df.shape[0]} hàng, {len(feature_cols)} feature.")


Bộ dữ liệu sau khi ghép: 10649930 hàng, 8 feature.


In [11]:

# --- 4. Chia các giai đoạn theo thời gian ---
unique_times = pd.Index(dataset_df["time"].unique()).sort_values()

train_cut = unique_times[int(len(unique_times) * 0.60)]
valid_cut = unique_times[int(len(unique_times) * 0.80)]
test_cut  = unique_times[int(len(unique_times) * 0.90)]

print(f"Train: {train_cut} | Valid: {valid_cut} | Test: {test_cut}")

Train: 2023-08-26 20:42:00 | Valid: 2024-08-30 16:15:00 | Test: 2025-03-03 14:01:00


In [12]:
def assign_period(ts):
    if ts <= train_cut:
        return "train"
    if ts <= valid_cut:
        return "valid"
    if ts <= test_cut:
        return "test"
    return "unseen"

dataset_df["period"] = dataset_df["time"].map(assign_period)

split_dfs = {name: df for name, df in dataset_df.groupby("period")}
for name in ["train", "valid", "test", "unseen"]:
    print(f"{name:>7}: {len(split_dfs.get(name, []))} samples")

  train: 6389670 samples
  valid: 2130132 samples
   test: 1065064 samples
 unseen: 1065064 samples


In [13]:
# --- 5. Huấn luyện mô hình đơn giản ---
X_train = split_dfs["train"][feature_cols].values
y_train = split_dfs["train"]["target"].values

In [14]:
bad_cols = split_dfs["train"][feature_cols].select_dtypes(exclude=[np.number])
print(bad_cols.dtypes)
print(bad_cols.head())


Series([], dtype: object)
Empty DataFrame
Columns: []
Index: [0, 1, 2, 3, 4]


In [15]:
(split_dfs["train"][feature_cols]
    .applymap(lambda v: isinstance(v, str))
    .sum()
    .sort_values(ascending=False)
    .head())


  .applymap(lambda v: isinstance(v, str))


atr_14             0
ema_ratio_20       0
return_lag5        0
rsi_14             0
taker_buy_ratio    0
dtype: int64

In [16]:

model = make_pipeline(
    StandardScaler(),
    RidgeCV(alphas=np.logspace(-3, 3, 13))
)
model.fit(X_train, y_train)

0,1,2
,steps,"[('standardscaler', ...), ('ridgecv', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,alphas,array([1.0000...00000000e+03])
,fit_intercept,True
,scoring,
,cv,
,gcv_mode,
,store_cv_results,False
,alpha_per_target,False


In [17]:





# --- 6. Đánh giá trên từng split ---
results = []
pred_col = f"pred_{TARGET_LABEL}"

for name, df in split_dfs.items():
    X = df[feature_cols].values
    y = df["target"].values
    preds = model.predict(X)
    mse = mean_squared_error(y, preds)
    mae = mean_absolute_error(y, preds)

    eps = 5e-5
    mask = np.abs(y) > eps
    directional = float((np.sign(preds[mask]) == np.sign(y[mask])).mean()) if mask.any() else np.nan

    results.append(dict(period=name, mse=mse, mae=mae, dir_acc=directional))
    dataset_df.loc[df.index, pred_col] = preds

results_df = pd.DataFrame(results).set_index("period").sort_index()
display(results_df)

# (tùy chọn) xem nhanh một vài dòng dự đoán
dataset_df[["time", "symbol", "target", pred_col, "period"]].head()


Unnamed: 0_level_0,mse,mae,dir_acc
period,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
test,5e-06,0.001411,0.492363
train,1.1e-05,0.001909,0.495681
unseen,4e-06,0.001235,0.494418
valid,5e-06,0.001343,0.489351


Unnamed: 0,time,symbol,target,pred_ret_log__H5,period
0,2020-08-11 10:00:00,BNBUSDT,0.0043,0.000214,train
1,2020-08-11 10:00:00,BTCUSDT,0.002517,8.7e-05,train
2,2020-08-11 10:00:00,ETHUSDT,0.003932,5.6e-05,train
3,2020-08-11 10:00:00,SOLUSDT,0.005937,0.000749,train
4,2020-08-11 10:01:00,BNBUSDT,0.002168,0.00019,train
