In [2]:
import filegetter.filegetter as fgt
import pandas as pd
df = fgt.get("binance", "swap", "btc-usdt", "trade","20250318", machine="AWS-JP1")
dg = fgt.get("binance", "swap", "btc-usdt", "depth5","20250318", machine="AWS-JP1")
trades_df=df
quotes_df=dg


loading... AWS-JP1_BINANCE_SWAP_BTC-USDT_TRADE_2025_03_18.hdf 
loading... AWS-JP1_BINANCE_SWAP_BTC-USDT_DEPTH5_2025_03_18.hdf 


# MA

In [3]:
import pandas as pd, numpy as np
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# ------------ 0. 载入&预处理 ------------------------------------------------
# trade_df 应至少包含列: ts (ms), v (signed volume)
trades_df["datetime"] = pd.to_datetime(trades_df["ts"], unit="ms")
trades = (trades_df.set_index("datetime")
                     .sort_index())

# 只关心绝对成交量
trades["vol"] = trades["v"].abs()

# ------------ 1. 秒级聚合 ---------------------------------------------------
vol_1s = trades["vol"].resample("1s").sum().fillna(0)   # (Series, 1s freq)

# ------------ 2. 构造特征 & 标签 -------------------------------------------
# 过去 120 秒滚动平均 (含当前秒)
pred_vol = vol_1s.rolling(window=120, min_periods=1).mean()

# 未来 2 秒成交量 (窗口 = 当前秒 + 下一秒)
true_vol = vol_1s.rolling(window=2, min_periods=2).sum().shift(-1)

# 对齐并去掉尾部 NaN
df_eval = pd.DataFrame({"pred": pred_vol, "true": true_vol}).dropna()

# ------------ 3. 评估指标 ---------------------------------------------------
mae  = mean_absolute_error(df_eval["true"], df_eval["pred"])
rmse = mean_squared_error(df_eval["true"], df_eval["pred"], squared=False)
mape = (np.abs(df_eval["true"]-df_eval["pred"]) / df_eval["true"].replace(0, np.nan)).mean()
r2   = r2_score(df_eval["true"], df_eval["pred"])

print(f"样本点数      : {len(df_eval):,}")
print(f"MAE           : {mae:,.4f}")
print(f"RMSE          : {rmse:,.4f}")
print(f"MAPE (mean)   : {mape:,.2%}")
print(f"R²            : {r2:,.4f}")


样本点数      : 86,394
MAE           : 3.4147
RMSE          : 13.1007
MAPE (mean)   : 403.59%
R²            : 0.0751


# EWMA

In [None]:
import pandas as pd, numpy as np
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# ------------ 0. 载入&预处理 ------------------------------------------------
# trade_df 应至少包含列: ts (ms), v (signed volume)
trades_df["datetime"] = pd.to_datetime(trades_df["ts"], unit="ms")
trades = (trades_df.set_index("datetime")
                     .sort_index())

# 只关心绝对成交量
trades["vol"] = trades["v"].abs()

# ------------ 1. 秒级聚合 ---------------------------------------------------
vol_1s = trades["vol"].resample("1s").sum().fillna(0)   # (Series, 1s freq)

# ------------ 2. 构造特征 & 标签 -------------------------------------------
pred_vol = vol_1s.ewm(halflife=30, adjust=False).mean()

# 未来 2 秒成交量 (窗口 = 当前秒 + 下一秒)
true_vol = vol_1s.rolling(window=2, min_periods=2).sum().shift(-1)

# 对齐并去掉尾部 NaN
df_eval = pd.DataFrame({"pred": pred_vol, "true": true_vol}).dropna()

# ------------ 3. 评估指标 ---------------------------------------------------
mae  = mean_absolute_error(df_eval["true"], df_eval["pred"])
rmse = mean_squared_error(df_eval["true"], df_eval["pred"], squared=False)
mape = (np.abs(df_eval["true"]-df_eval["pred"]) / df_eval["true"].replace(0, np.nan)).mean()
r2   = r2_score(df_eval["true"], df_eval["pred"])

print(f"样本点数      : {len(df_eval):,}")
print(f"MAE           : {mae:,.4f}")
print(f"RMSE          : {rmse:,.4f}")
print(f"MAPE (mean)   : {mape:,.2%}")
print(f"R²            : {r2:,.4f}")


样本点数      : 86,394
MAE           : 3.3337
RMSE          : 12.9123
MAPE (mean)   : 379.44%
R²            : 0.1015


# 对数线性回归

In [5]:
import numpy as np, pandas as pd
from scipy.stats import linregress

# ---- 0. 计算特征、标签（对数空间） ---------------------------------
logv = np.log1p(vol_1s)                      # ln(1 + v)
X = logv.rolling(120, min_periods=120).mean()          # >=120 样本时才出值
y = logv.rolling(2,   min_periods=2  ).sum().shift(-1) # 未来2秒

# ---- 1. 一次性过滤所有非法值 ----------------------------------------
df_xy = pd.DataFrame({"X": X, "y": y}).dropna()        # 去掉 NaN
df_xy = df_xy[np.isfinite(df_xy["X"]) & np.isfinite(df_xy["y"])]  # 去 ±inf

# 至少要 2 个样本才能拟合
if len(df_xy) < 2:
    raise ValueError("Not enough valid points for regression")

# ---- 2. 线性回归（y = a * X + b） -----------------------------------
slope, intercept, r_value, *_ = linregress(df_xy["X"], df_xy["y"])

# ---- 3. 还原预测值 ---------------------------------------------------
pred_log = slope * X + intercept            # Series，与原时间轴对齐
pred_vol = np.expm1(pred_log)               # 逆 log1p

# ---- 4. 与真实值对齐评估 -------------------------------------------
true_vol = np.expm1(y)                      # 把标签也还原
df_eval = pd.DataFrame({"pred": pred_vol, "true": true_vol}).dropna()

mae  = (df_eval["pred"] - df_eval["true"]).abs().mean()
rmse = np.sqrt(((df_eval["pred"] - df_eval["true"])**2).mean())
print(f"n = {len(df_eval):,},   MAE = {mae:.3f},   RMSE = {rmse:.3f}")


n = 86,275,   MAE = 23.543,   RMSE = 591.512


## 简单Hawkes

In [2]:
import numpy as np, pandas as pd
from tick.hawkes import HawkesExpKern
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# ---------------- 0. 准备逐笔事件序列 -----------------
trades_df["datetime"] = pd.to_datetime(trades_df["ts"], unit="ms")
trades = trades_df.set_index("datetime").sort_index()

# 仅统计绝对成交量；若想分方向，可拆成正/负两条 Hawkes
trades["vol"] = trades["v"].abs()

# 事件时间（秒）——重复一条事件 vol 次会过大，这里用“每笔=1 手”近似
event_times = trades.index.view("int64") / 1e9         # ndarray[float64]

# ---------------- 1. 拟合单指数 Hawkes -----------------
beta = 2.0                 # 衰减率 (s⁻¹)，≈0.5 s 的半衰期，可调
hawkes = HawkesExpKern(decays=[[beta]], max_iter=80)
hawkes.fit([event_times])

mu     = hawkes.baseline[0]          # 基线 μ
alpha  = hawkes.adjacency[0, 0]      # 自激 α
print(f"Fit result: μ={mu:.4f}, α={alpha:.4f}, β={beta}")

# ---------------- 2. 生成秒级特征网格 -----------------
sec_series = trades["vol"].resample("1s").sum().fillna(0)
t0   = sec_series.index[0].value / 1e9
grid = (sec_series.index.view("int64") / 1e9) - t0      # 秒坐标

# ---------------- 3. 计算未来 2 s 期望量 ----------------
# analytic integral:  Λ(t,Δ)=μΔ + α/β * (1-e^{-βΔ}) * Σ_j e^{-β(t - t_j)}
Δ = 2.0
kernel_int = alpha / beta * (1 - np.exp(-beta * Δ))

# rolling pointer式累积和，O(N+M)
intensity_sum = np.zeros_like(grid)
k = 0
for i, t in enumerate(grid):
    while k < len(event_times) and event_times[k] < t - 10:   # 10 s 斩尾
        k += 1
    # 对剩余事件做向量化 dot
    dt = t - event_times[k:]
    valid = dt[dt > 0]
    if len(valid):
        intensity_sum[i] = np.exp(-beta * valid).sum()

pred_cnt = mu * Δ + kernel_int * intensity_sum        # 预测未来 2 s 成交数

# ---------------- 4. 构造真实标签 --------------------
true_cnt = (sec_series.rolling(2).sum().shift(-1))    # 当前秒+下一秒
df_eval = pd.DataFrame({"pred": pred_cnt,
                        "true": true_cnt.values}).dropna()

# ---------------- 5. 评估 ----------------------------
mae  = mean_absolute_error(df_eval["true"], df_eval["pred"])
rmse = mean_squared_error(df_eval["true"], df_eval["pred"], squared=False)
r2   = r2_score(df_eval["true"], df_eval["pred"])
print(f"n={len(df_eval):,}, MAE={mae:.3f}, RMSE={rmse:.3f}, R²={r2:.4f}")


Fit result: μ=0.0000, α=0.9943, β=2.0
n=86,394, MAE=4.280, RMSE=14.279, R²=-0.0987


# 事件按真实手数重复+双指数核+FFT 卷积

In [3]:
# ===============================================================
# Hawkes benchmark — predict 2-second volume
# (1-sec grid, dual-decay λ, FFT convolution)
# ===============================================================
import numpy as np, pandas as pd
from scipy.signal import fftconvolve
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from tick.hawkes import HawkesExpKern

# ---------------- 用户可调超参 ---------------------------------
DT_FE     = 1.0          # λ(t) 采样步长（秒）
PRED_WIN  = 2.0          # 预测窗口 Δ = 2 s
BETAS     = [2.0, 0.2]   # 双衰减 (β=2 s⁻¹ ≈0.5 s, β=0.2 s⁻¹ ≈5 s)
LOOKBACK  = 20           # λ(t) 只回看最近 20 s 事件
VOL_CLIP  = 50           # 每笔 volume > VOL_CLIP 时仅按 VOL_CLIP 次重复，防爆内存
# ---------------------------------------------------------------

# ---------- 0. 载入 & 逐笔事件 → 秒坐标 -------------------------
trades_df["datetime"] = pd.to_datetime(trades_df["ts"], unit="ms")
trades = trades_df.set_index("datetime").sort_index()
trades["vol_int"] = trades["v"].abs().astype(int).clip(upper=VOL_CLIP)

evt_times = np.repeat(
    trades.index.view("int64") / 1e9,
    trades["vol_int"]                     # 重复 volume_i 次
)
t0 = evt_times[0]
evt_times -= t0                           # 归零秒坐标

# ---------- 1. 拟合双指数核 Hawkes ------------------------------
hawkes = HawkesExpKern(decays=[BETAS], max_iter=80)
hawkes.fit([evt_times])

mu        = float(hawkes.baseline[0])
alpha_tot = float(hawkes.adjacency[0, 0])   # tick 把多核 α 求和放这里
β1, β2    = BETAS
print(f"μ={mu:.4f}, α_total={alpha_tot:.4f}, β1={β1}, β2={β2}")

# ---------- 2. 生成 1-s 网格 & 计算 λ(t) ------------------------
grid = np.arange(0, evt_times[-1] + DT_FE, DT_FE)

def hawkes_lambda(events, grid, μ, α, β1, β2, look):
    lam = np.empty_like(grid)
    k = 0
    for i, t in enumerate(grid):
        while k < len(events) and events[k] < t - look:
            k += 1
        acc = 0.0
        for j in range(k, len(events)):
            dt = t - events[j]
            if dt <= 0.0:
                break
            acc += α * (np.exp(-β1*dt) + np.exp(-β2*dt))  # α 被平均分到两核
        lam[i] = μ + 0.5 * acc             # 0.5× 是因为α被拆到两核
    return lam

lam = hawkes_lambda(evt_times, grid, mu, alpha_tot, β1, β2, LOOKBACK)

# ---------- 3. λ(t) ⇒ 未来 2 s 期望量 (FFT 卷积) ---------------
kernel_len = int(PRED_WIN / DT_FE) + 1          # 2 s / 1 s = 2 → 长度 3
kernel = np.ones(kernel_len)
pred_cnt = fftconvolve(lam, kernel, mode="same") * DT_FE   # 积分近似求和

# ---------- 4. 构造真实 2 s 成交量 -----------------------------
true_sec = trades["vol_int"].resample(f"{int(DT_FE)}s").sum().fillna(0)
# 重新索引到 grid（确保长度一致）
true_sec = true_sec.reindex(pd.to_datetime((grid+t0)*1e9), fill_value=0)
true_cnt = true_sec.rolling(kernel_len).sum().shift(-(kernel_len-1))

# ---------- 5. 评估 --------------------------------------------
df_eval = pd.DataFrame({"pred": pred_cnt, "true": true_cnt}).dropna()

mae  = mean_absolute_error(df_eval["true"], df_eval["pred"])
rmse = mean_squared_error(df_eval["true"], df_eval["pred"], squared=False)
r2   = r2_score(df_eval["true"], df_eval["pred"])

print(f"Samples : {len(df_eval):,}")
print(f"MAE     : {mae:.3f}")
print(f"RMSE    : {rmse:.3f}")
print(f"R²      : {r2:.4f}")


μ=0.0212, α_total=0.9499, β1=2.0, β2=0.2
Samples : 86,390
MAE     : 3.363
RMSE    : 10.744
R²      : 0.0000
