In [None]:
import os
import random
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Dropout, Multiply, Permute
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
import matplotlib.pyplot as plt
import joblib

from tensorflow.keras.layers import Lambda

# ==========================================================
# 0. REPRODUCIBILITY
# ==========================================================
SEED = 42
np.random.seed(SEED)
tf.random.set_seed(SEED)
random.seed(SEED)
os.environ["PYTHONHASHSEED"] = str(SEED)

# ==========================================================
# 1. CONFIG
# ==========================================================
WINDOW = 48                 # number of 4h windows for the sequence
FUTURE_PERIODS = 1          # predict next 4h funding
BATCH_SIZE = 64
EPOCHS = 60

# Spike handling hyperparams
SPIKE_FACTOR_LGBM = 5000.0  
SPIKE_WEIGHT_CLIP = 50.0
SPIKE_LOSS_GAMMA = 40.0
ATTN_SPIKE_ALPHA = 40.0

# ==========================================================
# 2. LOAD FUNDING + 5-MIN SPOT/FUTURES DATA
# ==========================================================
funding = pd.read_csv(
    r"D:\Homework\QF634\project\TAOUSDT_funding_rate_20200101_20251130.csv",
    parse_dates=["fundingDateTime"],
    date_parser=lambda x: pd.to_datetime(x, format="%Y-%m-%d %H:%M:%S.%f"),
)
funding = funding.sort_values("fundingDateTime")
funding = funding.rename(columns={"fundingDateTime": "timestamp", "fundingRate": "funding_rate"})
funding = funding.drop(columns=["symbol", "formattedFundingDateTime"])
funding.set_index("timestamp", inplace=True)

spot = pd.read_json(
    r"D:\Homework\QF634\project\Data\raw_historical_price\TAOUSDT_5m_binance_spot_historical_data.json",
    lines=True,
).sort_values("timestamp").set_index("timestamp")

future = pd.read_json(
    r"D:\Homework\QF634\project\Data\raw_historical_price\TAOUSDT_5m_binance_futures_historical_data.json",
    lines=True,
).sort_values("timestamp").set_index("timestamp")

spot.index = pd.to_datetime(spot.index)
future.index = pd.to_datetime(future.index)
funding.index = pd.to_datetime(funding.index)

# ==========================================================
# 3. REMOVE LAST 15 MINUTES BEFORE EACH FUNDING TIMESTAMP
# ==========================================================
fund_times = funding.index.sort_values()

clean_spot_windows = {}
clean_future_windows = {}

for i in range(1, len(fund_times)):
    t_prev = fund_times[i - 1]
    t_curr = fund_times[i]

    window_start = t_prev
    window_end = t_curr - pd.Timedelta(minutes=15)  # remove final 15m

    spot_window = spot.loc[(spot.index >= window_start) & (spot.index < window_end)]
    fut_window = future.loc[(future.index >= window_start) & (future.index < window_end)]

    clean_spot_windows[t_curr] = spot_window
    clean_future_windows[t_curr] = fut_window

# ==========================================================
# 4. MANUAL RESAMPLING (OHLCV)
# ==========================================================
spot_rows = []
future_rows = []

for t in fund_times[1:]:  # skip first because no full window
    sw = clean_spot_windows[t]
    fw = clean_future_windows[t]

    # ---- Spot ----
    spot_rows.append(pd.Series({
        "spot_open": sw["open"].iloc[0] if len(sw) else np.nan,
        "spot_high": sw["high"].max() if len(sw) else np.nan,
        "spot_low": sw["low"].min() if len(sw) else np.nan,
        "spot_close": sw["close"].iloc[-1] if len(sw) else np.nan,
        "spot_volume": sw["volume"].sum() if len(sw) else 0.0,
        "spot_quote_volume": sw["quote_asset_volume"].sum() if len(sw) else 0.0,
        "spot_trades": sw["number_of_trades"].sum() if len(sw) else 0.0,
        "spot_taker_buy_base": sw["taker_buy_base_asset_volume"].sum() if len(sw) else 0.0,
        "spot_taker_buy_quote": sw["taker_buy_quote_asset_volume"].sum() if len(sw) else 0.0,
    }, name=t))

    # ---- Future ----
    future_rows.append(pd.Series({
        "fut_open": fw["open"].iloc[0] if len(fw) else np.nan,
        "fut_high": fw["high"].max() if len(fw) else np.nan,
        "fut_low": fw["low"].min() if len(fw) else np.nan,
        "fut_close": fw["close"].iloc[-1] if len(fw) else np.nan,
        "fut_volume": fw["volume"].sum() if len(fw) else 0.0,
        "fut_quote_volume": fw["quote_asset_volume"].sum() if len(fw) else 0.0,
        "fut_trades": fw["number_of_trades"].sum() if len(fw) else 0.0,
        "fut_taker_buy_base": fw["taker_buy_base_asset_volume"].sum() if len(fw) else 0.0,
        "fut_taker_buy_quote": fw["taker_buy_quote_asset_volume"].sum() if len(fw) else 0.0,
    }, name=t))

spot_8h = pd.DataFrame(spot_rows)
future_8h = pd.DataFrame(future_rows)

# ==========================================================
# 5. MERGE ALL 4H DATA
# ==========================================================
df = funding.merge(spot_8h, left_index=True, right_index=True, how="left") \
            .merge(future_8h, left_index=True, right_index=True, how="left")

# ==========================================================
# 6. FEATURE ENGINEERING
# ==========================================================
def pct_change(series, periods=1):
    return series.pct_change(periods)

def max_drawdown(arr):
    peak = np.maximum.accumulate(arr)
    dd = (arr - peak) / peak
    return dd.min()

df["spot_ret_1"] = pct_change(df["spot_close"])
df["spot_vol_24"] = df["spot_ret_1"].rolling(24).std()
df["spot_mom_24"] = df["spot_close"] / df["spot_close"].shift(24) - 1
df["spot_maxdd_120"] = df["spot_close"].rolling(120).apply(lambda x: max_drawdown(np.array(x)), raw=False)

df["fut_ret_1"] = pct_change(df["fut_close"])
df["fut_vol_24"] = df["fut_ret_1"].rolling(24).std()
df["fut_mom_24"] = df["fut_close"] / df["fut_close"].shift(24) - 1
df["fut_maxdd_120"] = df["fut_close"].rolling(120).apply(lambda x: max_drawdown(np.array(x)), raw=False)

df["spread_close"] = df["fut_close"] - df["spot_close"]
df["spread_ret_1"] = df["spread_close"].pct_change()

df["funding_delta_1"] = df["funding_rate"].diff()
df["funding_vol_24"] = df["funding_rate"].pct_change().rolling(24).std()
df["funding_mom_24"] = df["funding_rate"] / df["funding_rate"].shift(24) - 1

df["funding_slope_6"] = df["funding_rate"].rolling(6).apply(lambda x: np.polyfit(np.arange(6), x, 1)[0])
df["funding_mean_6"]  = df["funding_rate"].rolling(6).mean()

df = df.fillna(method="ffill").fillna(method="bfill")
df = df.replace([np.inf, -np.inf], 0)

# ==========================================================
# 7. BUILD TARGET & SEQUENCES
# ==========================================================
y_multi = np.array([
    df["funding_rate"].iloc[i+WINDOW:i+WINDOW+FUTURE_PERIODS].values
    for i in range(len(df) - WINDOW - FUTURE_PERIODS)
], dtype=np.float32)

feature_cols = [
    "spot_close", "spot_ret_1", "spot_vol_24", "spot_mom_24", "spot_maxdd_120",
    "fut_close", "fut_ret_1", "fut_vol_24", "fut_mom_24", "fut_maxdd_120",
    "spread_close", "spread_ret_1",
    "funding_delta_1", "funding_vol_24", "funding_mom_24",
    "funding_slope_6", "funding_mean_6",
]

seq_X = []
tab_X = []
for i in range(len(df) - WINDOW - FUTURE_PERIODS):
    seq_X.append(df.iloc[i:i+WINDOW][feature_cols].values)
    tab_X.append(df.iloc[i+WINDOW][feature_cols].values)

seq_X = np.array(seq_X, dtype=np.float32)
tab_X = np.array(tab_X, dtype=np.float32)

# ==========================================================
# 8. SPIKE WEIGHTS
# ==========================================================
weights = []
fd = df["funding_delta_1"].values

for i in range(len(df) - WINDOW - FUTURE_PERIODS):
    w = np.max(np.abs(fd[i:i+WINDOW]))
    w = 1 + SPIKE_FACTOR_LGBM * w
    w = min(w, SPIKE_WEIGHT_CLIP)
    weights.append(w)

weights = np.array(weights, dtype=np.float32)

# ==========================================================
# 9. TRAIN / VAL / TEST SPLITS
# ==========================================================
N = len(seq_X)
train_end = int(0.8*N)
val_end = int(0.9*N)

X_seq_train, X_seq_val, X_seq_test = seq_X[:train_end], seq_X[train_end:val_end], seq_X[val_end:]
X_tab_train, X_tab_val, X_tab_test = tab_X[:train_end], tab_X[train_end:val_end], tab_X[val_end:]
y_train, y_val, y_test = y_multi[:train_end], y_multi[train_end:val_end], y_multi[val_end:]
w_train, w_val, w_test = weights[:train_end], weights[train_end:val_end], weights[val_end:]

# ==========================================================
# 10. SCALING
# ==========================================================
tab_scaler = StandardScaler()
X_tab_train_s = tab_scaler.fit_transform(X_tab_train)
X_tab_val_s = tab_scaler.transform(X_tab_val)
X_tab_test_s = tab_scaler.transform(X_tab_test)

seq_scaler = StandardScaler().fit(X_seq_train.reshape(-1, seq_X.shape[2]))

def scale_seq(x):
    x2 = x.reshape(-1, x.shape[2])
    x2 = seq_scaler.transform(x2)
    return x2.reshape(x.shape[0], x.shape[1], x.shape[2])

X_seq_train_s = scale_seq(X_seq_train)
X_seq_val_s = scale_seq(X_seq_val)
X_seq_test_s = scale_seq(X_seq_test)

y_scaler = StandardScaler().fit(y_train)
y_train_s = y_scaler.transform(y_train)
y_val_s = y_scaler.transform(y_val)

# LGB part (sum target)
y_train_sum = y_train.sum(axis=1)
y_val_sum = y_val.sum(axis=1)
y_test_sum = y_test.sum(axis=1)

# ==========================================================
# 11. TRAIN LGBM
# ==========================================================
lgb_train = lgb.Dataset(X_tab_train_s, label=y_train_sum, weight=w_train)
lgb_val = lgb.Dataset(X_tab_val_s, label=y_val_sum, weight=w_val)

params = {
    "objective":"regression",
    "metric":"rmse",
    "learning_rate":0.04,
    "num_leaves":64,
    "seed":SEED,
}

lgb_model = lgb.train(
    params, lgb_train,
    num_boost_round=2000,
    valid_sets=[lgb_train, lgb_val],
    callbacks=[
        lgb.early_stopping(50),
        lgb.log_evaluation(100)
    ]
)

pred_lgb_train = lgb_model.predict(X_tab_train_s)
pred_lgb_val = lgb_model.predict(X_tab_val_s)
pred_lgb_test = lgb_model.predict(X_tab_test_s)

# ==========================================================
# 12. RESIDUAL TARGET FOR RNN
# ==========================================================
lgb_train_scaled = y_scaler.transform(pred_lgb_train.reshape(-1,1))
lgb_val_scaled   = y_scaler.transform(pred_lgb_val.reshape(-1,1))

res_train = y_train_s - lgb_train_scaled
res_val = y_val_s - lgb_val_scaled

# ==========================================================
# 13. SPIKE-AWARE ATTENTION LSTM
# ==========================================================
SPIKE_IDX = feature_cols.index("funding_delta_1")

def spike_loss(y_true, y_pred):
    w = 1 + SPIKE_LOSS_GAMMA * tf.abs(y_true)
    return tf.reduce_mean(w * tf.square(y_true - y_pred))


inp = Input(shape=(WINDOW, seq_X.shape[2]))

# ---- LSTM ----
lstm = LSTM(128, return_sequences=True)(inp)

# ---- Attention scores ----
att_logits = Dense(1, activation="tanh")(lstm)                     # (B, T, 1)
att_logits = Lambda(lambda x: tf.squeeze(x, axis=-1))(att_logits) # (B, T)

# ---- Extract spike magnitude ----
spikes = Lambda(lambda x: tf.abs(x[:, :, SPIKE_IDX]))(inp)        # (B, T)

# ---- Boost attention by spike magnitude ----
boost = Lambda(lambda s: 1.0 + ATTN_SPIKE_ALPHA * s)(spikes)      # (B, T)
att_logits = Lambda(lambda ab: ab[0] * ab[1])([att_logits, boost])

# ---- Softmax attention ----
att = tf.keras.layers.Activation("softmax")(att_logits)           # (B, T)

# ---- Expand attention to match LSTM hidden dim ----
att_exp = RepeatVector(128)(att)                                  # (B, 128, T)
att_exp = Permute([2,1])(att_exp)                                 # (B, T, 128)

# ---- Weighted sum context ----
context = Multiply()([lstm, att_exp])                             # (B, T, 128)
context = Lambda(lambda x: tf.reduce_sum(x, axis=1))(context)     # (B, 128)

# ---- Dense layers ----
dense = Dense(64, activation="relu")(context)
dense = Dropout(0.2)(dense)
out = Dense(1)(dense)

# ---- Build model ----
rnn_model = Model(inp, out)
rnn_model.compile(optimizer=tf.keras.optimizers.Adam(1e-3), loss=spike_loss)

rnn_model.fit(
    X_seq_train_s, res_train,
    validation_data=(X_seq_val_s, res_val),
    epochs=EPOCHS,
    batch_size=BATCH_SIZE,
    callbacks=[EarlyStopping(10, restore_best_weights=True),
               ReduceLROnPlateau(5, factor=0.5)],
    verbose=2
)

# ==========================================================
# 14. HYBRID PREDICTION (CORRECTED)
# ==========================================================
def hybrid_predict(X_seq, pred_lgb):
    pred_lgb_scaled = y_scaler.transform(pred_lgb.reshape(-1,1))
    pred_res_scaled = rnn_model.predict(X_seq)
    y_scaled = pred_lgb_scaled + pred_res_scaled
    y = y_scaler.inverse_transform(y_scaled)
    return y.ravel()

pred_hybrid = hybrid_predict(X_seq_test_s, pred_lgb_test)

# ==========================================================
# 15. EVALUATION
# ==========================================================
def metrics(true, pred, name):
    rmse = np.sqrt(mean_squared_error(true, pred))
    mae = mean_absolute_error(true, pred)
    print(f"{name} RMSE={rmse:.8f}, MAE={mae:.8f}")

metrics(y_test_sum, pred_lgb_test, "LGBM")
metrics(y_test_sum, pred_hybrid, "Hybrid RNN+LGBM")

plt.figure(figsize=(14,4))
plt.plot(y_test_sum, label="Actual", color="black")
plt.plot(pred_lgb_test, label="LGBM", alpha=0.7)
plt.plot(pred_hybrid, label="Hybrid", alpha=0.7)
plt.legend()
plt.show()

# ==========================================================
# 16. SAVE MODELS & SCALERS
# ==========================================================
lgb_model.save_model("lgb_model.txt")
rnn_model.save("rnn_attention_spike_model.h5")
joblib.dump(tab_scaler, "tab_scaler.pkl")
joblib.dump(seq_scaler, "seq_scaler.pkl")
joblib.dump(y_scaler, "y_scaler.pkl")

print("All models saved.")


  funding = pd.read_csv(
  df = df.fillna(method="ffill").fillna(method="bfill")


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000630 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4187
[LightGBM] [Info] Number of data points in the train set: 2828, number of used features: 17
[LightGBM] [Info] Start training from score 0.000022
Training until validation scores don't improve for 50 rounds
[100]	training's rmse: 1.85675e-05	valid_1's rmse: 9.53269e-06
[200]	training's rmse: 1.36524e-05	valid_1's rmse: 9.47307e-06
Early stopping, best iteration is:
[154]	training's rmse: 1.54581e-05	valid_1's rmse: 9.39264e-06


ValueError: A KerasTensor cannot be used as input to a TensorFlow function. A KerasTensor is a symbolic placeholder for a shape and dtype, used when constructing Keras Functional models or Keras Functions. You can only use it as input to a Keras layer or a Keras operation (from the namespaces `keras.layers` and `keras.ops`). You are likely doing something like:

```
x = Input(...)
...
tf_fn(x)  # Invalid.
```

What you should do instead is wrap `tf_fn` in a layer:

```
class MyLayer(Layer):
    def call(self, x):
        return tf_fn(x)

x = MyLayer()(x)
```
