In [194]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    roc_auc_score, confusion_matrix, classification_report, f1_score
)

# Try to use XGBoost; if not available, fallback to RandomForest
use_xgb = True
try:
    from xgboost import XGBClassifier
except Exception:
    use_xgb = False
from sklearn.ensemble import RandomForestClassifier
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

In [60]:
import numpy as np
print(np.__version__)
pd.options.display.max_rows = None

1.26.4


In [189]:
df = pd.read_csv("../Downloads/st_export.csv").iloc[:,1:]
for i in range(len(df)):
    df.iloc[i,-3]=df.iloc[i,-3].split("GMT")[0].strip()
df['timestamp'] = pd.to_datetime(df['timestamp'])

In [190]:
a=np.random.randint(len(df))
df.iloc[a:a+5,:]

Unnamed: 0,klineacc,spread,spreadper,x,vwap,deviation,ratio,term,sigma,e,h,asset,timestamp,gap,gaplimit
72166,18294646,-2277414.0,-117.692345,-0.739499,1.572469,-0.035084,2.176924,0.301871,0.024692,-0.93373,-0.732328,SUI,2025-11-20 03:27:59,5m,60
72167,101103832,-3996358.0,-63.699516,-0.813051,2.077507,-0.018102,1.636995,0.493582,0.014593,-1.002307,-0.762561,XRP,2025-11-20 03:27:59,5m,60
72168,95749136,-26318410.0,-296.169312,-0.666268,134.315186,-0.022746,3.961693,0.238122,0.017215,-1.246502,-0.8473,SOL,2025-11-20 03:27:59,5m,60
72169,12786416,-129667.2,-7.432905,-0.864047,4.844911,0.012609,1.074329,0.672308,0.025574,0.356106,0.34178,ICP,2025-11-20 03:39:19,1m,300
72170,169758016,3958168.0,51.434204,0.834458,619.839111,0.012714,0.485658,0.724087,0.021389,0.209042,0.20605,ZEC,2025-11-20 03:39:19,1m,300


In [188]:
ndf = pd.read_csv("../Downloads/st_export3.csv").iloc[:,1:]
for i in range(len(ndf)):
    ndf.iloc[i,-3]=ndf.iloc[i,-3].split("GMT")[0].strip()
ndf['timestamp'] = pd.to_datetime(ndf['timestamp'])
a=np.random.randint(len(ndf))
ndf.iloc[a:a+5,:]

Unnamed: 0,klineacc,spread,spreadper,x,vwap,deviation,ratio,term,sigma,e,h,asset,timestamp,gap,gaplimit
3340,48334904,-2195232.0,-24.973526,-0.80597,89567.703125,-0.000588,1.249735,0.844876,0.001175,-0.528642,-0.484342,BTC,2025-12-07 14:06:07,3m,100
3341,17542022,-12747070.0,-89.029434,-0.562048,133.019684,-0.001652,1.890294,0.531837,0.002218,-0.748742,-0.634398,SOL,2025-12-07 14:06:07,3m,100
3342,31555670,-227765.7,-2.231067,-0.872986,3049.097656,-0.001508,1.022311,0.564068,0.001143,-0.760481,-0.64136,ETH,2025-12-07 14:06:07,3m,100
3343,11513714,-4263340.0,-82.287003,-0.634807,2.053715,-0.002101,1.82287,1.084755,0.00471,-0.881923,-0.707382,XRP,2025-12-07 14:06:07,3m,100
3344,13765689,-1061777.0,-45.582344,-0.774624,339.793945,0.012643,1.455823,0.692376,0.010148,1.255781,0.849897,ZEC,2025-12-07 14:09:28,5m,60


In [191]:
assets_to_train=[i for i in ndf["asset"].unique().tolist() if i in df["asset"].unique().tolist()]
print(assets_to_train)

['SOL', 'BNB', 'BTC', 'XRP', 'ETH', 'ZEC', 'SUI', 'EGLD', 'AAVE', 'BCH', 'LINK', 'DOT', 'TRB', 'LTC', 'ORDI', 'TON', 'PAXG', 'INJ', 'WBTC', 'ZRO', 'ETC', 'ORCA', 'CAKE', 'AR', 'UNI', 'COMP', 'DASH', 'AVAX', 'TAO', 'FIL', 'RENDER', 'TRUMP', 'NEAR', 'GIGGLE', 'ATOM', 'QNT', 'PENDLE', 'ZEN', 'APT', 'ICP', 'NMR']


In [213]:
import pandas as pd
import numpy as np
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
import warnings
warnings.filterwarnings('ignore')

# -------------------------------------------------------------------
# Make SAFE internal copies (df stays df, ndf stays ndf)
# -------------------------------------------------------------------
train_df = df.copy()
future_df = ndf.copy()

# -------------------------------------------------------------------
# Timestamp cleanup
# -------------------------------------------------------------------
for d in [train_df, future_df]:
    d.iloc[:, -3] = d.iloc[:, -3].astype(str).str.split("GMT").str[0].str.strip()
    d['timestamp'] = pd.to_datetime(d['timestamp'])

lags = [1,2,3]
base_features = ['vwap','e','h','x','spread','spreadper','ratio','sigma','deviation']

def add_lags(df_in):
    df_out = df_in.sort_values('timestamp').copy()
    for lag in lags:
        for f in base_features:
            df_out[f"{f}_lag{lag}"] = df_out.groupby("asset")[f].shift(lag)
    return df_out

train_lagged = add_lags(train_df)
future_lagged = add_lags(future_df)

train_lagged = train_lagged.dropna()

predictions = []
metrics = []

for asset in assets_to_train:
    sub = train_lagged[train_lagged["asset"] == asset].sort_values("timestamp")
    if len(sub) < 30:
        print(f"Skipping {asset} (too little data)")
        continue

    feature_cols = [c for c in sub.columns if "lag" in c]

    X = sub[feature_cols]
    y = sub["vwap"]

    split = int(len(X)*0.8)
    X_train, X_val = X.iloc[:split], X.iloc[split:]
    y_train, y_val = y.iloc[:split], y.iloc[split:]

    # =====================================================
    # Main fix: HistGradientBoostingRegressor handles NaN.
    # =====================================================
    model = HistGradientBoostingRegressor(
        max_depth=4,
        learning_rate=0.05,
        max_iter=300
    )

    model.fit(X_train, y_train)

    # Validation metrics
    y_pred = model.predict(X_val)
    metrics.append({
        "asset": asset,
        "R2": r2_score(y_val, y_pred),
        "MAE": mean_absolute_error(y_val, y_pred),
        "RMSE": np.sqrt(mean_squared_error(y_val, y_pred))
    })

    # Predictions for new data
    new_sub = future_lagged[future_lagged["asset"] == asset].sort_values("timestamp")
    if new_sub.empty:
        continue

    new_sub["prediction"] = model.predict(new_sub[feature_cols])
    predictions.append(new_sub[["asset","timestamp","prediction"]])

# Final outputs
pred_df = pd.concat(predictions).reset_index(drop=True)
metrics_df = pd.DataFrame(metrics).set_index("asset")

Python(1887) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


Skipping EGLD (too little data)


In [214]:
print(metrics_df.sort_values('R2', ascending=False))

               R2         MAE        RMSE
asset                                    
XRP      0.963978    0.005013    0.013099
FIL      0.962063    0.020734    0.039323
BTC      0.956198  199.020438  498.495734
SOL      0.954861    0.520708    1.228089
ETH      0.948974    9.750855   27.745067
BNB      0.946116    3.055720    6.978202
LINK     0.931703    0.111574    0.215440
NEAR     0.926329    0.026739    0.049117
ICP      0.922538    0.085308    0.157760
SUI      0.920552    0.017227    0.030301
GIGGLE   0.920315    2.863577    4.879095
AVAX     0.874721    0.137658    0.210788
ZEN      0.859808    0.365244    0.730618
LTC      0.858565    1.450574    1.996265
CAKE     0.852020    0.010022    0.013735
UNI      0.820363    0.155576    0.235801
PENDLE   0.772881    0.016107    0.020631
INJ      0.771541    0.061737    0.073638
DASH     0.730259    3.847777    6.684932
TRB      0.599198    0.940410    1.241431
ZEC      0.597745   25.921409   34.381240
ETC      0.459696    0.133775    0

In [218]:
c=[[len(df[df["asset"]==i]),i] for i in assets_to_train]

In [223]:
c = sorted(c, key=lambda x: x[0],reverse=True)
print(c)

[[10193, 'BTC'], [10172, 'ETH'], [9534, 'SOL'], [9523, 'ZEC'], [9512, 'BNB'], [9398, 'XRP'], [4828, 'SUI'], [3691, 'ICP'], [3305, 'DASH'], [3203, 'GIGGLE'], [3054, 'LTC'], [2943, 'LINK'], [2886, 'ZEN'], [2448, 'UNI'], [2402, 'NEAR'], [2344, 'TAO'], [2145, 'FIL'], [1917, 'TRUMP'], [1759, 'AVAX'], [1365, 'PAXG'], [1211, 'BCH'], [795, 'AAVE'], [660, 'AR'], [566, 'DOT'], [479, 'ETC'], [408, 'WBTC'], [358, 'CAKE'], [358, 'RENDER'], [347, 'ORDI'], [325, 'INJ'], [310, 'APT'], [287, 'TON'], [287, 'PENDLE'], [264, 'ATOM'], [249, 'TRB'], [204, 'NMR'], [190, 'QNT'], [184, 'ZRO'], [124, 'ORCA'], [81, 'COMP'], [27, 'EGLD']]


In [227]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import HuberRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.pipeline import Pipeline
import warnings

warnings.filterwarnings('ignore')

# --- Copies to avoid touching originals ---
df_copy = df.copy()
ndf_copy = ndf.copy()

# --- Clean timestamps ---
for dataset in [df_copy, ndf_copy]:
    dataset.iloc[:, -3] = dataset.iloc[:, -3].astype(str).str.split("GMT").str[0].str.strip()
    dataset['timestamp'] = pd.to_datetime(dataset['timestamp'])

# --- Major assets focus ---
major_assets = ['XRP', 'FIL', 'BTC', 'SOL', 'ETH', 'BNB', 'LINK']

# --- Feature engineering ---
lags = [1,2,3]
base_features = ['vwap','e','h','x','spread','spreadper','ratio','sigma','deviation']

def create_lag_features(df_input, features, lags):
    df_temp = df_input.sort_values('timestamp').copy()
    for lag in lags:
        for f in features:
            df_temp[f'{f}_lag{lag}'] = df_temp.groupby('asset')[f].shift(lag)
    return df_temp

def add_volatility(df_input, window=5):
    df_temp = df_input.copy()
    df_temp['log_ret'] = np.log(df_temp['vwap']).diff()
    df_temp['volatility'] = df_temp.groupby('asset')['log_ret'].rolling(window).std().reset_index(0,drop=True)
    return df_temp

def add_market_features(df_input, target_assets):
    df_temp = df_input.copy()
    df_temp['market_vwap_mean'] = df_temp.groupby('timestamp')['vwap'].transform('mean')
    btc_df = df_temp[df_temp['asset']=='BTC'][['timestamp','vwap']].rename(columns={'vwap':'btc_vwap'})
    df_temp = df_temp.merge(btc_df, on='timestamp', how='left')
    return df_temp

# Apply feature engineering
df_feat = create_lag_features(df_copy, base_features, lags)
df_feat = add_volatility(df_feat)
df_feat = add_market_features(df_feat, major_assets)

ndf_feat = create_lag_features(ndf_copy, base_features, lags)
ndf_feat = add_volatility(ndf_feat)
ndf_feat = add_market_features(ndf_feat, major_assets)

# --- Fill NaNs (forward/backward) ---
df_feat = df_feat.groupby('asset').apply(lambda x: x.ffill().bfill()).reset_index(drop=True)
ndf_feat = ndf_feat.groupby('asset').apply(lambda x: x.ffill().bfill()).reset_index(drop=True)

# --- Ensemble function ---
def ensemble_predict(models, X):
    preds = np.column_stack([m.predict(X) for m in models])
    return preds.mean(axis=1)

# --- Per-asset model training & prediction ---
predictions_list = []
metrics_list = []

for asset in major_assets:
    asset_df = df_feat[df_feat['asset']==asset].sort_values('timestamp')
    if len(asset_df) < 20:
        continue

    feature_cols = [c for c in asset_df.columns if 'lag' in c or c in ['volatility','market_vwap_mean','btc_vwap']]
    X = asset_df[feature_cols]
    y = asset_df['vwap']

    split_idx = int(len(X) * 0.8)
    X_train, X_val = X.iloc[:split_idx], X.iloc[split_idx:]
    y_train, y_val = y.iloc[:split_idx], y.iloc[split_idx:]

    # --- Models ---
    models = [
        RandomForestRegressor(n_estimators=150, max_depth=5, random_state=42),
        GradientBoostingRegressor(n_estimators=150, learning_rate=0.05, max_depth=3, random_state=42),
        HuberRegressor(epsilon=1.35, max_iter=1000)
    ]

    for m in models:
        m.fit(X_train, y_train)

    # --- Validation ---
    y_val_pred = ensemble_predict(models, X_val)
    r2 = r2_score(y_val, y_val_pred)
    mae = mean_absolute_error(y_val, y_val_pred)
    rmse = mean_squared_error(y_val, y_val_pred)
    metrics_list.append({'asset': asset, 'R2': r2, 'MAE': mae, 'RMSE': rmse})

    # --- Predict on new data ---
    new_asset_df = ndf_feat[ndf_feat['asset']==asset].sort_values('timestamp')
    if new_asset_df.empty:
        continue
    X_new = new_asset_df[feature_cols]
    y_new_pred = ensemble_predict(models, X_new)
    new_asset_df['prediction'] = y_new_pred
    predictions_list.append(new_asset_df[['asset','timestamp','prediction']])

# --- Combine predictions and metrics ---
pred_df = pd.concat(predictions_list).reset_index(drop=True)
metrics_df = pd.DataFrame(metrics_list).set_index('asset')


=== Sample Predictions ===
  asset           timestamp  prediction
0   XRP 2025-12-05 17:33:39    2.132515
1   XRP 2025-12-05 17:43:45    2.119914
2   XRP 2025-12-05 17:50:11    2.118827
3   XRP 2025-12-05 17:55:23    2.118026
4   XRP 2025-12-05 17:56:33    2.117814

=== Validation Metrics ===
             R2        MAE          RMSE
asset                                   
BTC    0.998180  84.785181  10332.365924
ETH    0.966907  13.246449    498.547643
BNB    0.904818   7.120979     85.995523
SOL    0.871798   1.507114      4.285748
XRP    0.868555   0.018367      0.000626
FIL    0.825408   0.073279      0.007118
LINK   0.733196   0.342271      0.181485


In [248]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import HuberRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.preprocessing import StandardScaler
import warnings

warnings.filterwarnings('ignore')

# === keep originals untouched ===
df_copy = df.copy()
ndf_copy = ndf.copy()

# === basic timestamp cleanup ===
for d in (df_copy, ndf_copy):
    d.iloc[:, -3] = d.iloc[:, -3].astype(str).str.split("GMT").str[0].str.strip()
    d['timestamp'] = pd.to_datetime(d['timestamp'])

# === major assets ===
major_assets = ['XRP','FIL','BTC','SOL','ETH','BNB','LINK']

# === FE: lags, vol, market ===
LAGS = [1,2,3]
BASE = ['vwap','e','h','x','spread','spreadper','ratio','sigma','deviation']

def create_lags(df_in):
    df = df_in.sort_values('timestamp').copy()
    for lag in LAGS:
        for c in BASE:
            if c in df.columns:
                df[f"{c}_lag{lag}"] = df.groupby('asset')[c].shift(lag)
    return df

def add_vol(df_in, win=5):
    df = df_in.copy()
    # safe log: avoid log(0) by clipping
    df['vwap_safe'] = df['vwap'].replace(0, np.nan).ffill().bfill()
    df['log_ret'] = np.log(df['vwap_safe']).diff()
    df['volatility'] = df.groupby('asset')['log_ret'].rolling(win, min_periods=1).std().reset_index(0,drop=True)
    df.drop(columns=['vwap_safe','log_ret'], inplace=True)
    return df

# Important: fill per-asset BEFORE market-feature merge
df_feat = create_lags(df_copy)
ndf_feat = create_lags(ndf_copy)

# per-asset ffill/bfill to remove trailing NaNs from shift windows
df_feat = df_feat.groupby('asset').apply(lambda x: x.ffill().bfill()).reset_index(drop=True)
ndf_feat = ndf_feat.groupby('asset').apply(lambda x: x.ffill().bfill()).reset_index(drop=True)

# now add volatility (safe) and then add market features
df_feat = add_vol(df_feat)
ndf_feat = add_vol(ndf_feat)

# market features: we can safely compute after per-asset fills
def add_market(df_in):
    df = df_in.copy()
    df['market_vwap_mean'] = df.groupby('timestamp')['vwap'].transform('mean')
    # btc_vwap aligned by timestamp (BTC exists in dataset)
    btc = df[df['asset']=='BTC'][['timestamp','vwap']].rename(columns={'vwap':'btc_vwap'})
    df = df.merge(btc, on='timestamp', how='left')
    return df

df_feat = add_market(df_feat)
ndf_feat = add_market(ndf_feat)

# drop any remaining NaNs before training (rare after ffill/bfill)
train_pool = df_feat.dropna().reset_index(drop=True)

# === helper: ensemble average ===
def ensemble_predict(models, X, huber_scaled_idx=None):
    # if huber_scaled_idx provided, models list must correspond: [RF, GBT, Huber]
    preds = []
    for i, m in enumerate(models):
        preds.append(m.predict(X))
    return np.column_stack(preds).mean(axis=1)

# === train per-asset ensemble (RF + GBT + Huber-with-scaling) ===
predictions = []
metrics = []

for asset in major_assets:
    sub = train_pool[train_pool['asset']==asset].sort_values('timestamp')
    if len(sub) < 30:
        print(f"[SKIP] {asset} (rows={len(sub)})")
        continue

    # feature set
    feat_cols = [c for c in sub.columns if ('lag' in c) or c in ('volatility','market_vwap_mean','btc_vwap')]
    X = sub[feat_cols].astype(float)
    y = sub['vwap'].astype(float)

    # time split
    split = int(0.8 * len(X))
    X_tr, X_val = X.iloc[:split], X.iloc[split:]
    y_tr, y_val = y.iloc[:split], y.iloc[split:]

    # scale only for Huber
    scaler = StandardScaler().fit(X_tr)
    X_tr_scaled = scaler.transform(X_tr)
    X_val_scaled = scaler.transform(X_val)

    # models: fit RF and GBT on raw X; fit Huber on scaled X
    rf = RandomForestRegressor(n_estimators=150, max_depth=5, random_state=42)
    gbt = GradientBoostingRegressor(n_estimators=150, learning_rate=0.05, max_depth=3, random_state=42)
    huber = HuberRegressor(epsilon=1.35, max_iter=1000)

    rf.fit(X_tr, y_tr)
    gbt.fit(X_tr, y_tr)
    huber.fit(X_tr_scaled, y_tr)

    # validation preds: combine (for huber use scaled)
    preds_rf = rf.predict(X_val)
    preds_gbt = gbt.predict(X_val)
    preds_huber = huber.predict(X_val_scaled)
    y_val_pred = np.column_stack([preds_rf, preds_gbt, preds_huber]).mean(axis=1)

    # metrics (correct RMSE)
    mae = mean_absolute_error(y_val, y_val_pred)
    mse = mean_squared_error(y_val, y_val_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_val, y_val_pred)

    metrics.append({'asset': asset, 'R2': r2, 'MAE': mae, 'RMSE': rmse})

    # prepare NDF rows for this asset; fill then predict
    nsub = ndf_feat[ndf_feat['asset']==asset].sort_values('timestamp')
    if nsub.empty:
        continue
    X_new = nsub[feat_cols].astype(float)
    # ensure same fill strategy (already applied), but guard any leftover NaN
    X_new = X_new.ffill().bfill().values

    preds_rf_new = rf.predict(X_new)
    preds_gbt_new = gbt.predict(X_new)
    preds_huber_new = huber.predict(scaler.transform(X_new))
    y_new_pred = np.column_stack([preds_rf_new, preds_gbt_new, preds_huber_new]).mean(axis=1)

    nsub = nsub.copy()
    nsub['prediction'] = y_new_pred
    predictions.append(nsub[['asset','timestamp','prediction']])

# combine outputs
pred_df = pd.concat(predictions, ignore_index=True) if predictions else pd.DataFrame(columns=['asset','timestamp','prediction'])
metrics_df = pd.DataFrame(metrics).set_index('asset').sort_values('R2', ascending=False)

# quick printing
print("\nMetrics:")
print(metrics_df)
print("\nSample predictions (head):")
print(pred_df.groupby('asset').head(1))


Metrics:
             R2        MAE       RMSE
asset                                
BTC    0.998347  82.452163  96.866560
FIL    0.966262   0.019160   0.036881
ETH    0.964747  13.279683  23.107242
XRP    0.959648   0.006072   0.013939
SOL    0.932874   0.883113   1.511126
BNB    0.907427   6.990046   9.172206
LINK   0.886231   0.216945   0.281686

Sample predictions (head):
     asset           timestamp    prediction
0      XRP 2025-12-05 17:33:39      2.062858
626    FIL 2025-12-05 22:42:12      1.583203
648    BTC 2025-12-05 17:33:39  91991.911929
1487   SOL 2025-12-05 17:33:39    138.686910
2323   ETH 2025-12-05 17:33:39   3133.188540
3157   BNB 2025-12-05 17:33:39    901.481374
3604  LINK 2025-12-05 22:42:12     13.402036


In [241]:
# Suppose ndf is your DataFrame and you are looking for previous ETH entry before index 5146
target_index = 3156
asset = 'ETH'

# Filter rows before the target index
prev_rows = pred_df.loc[:target_index-1]  # all rows up to but not including 5146

# Further filter by the asset
prev_asset_rows = prev_rows[prev_rows['asset'] == asset]

if not prev_asset_rows.empty:
    last_entry_before = prev_asset_rows.iloc[-1]  # last occurrence before target_index
    print(last_entry_before)
else:
    print(f"No previous {asset} entry before index {target_index}")

asset                         ETH
timestamp     2025-12-08 15:30:25
prediction            3118.150709
Name: 3155, dtype: object


# omg on new

In [257]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
import warnings

warnings.filterwarnings('ignore')

# --- Copy dataset ---
df_copy = df.copy()

# --- Clean timestamps ---
df_copy.iloc[:, -3] = df_copy.iloc[:, -3].astype(str).str.split("GMT").str[0].str.strip()
df_copy['timestamp'] = pd.to_datetime(df_copy['timestamp'])

# --- Major assets & prediction horizons ---
major_assets = ['XRP', 'FIL', 'BTC', 'SOL', 'ETH', 'BNB', 'LINK','ZEC','PAXG','LTC','BCH','AAVE','UNI','CAKE']
future_minutes = [1,3,5,15,30,60]  # prediction horizons in minutes

# --- Feature engineering ---
lags = [1,2,3]
base_features = ['vwap','e','h','x','spread','spreadper','ratio','sigma','deviation','klineacc','term']

def create_lag_features(df_input, features, lags):
    df_temp = df_input.sort_values('timestamp').copy()
    for lag in lags:
        for f in features:
            df_temp[f'{f}_lag{lag}'] = df_temp.groupby('asset')[f].shift(lag)
    return df_temp

def add_volatility(df_input, window=5):
    df_temp = df_input.copy()
    df_temp['log_ret'] = np.log(df_temp['vwap']).diff()
    df_temp['volatility'] = df_temp.groupby('asset')['log_ret'].rolling(window).std().reset_index(0, drop=True)
    return df_temp

def add_market_features(df_input):
    df_temp = df_input.copy()
    df_temp['market_vwap_mean'] = df_temp.groupby('timestamp')['vwap'].transform('mean')
    return df_temp

# Apply feature engineering
df_feat = create_lag_features(df_copy, base_features, lags)
df_feat = add_volatility(df_feat)
df_feat = add_market_features(df_feat)

# Fill NaNs
df_feat = df_feat.groupby('asset').apply(lambda x: x.ffill().bfill()).reset_index(drop=True)

# --- Multi-horizon target columns ---
for m in future_minutes:
    df_feat[f'vwap_fut_{m}m'] = df_feat.groupby('asset')['vwap'].shift(-m)

df_feat = df_feat.dropna(subset=[f'vwap_fut_{m}m' for m in future_minutes])

# --- Features used for modeling ---
feature_cols = [c for c in df_feat.columns if 'lag' in c or c in ['volatility','market_vwap_mean']]

# --- Train per asset using a single model for all horizons ---
results_list = []
metrics_list = []

for asset in major_assets:
    asset_df = df_feat[df_feat['asset']==asset].sort_values('timestamp')
    if len(asset_df) < 20:
        continue

    X = asset_df[feature_cols]
    y = asset_df[[f'vwap_fut_{m}m' for m in future_minutes]]

    # Split train/val
    split_idx = int(len(X) * 0.8)
    X_train, X_val = X.iloc[:split_idx], X.iloc[split_idx:]
    y_train, y_val = y.iloc[:split_idx], y.iloc[split_idx:]

    # Train multi-output RandomForest
    model = RandomForestRegressor(n_estimators=200, max_depth=7, n_jobs=-1, random_state=42)
    model.fit(X_train, y_train)

    # Predictions
    preds = pd.DataFrame(model.predict(X_val), columns=[f'pred_{m}m' for m in future_minutes])
    
    # Reset indices for safe timestamp alignment
    X_val_reset = X_val.reset_index(drop=True)
    asset_df_reset = asset_df.reset_index(drop=True)
    preds['timestamp'] = asset_df_reset.loc[X_val_reset.index, 'timestamp'].values
    preds['asset'] = asset

    # Metrics per horizon
    for m in future_minutes:
        r2 = r2_score(y_val[f'vwap_fut_{m}m'], preds[f'pred_{m}m'])
        mae = mean_absolute_error(y_val[f'vwap_fut_{m}m'], preds[f'pred_{m}m'])
        rmse = np.sqrt(mean_squared_error(y_val[f'vwap_fut_{m}m'], preds[f'pred_{m}m']))
        metrics_list.append({
            'asset': asset,
            'horizon': f'{m}m',
            'R2': r2,
            'MAE': mae,
            'RMSE': rmse
        })

    results_list.append(preds)

# --- Combine results ---
pred_df = pd.concat(results_list).reset_index(drop=True)
metrics_df = pd.DataFrame(metrics_list)

In [258]:
timestamp=[]
print("\n=== Sample Predictions ===")
for i in major_assets:
    a=pred_df[pred_df["asset"]==i].head(5)
    b=pred_df[pred_df["asset"]==i].tail(5)
    print(pred_df[pred_df["asset"]==i].head(5))
    for jk in range(len(a)):
        timestamp.append([a.iloc[jk]["asset"],a.iloc[jk]["timestamp"]])
    for jk in range(len(b)):
        timestamp.append([b.iloc[jk]["asset"],b.iloc[jk]["timestamp"]])
    print(pred_df[pred_df["asset"]==i].tail(5))

print("\n=== Validation Metrics ===")
print(metrics_df.sort_values('R2', ascending=False))

timestamp = sorted(timestamp, key=lambda x: x[1],reverse=False)
for k in timestamp:
    print(k)


=== Sample Predictions ===
    pred_1m   pred_3m   pred_5m  pred_15m  pred_30m  pred_60m  \
0  2.214187  2.213134  2.212034  2.206871  2.201346  2.193591   
1  2.213858  2.212754  2.211713  2.206452  2.200961  2.193390   
2  2.213858  2.212754  2.211713  2.206452  2.200961  2.193390   
3  2.192553  2.194256  2.194835  2.197277  2.201014  2.207701   
4  2.197667  2.197245  2.198194  2.198771  2.200463  2.203339   

            timestamp asset  
0 2025-11-02 18:03:35   XRP  
1 2025-11-02 18:09:02   XRP  
2 2025-11-02 18:14:28   XRP  
3 2025-11-02 18:19:54   XRP  
4 2025-11-02 18:25:21   XRP  
       pred_1m   pred_3m   pred_5m  pred_15m  pred_30m  pred_60m  \
1863  2.097938  2.097550  2.097074  2.095004  2.094477  2.081361   
1864  2.094091  2.093738  2.093460  2.092587  2.091363  2.081147   
1865  2.100250  2.099859  2.099535  2.097481  2.097402  2.085267   
1866  2.098594  2.098156  2.097850  2.096894  2.095651  2.082988   
1867  2.092675  2.092313  2.092027  2.091181  2.090294  2.081

# new

In [261]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import (
    r2_score, mean_absolute_error, mean_squared_error,
    accuracy_score, f1_score
)
import warnings
warnings.filterwarnings('ignore')


# ===============================
# 1. CLEAN + FEATURE ENGINEERING
# ===============================

df_copy = df.copy()

# Fix timestamp
df_copy.iloc[:, -3] = df_copy.iloc[:, -3].astype(str).str.split("GMT").str[0].str.strip()
df_copy["timestamp"] = pd.to_datetime(df_copy["timestamp"])

major_assets = [
    'XRP','FIL','BTC','SOL','ETH','BNB','LINK','ZEC','PAXG',
    'LTC','BCH','AAVE','UNI','CAKE'
]

future_minutes = [1,3,5,15,30,60]

lags = [1,2,3]
base_features = ['vwap','e','h','x','spread','spreadper','ratio','sigma','deviation','klineacc','term']


def create_lags(df_in):
    df_temp = df_in.sort_values("timestamp").copy()
    for lag in lags:
        for f in base_features:
            df_temp[f"{f}_lag{lag}"] = df_temp.groupby("asset")[f].shift(lag)
    return df_temp

def add_vol(df_in):
    df_temp = df_in.copy()
    df_temp["log_ret"] = np.log(df_temp["vwap"]).diff()
    df_temp["volatility"] = (
        df_temp.groupby("asset")["log_ret"]
        .rolling(5).std().reset_index(0, drop=True)
    )
    return df_temp

def add_market(df_in):
    df_temp = df_in.copy()
    df_temp["market_vwap_mean"] = df_temp.groupby("timestamp")["vwap"].transform("mean")
    return df_temp


df_feat = create_lags(df_copy)
df_feat = add_vol(df_feat)
df_feat = add_market(df_feat)

# forward/back fill per asset
df_feat = df_feat.groupby("asset").apply(lambda x: x.ffill().bfill()).reset_index(drop=True)


# ===============================
# 2. FUTURE TARGETS + DIRECTION
# ===============================

for m in future_minutes:
    df_feat[f"vwap_fut_{m}m"] = df_feat.groupby("asset")["vwap"].shift(-m)
    df_feat[f"dir_{m}m"] = (df_feat[f"vwap_fut_{m}m"] > df_feat["vwap"]).astype(int)

df_feat = df_feat.dropna(subset=[f"vwap_fut_{m}m" for m in future_minutes])


feature_cols = [
    c for c in df_feat.columns
    if ("lag" in c) or (c in ["volatility", "market_vwap_mean"])
]


# ===============================
# 3. MODEL TRAINING & EVALUATION
# ===============================

results_list = []
reg_metrics = []
cls_metrics = []

for asset in major_assets:

    asset_df = df_feat[df_feat["asset"] == asset].sort_values("timestamp")
    if len(asset_df) < 50:
        continue

    X = asset_df[feature_cols]
    y_reg = asset_df[[f"vwap_fut_{m}m" for m in future_minutes]]
    y_cls = asset_df[[f"dir_{m}m" for m in future_minutes]]

    split_idx = int(len(X) * 0.8)
    X_train, X_val = X.iloc[:split_idx], X.iloc[split_idx:]
    y_reg_train, y_reg_val = y_reg.iloc[:split_idx], y_reg.iloc[split_idx:]
    y_cls_train, y_cls_val = y_cls.iloc[:split_idx], y_cls.iloc[split_idx:]

    # -----------------------------
    # Regression: RandomForest
    # -----------------------------
    reg = RandomForestRegressor(
        n_estimators=300,
        max_depth=9,
        min_samples_split=4,
        n_jobs=-1,
        random_state=42
    )
    reg.fit(X_train, y_reg_train)

    reg_pred = pd.DataFrame(
        reg.predict(X_val),
        columns=[f"pred_{m}m" for m in future_minutes],
        index=X_val.index
    )

    # -----------------------------
    # Classification: Direction
    # -----------------------------
    cls = RandomForestClassifier(
        n_estimators=400,
        max_depth=8,
        n_jobs=-1,
        class_weight="balanced",
        random_state=42
    )
    cls.fit(X_train, y_cls_train)

    cls_pred = pd.DataFrame(
        cls.predict(X_val),
        columns=[f"pred_dir_{m}m" for m in future_minutes],
        index=X_val.index
    )

    # -----------------------------
    # Metrics: Regression + Classif
    # -----------------------------
    for m in future_minutes:

        # Regression evaluations
        r2 = r2_score(y_reg_val[f"vwap_fut_{m}m"], reg_pred[f"pred_{m}m"])
        mae = mean_absolute_error(y_reg_val[f"vwap_fut_{m}m"], reg_pred[f"pred_{m}m"])
        rmse = np.sqrt(mean_squared_error(y_reg_val[f"vwap_fut_{m}m"], reg_pred[f"pred_{m}m"]))

        reg_metrics.append({
            "asset": asset,
            "horizon": f"{m}m",
            "R2": r2,
            "MAE": mae,
            "RMSE": rmse
        })

        # Classification evaluations
        acc = accuracy_score(y_cls_val[f"dir_{m}m"], cls_pred[f"pred_dir_{m}m"])
        f1 = f1_score(y_cls_val[f"dir_{m}m"], cls_pred[f"pred_dir_{m}m"])

        cls_metrics.append({
            "asset": asset,
            "horizon": f"{m}m",
            "Accuracy": acc,
            "F1": f1
        })

    # -----------------------------
    # Save predictions cleanly
    # -----------------------------
    out = pd.concat([reg_pred, cls_pred], axis=1)
    out["asset"] = asset
    
    # FIXED TIMESTAMP ALIGNMENT
    ts = asset_df["timestamp"].values          # raw numpy timestamps
    out["timestamp"] = ts[split_idx:]          # take the validation slice only
    
    results_list.append(out)


pred_df = pd.concat(results_list).reset_index(drop=True)
reg_metrics_df = pd.DataFrame(reg_metrics)
cls_metrics_df = pd.DataFrame(cls_metrics)

In [266]:
timestamps = []

for asset in major_assets:
    asset_df = pred_df[pred_df["asset"] == asset]

    head5 = asset_df.head(5)
    tail5 = asset_df.tail(5)

    print(f"\n--- {asset} FIRST 5 ---")
    print(head5)

    print(f"\n--- {asset} LAST 5 ---")
    print(tail5)

    # collect timestamps
    for _, row in head5.iterrows():
        timestamps.append([asset, row["timestamp"]])
    for _, row in tail5.iterrows():
        timestamps.append([asset, row["timestamp"]])

# sort timestamps chronologically
timestamps = sorted(timestamps, key=lambda x: x[1])

print("\n=== Cross-Asset Timestamp Sample ===")
for t in timestamps:
    print(t)

print("\n=== Regression Metrics (R2 sorted) ===")
if "R2" in reg_metrics_df.columns:
    print(reg_metrics_df.sort_values("R2", ascending=False))
else:
    print(reg_metrics_df)

print("\n=== Classification Metrics ===")
print(cls_metrics_df)



--- XRP FIRST 5 ---
   reg_pred_ret_1m  reg_pred_ret_3m  reg_pred_ret_5m  reg_pred_ret_15m  \
0        -0.000853        -0.004594        -0.002222         -0.004831   
1        -0.000789        -0.002594        -0.005095         -0.004935   
2         0.023778         0.017161         0.019443          0.025958   
3        -0.001157        -0.005750        -0.001548         -0.003535   
4        -0.000105        -0.003008        -0.001420         -0.004543   

   reg_pred_ret_30m  reg_pred_ret_60m  reg_pred_price_1m  reg_pred_price_3m  \
0         -0.008515         -0.012406           2.210029           2.201778   
1         -0.009158         -0.012007           2.209423           2.205440   
2          0.022559          0.009536           2.212727           2.198132   
3         -0.005295         -0.008334           2.207459           2.197343   
4         -0.007847         -0.003901           2.206520           2.200123   

   reg_pred_price_5m  reg_pred_price_15m  reg_pred_price_30

In [None]:
"""
Multi-asset, multi-horizon forecasting pipeline
- XGBoost regressors for next-n-minute returns (separate model per horizon)
- XGBoost classifiers for direction (balanced training)
- Return-based targets (log returns)
- Cross-asset z-score normalization at each timestamp
- Asset-relative transforms (price / asset-median)
- Robust feature engineering (lags, rolling stats)
- Chronological train/validation split, safe timestamp alignment
- Final outputs: pred_df (predictions + timestamp + asset), reg_metrics_df, cls_metrics_df

Requirements:
pip install xgboost scikit-learn pandas numpy
"""

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.metrics import (
    r2_score, mean_absolute_error, mean_squared_error,
    accuracy_score, f1_score, confusion_matrix
)
from sklearn.utils import resample
from xgboost import XGBRegressor, XGBClassifier
import warnings
warnings.filterwarnings("ignore")
RNG = 42

# ---------- USER: point your df here ----------
# df is expected to have at least:
# ['asset','timestamp','vwap'] plus other numeric cols (e,h,x,spread,spreadper,ratio,sigma,deviation,...)
# timestamp should be parseable by pd.to_datetime
# If your 'timestamp' column is in different position, the script relies on column name 'timestamp'.
# Example usage: df = pd.read_parquet('your_data.parquet')
# ------------------------------------------------

def safe_timestamp_fix(df):
    df = df.copy()
    # previous scripts trimmed timezone strings; keep that behavior if needed
    if df['timestamp'].dtype == object:
        df['timestamp'] = df['timestamp'].astype(str).str.split("GMT").str[0].str.strip()
    df['timestamp'] = pd.to_datetime(df['timestamp'])
    return df

def basic_cleaning(df):
    df = df.copy()
    df = safe_timestamp_fix(df)
    # Keep only numeric features + asset + timestamp
    # If fields are strings with commas, try to coerce
    for c in df.columns:
        if c not in ['asset','timestamp'] and df[c].dtype == object:
            # try to convert
            try:
                df[c] = pd.to_numeric(df[c].str.replace(',',''), errors='coerce')
            except Exception:
                pass
    return df

def cross_asset_zscore(df, cols):
    # for each timestamp, compute zscore across assets for specified cols
    df = df.copy()
    # pivot-like groupby
    for c in cols:
        # compute mean/std across assets at each timestamp
        grp = df.groupby('timestamp')[c]
        mean = grp.transform('mean')
        std = grp.transform('std').replace(0, np.nan)
        df[f'{c}_ts_z'] = (df[c] - mean) / std
        # fill na -> 0 (if single-asset timestamp)
        df[f'{c}_ts_z'] = df[f'{c}_ts_z'].fillna(0)
    return df

def asset_relative_transform(df, cols):
    # normalize by asset median (a simple robust transform)
    df = df.copy()
    medians = df.groupby('asset')[cols].transform('median')
    # avoid division by zero
    medians = medians.replace(0, np.nan).fillna(1.0)
    for c in cols:
        df[f'{c}_asset_rel'] = df[c] / medians[c]
    return df

def feature_engineering(df, lags=(1,2,3), rolling_windows=(3,5,10)):
    df = df.sort_values(['asset','timestamp']).copy()
    base_num = [c for c in df.columns if c not in ['asset','timestamp'] and np.issubdtype(df[c].dtype, np.number)]
    # create log prices and returns
    df['log_vwap'] = np.log(df['vwap'].replace(0, np.nan)).fillna(method='ffill').fillna(0)
    df['vwap_ret_1'] = df.groupby('asset')['log_vwap'].diff()   # log return 1 row
    # lag features
    for lag in lags:
        df[f'vwap_ret_lag{lag}'] = df.groupby('asset')['vwap_ret_1'].shift(lag)
        for c in base_num:
            df[f'{c}_lag{lag}'] = df.groupby('asset')[c].shift(lag)
    # rolling stats on returns
    for w in rolling_windows:
        df[f'vol_{w}'] = df.groupby('asset')['vwap_ret_1'].rolling(window=w, min_periods=1).std().reset_index(0,drop=True)
        df[f'meanret_{w}'] = df.groupby('asset')['vwap_ret_1'].rolling(window=w, min_periods=1).mean().reset_index(0,drop=True)
    # cross-series lagged ratios (vwap / btc_vwap at same timestamp) not necessary if you do asset-relative transforms
    # forward/backfill where appropriate per-asset to avoid entire rows dropping
    df = df.groupby('asset').apply(lambda g: g.ffill().bfill()).reset_index(drop=True)
    return df

def create_targets(df, horizons_minutes=[1,3,5,15,30,60], price_col='vwap'):
    df = df.copy()
    # We'll use log-return horizon targets: log(vwap_t+h) - log(vwap_t)
    df['log_vwap'] = np.log(df[price_col].replace(0, np.nan))
    for h in horizons_minutes:
        df[f'logvwap_fut_{h}m'] = df.groupby('asset')['log_vwap'].shift(-h)
        df[f'ret_fut_{h}m'] = df[f'logvwap_fut_{h}m'] - df['log_vwap']
        # classification direction (+1 if up, 0 if down or same)
        df[f'dir_{h}m'] = (df[f'ret_fut_{h}m'] > 0).astype(int)
    # drop rows without any future (so we don't create misaligned val indices)
    df = df.dropna(subset=[f'ret_fut_{h}m' for h in horizons_minutes])
    return df

def safe_train_val_split(asset_df, feature_cols, target_cols, val_fraction=0.2):
    n = len(asset_df)
    cut = int(n * (1 - val_fraction))
    X_train = asset_df[feature_cols].iloc[:cut].reset_index(drop=True)
    X_val = asset_df[feature_cols].iloc[cut:].reset_index(drop=True)
    y_train = asset_df[target_cols].iloc[:cut].reset_index(drop=True)
    y_val = asset_df[target_cols].iloc[cut:].reset_index(drop=True)
    meta_val = asset_df[['timestamp']].iloc[cut:].reset_index(drop=True)
    return X_train, X_val, y_train, y_val, meta_val

def balance_binary_train(X, y_binary):
    # simple up-sample minority class to balance
    df_train = pd.concat([X.reset_index(drop=True), y_binary.reset_index(drop=True)], axis=1)
    cls_col = y_binary.name
    # split
    df_0 = df_train[df_train[cls_col] == 0]
    df_1 = df_train[df_train[cls_col] == 1]
    if len(df_0) == 0 or len(df_1) == 0:
        # nothing to do
        return X, y_binary
    if len(df_0) > len(df_1):
        df_1_up = resample(df_1, replace=True, n_samples=len(df_0), random_state=RNG)
        df_bal = pd.concat([df_0, df_1_up]).sample(frac=1, random_state=RNG).reset_index(drop=True)
    else:
        df_0_up = resample(df_0, replace=True, n_samples=len(df_1), random_state=RNG)
        df_bal = pd.concat([df_1, df_0_up]).sample(frac=1, random_state=RNG).reset_index(drop=True)
    Xb = df_bal.drop(cls_col, axis=1)
    yb = df_bal[cls_col]
    return Xb, yb

# ----------------------------
# MAIN pipeline function
# ----------------------------
def train_pipeline(df,
                   major_assets=None,
                   horizons=[1,3,5,15,30,60],
                   numeric_base_cols=None,
                   xgb_params_reg=None,
                   xgb_params_clf=None,
                   verbose=True):
    """
    Returns:
      pred_df: concatenated predictions per-asset on validation folds
      reg_metrics_df, cls_metrics_df: evaluation tables
    """
    if major_assets is None:
        major_assets = df['asset'].unique().tolist()

    # Basic cleaning
    df = basic_cleaning(df)

    # Decide numeric columns if not provided
    if numeric_base_cols is None:
        numeric_base_cols = [c for c in df.columns if c not in ['asset','timestamp','vwap'] and np.issubdtype(df[c].dtype, np.number)]

    # Cross-asset normalization for vwap and the numeric columns (creates *_ts_z)
    cross_cols = ['vwap'] + numeric_base_cols
    df = cross_asset_zscore(df, cross_cols)

    # Asset-relative transforms (create *_asset_rel)
    df = asset_relative_transform(df, ['vwap'])

    # Feature engineering (lags + rolling)
    df = feature_engineering(df, lags=(1,2,3), rolling_windows=(3,5))

    # Create targets (log-return based) and directions
    df = create_targets(df, horizons_minutes=horizons, price_col='vwap')

    # choose features: use engineered columns plus cross-asset z-scores and asset_rel
    candidate_features = [c for c in df.columns if (
        'lag' in c or
        c.startswith('vol_') or
        c.startswith('meanret_') or
        c.endswith('_ts_z') or
        c.endswith('_asset_rel') or
        c in ['vwap_ret_1','log_vwap']
    )]
    # dedupe while preserving order
    seen = set(); feature_cols = [x for x in candidate_features if not (x in seen or seen.add(x))]

    if verbose:
        print(f"[info] using {len(feature_cols)} features (example head): {feature_cols[:8]}")

    # default xgboost params (light but reasonable)
    if xgb_params_reg is None:
        xgb_params_reg = dict(n_estimators=120, max_depth=6, learning_rate=0.08, n_jobs=-1, random_state=RNG, verbosity=0)
    if xgb_params_clf is None:
        xgb_params_clf = dict(n_estimators=120, max_depth=5, learning_rate=0.08, n_jobs=-1, random_state=RNG, verbosity=0, use_label_encoder=False)

    results = []
    reg_metrics = []
    cls_metrics = []

    for asset in major_assets:
        asset_df = df[df['asset']==asset].sort_values('timestamp').reset_index(drop=True)
        if len(asset_df) < 40:
            if verbose:
                print(f"[skip] {asset} (too few rows: {len(asset_df)})")
            continue

        # Train/Val chronological split
        X_train, X_val, y_reg_train_df, y_reg_val_df, val_meta = safe_train_val_split(asset_df, feature_cols, [f'ret_fut_{h}m' for h in horizons], val_fraction=0.2)

        # Per-horizon models (regression + classification)
        preds_reg = pd.DataFrame(index=range(len(X_val)))
        preds_clf = pd.DataFrame(index=range(len(X_val)))

        for h in horizons:
            target_reg_col = f'ret_fut_{h}m'
            target_cls_col = f'dir_{h}m'

            # Regression: train XGBRegressor on log-return target (y values are small)
            model_reg = XGBRegressor(**xgb_params_reg)
            y_train_reg = asset_df[target_reg_col].iloc[:len(X_train)].reset_index(drop=True)
            y_val_reg = asset_df[target_reg_col].iloc[len(X_train):].reset_index(drop=True)
            # Fit
            model_reg.fit(X_train, y_train_reg, eval_set=[(X_val, y_val_reg)])
            y_pred_reg = model_reg.predict(X_val)
            preds_reg[f'pred_ret_{h}m'] = y_pred_reg

            # Metrics (convert back to price-scale only if user wants; here we report on return-scale)
            r2 = r2_score(y_val_reg, y_pred_reg)
            mae = mean_absolute_error(y_val_reg, y_pred_reg)
            rmse = np.sqrt(mean_squared_error(y_val_reg, y_pred_reg))
            reg_metrics.append({'asset': asset, 'horizon': f'{h}m', 'R2': r2, 'MAE': mae, 'RMSE': rmse})

            # Classification: balance training classes before fitting
            cls_model = XGBClassifier(**xgb_params_clf)
            y_train_cls = asset_df[target_cls_col].iloc[:len(X_train)].reset_index(drop=True)
            y_val_cls = asset_df[target_cls_col].iloc[len(X_train):].reset_index(drop=True)

            Xb, yb = balance_binary_train(X_train, y_train_cls.rename(target_cls_col))
            # If balance returned unchanged (e.g., all same class), train on original
            if len(yb.unique()) == 1:
                Xb, yb = X_train, y_train_cls

            cls_model.fit(Xb, yb, eval_set=[(X_val, y_val_cls)])
            y_pred_cls = cls_model.predict(X_val)
            preds_clf[f'pred_dir_{h}m'] = y_pred_cls

            # Classification metrics
            acc = accuracy_score(y_val_cls, y_pred_cls)
            # handle case where one class absent in val (f1_score will error otherwise) -> use zero
            try:
                f1 = f1_score(y_val_cls, y_pred_cls)
            except Exception:
                f1 = np.nan
            cls_metrics.append({'asset': asset, 'horizon': f'{h}m', 'Accuracy': acc, 'F1': f1})

        # Pack predictions, convert return preds back to predicted future price if desired:
        # predicted price = current_price * exp(pred_ret)
        # we stored log returns, so:
        current_price_val = asset_df['vwap'].iloc[len(X_train):].reset_index(drop=True)
        for h in horizons:
            preds_reg[f'pred_price_{h}m'] = current_price_val * np.exp(preds_reg[f'pred_ret_{h}m'])

        out = pd.concat([preds_reg.add_prefix('reg_'), preds_clf.add_prefix('clf_')], axis=1)
        out['asset'] = asset
        out['timestamp'] = val_meta['timestamp'].values  # safe alignment: val_meta length == X_val length

        results.append(out)

    # combine
    if len(results) == 0:
        raise RuntimeError("No assets trained - check data / major_assets list / minimal rows per asset")

    pred_df = pd.concat(results).reset_index(drop=True)
    reg_metrics_df = pd.DataFrame(reg_metrics).sort_values(['asset','horizon']).reset_index(drop=True)
    cls_metrics_df = pd.DataFrame(cls_metrics).sort_values(['asset','horizon']).reset_index(drop=True)
    return pred_df, reg_metrics_df, cls_metrics_df

# ----------------------------
# Example run (user executes)
# ----------------------------
if __name__ == '__main__':
    # df must exist in the environment already. Example:
    # df = pd.read_parquet("ndf_sample.parquet")
    # adjust major_assets list to the assets you want
    try:
        pred_df, reg_metrics_df, cls_metrics_df = train_pipeline(df, major_assets=[
            'XRP','FIL','BTC','SOL','ETH','BNB','LINK','ZEC','PAXG','LTC','BCH','AAVE','UNI','CAKE'
        ])
    except Exception as e:
        print("[error] pipeline failed:", e)
        raise

    # Show compact summaries
    print("\n=== Regression metrics (top few rows) ===")
    print(reg_metrics_df.sort_values('R2', ascending=False).head(20))
    print("\n=== Classification metrics (top few rows) ===")
    print(cls_metrics_df.sort_values('F1', ascending=False).head(20))

    # Save or inspect
    # pred_df.to_parquet('preds.parquet')
    # reg_metrics_df.to_csv('reg_metrics.csv', index=False)
    # cls_metrics_df.to_csv('cls_metrics.csv', index=False)

    # quick example: compute directional accuracy aggregated
    dir_acc = cls_metrics_df.groupby('horizon')['Accuracy'].mean().reset_index()
    print("\n=== Mean directional accuracy by horizon ===")
    print(dir_acc)


[info] using 104 features (example head): ['vwap_ts_z', 'klineacc_ts_z', 'spread_ts_z', 'spreadper_ts_z', 'x_ts_z', 'deviation_ts_z', 'ratio_ts_z', 'term_ts_z']
[0]	validation_0-rmse:0.00781
[1]	validation_0-rmse:0.00756
[2]	validation_0-rmse:0.00734
[3]	validation_0-rmse:0.00715
[4]	validation_0-rmse:0.00697
[5]	validation_0-rmse:0.00683
[6]	validation_0-rmse:0.00670
[7]	validation_0-rmse:0.00659
[8]	validation_0-rmse:0.00652
[9]	validation_0-rmse:0.00646
[10]	validation_0-rmse:0.00639
[11]	validation_0-rmse:0.00635
[12]	validation_0-rmse:0.00631
[13]	validation_0-rmse:0.00626
[14]	validation_0-rmse:0.00622
[15]	validation_0-rmse:0.00618
[16]	validation_0-rmse:0.00615
[17]	validation_0-rmse:0.00613
[18]	validation_0-rmse:0.00610
[19]	validation_0-rmse:0.00608
[20]	validation_0-rmse:0.00606
[21]	validation_0-rmse:0.00604
[22]	validation_0-rmse:0.00603
[23]	validation_0-rmse:0.00602
[24]	validation_0-rmse:0.00601
[25]	validation_0-rmse:0.00601
[26]	validation_0-rmse:0.00600
[27]	validat

In [267]:
timestamps = []

for asset in major_assets:
    asset_df = pred_df[pred_df["asset"] == asset]

    head5 = asset_df.head(5)
    tail5 = asset_df.tail(5)

    print(f"\n--- {asset} FIRST 5 ---")
    print(head5)

    print(f"\n--- {asset} LAST 5 ---")
    print(tail5)

    # collect timestamps
    for _, row in head5.iterrows():
        timestamps.append([asset, row["timestamp"]])
    for _, row in tail5.iterrows():
        timestamps.append([asset, row["timestamp"]])

# sort timestamps chronologically
timestamps = sorted(timestamps, key=lambda x: x[1])

print("\n=== Cross-Asset Timestamp Sample ===")
for t in timestamps:
    print(t)

print("\n=== Regression Metrics (R2 sorted) ===")
if "R2" in reg_metrics_df.columns:
    print(reg_metrics_df.sort_values("R2", ascending=False))
else:
    print(reg_metrics_df)

print("\n=== Classification Metrics ===")
print(cls_metrics_df)



--- XRP FIRST 5 ---
   reg_pred_ret_1m  reg_pred_ret_3m  reg_pred_ret_5m  reg_pred_ret_15m  \
0        -0.000853        -0.004594        -0.002222         -0.004831   
1        -0.000789        -0.002594        -0.005095         -0.004935   
2         0.023778         0.017161         0.019443          0.025958   
3        -0.001157        -0.005750        -0.001548         -0.003535   
4        -0.000105        -0.003008        -0.001420         -0.004543   

   reg_pred_ret_30m  reg_pred_ret_60m  reg_pred_price_1m  reg_pred_price_3m  \
0         -0.008515         -0.012406           2.210029           2.201778   
1         -0.009158         -0.012007           2.209423           2.205440   
2          0.022559          0.009536           2.212727           2.198132   
3         -0.005295         -0.008334           2.207459           2.197343   
4         -0.007847         -0.003901           2.206520           2.200123   

   reg_pred_price_5m  reg_pred_price_15m  reg_pred_price_30