In [194]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    roc_auc_score, confusion_matrix, classification_report, f1_score
)

# Try to use XGBoost; if not available, fallback to RandomForest
use_xgb = True
try:
    from xgboost import XGBClassifier
except Exception:
    use_xgb = False
from sklearn.ensemble import RandomForestClassifier
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

In [60]:
import numpy as np
print(np.__version__)
pd.options.display.max_rows = None

1.26.4


In [189]:
df = pd.read_csv("../Downloads/st_export.csv").iloc[:,1:]
for i in range(len(df)):
    df.iloc[i,-3]=df.iloc[i,-3].split("GMT")[0].strip()
df['timestamp'] = pd.to_datetime(df['timestamp'])

In [190]:
a=np.random.randint(len(df))
df.iloc[a:a+5,:]

Unnamed: 0,klineacc,spread,spreadper,x,vwap,deviation,ratio,term,sigma,e,h,asset,timestamp,gap,gaplimit
72166,18294646,-2277414.0,-117.692345,-0.739499,1.572469,-0.035084,2.176924,0.301871,0.024692,-0.93373,-0.732328,SUI,2025-11-20 03:27:59,5m,60
72167,101103832,-3996358.0,-63.699516,-0.813051,2.077507,-0.018102,1.636995,0.493582,0.014593,-1.002307,-0.762561,XRP,2025-11-20 03:27:59,5m,60
72168,95749136,-26318410.0,-296.169312,-0.666268,134.315186,-0.022746,3.961693,0.238122,0.017215,-1.246502,-0.8473,SOL,2025-11-20 03:27:59,5m,60
72169,12786416,-129667.2,-7.432905,-0.864047,4.844911,0.012609,1.074329,0.672308,0.025574,0.356106,0.34178,ICP,2025-11-20 03:39:19,1m,300
72170,169758016,3958168.0,51.434204,0.834458,619.839111,0.012714,0.485658,0.724087,0.021389,0.209042,0.20605,ZEC,2025-11-20 03:39:19,1m,300


In [188]:
ndf = pd.read_csv("../Downloads/st_export3.csv").iloc[:,1:]
for i in range(len(ndf)):
    ndf.iloc[i,-3]=ndf.iloc[i,-3].split("GMT")[0].strip()
ndf['timestamp'] = pd.to_datetime(ndf['timestamp'])
a=np.random.randint(len(ndf))
ndf.iloc[a:a+5,:]

Unnamed: 0,klineacc,spread,spreadper,x,vwap,deviation,ratio,term,sigma,e,h,asset,timestamp,gap,gaplimit
3340,48334904,-2195232.0,-24.973526,-0.80597,89567.703125,-0.000588,1.249735,0.844876,0.001175,-0.528642,-0.484342,BTC,2025-12-07 14:06:07,3m,100
3341,17542022,-12747070.0,-89.029434,-0.562048,133.019684,-0.001652,1.890294,0.531837,0.002218,-0.748742,-0.634398,SOL,2025-12-07 14:06:07,3m,100
3342,31555670,-227765.7,-2.231067,-0.872986,3049.097656,-0.001508,1.022311,0.564068,0.001143,-0.760481,-0.64136,ETH,2025-12-07 14:06:07,3m,100
3343,11513714,-4263340.0,-82.287003,-0.634807,2.053715,-0.002101,1.82287,1.084755,0.00471,-0.881923,-0.707382,XRP,2025-12-07 14:06:07,3m,100
3344,13765689,-1061777.0,-45.582344,-0.774624,339.793945,0.012643,1.455823,0.692376,0.010148,1.255781,0.849897,ZEC,2025-12-07 14:09:28,5m,60


In [191]:
assets_to_train=[i for i in ndf["asset"].unique().tolist() if i in df["asset"].unique().tolist()]
print(assets_to_train)

['SOL', 'BNB', 'BTC', 'XRP', 'ETH', 'ZEC', 'SUI', 'EGLD', 'AAVE', 'BCH', 'LINK', 'DOT', 'TRB', 'LTC', 'ORDI', 'TON', 'PAXG', 'INJ', 'WBTC', 'ZRO', 'ETC', 'ORCA', 'CAKE', 'AR', 'UNI', 'COMP', 'DASH', 'AVAX', 'TAO', 'FIL', 'RENDER', 'TRUMP', 'NEAR', 'GIGGLE', 'ATOM', 'QNT', 'PENDLE', 'ZEN', 'APT', 'ICP', 'NMR']


In [213]:
import pandas as pd
import numpy as np
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
import warnings
warnings.filterwarnings('ignore')

# -------------------------------------------------------------------
# Make SAFE internal copies (df stays df, ndf stays ndf)
# -------------------------------------------------------------------
train_df = df.copy()
future_df = ndf.copy()

# -------------------------------------------------------------------
# Timestamp cleanup
# -------------------------------------------------------------------
for d in [train_df, future_df]:
    d.iloc[:, -3] = d.iloc[:, -3].astype(str).str.split("GMT").str[0].str.strip()
    d['timestamp'] = pd.to_datetime(d['timestamp'])

lags = [1,2,3]
base_features = ['vwap','e','h','x','spread','spreadper','ratio','sigma','deviation']

def add_lags(df_in):
    df_out = df_in.sort_values('timestamp').copy()
    for lag in lags:
        for f in base_features:
            df_out[f"{f}_lag{lag}"] = df_out.groupby("asset")[f].shift(lag)
    return df_out

train_lagged = add_lags(train_df)
future_lagged = add_lags(future_df)

train_lagged = train_lagged.dropna()

predictions = []
metrics = []

for asset in assets_to_train:
    sub = train_lagged[train_lagged["asset"] == asset].sort_values("timestamp")
    if len(sub) < 30:
        print(f"Skipping {asset} (too little data)")
        continue

    feature_cols = [c for c in sub.columns if "lag" in c]

    X = sub[feature_cols]
    y = sub["vwap"]

    split = int(len(X)*0.8)
    X_train, X_val = X.iloc[:split], X.iloc[split:]
    y_train, y_val = y.iloc[:split], y.iloc[split:]

    # =====================================================
    # Main fix: HistGradientBoostingRegressor handles NaN.
    # =====================================================
    model = HistGradientBoostingRegressor(
        max_depth=4,
        learning_rate=0.05,
        max_iter=300
    )

    model.fit(X_train, y_train)

    # Validation metrics
    y_pred = model.predict(X_val)
    metrics.append({
        "asset": asset,
        "R2": r2_score(y_val, y_pred),
        "MAE": mean_absolute_error(y_val, y_pred),
        "RMSE": np.sqrt(mean_squared_error(y_val, y_pred))
    })

    # Predictions for new data
    new_sub = future_lagged[future_lagged["asset"] == asset].sort_values("timestamp")
    if new_sub.empty:
        continue

    new_sub["prediction"] = model.predict(new_sub[feature_cols])
    predictions.append(new_sub[["asset","timestamp","prediction"]])

# Final outputs
pred_df = pd.concat(predictions).reset_index(drop=True)
metrics_df = pd.DataFrame(metrics).set_index("asset")

Python(1887) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


Skipping EGLD (too little data)


In [214]:
print(metrics_df.sort_values('R2', ascending=False))

               R2         MAE        RMSE
asset                                    
XRP      0.963978    0.005013    0.013099
FIL      0.962063    0.020734    0.039323
BTC      0.956198  199.020438  498.495734
SOL      0.954861    0.520708    1.228089
ETH      0.948974    9.750855   27.745067
BNB      0.946116    3.055720    6.978202
LINK     0.931703    0.111574    0.215440
NEAR     0.926329    0.026739    0.049117
ICP      0.922538    0.085308    0.157760
SUI      0.920552    0.017227    0.030301
GIGGLE   0.920315    2.863577    4.879095
AVAX     0.874721    0.137658    0.210788
ZEN      0.859808    0.365244    0.730618
LTC      0.858565    1.450574    1.996265
CAKE     0.852020    0.010022    0.013735
UNI      0.820363    0.155576    0.235801
PENDLE   0.772881    0.016107    0.020631
INJ      0.771541    0.061737    0.073638
DASH     0.730259    3.847777    6.684932
TRB      0.599198    0.940410    1.241431
ZEC      0.597745   25.921409   34.381240
ETC      0.459696    0.133775    0

In [218]:
c=[[len(df[df["asset"]==i]),i] for i in assets_to_train]

In [223]:
c = sorted(c, key=lambda x: x[0],reverse=True)
print(c)

[[10193, 'BTC'], [10172, 'ETH'], [9534, 'SOL'], [9523, 'ZEC'], [9512, 'BNB'], [9398, 'XRP'], [4828, 'SUI'], [3691, 'ICP'], [3305, 'DASH'], [3203, 'GIGGLE'], [3054, 'LTC'], [2943, 'LINK'], [2886, 'ZEN'], [2448, 'UNI'], [2402, 'NEAR'], [2344, 'TAO'], [2145, 'FIL'], [1917, 'TRUMP'], [1759, 'AVAX'], [1365, 'PAXG'], [1211, 'BCH'], [795, 'AAVE'], [660, 'AR'], [566, 'DOT'], [479, 'ETC'], [408, 'WBTC'], [358, 'CAKE'], [358, 'RENDER'], [347, 'ORDI'], [325, 'INJ'], [310, 'APT'], [287, 'TON'], [287, 'PENDLE'], [264, 'ATOM'], [249, 'TRB'], [204, 'NMR'], [190, 'QNT'], [184, 'ZRO'], [124, 'ORCA'], [81, 'COMP'], [27, 'EGLD']]
