In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    roc_auc_score, confusion_matrix, classification_report, f1_score
)

# Try to use XGBoost; if not available, fallback to RandomForest
use_xgb = True
try:
    from xgboost import XGBClassifier
except Exception:
    use_xgb = False
from sklearn.ensemble import RandomForestClassifier

In [3]:
import numpy as np
print(np.__version__)

1.26.4


In [4]:
df = pd.read_csv("../Downloads/st_export.csv").iloc[:,1:]
for i in range(len(df)):
    df.iloc[i,-3]=df.iloc[i,-3].split("GMT")[0].strip()
df['timestamp'] = pd.to_datetime(df['timestamp'])
a=np.random.randint(len(df))
df.iloc[a:a+5,:]

Unnamed: 0,klineacc,spread,spreadper,x,vwap,deviation,ratio,term,sigma,e,h,asset,timestamp,gap,gaplimit
73626,52695384,3042226.0,44.480713,0.785304,687.678223,-0.021156,0.555193,0.522323,0.010241,-0.599078,-0.536393,ZEC,2025-11-20 15:28:45,30m,10
73627,31215158,17062410.0,54.070522,0.345923,1.316586,-0.023991,0.459295,1.691584,0.022114,-0.842885,-0.687334,ASTER,2025-11-20 15:28:45,30m,10
73628,25991740,-245844.4,-6.046437,-0.865974,907.929871,-0.004295,1.060464,0.836181,0.00317,-1.201415,-0.834086,BNB,2025-11-20 15:28:45,30m,10
73629,277278464,-2309446.0,-16.749805,-0.869361,92433.15625,-0.005876,1.167498,0.872728,0.003298,-1.815278,-0.948365,BTC,2025-11-20 15:28:45,30m,10
73630,88408616,1329105.0,9.652183,0.851108,143.534195,-0.003582,0.903478,0.311454,0.003434,-0.293535,-0.285385,SOL,2025-11-20 15:29:10,3m,100


In [5]:
len(df)

110170

In [8]:
selected_assets = ['BTC','ETH','SOL','BNB','DOT','AAVE','BCH','PAXG']
df_copy = df[df['asset'].isin(selected_assets)].copy()

feature_cols = ['klineacc','spread','spreadper','x','vwap','deviation','ratio','term','sigma','e','h']
wide_frames = []
for col in feature_cols:
    w = df_copy.pivot_table(index='timestamp', columns='asset', values=col, aggfunc='mean')
    w.columns = [f"{col}__{a}" for a in w.columns]
    wide_frames.append(w)

wide_df = pd.concat(wide_frames, axis=1).sort_index()

# -----------------------------
# 2) BTC returns + volatility
# -----------------------------
btc_vwap_col = "vwap__BTC"
wide_df['btc_return'] = wide_df[btc_vwap_col].pct_change(fill_method=None)

# Rolling volatility (std of returns)
wide_df['btc_volatility'] = wide_df['btc_return'].rolling(window=10).std()

# -----------------------------
# 3) Hybrid target definition
# -----------------------------
neutral_thresh = 0.005  # 0.5% neutral zone
vol_thresh = 0.003      # volatility threshold

def hybrid_label(row):
    r = row['btc_return']
    vol = row['btc_volatility']
    if abs(r) < neutral_thresh and vol < vol_thresh:
        return 1  # Neutral
    elif r > neutral_thresh:
        return 2  # Up
    elif r < -neutral_thresh:
        return 0  # Down
    else:
        return 1  # fallback Neutral

wide_df['target'] = wide_df.apply(hybrid_label, axis=1)
model_df = wide_df.dropna()

# -----------------------------
# 4) Binary dataset (Up vs Down only)
# -----------------------------
binary_df = model_df[model_df['target'] != 1].copy()

# remap Up=2 → 1 so classes are [0,1]
binary_df['target'] = binary_df['target'].replace({2:1})

X = binary_df.drop(columns=['target'])
y = binary_df['target']

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

split_idx = int(len(X_scaled)*0.8)
X_train, y_train = X_scaled[:split_idx], y[:split_idx]
X_test, y_test = X_scaled[split_idx:], y[split_idx:]

In [12]:
# -----------------------------
# 5) Train boosting model
# -----------------------------
if use_xgb:
    model = XGBClassifier(
        n_estimators=300,
        max_depth=5,
        learning_rate=0.05,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42,
        use_label_encoder=False,
        eval_metric='logloss'
    )
else:
    model = RandomForestClassifier(
        n_estimators=300,
        max_depth=8,
        random_state=42
    )

model.fit(X_train, y_train)

# -----------------------------
# 6) Hybrid evaluation
# -----------------------------
# Predict Up/Down only
y_pred_binary = model.predict(X_test)

# Map predictions back into full 3-class space
test_full = model_df.iloc[split_idx:].copy()
X_full = test_full.drop(columns=['target'])
X_full_scaled = scaler.transform(X_full)

pred_binary = model.predict(X_full_scaled)

# Apply hybrid rule for Neutral
final_preds = []
for i, row in test_full.iterrows():
    # Rule for Neutral
    if abs(row['btc_return']) < neutral_thresh and row['btc_volatility'] < vol_thresh:
        final_preds.append(1)  # Neutral
    else:
        # Use ML model for Up/Down
        pred_bin = model.predict([X_full_scaled[list(test_full.index).index(i)]])[0]
        # Map binary prediction back: 0=Down, 1=Up
        final_preds.append(0 if pred_bin==0 else 2)

y_true = test_full['target'].values
y_pred = np.array(final_preds)

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [13]:
acc = accuracy_score(y_true, y_pred)
macro_f1 = f1_score(y_true, y_pred, average='macro')
print(f"Hybrid Accuracy: {acc:.4f} | Macro F1: {macro_f1:.4f}")

print("\nClassification report:")
print(classification_report(y_true, y_pred, digits=4))

cm = confusion_matrix(y_true, y_pred)
print("\nConfusion matrix:\n", cm)

Hybrid Accuracy: 0.9242 | Macro F1: 0.6508

Classification report:
              precision    recall  f1-score   support

           0     0.9091    1.0000    0.9524        50
           1     0.0000    0.0000    0.0000         5
           2     1.0000    1.0000    1.0000        11

    accuracy                         0.9242        66
   macro avg     0.6364    0.6667    0.6508        66
weighted avg     0.8554    0.9242    0.8882        66


Confusion matrix:
 [[50  0  0]
 [ 5  0  0]
 [ 0  0 11]]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [14]:
model_df.head()

Unnamed: 0_level_0,klineacc__AAVE,klineacc__BCH,klineacc__BNB,klineacc__BTC,klineacc__DOT,klineacc__ETH,klineacc__PAXG,klineacc__SOL,spread__AAVE,spread__BCH,...,h__BCH,h__BNB,h__BTC,h__DOT,h__ETH,h__PAXG,h__SOL,btc_return,btc_volatility,target
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2025-11-03 00:50:47,71898904.0,111923688.0,1029752000.0,5156883000.0,49987076.0,3802426000.0,90314240.0,1556169000.0,611453.1,124674.820312,...,-0.713702,-0.302268,0.004399,-0.208983,-0.084277,0.024301,-0.455353,-0.004483,0.00141,1
2025-11-03 00:50:53,67705104.0,104394336.0,964045100.0,4855333000.0,45435400.0,3572898000.0,85354040.0,1452599000.0,604593.4,113126.640625,...,-0.490558,-0.143464,-0.001518,-0.093469,-0.041938,0.026746,-0.298548,0.000224,0.001427,1
2025-11-03 06:59:04,72165672.0,105352472.0,983522600.0,4999070000.0,48096308.0,3666288000.0,86443400.0,1497718000.0,175672.3,-381343.8125,...,-0.614349,-0.160859,-0.048256,-0.090859,-0.077478,0.029168,-0.232361,0.001165,0.000468,1
2025-11-03 13:01:27,81473080.0,103061232.0,1028549000.0,5191191000.0,50303940.0,4077146000.0,87227232.0,1617474000.0,-2687616.0,-204625.6875,...,-0.486674,-0.652416,-0.55426,-0.396228,-0.675461,0.063187,-0.830285,0.010085,0.003309,2
2025-11-03 18:55:47,86441984.0,103348408.0,1107766000.0,5557130000.0,55905988.0,4522788000.0,89872256.0,1775345000.0,-3168482.0,-216105.296875,...,-0.71632,-0.881051,-0.323942,-0.622146,-0.670574,0.050344,-0.908824,0.01993,0.006325,2


In [15]:
binary_df.head()

Unnamed: 0_level_0,klineacc__AAVE,klineacc__BCH,klineacc__BNB,klineacc__BTC,klineacc__DOT,klineacc__ETH,klineacc__PAXG,klineacc__SOL,spread__AAVE,spread__BCH,...,h__BCH,h__BNB,h__BTC,h__DOT,h__ETH,h__PAXG,h__SOL,btc_return,btc_volatility,target
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2025-11-03 13:01:27,81473080.0,103061232.0,1028549000.0,5191191000.0,50303940.0,4077146000.0,87227232.0,1617474000.0,-2687616.0,-204625.7,...,-0.486674,-0.652416,-0.55426,-0.396228,-0.675461,0.063187,-0.830285,0.010085,0.003309,1
2025-11-03 18:55:47,86441984.0,103348408.0,1107766000.0,5557130000.0,55905988.0,4522788000.0,89872256.0,1775345000.0,-3168482.0,-216105.3,...,-0.71632,-0.881051,-0.323942,-0.622146,-0.670574,0.050344,-0.908824,0.01993,0.006325,1
2025-11-04 00:53:19,90251424.0,97903840.0,1107663000.0,5246447000.0,60447356.0,4687129000.0,76831576.0,1828551000.0,-2681018.0,-774503.8,...,-0.711986,-0.583903,-0.734676,-0.760226,-0.846621,-0.029024,-0.91108,0.022106,0.007142,1
2025-11-04 07:07:35,99344112.0,101505216.0,1205232000.0,5962357000.0,67873792.0,5240622000.0,80943944.0,2061349000.0,-110713.9,-1104399.0,...,-0.918477,-0.833386,-0.741948,-0.974854,-0.993856,0.001986,-0.999953,0.019355,0.006169,1
2025-11-04 12:53:14,92082616.0,89428208.0,1161205000.0,5826781000.0,66509876.0,5139522000.0,72561512.0,1973943000.0,120425.1,-948775.7,...,-0.669613,-0.888514,-0.5481,-0.865125,-0.745811,-0.372268,-0.983842,0.018301,0.005754,1


In [17]:
ndf = pd.read_csv("../Downloads/st_export2.csv").iloc[:,1:]
for i in range(len(ndf)):
    ndf.iloc[i,-3]=ndf.iloc[i,-3].split("GMT")[0].strip()
ndf['timestamp'] = pd.to_datetime(ndf['timestamp'])
a=np.random.randint(len(ndf))
ndf.iloc[a:a+5,:]

Unnamed: 0,klineacc,spread,spreadper,x,vwap,deviation,ratio,term,sigma,e,h,asset,timestamp,gap,gaplimit
539,17776930,-285480.6,-14.201055,-0.849923,1.629846,-0.003771,1.142011,2.370354,0.007961,-1.282284,-0.857092,SUI,2025-12-06 00:09:21,1m,300
540,11328909,999839.0,7.114916,0.751235,13.946676,-0.007649,0.928851,1.730516,0.006647,-1.849612,-0.951709,LINK,2025-12-06 00:09:21,1m,300
541,47235616,-2238349.0,-85.644043,-0.803718,380.828613,-0.016119,1.85644,0.736135,0.01079,-2.041548,-0.966848,ZEC,2025-12-06 00:09:21,1m,300
542,82289600,-4656324.0,-30.749805,-0.793837,136.12001,-0.010285,1.307498,1.176053,0.005798,-2.727551,-0.991488,SOL,2025-12-06 00:09:21,1m,300
543,316729792,-2166490.0,-28.956581,-0.874298,3119.12793,-0.01158,1.289566,1.748224,0.004843,-5.389832,-0.999958,ETH,2025-12-06 00:09:21,1m,300
