In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    roc_auc_score, confusion_matrix, classification_report, f1_score
)

# Try to use XGBoost; if not available, fallback to RandomForest
use_xgb = True
try:
    from xgboost import XGBClassifier
except Exception:
    use_xgb = False
from sklearn.ensemble import RandomForestClassifier

In [3]:
import numpy as np
print(np.__version__)

1.26.4


In [4]:
df = pd.read_csv("../Downloads/st_export.csv").iloc[:,1:]
for i in range(len(df)):
    df.iloc[i,-3]=df.iloc[i,-3].split("GMT")[0].strip()
df['timestamp'] = pd.to_datetime(df['timestamp'])
a=np.random.randint(len(df))
df.iloc[a:a+5,:]

Unnamed: 0,klineacc,spread,spreadper,x,vwap,deviation,ratio,term,sigma,e,h,asset,timestamp,gap,gaplimit
73626,52695384,3042226.0,44.480713,0.785304,687.678223,-0.021156,0.555193,0.522323,0.010241,-0.599078,-0.536393,ZEC,2025-11-20 15:28:45,30m,10
73627,31215158,17062410.0,54.070522,0.345923,1.316586,-0.023991,0.459295,1.691584,0.022114,-0.842885,-0.687334,ASTER,2025-11-20 15:28:45,30m,10
73628,25991740,-245844.4,-6.046437,-0.865974,907.929871,-0.004295,1.060464,0.836181,0.00317,-1.201415,-0.834086,BNB,2025-11-20 15:28:45,30m,10
73629,277278464,-2309446.0,-16.749805,-0.869361,92433.15625,-0.005876,1.167498,0.872728,0.003298,-1.815278,-0.948365,BTC,2025-11-20 15:28:45,30m,10
73630,88408616,1329105.0,9.652183,0.851108,143.534195,-0.003582,0.903478,0.311454,0.003434,-0.293535,-0.285385,SOL,2025-11-20 15:29:10,3m,100


In [5]:
len(df)

110170

In [8]:
selected_assets = ['BTC','ETH','SOL','BNB','DOT','AAVE','BCH','PAXG']
df_copy = df[df['asset'].isin(selected_assets)].copy()

feature_cols = ['klineacc','spread','spreadper','x','vwap','deviation','ratio','term','sigma','e','h']
wide_frames = []
for col in feature_cols:
    w = df_copy.pivot_table(index='timestamp', columns='asset', values=col, aggfunc='mean')
    w.columns = [f"{col}__{a}" for a in w.columns]
    wide_frames.append(w)

wide_df = pd.concat(wide_frames, axis=1).sort_index()

# -----------------------------
# 2) BTC returns + volatility
# -----------------------------
btc_vwap_col = "vwap__BTC"
wide_df['btc_return'] = wide_df[btc_vwap_col].pct_change(fill_method=None)

# Rolling volatility (std of returns)
wide_df['btc_volatility'] = wide_df['btc_return'].rolling(window=10).std()

# -----------------------------
# 3) Hybrid target definition
# -----------------------------
neutral_thresh = 0.005  # 0.5% neutral zone
vol_thresh = 0.003      # volatility threshold

def hybrid_label(row):
    r = row['btc_return']
    vol = row['btc_volatility']
    if abs(r) < neutral_thresh and vol < vol_thresh:
        return 1  # Neutral
    elif r > neutral_thresh:
        return 2  # Up
    elif r < -neutral_thresh:
        return 0  # Down
    else:
        return 1  # fallback Neutral

wide_df['target'] = wide_df.apply(hybrid_label, axis=1)
model_df = wide_df.dropna()

# -----------------------------
# 4) Binary dataset (Up vs Down only)
# -----------------------------
binary_df = model_df[model_df['target'] != 1].copy()

# remap Up=2 â†’ 1 so classes are [0,1]
binary_df['target'] = binary_df['target'].replace({2:1})

X = binary_df.drop(columns=['target'])
y = binary_df['target']

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

split_idx = int(len(X_scaled)*0.8)
X_train, y_train = X_scaled[:split_idx], y[:split_idx]
X_test, y_test = X_scaled[split_idx:], y[split_idx:]

In [12]:
# -----------------------------
# 5) Train boosting model
# -----------------------------
if use_xgb:
    model = XGBClassifier(
        n_estimators=300,
        max_depth=5,
        learning_rate=0.05,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42,
        use_label_encoder=False,
        eval_metric='logloss'
    )
else:
    model = RandomForestClassifier(
        n_estimators=300,
        max_depth=8,
        random_state=42
    )

model.fit(X_train, y_train)

# -----------------------------
# 6) Hybrid evaluation
# -----------------------------
# Predict Up/Down only
y_pred_binary = model.predict(X_test)

# Map predictions back into full 3-class space
test_full = model_df.iloc[split_idx:].copy()
X_full = test_full.drop(columns=['target'])
X_full_scaled = scaler.transform(X_full)

pred_binary = model.predict(X_full_scaled)

# Apply hybrid rule for Neutral
final_preds = []
for i, row in test_full.iterrows():
    # Rule for Neutral
    if abs(row['btc_return']) < neutral_thresh and row['btc_volatility'] < vol_thresh:
        final_preds.append(1)  # Neutral
    else:
        # Use ML model for Up/Down
        pred_bin = model.predict([X_full_scaled[list(test_full.index).index(i)]])[0]
        # Map binary prediction back: 0=Down, 1=Up
        final_preds.append(0 if pred_bin==0 else 2)

y_true = test_full['target'].values
y_pred = np.array(final_preds)

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [13]:
acc = accuracy_score(y_true, y_pred)
macro_f1 = f1_score(y_true, y_pred, average='macro')
print(f"Hybrid Accuracy: {acc:.4f} | Macro F1: {macro_f1:.4f}")

print("\nClassification report:")
print(classification_report(y_true, y_pred, digits=4))

cm = confusion_matrix(y_true, y_pred)
print("\nConfusion matrix:\n", cm)

Hybrid Accuracy: 0.9242 | Macro F1: 0.6508

Classification report:
              precision    recall  f1-score   support

           0     0.9091    1.0000    0.9524        50
           1     0.0000    0.0000    0.0000         5
           2     1.0000    1.0000    1.0000        11

    accuracy                         0.9242        66
   macro avg     0.6364    0.6667    0.6508        66
weighted avg     0.8554    0.9242    0.8882        66


Confusion matrix:
 [[50  0  0]
 [ 5  0  0]
 [ 0  0 11]]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
