In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    roc_auc_score, confusion_matrix, classification_report, f1_score
)

# Try to use XGBoost; if not available, fallback to RandomForest
use_xgb = True
try:
    from xgboost import XGBClassifier
except Exception:
    use_xgb = False
from sklearn.ensemble import RandomForestClassifier

In [60]:
import numpy as np
print(np.__version__)
pd.options.display.max_rows = None

1.26.4


In [4]:
df = pd.read_csv("../Downloads/st_export.csv").iloc[:,1:]
for i in range(len(df)):
    df.iloc[i,-3]=df.iloc[i,-3].split("GMT")[0].strip()
df['timestamp'] = pd.to_datetime(df['timestamp'])
a=np.random.randint(len(df))
df.iloc[a:a+5,:]

Unnamed: 0,klineacc,spread,spreadper,x,vwap,deviation,ratio,term,sigma,e,h,asset,timestamp,gap,gaplimit
73626,52695384,3042226.0,44.480713,0.785304,687.678223,-0.021156,0.555193,0.522323,0.010241,-0.599078,-0.536393,ZEC,2025-11-20 15:28:45,30m,10
73627,31215158,17062410.0,54.070522,0.345923,1.316586,-0.023991,0.459295,1.691584,0.022114,-0.842885,-0.687334,ASTER,2025-11-20 15:28:45,30m,10
73628,25991740,-245844.4,-6.046437,-0.865974,907.929871,-0.004295,1.060464,0.836181,0.00317,-1.201415,-0.834086,BNB,2025-11-20 15:28:45,30m,10
73629,277278464,-2309446.0,-16.749805,-0.869361,92433.15625,-0.005876,1.167498,0.872728,0.003298,-1.815278,-0.948365,BTC,2025-11-20 15:28:45,30m,10
73630,88408616,1329105.0,9.652183,0.851108,143.534195,-0.003582,0.903478,0.311454,0.003434,-0.293535,-0.285385,SOL,2025-11-20 15:29:10,3m,100


In [73]:
ndf = pd.read_csv("../Downloads/st_export3.csv").iloc[:,1:]
for i in range(len(ndf)):
    ndf.iloc[i,-3]=ndf.iloc[i,-3].split("GMT")[0].strip()
ndf['timestamp'] = pd.to_datetime(ndf['timestamp'])
a=np.random.randint(len(ndf))
ndf.iloc[a:a+5,:]

Unnamed: 0,klineacc,spread,spreadper,x,vwap,deviation,ratio,term,sigma,e,h,asset,timestamp,gap,gaplimit
1892,50986084,-640160.8,-51.030678,-0.857805,184.217438,-0.009649,1.510307,0.215379,0.056339,-0.055709,-0.055651,AAVE,2025-12-06 16:53:43,2h,60
1893,17448994,578771.5,74.292458,0.818204,34.680561,-0.094017,0.257075,0.13779,0.046169,-0.072134,-0.072009,COMP,2025-12-06 16:53:43,2h,60
1894,14214278,4650152.0,79.795494,0.535246,4.189442,-0.049993,0.202045,0.460276,0.040634,-0.114416,-0.11392,AR,2025-12-06 16:53:43,2h,60
1895,14604140,1322706.0,46.304005,0.748821,5.659653,-0.03881,0.53696,0.308152,0.053887,-0.119169,-0.118609,INJ,2025-12-06 16:53:43,2h,60
1896,5625011200,-923547.6,-8.39404,-0.927268,90413.90625,-0.012372,1.08394,0.305679,0.029773,-0.137681,-0.136817,BTC,2025-12-06 16:53:43,2h,60


# new

In [98]:
# =========================
# CONFIG
# =========================
SHORT_GAPS = ['1m','3m','5m','15m','30m','1h']
LABEL_MAP = {0:"Down",1:"Neutral",2:"Up"}

# Fixed thresholds for classification (VWAP return)
UP_THRESHOLD   = 0.0005   # +0.05%
DOWN_THRESHOLD = -0.0005  # -0.05%

# =========================
# FEATURE ENGINEERING
# =========================
def make_features(df):
    df = df.copy().sort_values('timestamp')
    df['return'] = df['vwap'].pct_change(fill_method=None)
    df['volatility'] = df['return'].rolling(window=10).std()
    df['delta_seconds_to_next'] = df['timestamp'].diff().shift(-1).dt.total_seconds()
    df['return_per_sec'] = df['return'] / df['delta_seconds_to_next'].replace(0,np.nan)

    # Lagged features
    for lag in [1,2,3]:
        df[f'return_lag{lag}'] = df['return'].shift(lag)
    df['volatility_5'] = df['return'].rolling(window=5).std()
    df['volatility_20'] = df['return'].rolling(window=20).std()
    df['mean_return_5'] = df['return'].rolling(window=5).mean()
    df['mean_return_20'] = df['return'].rolling(window=20).mean()

    df = df.dropna()
    return df

# =========================
# TARGET BUILDING
# =========================
def build_targets(df_feat):
    # Next VWAP return
    next_ret = df_feat['vwap'].pct_change().shift(-1)
    df_eval = df_feat.iloc[:-1].copy()
    df_eval['next_ret'] = next_ret.iloc[:-1].values

    def classify(r):
        if r >= UP_THRESHOLD:
            return 2
        elif r <= DOWN_THRESHOLD:
            return 0
        else:
            return 1

    df_eval['target'] = df_eval['next_ret'].apply(classify)
    return df_eval

# =========================
# TRAINING
# =========================
def train_short_term(df_train, feature_cols):
    df_feat = make_features(df_train)
    df_eval = build_targets(df_feat)

    X = df_eval[feature_cols]
    y = df_eval['target']

    scaler = StandardScaler()
    Xs = scaler.fit_transform(X)

    # Class weights to balance Neutral vs Up/Down
    counts = y.value_counts().reindex([0,1,2], fill_value=0)
    inv = {c: (1.0 / (counts[c] if counts[c] > 0 else 1)) for c in [0,1,2]}
    sample_weight = y.map(inv).values

    model = XGBClassifier(
        n_estimators=400,
        max_depth=6,
        learning_rate=0.05,
        subsample=0.85,
        colsample_bytree=0.85,
        random_state=42,
        eval_metric='mlogloss'
    )
    model.fit(Xs, y, sample_weight=sample_weight)
    return model, scaler, df_eval

# =========================
# EVALUATION
# =========================
def evaluate_short_term(model, scaler, df_new, feature_cols, tail_rows=20):
    df_feat = make_features(df_new)
    if df_feat.empty or len(df_feat) < 5:
        print("No/insufficient new data after features; skipping.")
        return

    df_eval = build_targets(df_feat)
    X_new = df_eval[feature_cols]
    y_true = df_eval['target']

    Xs_new = scaler.transform(X_new)
    preds = model.predict(Xs_new)

    df_eval['prediction'] = preds
    df_eval['signal'] = df_eval['prediction'].map(LABEL_MAP)
    df_eval['target_signal'] = df_eval['target'].map(LABEL_MAP)

    print("\nShort-term evaluation:")
    print(classification_report(y_true, preds, digits=4))

    print("\n=== Short-term comparison ===")
    cols = ['timestamp','return','volatility','signal','target_signal']
    print(df_eval[cols].tail(tail_rows))

In [101]:
# =========================
# MAIN EXECUTION
# =========================
btc_short_train = df[(df['asset']=='BTC') & (df['gap'].isin(SHORT_GAPS))]

feature_cols = [
    'klineacc','spread','spreadper','x','vwap','deviation',
    'ratio','term','sigma','e','h','return','volatility',
    'delta_seconds_to_next','return_per_sec',
    'return_lag1','return_lag2','return_lag3',
    'volatility_5','volatility_20','mean_return_5','mean_return_20'
]

model_short, scaler_short, train_eval = train_short_term(btc_short_train, feature_cols)

print("Short-term model trained with fixed thresholds and lagged features.")

btc_short_new = ndf[(ndf['asset']=='BTC') & (ndf['gap'].isin(SHORT_GAPS))]
evaluate_short_term(model_short, scaler_short, btc_short_new, feature_cols, tail_rows=150)

Short-term model trained with fixed thresholds and lagged features.

Short-term evaluation:
              precision    recall  f1-score   support

           0     0.0000    0.0000    0.0000        30
           1     0.8938    1.0000    0.9439       446
           2     0.0000    0.0000    0.0000        23

    accuracy                         0.8938       499
   macro avg     0.2979    0.3333    0.3146       499
weighted avg     0.7989    0.8938    0.8437       499


=== Short-term comparison ===
               timestamp        return  volatility   signal target_signal
2489 2025-12-07 01:43:33 -4.346631e-05    0.000132  Neutral       Neutral
2495 2025-12-07 01:44:14  2.069086e-05    0.000132  Neutral       Neutral
2501 2025-12-07 01:49:35  1.623938e-04    0.000136  Neutral       Neutral
2508 2025-12-07 01:57:00 -1.181249e-04    0.000100  Neutral       Neutral
2514 2025-12-07 02:02:16  6.954460e-07    0.000087  Neutral       Neutral
2519 2025-12-07 02:09:49  2.903485e-05    0.000085  

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


# new new

In [114]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
from xgboost import XGBClassifier

ETH_SHORT_GAPS = ['1m','3m','5m','15m','30m','1h']
LABELS = {0:"Down",1:"Neutral",2:"Up"}

# -------- Features --------
def fe_eth(df):
    d = df.copy().sort_values('timestamp')
    d['ret_vwap'] = d['vwap'].pct_change()
    d['ret_per_sec'] = d['ret_vwap'] / d['timestamp'].diff().shift(-1).dt.total_seconds().replace(0, np.nan)
    d['ret_std_10'] = d['ret_vwap'].rolling(10).std()
    d = d.dropna()
    return d

# -------- Target with auto-relax --------
def build_target_adaptive(df):
    nxt = df['vwap'].pct_change().shift(-1)
    sig = df['sigma'].shift(-1)
    d = df.iloc[:-1].copy()
    d['next_ret'] = nxt.iloc[:-1].values
    d['next_sigma'] = sig.iloc[:-1].values

    # Much wider sigma multipliers
    sigma_grid = [0.60, 0.50, 0.40, 0.30, 0.25, 0.20, 0.15, 0.10, 0.08, 0.05, 0.02, 0.01, 0.005, 0.002, 0.001]

    # Looser gates to avoid blocking borderline cases
    gate_grid = [
        dict(ratio_up=0.95, ratio_down=1.05, h_up=-0.10, h_down=0.10, x_up=-0.10, x_down=0.10),
        dict(ratio_up=0.90, ratio_down=1.10, h_up=-0.20, h_down=0.20, x_up=-0.20, x_down=0.20),
        dict(ratio_up=0.85, ratio_down=1.15, h_up=-0.30, h_down=0.30, x_up=-0.30, x_down=0.30),
        dict(ratio_up=0.80, ratio_down=1.20, h_up=-0.40, h_down=0.40, x_up=-0.40, x_down=0.40),
    ]

    # Neutral band: much tighter so Neutral doesn’t swallow Up/Down
    neutral_band = 0.003  # Neutral if |next_ret| <= neutral_band * sigma

    def classify(r, s, ratio, h, x, sm, g):
        if pd.isna(r) or pd.isna(s):
            return np.nan
        up = (r >= sm * s) and (ratio > g['ratio_up']) and (h > g['h_up']) and (x > g['x_up'])
        dn = (r <= -sm * s) and (ratio < g['ratio_down']) and (h < g['h_down']) and (x < g['x_down'])
        if up: return 2
        if dn: return 0
        # Only Neutral if movement is truly tiny relative to sigma
        if abs(r) <= neutral_band * s: return 1
        # Outside tiny band but signals disagree → Neutral (safety)
        return 1

    best = None
    best_counts = None

    for sm in sigma_grid:
        for g in gate_grid:
            tgt = [classify(r, s, ra, hh, xx, sm, g)
                   for r, s, ra, hh, xx in zip(d['next_ret'], d['next_sigma'], d['ratio'], d['h'], d['x'])]
            d['target'] = tgt
            d2 = d.dropna(subset=['target']).copy()
            d2['target'] = d2['target'].astype(int)
            counts = d2['target'].value_counts().reindex([0, 1, 2], fill_value=0)
            print(f"Try sigma_mult={sm:.3f}, gates={g}, counts={counts.to_dict()}")

            # Track best split (maximize the smallest class)
            if best is None or (counts.min() > best_counts.min()):
                best, best_counts = (d2.copy(), counts.copy())

            # Success: accept realistic short-term support
            if counts[0] >= 40 and counts[1] >= 300 and counts[2] >= 40:
                d2['sigma_mult'] = sm
                d2['gates'] = str(g)
                d2['neutral_band'] = neutral_band
                return d2

    # Fallback: return the best found (no exception), with metadata
    best['sigma_mult'] = np.nan
    best['gates'] = 'fallback_best'
    best['neutral_band'] = neutral_band
    print(f"Adaptive fallback used. Best counts={best_counts.to_dict()}")
    return best


# -------- Train --------
def train_eth(train_df, feat_cols):
    f = fe_eth(train_df)
    lbl = build_target_adaptive(f)
    X, y = lbl[feat_cols], lbl['target']
    scaler = StandardScaler()
    Xs = scaler.fit_transform(X)
    # simple class weights
    counts = y.value_counts().reindex([0,1,2], fill_value=1)
    invw = y.map({c: 1.0 / counts[c] for c in [0,1,2]}).values
    clf = XGBClassifier(n_estimators=250, max_depth=5, learning_rate=0.08, subsample=0.85, colsample_bytree=0.85, eval_metric='mlogloss', random_state=42)
    clf.fit(Xs, y, sample_weight=invw)
    return clf, scaler

# -------- Eval --------
def eval_eth(clf, scaler, new_df, feat_cols, tau=0.6):
    f = fe_eth(new_df)
    lbl = build_target_adaptive(f)  # build on same adaptive rule for comparability
    X, y = lbl[feat_cols], lbl['target']
    Xs = scaler.transform(X)
    proba = clf.predict_proba(Xs)
    top = proba.argmax(1)
    conf = proba.max(1)
    pred = np.where(conf < tau, 1, top)
    print("\nshort-term (multi-variable) report:")
    print(classification_report(y, pred, target_names=["Down","Neutral","Up"], digits=4))
    out = lbl.copy()
    out['pred'] = pred
    out['signal'] = out['pred'].map(LABELS)
    out['target_signal'] = out['target'].map(LABELS)
    out['conf'] = conf
    print("\n=== Comparison (tail) ===")
    print(out[['timestamp','ret_vwap','sigma','ratio','h','x','signal','target_signal','conf']].tail(20))

# -------- MAIN --------
eth_train = df[(df['asset']=='BTC') & (df['gap'].isin(ETH_SHORT_GAPS))].copy()
eth_new   = ndf[(ndf['asset']=='BTC') & (ndf['gap'].isin(ETH_SHORT_GAPS))].copy()

features = ['vwap','sigma','ratio','h','x','ret_vwap','ret_per_sec','ret_std_10']
clf, scaler = train_eth(eth_train, features)
eval_eth(clf, scaler, eth_new, features, tau=0.6)


Try sigma_mult=0.600, gates={'ratio_up': 0.95, 'ratio_down': 1.05, 'h_up': -0.1, 'h_down': 0.1, 'x_up': -0.1, 'x_down': 0.1}, counts={0: 7, 1: 9586, 2: 7}
Try sigma_mult=0.600, gates={'ratio_up': 0.9, 'ratio_down': 1.1, 'h_up': -0.2, 'h_down': 0.2, 'x_up': -0.2, 'x_down': 0.2}, counts={0: 11, 1: 9577, 2: 12}
Try sigma_mult=0.600, gates={'ratio_up': 0.85, 'ratio_down': 1.15, 'h_up': -0.3, 'h_down': 0.3, 'x_up': -0.3, 'x_down': 0.3}, counts={0: 14, 1: 9568, 2: 18}
Try sigma_mult=0.600, gates={'ratio_up': 0.8, 'ratio_down': 1.2, 'h_up': -0.4, 'h_down': 0.4, 'x_up': -0.4, 'x_down': 0.4}, counts={0: 19, 1: 9558, 2: 23}
Try sigma_mult=0.500, gates={'ratio_up': 0.95, 'ratio_down': 1.05, 'h_up': -0.1, 'h_down': 0.1, 'x_up': -0.1, 'x_down': 0.1}, counts={0: 9, 1: 9584, 2: 7}
Try sigma_mult=0.500, gates={'ratio_up': 0.9, 'ratio_down': 1.1, 'h_up': -0.2, 'h_down': 0.2, 'x_up': -0.2, 'x_down': 0.2}, counts={0: 13, 1: 9573, 2: 14}
Try sigma_mult=0.500, gates={'ratio_up': 0.85, 'ratio_down': 1.15, '

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


# new*3