In [4]:
# --- 03_ml_training.ipynb (Fixed) ---
import pandas as pd
import numpy as np
import joblib
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

MODEL_PATH = 'models/xgb_model.pkl'
SCALER_PATH = 'models/scaler.pkl'

# --- LOAD FEATURES ---
df = pd.read_csv('data/features.csv', index_col='open_time', parse_dates=True)

# --- CLEAN DATA (Fix inf/NaN/extreme values) ---
# Replace inf with NaN
df.replace([np.inf, -np.inf], np.nan, inplace=True)

# Fill NaN with 0 or median (safe for most features)
df.fillna(0, inplace=True)  # Or use df.median() for better results

# Cap extreme values (optional but recommended)
for col in df.columns:
    if df[col].dtype in ['float64', 'float32']:
        df[col] = np.clip(df[col], -1e6, 1e6)  # Cap to reasonable range

# --- CREATE LABELS ---
df['future_return'] = df['close'].pct_change().shift(-1)
df['label'] = np.where(df['future_return'] > 0.005, 1,   # BUY
                       np.where(df['future_return'] < -0.005, 0, 2))  # SELL or HOLD
df.dropna(inplace=True)

print("Data after cleaning and labels:")
print(df.head())

# --- FEATURE COLUMNS ---
feature_cols = [
    'return', 'log_return', 'volatility', 'volume_change', 'cvd', 'imbalance', 'liq_risk',
    'close_lag_1', 'close_lag_3', 'close_lag_5', 'close_lag_10',
    'volume_lag_1', 'volume_lag_3', 'volume_lag_5', 'volume_lag_10'
]

# Check for missing columns
missing_cols = [col for col in feature_cols if col not in df.columns]
if missing_cols:
    print(f"WARNING: Missing columns: {missing_cols}")
    feature_cols = [col for col in feature_cols if col in df.columns]

X = df[feature_cols]
y = df['label']

# --- SCALE FEATURES ---
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# --- TRAIN-TEST SPLIT ---
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, shuffle=False)

# --- TRAIN XGBOOST ---
model = XGBClassifier(
    n_estimators=200,
    max_depth=6,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    eval_metric='mlogloss'
)
model.fit(X_train, y_train)

# --- EVALUATE ---
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

# --- SAVE MODEL & SCALER ---
joblib.dump(model, MODEL_PATH)
joblib.dump(scaler, SCALER_PATH)
print(f"Model saved to {MODEL_PATH}")
print(f"Scaler saved to {SCALER_PATH}")

Data after cleaning and labels:
                        open     high      low    close     volume    return  \
open_time                                                                      
2017-08-19 05:00:00  4123.95  4123.95  4027.83  4077.00  12.514194  0.002148   
2017-08-19 06:00:00  4077.00  4082.00  3986.87  3986.87  51.455431 -0.022107   
2017-08-19 07:00:00  3969.12  4033.47  3933.21  4033.47  31.429222  0.011688   
2017-08-19 08:00:00  4033.47  4064.84  3964.08  3999.00  18.006405 -0.008546   
2017-08-19 09:00:00  3999.00  4082.25  3999.00  4068.20   5.184223  0.017304   

                     log_return  volatility  volume_change  delta  ...  \
open_time                                                          ...   
2017-08-19 05:00:00    0.002146    0.013284      -0.278855 -46.95  ...   
2017-08-19 06:00:00   -0.022355    0.013488       3.111765 -90.13  ...   
2017-08-19 07:00:00    0.011621    0.013957      -0.389195  64.35  ...   
2017-08-19 08:00:00   -0.008583    0.