# Overtrading model training

Train a **window-level overtrading classifier** from `data_preprocessing_training.csv`.

## What this notebook does
- Uses `is_calm` as the label source (`is_overtrading = is_calm == 0`).
- Drops time identifiers (`window_start`, `window_end`, `session_id`) and `is_calm` from features.
- Uses a **time-aware split** (first 80% train, last 20% validation).
- Trains **XGBoost** by default (falls back to sklearn GradientBoosting if XGBoost is unavailable).
- Outputs `overtrading_predictions.csv` with `y_true`, `y_pred`, and `y_score`.


## Rationale

- **Target**: use `is_calm` as the label; we train on **P(is_overtrading)** where `is_overtrading = (is_calm == 0)`.
- **Features**: cross-person comparable window features (`trade_rate_per_min`, gap stats, asset diversity, turnover, PnL shape); `is_calm` is **not** included as a feature to avoid leakage.
- **Output**: the model returns a probability score for overtrading per window.
- **Why boosting**: overtrading is driven by interactions (high trade rate + low gaps + high burst), which tree‑boosting captures well.
- **Time split**: we split by time to reduce leakage across adjacent windows.


In [51]:
import pandas as pd
from pathlib import Path


In [52]:
# Load training data
# This notebook lives in model_training/, so data is in ../data_analysis/
data_path = Path("../data_analysis/data_preprocessing_training.csv")
df = pd.read_csv(data_path)
df.head()


Unnamed: 0,window_start,window_end,n_trades,trade_rate_per_min,median_gap_sec,mean_gap_sec,gap_cv,burst_frac,n_assets,top_asset_share,...,pnl_sum,pnl_mean,pnl_std,win_rate,payoff_ratio,pnl_skew_proxy,session_id,source_file,is_calm,turnover_per_hour
0,2025-01-01 21:52:00+00:00,2025-01-01 22:07:00+00:00,1,0.066667,,,,,1,1.0,...,-1.15,-1.15,,0.0,,-2.0,2025-01-01T21:52:00Z,balanced_example.csv,1,0.089022
1,2025-01-01 21:57:00+00:00,2025-01-01 22:12:00+00:00,0,0.0,,,,,0,,...,0.0,,,,,,2025-01-01T21:57:00Z,balanced_example.csv,1,
2,2025-01-01 22:02:00+00:00,2025-01-01 22:17:00+00:00,0,0.0,,,,,0,,...,0.0,,,,,,2025-01-01T22:02:00Z,balanced_example.csv,1,
3,2025-01-01 22:07:00+00:00,2025-01-01 22:22:00+00:00,0,0.0,,,,,0,,...,0.0,,,,,,2025-01-01T22:07:00Z,balanced_example.csv,1,
4,2025-01-01 22:12:00+00:00,2025-01-01 22:27:00+00:00,0,0.0,,,,,0,,...,0.0,,,,,,2025-01-01T22:12:00Z,balanced_example.csv,1,


In [53]:
# ── Filter: keep only windows with >= 2 trades ──────────────────────
# Gap stats (median_gap_sec, gap_cv, etc.) are undefined for 0-1 trade windows,
# producing all-NaN rows that add noise without signal.
print(f"Rows before filter: {len(df):,}")
df = df[df["n_trades"] >= 2].copy()
print(f"Rows after  filter (n_trades >= 2): {len(df):,}")

# ── Label ────────────────────────────────────────────────────────────
y = (df["is_calm"] == 0).astype(int)
print(f"Class balance — overtrading: {y.mean():.2%}, calm: {1 - y.mean():.2%}")

# ── Feature selection ────────────────────────────────────────────────
drop_cols = {
    # Metadata / identifiers
    "window_start",
    "window_end",
    "session_id",
    "source_file",
    "is_calm",
    # Redundant (keep one per correlated pair)
    "n_trades",       # perfectly correlated with trade_rate_per_min in fixed-length windows
    "turnover",       # scaled version of turnover_per_hour
    # Outcome features — not behavioral signals
    "pnl_sum",
    "pnl_mean",
    "win_rate",
    "payoff_ratio",
    # Guard against stale CSVs
    "trade_rate_gt_p90",
    "p90_trade_rate_global",
    "hot_streak_len",
    "dd_max",
    "window_start_balance",
    "notional_sum",
    "notional_mean",
    "notional_std",
    "avg_gain",
    "avg_loss_abs",
}
X = df.drop(columns=[c for c in drop_cols if c in df.columns])
# NOTE: no fillna here — XGBoost handles NaN natively.
# For the sklearn fallback, fillna(-1) is applied in the model-selection cell.
print(f"Features ({X.shape[1]}): {list(X.columns)}")

# ── Time-aware split (80/20) with gap ───────────────────────────────
df_sorted = df.sort_values("window_start")
split_idx = int(len(df_sorted) * 0.8)
gap = 3  # skip 3 windows (= 1 full window_minutes) to prevent leakage
train_idx = df_sorted.index[:split_idx]
val_idx = df_sorted.index[split_idx + gap:]

X_train, y_train = X.loc[train_idx], y.loc[train_idx]
X_val, y_val = X.loc[val_idx], y.loc[val_idx]

print(f"Train: {X_train.shape}, Val: {X_val.shape}")
X_train.shape, X_val.shape


Rows before filter: 1,214,887
Rows after  filter (n_trades >= 2): 6,487
Class balance — overtrading: 66.92%, calm: 33.08%
Features (11): ['trade_rate_per_min', 'median_gap_sec', 'mean_gap_sec', 'gap_cv', 'burst_frac', 'n_assets', 'top_asset_share', 'asset_switch_rate', 'pnl_std', 'pnl_skew_proxy', 'turnover_per_hour']
Train: (5189, 11), Val: (1295, 11)


((5189, 11), (1295, 11))

In [54]:
# Train model (XGBoost preferred; fallback to sklearn if unavailable)
try:
    from xgboost import XGBClassifier
    USE_XGB = True
except Exception:
    USE_XGB = False

if USE_XGB:
    neg, pos = (y_train == 0).sum(), (y_train == 1).sum()
    model = XGBClassifier(
        n_estimators=300,
        max_depth=5,
        learning_rate=0.05,
        subsample=0.9,
        colsample_bytree=0.9,
        scale_pos_weight=neg / pos,
        eval_metric="logloss",
        random_state=42,
    )
else:
    from sklearn.ensemble import GradientBoostingClassifier

    # sklearn can't handle NaN -- fill with -1 as a distinguishable sentinel
    X_train = X_train.fillna(-1)
    X_val = X_val.fillna(-1)

    model = GradientBoostingClassifier(
        n_estimators=300,
        max_depth=5,
        learning_rate=0.05,
        subsample=0.9,
        random_state=42,
    )

model


0,1,2
,"objective  objective: typing.Union[str, xgboost.sklearn._SklObjWProto, typing.Callable[[typing.Any, typing.Any], typing.Tuple[numpy.ndarray, numpy.ndarray]], NoneType] Specify the learning task and the corresponding learning objective or a custom objective function to be used. For custom objective, see :doc:`/tutorials/custom_metric_obj` and :ref:`custom-obj-metric` for more information, along with the end note for function signatures.",'binary:logistic'
,"base_score  base_score: typing.Union[float, typing.List[float], NoneType] The initial prediction score of all instances, global bias.",
,booster,
,"callbacks  callbacks: typing.Optional[typing.List[xgboost.callback.TrainingCallback]] List of callback functions that are applied at end of each iteration. It is possible to use predefined callbacks by using :ref:`Callback API `. .. note::  States in callback are not preserved during training, which means callback  objects can not be reused for multiple training sessions without  reinitialization or deepcopy. .. code-block:: python  for params in parameters_grid:  # be sure to (re)initialize the callbacks before each run  callbacks = [xgb.callback.LearningRateScheduler(custom_rates)]  reg = xgboost.XGBRegressor(**params, callbacks=callbacks)  reg.fit(X, y)",
,colsample_bylevel  colsample_bylevel: typing.Optional[float] Subsample ratio of columns for each level.,
,colsample_bynode  colsample_bynode: typing.Optional[float] Subsample ratio of columns for each split.,
,colsample_bytree  colsample_bytree: typing.Optional[float] Subsample ratio of columns when constructing each tree.,0.9
,"device  device: typing.Optional[str] .. versionadded:: 2.0.0 Device ordinal, available options are `cpu`, `cuda`, and `gpu`.",
,"early_stopping_rounds  early_stopping_rounds: typing.Optional[int] .. versionadded:: 1.6.0 - Activates early stopping. Validation metric needs to improve at least once in  every **early_stopping_rounds** round(s) to continue training. Requires at  least one item in **eval_set** in :py:meth:`fit`. - If early stopping occurs, the model will have two additional attributes:  :py:attr:`best_score` and :py:attr:`best_iteration`. These are used by the  :py:meth:`predict` and :py:meth:`apply` methods to determine the optimal  number of trees during inference. If users want to access the full model  (including trees built after early stopping), they can specify the  `iteration_range` in these inference methods. In addition, other utilities  like model plotting can also use the entire model. - If you prefer to discard the trees after `best_iteration`, consider using the  callback function :py:class:`xgboost.callback.EarlyStopping`. - If there's more than one item in **eval_set**, the last entry will be used for  early stopping. If there's more than one metric in **eval_metric**, the last  metric will be used for early stopping.",
,enable_categorical  enable_categorical: bool See the same parameter of :py:class:`DMatrix` for details.,False


In [58]:
from sklearn.metrics import classification_report, roc_auc_score
import numpy as np

if USE_XGB:
    # XGBoost handles class imbalance via scale_pos_weight (set in cell 5)
    model.fit(X_train, y_train)
else:
    # GradientBoostingClassifier doesn't support class_weight;
    # pass sample_weight to up-weight the minority class instead.
    neg, pos = (y_train == 0).sum(), (y_train == 1).sum()
    weight_map = {0: len(y_train) / (2 * neg), 1: len(y_train) / (2 * pos)}
    sw = np.array([weight_map[v] for v in y_train])
    model.fit(X_train, y_train, sample_weight=sw)

pred = model.predict(X_val)
proba = model.predict_proba(X_val)[:, 1] if hasattr(model, "predict_proba") else None

print(classification_report(y_val, pred))
if proba is not None:
    print("ROC AUC:", roc_auc_score(y_val, proba))


              precision    recall  f1-score   support

           0       0.67      0.02      0.04        88
           1       0.93      1.00      0.97      1207

    accuracy                           0.93      1295
   macro avg       0.80      0.51      0.50      1295
weighted avg       0.92      0.93      0.90      1295

ROC AUC: 0.7015750922648188


In [56]:
# Save trained model to disk
import joblib

# Derive path from data_path (defined in cell 3, known to work)
# data_path = ../data_analysis/data_preprocessing_training.csv
#   .parent  = ../data_analysis/
#   .parent  = ../                (= overtrading_model/)
#   / model_training              (= overtrading_model/model_training/)
model_dir = data_path.parent.parent / "model_training"
model_dir.mkdir(parents=True, exist_ok=True)
model_path = model_dir / "overtrading_model.joblib"
joblib.dump(model, model_path)
print(f"Model saved to {model_path}")
model_path


Model saved to ../model_training/overtrading_model.joblib


PosixPath('../model_training/overtrading_model.joblib')

In [57]:
# Save predictions (optional)
out = df.loc[val_idx, ["window_start", "window_end"]].copy()
out["y_true"] = y_val.values
out["y_pred"] = pred
if proba is not None:
    out["y_score"] = proba

out_path = model_dir / "overtrading_predictions.csv"
out.to_csv(out_path, index=False)
print(f"Predictions saved to {out_path}")
out_path


Predictions saved to ../model_training/overtrading_predictions.csv


PosixPath('../model_training/overtrading_predictions.csv')