# Data preprocessing

Notebook for preprocessing overtrading datasets.


## Concepts

- **Session**: one fixed-time window (e.g., each 15-minute window is a session).
- **Core window**: same as session in this setup.
- **Overtrading indicators**: optional global summaries across all windows (for context only).

This notebook builds one **training row per window**.

**Note:** each row corresponds to a single window and includes both the core-window
features and the per-window overtrading indicators.


In [3]:
import pandas as pd
from pathlib import Path
import sys


# Load mock behaviors datasets
base_dir = Path('.').resolve()
print(base_dir.absolute())

sys.path.append(str(base_dir / '../..'))

data_dir = (base_dir / '../../mock_behaviours').resolve()

bias_present_file = (data_dir / 'revenge_example.csv').absolute()
bias_negative_file = (data_dir / 'balanced_example.csv').absolute()

C:\Users\johnl\Documents\CogniTrade\backend\models\revenge_trading_model


In [4]:
from extract_features import extract_derived_features

bias_present_df = extract_derived_features(bias_present_file)
bias_negative_df = extract_derived_features(bias_negative_file)

2200
2200
1200
1200


In [5]:
# Define a baseline and post loss window, at each loss event
bias_present_loss_events = bias_present_df[~bias_present_df['IsWin']]
bias_negative_loss_events = bias_negative_df[~bias_negative_df['IsWin']]

In [6]:
def compute_core_window_vector(win, eps) -> pd.DataFrame:
    if win.empty:
        return pd.DataFrame()

    win = win.copy()

    start = win.iloc[0]['timestamp']
    end = win.iloc[-1]['timestamp']


    # Time / activity
    n_trades = len(win)
    window_minutes = (end - start).total_seconds() / 60
    trade_rate_per_min = n_trades / window_minutes

    gaps_sec = win['MinsSinceLastTrade'] * 60
    median_gap_sec = gaps_sec.median()
    mean_gap_sec   = gaps_sec.mean()

    burst_frac = (gaps_sec <= 60).mean()

    # Instrument churn
    n_assets = win['asset'].nunique()

    top_asset_share = (win['asset'].value_counts(normalize=True).iloc[0])

    asset_changes = (win['asset'].ne(win['asset'].shift()).sum() - 1)
    asset_switch_rate = asset_changes / max(n_trades - 1, 1)

    # Sizing / turnover
    sizing = win['TradeSize']

    sizing_sum  = sizing.sum()
    sizing_mean = sizing.mean()
    sizing_std  = sizing.std()

    window_start_balance = win.iloc[0]["balance"]
    turnover = sizing_sum / (window_start_balance + eps)    # or sum TradeSizePctBalance

    # P/L distribution
    pnl = win['profit_loss']

    pnl_sum  = pnl.sum()
    pnl_mean = pnl.mean()
    pnl_std  = pnl.std()

    win_rate = win['IsWin'].mean()
    positive_pnl = pnl[pnl > 0]
    avg_gain = positive_pnl.mean() if not positive_pnl.empty else 0.0
    negative_pnl = pnl[pnl < 0]
    avg_loss_abs = negative_pnl.abs().mean() if not negative_pnl.empty else 0.0
    payoff_ratio = avg_gain / (avg_loss_abs + eps)

    pnl_skew_proxy = (pnl.quantile(0.9) + pnl.quantile(0.1)) / (abs(pnl.quantile(0.5)) + eps)

    # Drawdown proxy
    min_balance = win['balance'].min()
    dd_max = (min_balance - window_start_balance) / window_start_balance


    return {
        # "window_start": str(start),
        # "window_end": str(end),
        "n_trades": n_trades,
        "trade_rate_per_min": trade_rate_per_min,
        "median_gap_sec": median_gap_sec,
        "mean_gap_sec": mean_gap_sec,
        # "gap_cv": gap_cv,
        "burst_frac": burst_frac,
        "n_assets": n_assets,
        "top_asset_share": top_asset_share,
        "asset_switch_rate": asset_switch_rate,
        "sizing_sum": sizing_sum,
        "sizing_mean": sizing_mean,
        "sizing_std": sizing_std,
        "turnover": turnover,
        "pnl_sum": pnl_sum,
        "pnl_mean": pnl_mean,
        "pnl_std": pnl_std,
        "win_rate": win_rate,
        "avg_gain": avg_gain,
        "avg_loss_abs": avg_loss_abs,
        "payoff_ratio": payoff_ratio,
        "pnl_skew_proxy": pnl_skew_proxy,
        "dd_max": dd_max,
        "window_start_balance": window_start_balance,
    }

In [7]:
def compute_revenge_trading_window_vector(base, post, eps: float = 1e-9):
    base = compute_core_window_vector(base, eps)
    post = compute_core_window_vector(post, eps)

    # REVENGE TRADING INDICATORS
    post_trade_rate_ratio = post['trade_rate_per_min'] / (base['trade_rate_per_min'] + eps)
    post_turnover_ratio   = post['turnover'] / (base['turnover'] + eps)
    post_sizing_mean_ratio = post['sizing_mean'] / (base['sizing_mean'] + eps)

    post_win_rate_delta = post['win_rate'] - base['win_rate']
    post_pnl_vol_ratio  = post['pnl_std'] / (base['pnl_std'] + eps)

    post_asset_switch_delta = post['asset_switch_rate'] - base['asset_switch_rate']
    post_burst_frac_delta   = post['burst_frac'] - base['burst_frac']

    vec = {
        'post_trade_rate_ratio': post_trade_rate_ratio,
        'post_turnover_delta': post_turnover_ratio,
        'post_sizing_mean_ratio': post_sizing_mean_ratio,
        'post_win_rate_delta': post_win_rate_delta,
        'post_pnl_vol_ratio': post_pnl_vol_ratio,
        'post_asset_switch_delta': post_asset_switch_delta,
        'post_burst_frac_delta': post_burst_frac_delta,
    }
    # print('---------------------------------------------------------------')
    # print(base)
    # print(post)
    return vec | post    


## Training dataframe (one row per window)

Each window is treated as a session, so the training dataframe is simply the
core window features with a `session_id` per window.


In [8]:
BASELINE_WIN_SIZE = 15
POSTLOSS_WIN_SIZE = 5
MIN_WIN=3

samples = pd.DataFrame()

for idx, loss in bias_present_loss_events.iterrows():
    base_start = max(0, idx - BASELINE_WIN_SIZE)
    post_end = min(idx + POSTLOSS_WIN_SIZE, len(bias_present_df))
    baseline = bias_present_df.iloc[base_start:idx]
    postloss = bias_present_df.iloc[idx:post_end]

    if len(baseline) > MIN_WIN and len(postloss) > MIN_WIN:
        vec = compute_revenge_trading_window_vector(baseline, postloss)
        vec['revenge_trader_window'] = 1
        # print(vec)
        samples = pd.concat([samples, pd.DataFrame([vec])], ignore_index=True)


for idx, loss in bias_negative_df.iterrows():
    base_start = max(0, idx - BASELINE_WIN_SIZE)
    post_end = min(idx + POSTLOSS_WIN_SIZE, len(bias_negative_df))
    baseline = bias_negative_df.iloc[base_start:idx]
    postloss = bias_negative_df.iloc[idx:post_end]

    if len(baseline) > MIN_WIN and len(postloss) > MIN_WIN:
        vec = compute_revenge_trading_window_vector(baseline, postloss)
        vec['revenge_trader_window'] = 0
        # print(vec)
        samples = pd.concat([samples, pd.DataFrame([vec])], ignore_index=True)

In [9]:
samples

with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print(samples[samples.isna().any(axis=1)])

Empty DataFrame
Columns: [post_trade_rate_ratio, post_turnover_delta, post_sizing_mean_ratio, post_win_rate_delta, post_pnl_vol_ratio, post_asset_switch_delta, post_burst_frac_delta, n_trades, trade_rate_per_min, median_gap_sec, mean_gap_sec, burst_frac, n_assets, top_asset_share, asset_switch_rate, sizing_sum, sizing_mean, sizing_std, turnover, pnl_sum, pnl_mean, pnl_std, win_rate, avg_gain, avg_loss_abs, payoff_ratio, pnl_skew_proxy, dd_max, window_start_balance, revenge_trader_window]
Index: []


In [10]:
# import xgboost as xgb
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import roc_auc_score, accuracy_score, classification_report
import joblib

# -----------------------------
# Step 0: Prepare data
# -----------------------------
# X = [[n_trades, trade_rate, gap_mean, gap_std, n_assets, top_asset_share, turnover, win_rate, ...], ...]
# y = [1, 0, 0, 1, ...]  # 1 = revenge_trader window, 0 = calm_trader window
label = 'revenge_trader_window'
feature_cols = [k for k in samples.columns if k != label]
X = samples[feature_cols].to_numpy(dtype=float)
y = samples[label].to_numpy(dtype=float)

X = np.array(X)  # shape (num_windows, num_features)
y = np.array(y)    # shape (num_windows,)

# -----------------------------
# Step 1: Split into train/val
# -----------------------------
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# -----------------------------
# Step 2: Train Gradient Boosted Decision Tree classifier
# -----------------------------
model = GradientBoostingClassifier(
    n_estimators=200,
    max_depth=5,
    learning_rate=0.1,
    subsample=0.8,
    random_state=42
)

# model.fit(
#     X_train, y_train,
#     eval_set=[(X_val, y_val)],
#     early_stopping_rounds=20,
#     verbose=True
# )

model.fit(X_train, y_train)


# -----------------------------
# Step 3: Evaluate
# -----------------------------
y_pred_proba = model.predict_proba(X_val)[:, 1]  # probability of revenge
# y_pred = (y_pred_proba >= 0.5).astype(int)    # probability of revenge_trader_window=1

# some accuracy stuff
# print("Accuracy:", accuracy_score(y_val, y_pred_proba))
# print("ROC-AUC:", roc_auc_score(y_val, y_pred_proba))
# print(classification_report(y_val, y_pred_proba))


# SAVE MODEL
artifact = {
    "model": model,
    "feature_keys": feature_cols,   # order matters!
    "threshold": 0.7,               # or whatever you choose
    "window_config": {
        "baseline_num_trades": BASELINE_WIN_SIZE,
        "postloss_num_trades": POSTLOSS_WIN_SIZE
    }
}

joblib.dump(artifact, 'revenge_model.joblib')


# y_proba = model.predict_proba(X)[:, 1]
# samples['pred_revenge_trader_window'] = y_proba

# samples.to_csv(base_dir / 'revenge_out.csv')

['revenge_model.joblib']