# Trading Bot Training (Colab)
**35 Features** | **M15 Timeframe** | **Semantic Actions** | **Position-Aware QNetwork**

Upload your EURUSD M5 CSV (columns: time, open, high, low, close, tick_volume) to `/content/`

In [None]:
# ==================== SETTINGS ====================
class Settings:
    # V2 Feature Set (35 features, alpha-proven at 55.1%)
    FEATURES = [
        'hour_sin', 'hour_cos', 'dow_sin', 'dow_cos',
        'session_london', 'session_nyc',
        'ret_1', 'ret_5', 'ret_20', 'rsi_14', 'roc_10',
        'zscore_20', 'zscore_50', 'bb_position', 'vwap_deviation',
        'dist_ema_20', 'dist_ema_50', 'ema_slope_20', 'adx_14',
        'atr_normalized', 'bb_width', 'volatility_ratio', 'atr_percentile',
        'body_ratio', 'upper_wick_ratio', 'lower_wick_ratio',
        'volume_ratio', 'volume_trend', 'price_volume_corr',
        'dist_session_high', 'dist_session_low',
        'session_range_position', 'session_range_ratio',
        'h1_trend', 'h1_momentum',
    ]
    INPUT_DIM = len(FEATURES)  # 35
    SEQUENCE_LENGTH = 48       # 12 hours of M15
    ENCODER_DIM = 64
    HIDDEN_DIM = 128
    NUM_LAYERS = 2
    DROPOUT = 0.25
    OUTPUT_DIM = 4             # HOLD, BUY, SELL, CLOSE
    ACTION_NAMES = ['HOLD', 'BUY', 'SELL', 'CLOSE']
    TRADE_STATE_DIM = 3        # position, bars_held, unrealized_pnl

    # Training
    EPOCHS = 10
    BATCH_SIZE = 64
    LEARNING_RATE = 0.0003
    GAMMA = 0.95
    EPSILON_START = 1.0
    EPSILON_DECAY_STEPS = 750_000  # Linear decay: reach min at step 750k
    EPSILON_MIN = 0.05
    TARGET_UPDATE_FREQ = 500
    TAU = 0.005
    TRANSACTION_COST_BPS = 2
    TRADE_PENALTY = 0.5        # Fixed penalty per position change
    HOLD_BONUS = 0.1           # Bonus for patient holding (>4 bars)
    HOLD_BONUS_MIN_BARS = 4    # Min bars held before bonus
    REWARD_CLIP = 10.0
    SCALING_FACTOR = 10000.0   # For EURUSD
    PER_ALPHA = 0.6
    PER_BETA_START = 0.4
    MEMORY_CAPACITY = 100000
    EARLY_STOP_PATIENCE = 10
    EPISODE_LENGTH = 2000      # ~21 days M15 â€” enough for trade lifecycle
    EPISODES_PER_EPOCH = 50    # 50x2000 = 100k steps/epoch

    # Data
    CSV_PATH = '/content/EURUSD_M5.csv'  # Upload your CSV here
    PAIR = 'EURUSD'

print(f'Settings loaded: {Settings.INPUT_DIM} features, {Settings.OUTPUT_DIM} actions')


In [None]:
import pandas as pd
import pandas_ta as ta
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import copy
import random
import os
import gc
import matplotlib.pyplot as plt
from tqdm.auto import tqdm

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Device: {device}')


## V2 Feature Engineering (35 features)
Time structure, VWAP, session context, volume dynamics, multi-TF context

In [None]:
def resample_ohlcv(df, rule='15min'):
    agg = {'open': 'first', 'high': 'max', 'low': 'min', 'close': 'last'}
    if 'tick_volume' in df.columns: agg['tick_volume'] = 'sum'
    return df.resample(rule).agg(agg).dropna()

def prepare_features(df):
    df = df.copy()
    if not isinstance(df.index, pd.DatetimeIndex):
        if 'time' in df.columns:
            df['time'] = pd.to_datetime(df['time'])
            df.set_index('time', inplace=True)

    # 1. TIME STRUCTURE
    hour = df.index.hour + df.index.minute / 60.0
    df['hour_sin'] = np.sin(2 * np.pi * hour / 24.0)
    df['hour_cos'] = np.cos(2 * np.pi * hour / 24.0)
    dow = df.index.dayofweek
    df['dow_sin'] = np.sin(2 * np.pi * dow / 5.0)
    df['dow_cos'] = np.cos(2 * np.pi * dow / 5.0)
    utc_hour = df.index.hour
    df['session_london'] = ((utc_hour >= 7) & (utc_hour < 16)).astype(float)
    df['session_nyc'] = ((utc_hour >= 13) & (utc_hour < 22)).astype(float)

    # 2. BASE
    atr = ta.atr(df['high'], df['low'], df['close'], length=14)
    df['atr'] = atr.bfill().fillna(df['close'] * 0.001)

    # 3. RETURNS & MOMENTUM
    for lag in [1, 5, 20]:
        raw_ret = df['close'].pct_change(lag)
        rm = raw_ret.rolling(60).mean()
        rs = raw_ret.rolling(60).std().replace(0, np.nan).fillna(raw_ret.std())
        df[f'ret_{lag}'] = (raw_ret - rm) / rs
    rsi = ta.rsi(df['close'], length=14)
    df['rsi_14'] = (rsi / 100.0) if rsi is not None else 0.5
    roc = ta.roc(df['close'], length=10)
    df['roc_10'] = (roc / 100.0) if roc is not None else 0.0

    # 4. MEAN REVERSION
    for w in [20, 50]:
        rm = df['close'].rolling(w).mean()
        rs = df['close'].rolling(w).std().replace(0, np.nan).fillna(1e-8)
        df[f'zscore_{w}'] = (df['close'] - rm) / rs
    bb = ta.bbands(df['close'], length=20, std=2)
    if bb is not None and not bb.empty:
        bbl, bbm, bbu = bb.iloc[:, 0], bb.iloc[:, 1], bb.iloc[:, 2]
        bb_range = (bbu - bbl).replace(0, np.nan).fillna(1e-8)
        df['bb_position'] = (df['close'] - bbl) / bb_range
        df['bb_width'] = (bbu - bbl) / df['close']
    else:
        df['bb_position'], df['bb_width'] = 0.5, 0.0

    # VWAP
    vol_col = 'tick_volume' if 'tick_volume' in df.columns else None
    tv = df[vol_col].astype(float).replace(0, 1) if vol_col else pd.Series(1.0, index=df.index)
    tp = (df['high'] + df['low'] + df['close']) / 3.0
    dg = df.index.date
    cum_tp_vol = (tp * tv).groupby(dg).cumsum()
    cum_vol = tv.groupby(dg).cumsum().replace(0, np.nan).fillna(1)
    vwap = cum_tp_vol / cum_vol
    df['vwap_deviation'] = (df['close'] - vwap) / df['atr']

    # 5. TREND
    ema_20 = ta.ema(df['close'], length=20)
    ema_50 = ta.ema(df['close'], length=50)
    df['dist_ema_20'] = (df['close'] - ema_20) / df['atr']
    df['dist_ema_50'] = (df['close'] - ema_50) / df['atr']
    df['ema_slope_20'] = (ema_20 - ema_20.shift(5)) / df['atr']
    adx_df = ta.adx(df['high'], df['low'], df['close'], length=14)
    df['adx_14'] = (adx_df.iloc[:, 0] / 100.0) if adx_df is not None and not adx_df.empty else 0.0

    # 6. VOLATILITY
    df['atr_normalized'] = df['atr'] / df['close']
    rv = df['close'].pct_change().rolling(20).std()
    rv_mean = rv.rolling(100).mean().replace(0, np.nan).fillna(rv.mean())
    df['volatility_ratio'] = rv / rv_mean
    atr_min = df['atr'].rolling(100, min_periods=1).min()
    atr_max = df['atr'].rolling(100, min_periods=1).max()
    atr_range = (atr_max - atr_min).replace(0, np.nan).fillna(1e-8)
    df['atr_percentile'] = (df['atr'] - atr_min) / atr_range

    # 7. CANDLE
    df['body_ratio'] = (df['close'] - df['open']) / df['atr']
    df['upper_wick_ratio'] = (df['high'] - df[['open','close']].max(axis=1)) / df['atr']
    df['lower_wick_ratio'] = (df[['open','close']].min(axis=1) - df['low']) / df['atr']

    # 8. VOLUME
    if vol_col:
        vol_avg = tv.rolling(20).mean().replace(0, np.nan).fillna(tv.mean())
        df['volume_ratio'] = tv / vol_avg
        df['volume_trend'] = (tv.rolling(10).mean() - tv.rolling(20).mean()) / (tv.rolling(20).mean() + 1e-8)
        df['price_volume_corr'] = df['close'].pct_change().rolling(10).corr(tv.pct_change())
    else:
        df['volume_ratio'], df['volume_trend'], df['price_volume_corr'] = 1.0, 0.0, 0.0

    # 9. SESSION CONTEXT
    sh = df['high'].groupby(dg).cummax()
    sl = df['low'].groupby(dg).cummin()
    sr = (sh - sl).replace(0, np.nan).fillna(1e-8)
    df['dist_session_high'] = (sh - df['close']) / df['atr']
    df['dist_session_low'] = (df['close'] - sl) / df['atr']
    df['session_range_position'] = (df['close'] - sl) / sr
    dh = df['high'].resample('D').max()
    dlw = df['low'].resample('D').min()
    dr = (dh - dlw).dropna()
    ar = dr.rolling(20, min_periods=1).mean()
    ratio = dr / ar.replace(0, np.nan).fillna(1e-8)
    date_idx = pd.Series(df.index.date, index=df.index)
    df['session_range_ratio'] = date_idx.map(ratio.to_dict()).fillna(1.0).values

    # 10. MULTI-TF
    h1_ema = ta.ema(df['close'], length=48)
    df['h1_trend'] = (h1_ema - h1_ema.shift(12)) / df['atr']
    h1_rsi = ta.rsi(df['close'], length=56)
    df['h1_momentum'] = (h1_rsi / 100.0) if h1_rsi is not None else 0.5

    # CLEANUP
    df.fillna(0, inplace=True)
    df.replace([np.inf, -np.inf], 0, inplace=True)
    for f in Settings.FEATURES:
        if f in df.columns: df[f] = df[f].clip(-10, 10)
    return df.iloc[250:]

print('V2 feature engineering loaded.')


## Load & Prepare Data

In [None]:
# Load CSV
raw_df = pd.read_csv(Settings.CSV_PATH)
raw_df['time'] = pd.to_datetime(raw_df['time'], format='mixed')
raw_df.set_index('time', inplace=True)
print(f'Loaded {len(raw_df)} M5 bars: {raw_df.index[0]} -> {raw_df.index[-1]}')

# Resample M5 -> M15
df = resample_ohlcv(raw_df, '15min')
print(f'M15 bars: {len(df)}')

# Compute features
df = prepare_features(df)
print(f'Bars after features: {len(df)}')

# Extract arrays
available_feats = [f for f in Settings.FEATURES if f in df.columns]
print(f'Available features: {len(available_feats)}/{Settings.INPUT_DIM}')

feature_data = df[available_feats].values.astype(np.float32)
close_prices = df['close'].values.astype(np.float64)

# Train/Val split (80/20)
total = len(feature_data)
train_end = int(total * 0.8)
print(f'Train: {train_end} bars | Val: {total - train_end} bars')
print(f'Train period: {df.index[0]} -> {df.index[train_end-1]}')
print(f'Val period: {df.index[train_end]} -> {df.index[-1]}')


## PER (Prioritized Experience Replay)

In [None]:
class SumTree:
    def __init__(self, capacity):
        self.capacity = capacity
        self.tree = np.zeros(2 * capacity - 1)
        self.data = np.zeros(capacity, dtype=object)
        self.write = 0
        self.count = 0

    def _propagate(self, idx, change):
        parent = (idx - 1) // 2
        self.tree[parent] += change
        if parent != 0: self._propagate(parent, change)

    def _retrieve(self, idx, s):
        left = 2 * idx + 1
        right = left + 1
        if left >= len(self.tree): return idx
        return self._retrieve(left, s) if s <= self.tree[left] else self._retrieve(right, s - self.tree[left])

    def total(self): return self.tree[0]

    def add(self, p, data):
        idx = self.write + self.capacity - 1
        self.data[self.write] = data
        self.update(idx, p)
        self.write = (self.write + 1) % self.capacity
        if self.count < self.capacity: self.count += 1

    def update(self, idx, p):
        change = p - self.tree[idx]
        self.tree[idx] = p
        self._propagate(idx, change)

    def get(self, s):
        idx = self._retrieve(0, s)
        return (idx, self.tree[idx], self.data[idx - self.capacity + 1])


class PERBuffer:
    def __init__(self, capacity, alpha=0.6):
        self.tree = SumTree(capacity)
        self.alpha = alpha

    def push(self, state, action, reward, next_state, done):
        max_p = np.max(self.tree.tree[-self.tree.capacity:])
        if max_p == 0: max_p = 1.0
        self.tree.add(max_p, (state, action, reward, next_state, done))

    def sample(self, batch_size, beta=0.4):
        batch, idxs, prios = [], [], []
        seg = self.tree.total() / batch_size
        for i in range(batch_size):
            s = random.uniform(seg * i, seg * (i + 1))
            idx, p, data = self.tree.get(s)
            if data == 0 or data is None:
                vi = random.randint(0, self.tree.count - 1)
                data = self.tree.data[vi]
                idx = vi + self.tree.capacity - 1
                p = self.tree.tree[idx]
            prios.append(p)
            batch.append(data)
            idxs.append(idx)
        sp = np.array(prios) / self.tree.total()
        w = np.power(self.tree.count * sp, -beta)
        w /= w.max()
        states, actions, rewards, next_states, dones = zip(*batch)
        return list(states), np.array(actions), np.array(rewards, dtype=np.float32), \
               list(next_states), np.array(dones, dtype=bool), idxs, np.array(w, dtype=np.float32)

    def update_priorities(self, idxs, errors):
        for idx, e in zip(idxs, errors):
            self.tree.update(idx, (abs(e) + 1e-5) ** self.alpha)

    def __len__(self): return self.tree.count

print('PER buffer loaded.')


## Position-Aware QNetwork

In [None]:
class QNetwork(nn.Module):
    def __init__(self):
        super().__init__()
        self.feature_encoder = nn.Sequential(
            nn.Linear(Settings.INPUT_DIM, Settings.INPUT_DIM),
            nn.ReLU(),
            nn.Linear(Settings.INPUT_DIM, Settings.ENCODER_DIM),
            nn.ReLU()
        )
        self.lstm = nn.LSTM(
            input_size=Settings.ENCODER_DIM,
            hidden_size=Settings.HIDDEN_DIM,
            num_layers=Settings.NUM_LAYERS,
            dropout=Settings.DROPOUT if Settings.NUM_LAYERS > 1 else 0,
            batch_first=True
        )
        self.layer_norm = nn.LayerNorm(Settings.HIDDEN_DIM)
        fc_in = Settings.HIDDEN_DIM + Settings.TRADE_STATE_DIM
        self.fc = nn.Sequential(
            nn.Linear(fc_in, Settings.HIDDEN_DIM // 2),
            nn.ReLU(),
            nn.Dropout(Settings.DROPOUT),
            nn.Linear(Settings.HIDDEN_DIM // 2, Settings.OUTPUT_DIM)
        )

    def forward(self, x, trade_state=None):
        B, S, _ = x.size()
        enc = self.feature_encoder(x.reshape(B*S, -1)).reshape(B, S, -1)
        out, _ = self.lstm(enc)
        h = self.layer_norm(out[:, -1, :])
        if trade_state is None:
            trade_state = torch.zeros(B, Settings.TRADE_STATE_DIM, device=x.device)
        return self.fc(torch.cat([h, trade_state], dim=1))

# Test
net = QNetwork().to(device)
x = torch.randn(2, Settings.SEQUENCE_LENGTH, Settings.INPUT_DIM).to(device)
ts = torch.randn(2, Settings.TRADE_STATE_DIM).to(device)
print(f'QNetwork output: {net(x, ts).shape}')  # [2, 4]
total_params = sum(p.numel() for p in net.parameters())
print(f'Total parameters: {total_params:,}')
del net, x, ts


## Trading Environment (Sequential, Position-Aware)

In [None]:
ACT_HOLD, ACT_BUY, ACT_SELL, ACT_CLOSE = 0, 1, 2, 3

class TradingEnv:
    def __init__(self, features, close_prices, seq_len, cost_bps=2, sf=10000.0):
        self.features = features
        self.close_prices = close_prices
        self.seq_len = seq_len
        self.cost_frac = cost_bps * 0.0001
        self.sf = sf
        self.reset()

    def reset(self, start_idx=None):
        self.pos = 0.0
        self.entry_price = 0.0
        self.bars_held = 0
        self.step_idx = start_idx if start_idx else self.seq_len
        self.done = False
        return self._state()

    def _state(self):
        mkt = self.features[self.step_idx - self.seq_len:self.step_idx]
        cp = self.close_prices[self.step_idx - 1]
        upnl = self.pos * (cp - self.entry_price) * self.sf if self.pos != 0 and self.entry_price > 0 else 0.0
        ts = np.array([self.pos, min(self.bars_held / 48.0, 1.0), np.clip(upnl / 100.0, -1, 1)], dtype=np.float32)
        return mkt, ts

    def step(self, action):
        if self.done: return self._state(), 0.0, True, {}
        cp = self.close_prices[self.step_idx - 1]
        old_pos = self.pos

        # Execute action
        if action == ACT_BUY and self.pos <= 0:
            self.pos, self.entry_price, self.bars_held = 1.0, cp, 0
        elif action == ACT_SELL and self.pos >= 0:
            self.pos, self.entry_price, self.bars_held = -1.0, cp, 0
        elif action == ACT_CLOSE and self.pos != 0:
            self.pos, self.entry_price, self.bars_held = 0.0, 0.0, 0

        pos_changed = (self.pos != old_pos)
        self.step_idx += 1

        if self.step_idx >= len(self.close_prices):
            self.done = True
            r = 0.0
            if self.pos != 0:
                r = self.pos * (self.close_prices[-1] - self.entry_price) * self.sf
                r -= self.cost_frac * self.sf
            return self._state(), np.clip(r, -Settings.REWARD_CLIP, Settings.REWARD_CLIP), True, {}

        np_ = self.close_prices[self.step_idx - 1]
        reward = 0.0
        if pos_changed:
            reward -= self.cost_frac * self.sf * abs(self.pos - old_pos)
            reward -= Settings.TRADE_PENALTY  # Discourage churning
        if self.pos != 0:
            reward += self.pos * (np_ - cp) * self.sf
            self.bars_held += 1
            if self.bars_held > Settings.HOLD_BONUS_MIN_BARS:
                reward += Settings.HOLD_BONUS  # Patience bonus

        reward = np.clip(reward, -Settings.REWARD_CLIP, Settings.REWARD_CLIP)
        return self._state(), reward, False, {'pos_changed': pos_changed, 'pos': self.pos}

# Test
env = TradingEnv(feature_data[:500], close_prices[:500], Settings.SEQUENCE_LENGTH)
s = env.reset()
print(f'Market state: {s[0].shape}, Trade state: {s[1].shape}')
for _ in range(10):
    s, r, d, i = env.step(random.randint(0, 3))
print(f'After 10 random steps: pos={env.pos}, bars_held={env.bars_held}')
del env


## Training Loop

In [None]:
def soft_update(policy, target, tau=Settings.TAU):
    for p, t in zip(policy.parameters(), target.parameters()):
        t.data.copy_(tau * p.data + (1 - tau) * t.data)

def train_step(policy, target, optimizer, memory, loss_fn):
    policy.train()
    states, actions, rewards, nstates, dones, idxs, weights = memory.sample(
        Settings.BATCH_SIZE, Settings.PER_BETA_START)
    mkt = torch.FloatTensor(np.array([s[0] for s in states])).to(device)
    ts = torch.FloatTensor(np.array([s[1] for s in states])).to(device)
    a = torch.LongTensor(actions).to(device)
    r = torch.FloatTensor(rewards).to(device)
    nmkt = torch.FloatTensor(np.array([s[0] for s in nstates])).to(device)
    nts = torch.FloatTensor(np.array([s[1] for s in nstates])).to(device)
    d_t = torch.FloatTensor(dones.astype(np.float32)).to(device)
    w_t = torch.FloatTensor(weights).to(device)

    with torch.enable_grad():
        q = policy(mkt, ts).gather(1, a.unsqueeze(1)).squeeze(1)
        with torch.no_grad():
            best_a = policy(nmkt, nts).argmax(dim=1, keepdim=True)
            nq = target(nmkt, nts).gather(1, best_a).squeeze(1)
            tgt = r + Settings.GAMMA * nq * (1 - d_t)
        loss = (loss_fn(q, tgt) * w_t).mean()
        optimizer.zero_grad()
        loss.backward()
    torch.nn.utils.clip_grad_norm_(policy.parameters(), 1.0)
    optimizer.step()
    td_err = (q - tgt).abs().detach().cpu().numpy() + 1e-6
    memory.update_priorities(idxs, td_err)
    return loss.item()

def validate(policy, features, prices, sf):
    policy.eval()
    env = TradingEnv(features, prices, Settings.SEQUENCE_LENGTH, sf=sf)
    mkt, ts = env.reset()
    daily, bar_r, bar_c = [], 0.0, 0
    trades = 0
    with torch.no_grad():
        while not env.done:
            mt = torch.FloatTensor(mkt).unsqueeze(0).to(device)
            tt = torch.FloatTensor(ts).unsqueeze(0).to(device)
            a = policy(mt, tt).argmax(dim=1).item()
            (mkt, ts), r, done, info = env.step(a)
            if info.get('pos_changed'): trades += 1
            bar_r += r
            bar_c += 1
            if bar_c >= 96:  # ~1 day of M15
                daily.append(bar_r)
                bar_r, bar_c = 0.0, 0
            if done:
                if bar_c > 0: daily.append(bar_r)
                break
    policy.train()
    if len(daily) < 5: return 0.0, 0.0, trades
    daily = np.array(daily)
    sharpe = (daily.mean() / (daily.std() + 1e-8)) * np.sqrt(252)
    return sharpe, daily.sum(), trades

print('Training functions loaded.')


## Run Training

In [None]:
# Initialize
policy_net = QNetwork().to(device)
target_net = copy.deepcopy(policy_net).to(device)
target_net.eval()
optimizer = optim.Adam(policy_net.parameters(), lr=Settings.LEARNING_RATE, weight_decay=1e-4)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', patience=5, factor=0.5)
loss_fn = nn.SmoothL1Loss(reduction='none')
memory = PERBuffer(Settings.MEMORY_CAPACITY, Settings.PER_ALPHA)

SF = Settings.SCALING_FACTOR
epsilon = Settings.EPSILON_START
total_steps = 0
best_sharpe = -np.inf
patience = 0
train_feats = feature_data[:train_end]
train_prices = close_prices[:train_end]
max_start = train_end - Settings.SEQUENCE_LENGTH - Settings.EPISODE_LENGTH - 1
history = {'epoch': [], 'reward': [], 'trades': [], 'val_sharpe': [], 'val_return': [], 'epsilon': [], 'loss': []}

for epoch in range(1, Settings.EPOCHS + 1):
    policy_net.train()
    ep_reward, ep_trades, losses = 0.0, 0, []
    pbar = tqdm(range(Settings.EPISODES_PER_EPOCH), desc=f'Epoch {epoch}/{Settings.EPOCHS}', ncols=110)

    for ep_i in pbar:
        # Random start point for this mini-rollout
        start = random.randint(Settings.SEQUENCE_LENGTH, max_start)
        env = TradingEnv(train_feats, train_prices, Settings.SEQUENCE_LENGTH,
                         Settings.TRANSACTION_COST_BPS, SF)
        mkt, ts = env.reset(start_idx=start)

        for step in range(Settings.EPISODE_LENGTH):
            # Skip forward pass when taking random action (optimization #3)
            if random.random() < epsilon:
                action = random.randint(0, Settings.OUTPUT_DIM - 1)
            else:
                mt = torch.FloatTensor(mkt).unsqueeze(0).to(device)
                tt = torch.FloatTensor(ts).unsqueeze(0).to(device)
                with torch.no_grad():
                    action = policy_net(mt, tt).argmax(dim=1).item()

            (nmkt, nts), reward, done, info = env.step(action)
            if info.get('pos_changed'): ep_trades += 1

            memory.push((mkt.copy(), ts.copy()), action, reward, (nmkt.copy(), nts.copy()), done)
            mkt, ts = nmkt, nts
            ep_reward += reward
            total_steps += 1

            epsilon = max(Settings.EPSILON_MIN, 1.0 - total_steps / Settings.EPSILON_DECAY_STEPS)

            if len(memory) >= Settings.BATCH_SIZE and total_steps % 8 == 0:
                l = train_step(policy_net, target_net, optimizer, memory, loss_fn)
                losses.append(l)

            if total_steps % Settings.TARGET_UPDATE_FREQ == 0:
                soft_update(policy_net, target_net)

            if done: break

        if ep_i % 20 == 0:
            al = np.mean(losses[-200:]) if losses else 0
            pbar.set_postfix({'R': f'{ep_reward:.0f}', 'T': ep_trades, 'e': f'{epsilon:.3f}', 'L': f'{al:.4f}'})

    pbar.close()
    avg_loss = np.mean(losses) if losses else 0

    # Validate
    vs, vr, vt = validate(policy_net, feature_data[train_end:], close_prices[train_end:], SF)
    scheduler.step(vs)

    history['epoch'].append(epoch)
    history['reward'].append(ep_reward)
    history['trades'].append(ep_trades)
    history['val_sharpe'].append(vs)
    history['val_return'].append(vr)
    history['epsilon'].append(epsilon)
    history['loss'].append(avg_loss)

    print(f'Epoch {epoch}: R={ep_reward:.0f} T={ep_trades} eps={epsilon:.4f} L={avg_loss:.4f} | '
          f'Val Sharpe={vs:.3f} Val Return={vr:.1f} Val Trades={vt}')

    if vs > best_sharpe:
        best_sharpe = vs
        patience = 0
        torch.save(policy_net.state_dict(), f'/content/best_{Settings.PAIR}.pt')
        print(f'  ** New best! Saved. **')
    else:
        patience += 1
        if patience >= Settings.EARLY_STOP_PATIENCE:
            print(f'Early stopping at epoch {epoch}')
            break

    gc.collect()
    if device.type == 'cuda': torch.cuda.empty_cache()

print(f'\nTraining complete. Best val Sharpe: {best_sharpe:.3f}')


## Training Curves

In [None]:
fig, axes = plt.subplots(2, 3, figsize=(18, 10))

axes[0, 0].plot(history['epoch'], history['reward'])
axes[0, 0].set_title('Episode Reward'); axes[0, 0].set_xlabel('Epoch')

axes[0, 1].plot(history['epoch'], history['trades'])
axes[0, 1].set_title('Trades per Epoch'); axes[0, 1].set_xlabel('Epoch')

axes[0, 2].plot(history['epoch'], history['epsilon'])
axes[0, 2].set_title('Epsilon'); axes[0, 2].set_xlabel('Epoch')

axes[1, 0].plot(history['epoch'], history['val_sharpe'])
axes[1, 0].axhline(y=0, color='r', linestyle='--', alpha=0.5)
axes[1, 0].set_title('Validation Sharpe'); axes[1, 0].set_xlabel('Epoch')

axes[1, 1].plot(history['epoch'], history['val_return'])
axes[1, 1].axhline(y=0, color='r', linestyle='--', alpha=0.5)
axes[1, 1].set_title('Validation Return'); axes[1, 1].set_xlabel('Epoch')

axes[1, 2].plot(history['epoch'], history['loss'])
axes[1, 2].set_title('Avg Loss'); axes[1, 2].set_xlabel('Epoch')

plt.tight_layout()
plt.savefig('/content/training_curves.png', dpi=150)
plt.show()


## Download Model

In [None]:
from google.colab import files
model_path = f'/content/best_{Settings.PAIR}.pt'
if os.path.exists(model_path):
    files.download(model_path)
    print(f'Model downloaded: {model_path}')
else:
    print('No model saved (training may not have improved).')
