# Buy DDQN training (notebook version)

Converted from your `run_train_buy.py`-style script.

1. Edit paths in **Parameters**.
2. Run cells top-to-bottom.
3. Model + diagnostics saved under `out_dir/<run_id>/`.


In [1]:
# Parameters (edit these)
config_path = "config.yaml"                 # path to your YAML config
features_npy = "data/features.npy"      # (n_steps, state_dim)
prices_npy = "data/prices.npy"          # (n_steps,)
out_dir = "runs"                            # output root folder
run_id = None                               # set to a string to override, or leave None for timestamp


In [2]:
import os
import time
import numpy as np

# If you run this notebook from outside your project root, you may need:
# import sys
# sys.path.append("/absolute/path/to/clean_trading_rl")

from core.config import load_config
from agents.ddqn_agent import DDQNAgent
from envs.buy_env import BuyEnv
from diagnostics.q_gap import compute_q_gap, plot_q_gap


In [3]:
# Load config + data
cfg = load_config(config_path)

features = np.load(features_npy)
prices = np.load(prices_npy)

cfg.agent.state_dim = int(features.shape[1])
cfg.agent.n_actions = 2

env = BuyEnv(features, prices, cfg.reward, cfg.trade_manager)
agent = DDQNAgent(cfg.agent)

print("features:", features.shape, "prices:", prices.shape)
print("state_dim:", cfg.agent.state_dim, "n_actions:", cfg.agent.n_actions)


features: (6195, 10) prices: (6195,)
state_dim: 10 n_actions: 2


In [4]:
# Output folder
if run_id is None:
    run_id = time.strftime("%Y%m%d-%H%M%S")

run_path = os.path.join(out_dir, run_id)
os.makedirs(run_path, exist_ok=True)

print("run_path:", run_path)


run_path: runs/20260114-152909


In [5]:
import torch

print("torch.is_inference_mode_enabled():", torch.is_inference_mode_enabled())
print("torch.is_grad_enabled():", torch.is_grad_enabled())


torch.is_inference_mode_enabled(): False
torch.is_grad_enabled(): True


In [6]:
# Training loop
for ep in range(int(cfg.training.episodes)):
    s = env.reset()
    done = False
    ep_reward = 0.0
    steps = 0
    loss = None

    while not done:
        a = agent.select_action(s, greedy=False)
        ns, r, done, info = env.step(a)

        agent.push(s, a, r, ns, done)

        # ✅ warmup based on env steps (or buffer size), then train
        if agent.total_steps >= int(cfg.training.warmup_steps):
            loss = agent.update()

        s = ns
        ep_reward += float(r)
        steps += 1

        if cfg.training.steps_per_episode is not None and steps >= int(cfg.training.steps_per_episode):
            break

    if (ep + 1) % int(cfg.training.log_every) == 0:
        last_loss = agent.loss_history[-1] if agent.loss_history else None
        print(
            f"[BUY] ep={ep+1}/{cfg.training.episodes} "
            f"reward={ep_reward:.4f} eps={agent.eps:.3f} "
            f"total_steps={agent.total_steps} learn_steps={agent.learn_steps} "
            f"loss={last_loss}"
        )


[BUY] ep=1/200 reward=0.7628 eps=0.976 total_steps=500 learn_steps=263 loss=0.0027448192704468966
target sync @ 500 loss 0.0023499273229390383
[BUY] ep=2/200 reward=0.6765 eps=0.953 total_steps=1000 learn_steps=763 loss=0.00035602442221716046
target sync @ 1000 loss 0.00032906795968301594
[BUY] ep=3/200 reward=0.9156 eps=0.929 total_steps=1500 learn_steps=1263 loss=0.00016380901797674596
target sync @ 1500 loss 0.000389173801522702
[BUY] ep=4/200 reward=1.0683 eps=0.905 total_steps=2000 learn_steps=1763 loss=0.0005383103853091598
target sync @ 2000 loss 0.0001436112797819078
[BUY] ep=5/200 reward=0.8210 eps=0.881 total_steps=2500 learn_steps=2263 loss=0.00032686020131222904
target sync @ 2500 loss 0.0003605523088481277
[BUY] ep=6/200 reward=1.2608 eps=0.858 total_steps=3000 learn_steps=2763 loss=0.00031028559897094965
target sync @ 3000 loss 0.00012002426228718832
[BUY] ep=7/200 reward=0.8094 eps=0.834 total_steps=3500 learn_steps=3263 loss=0.00011967222235398367
target sync @ 3500 los

In [None]:
# Save model + diagnostics
model_path = os.path.join(run_path, "buy_agent.pt")
agent.save(model_path)

gaps = compute_q_gap(agent, features, max_points=2000)
paths = plot_q_gap(gaps, run_path, tag="buy")

print("Saved model:", model_path)
print("Diagnostics:", paths)


Saved model: runs/20260114-152909/buy_agent.pt
Diagnostics: {'line': 'runs/20260114-152909/q_gap_buy.png', 'hist': 'runs/20260114-152909/q_gap_buy_hist.png'}


In [None]:
def compute_mean_delta(trades, prices, horizon, tc):
    deltas = []

    for tr in trades:
        if tr["meta"].get("reason") != "sell_agent":
            continue

        entry = tr["entry_idx"]
        # exit_sell = tr["exit_idx"]

        horizon_exit = min(entry + horizon, len(prices) - 1)

        entry_price = prices[entry]
        horizon_price = prices[horizon_exit]

        gross_horizon = (horizon_price - entry_price) / (entry_price + 1e-12)
        net_horizon = ((1 - tc) ** 2) * (1 + gross_horizon) - 1

        delta = tr["net_return"] - net_horizon
        deltas.append(delta)

    return {
        "count": len(deltas),
        "mean_delta": float(np.mean(deltas)) if deltas else 0.0,
        "median_delta": float(np.median(deltas)) if deltas else 0.0,
        "win_rate": float(np.mean(np.array(deltas) > 0)) if deltas else 0.0,
    }


# TradeManager

In [11]:
import os
import json
import numpy as np

from trade.trade_manager import TradeManager

# -----------------------
# SETTINGS
# -----------------------
SEG_LEN = 1239          # rows per ticker segment from your build_features validation
N_SEGS  = 5             # AAPL, MSFT, NVDA, AMZN, GOOGL
TRAIN_FRAC = 0.70       # time-based split within each segment

# NEW: entry harvesting (for SellAgent training)
TOPK_PER_SEG_TRAIN = 80     # try 50–150
TOPK_PER_SEG_TEST  = 40     # fewer is fine for eval
MIN_GAP_TRAIN = None        # None => defaults inside TradeManager
MIN_GAP_TEST  = None
USE_CONF_SCORE = False      # False => uses q1-q0 margin (recommended)

# -----------------------
# BUILD TRAIN/TEST INDEX (per segment, no leakage)
# -----------------------
train_len = int(SEG_LEN * TRAIN_FRAC)

train_idx = []
test_idx = []

for seg in range(N_SEGS):
    start = seg * SEG_LEN
    train_idx.extend(range(start, start + train_len))
    test_idx.extend(range(start + train_len, start + SEG_LEN))

train_idx = np.array(train_idx, dtype=np.int32)
test_idx  = np.array(test_idx, dtype=np.int32)

X_train = features[train_idx]
p_train = prices[train_idx]
X_test  = features[test_idx]
p_test  = prices[test_idx]

# Segment length inside each split subset (since we concatenated segments in order)
SEG_TRAIN = train_len
SEG_TEST  = SEG_LEN - train_len

print("=== DATA SPLIT ===")
print("features:", features.shape, "prices:", prices.shape)
print("SEG_LEN:", SEG_LEN, "N_SEGS:", N_SEGS, "TRAIN_FRAC:", TRAIN_FRAC)
print("train_len per seg:", SEG_TRAIN, "test_len per seg:", SEG_TEST)
print("X_train:", X_train.shape, "p_train:", p_train.shape)
print("X_test :", X_test.shape,  "p_test :", p_test.shape)

# -----------------------
# HELPER: run TM + debug logs (unchanged backtest)
# -----------------------
def run_tm(name: str, X: np.ndarray, p: np.ndarray, seg_len: int, sell_agent=None):
    tm = TradeManager(
        buy_agent=agent,            # trained buy agent
        sell_agent=None,      # optional
        state=X,
        prices=p,
        reward=cfg.reward,
        trade=cfg.trade_manager,
        segment_len=seg_len,        # IMPORTANT for boundary correctness
    )

    res = tm.run()
    trades = res["trades"]

    from collections import Counter
    reasons = Counter([t["meta"].get("reason", "none") for t in res["trades"]])
    print("Exit reasons:", dict(reasons))
    print("Non-time exits:", [t for t in res["trades"] if t["meta"].get("reason") != "time"][:3])

    print("ENTRY DEBUG:", res["entry_debug"])
    print("SELL DEBUG:", res["sell_debug"])
    print("EXIT REASONS:", res.get("exit_reasons"))


    print(f"\n=== TRADE MANAGER ({name}) ===")
    print("n_steps:", len(p))
    print("segment_len:", seg_len)
    print("n_trades:", res["n_trades"])
    print("final_equity:", res["final_equity"])

    # Boundary-crossing check (must be 0)
    if trades:
        cross = sum((t["entry_idx"] // seg_len) != (t["exit_idx"] // seg_len) for t in trades)
    else:
        cross = 0
    print("Trades crossing segment boundary:", cross)

    # Return stats
    if trades:
        net = np.array([t["net_return"] for t in trades], dtype=float)
        hold = np.array([t["hold_bars"] for t in trades], dtype=float)

        print("avg net return:", float(net.mean()))
        print("win rate:", float((net > 0).mean()))
        print("min/median/max net:", float(net.min()), float(np.median(net)), float(net.max()))
        print("median hold bars:", float(np.median(hold)))
        print("top 5 net:", np.sort(net)[-5:])
        print("bottom 5 net:", np.sort(net)[:5])

        # A few sample trades (head + tail)
        print("\nSample trades (first 3):")
        for t in trades[:3]:
            print(t)
        print("\nSample trades (last 3):")
        for t in trades[-3:]:
            print(t)
    else:
        print("No trades produced. Try lowering buy_min_confidence or disabling trend filter.")

    return tm, res

# -----------------------
# NEW: Harvest entry indices for SellAgent training (no trade execution)
# -----------------------
def harvest_entries(name: str, tm: TradeManager, topk_per_seg: int, min_gap=None, use_confidence_score=False):
    entries = tm.collect_entry_indices_topk(
        topk_per_segment=topk_per_seg,
        min_gap=min_gap,
        use_confidence_score=use_confidence_score,
    )
    entries = np.array(entries, dtype=np.int32)

    # Quick sanity: segment boundary + horizon feasibility check (should hold by construction)
    horizon = int(cfg.trade_manager.sell_horizon)
    if len(entries) > 0:
        seg_ok = np.all((entries % tm.segment_len) <= (tm.segment_len - 1 - horizon))
    else:
        seg_ok = True

    print(f"\n=== ENTRY HARVEST ({name}) ===")
    print("topk_per_segment:", topk_per_seg, "min_gap:", min_gap, "use_conf_score:", use_confidence_score)
    print("n_entries:", len(entries))
    print("horizon:", horizon, "segment_len:", tm.segment_len, "feasible_in_segment:", bool(seg_ok))
    if len(entries) > 0:
        print("first 10:", entries[:10].tolist())
        print("last 10 :", entries[-10:].tolist())

    return entries

# -----------------------
# RUN TRAIN + TEST (backtest as before)
# -----------------------
tm_train, res_train = run_tm("TRAIN", X_train, p_train, seg_len=SEG_TRAIN, sell_agent=None)
tm_test,  res_test  = run_tm("TEST",  X_test,  p_test,  seg_len=SEG_TEST,  sell_agent=None)

# -----------------------
# HARVEST ENTRIES (NEW LOGIC) — use these for SellEnv training
# -----------------------
train_entries = harvest_entries(
    "TRAIN",
    tm_train,
    topk_per_seg=TOPK_PER_SEG_TRAIN,
    min_gap=MIN_GAP_TRAIN,
    use_confidence_score=USE_CONF_SCORE,
)

test_entries = harvest_entries(
    "TEST",
    tm_test,
    topk_per_seg=TOPK_PER_SEG_TEST,
    min_gap=MIN_GAP_TEST,
    use_confidence_score=USE_CONF_SCORE,
)

# -----------------------
# SAVE ARTIFACTS (into out_dir)
# -----------------------
os.makedirs(out_dir, exist_ok=True)

train_entries_path = os.path.join(out_dir, "entry_indices_train.npy")
test_entries_path  = os.path.join(out_dir, "entry_indices_test.npy")

# NEW: save harvested entries (not trade entries)
np.save(train_entries_path, train_entries)
np.save(test_entries_path,  test_entries)

train_trades_json = os.path.join(out_dir, "trades_buy_only_train.json")
test_trades_json  = os.path.join(out_dir, "trades_buy_only_test.json")

with open(train_trades_json, "w") as f:
    json.dump(res_train["trades"], f, indent=2)

with open(test_trades_json, "w") as f:
    json.dump(res_test["trades"], f, indent=2)

print("\n=== SAVED ===")
print(" -", train_entries_path, "(HARVESTED)")
print(" -", test_entries_path,  "(HARVESTED)")
print(" -", train_trades_json)
print(" -", test_trades_json)


=== DATA SPLIT ===
features: (6195, 10) prices: (6195,)
SEG_LEN: 1239 N_SEGS: 5 TRAIN_FRAC: 0.7
train_len per seg: 867 test_len per seg: 372
X_train: (4335, 10) p_train: (4335,)
X_test : (1860, 10) p_test : (1860,)
Exit reasons: {'time': 141, 'segment_end': 1}
Non-time exits: [{'entry_idx': 3447, 'exit_idx': 3467, 'entry_price': 121.18000030517578, 'exit_price': 115.54000091552734, 'gross_return': -0.046542328564489185, 'net_return': -0.04844829044968868, 'hold_bars': 20, 'forced_exit': True, 'meta': {'buy_conf': 0.5080829894395992, 'reason': 'segment_end'}}]
ENTRY DEBUG: {'checked': 927, 'blocked_trend': 771, 'blocked_latest_entry': 11, 'blocked_conf': 3, 'opened': 142, 'conf_min': 0.2442312077388621, 'conf_max': 0.6760742955395784}
SELL DEBUG: {'seen': 0, 'sell_actions': 0}
EXIT REASONS: {'time': 141, 'segment_end': 1}

=== TRADE MANAGER (TRAIN) ===
n_steps: 4335
segment_len: 867
n_trades: 142
final_equity: 21.46589428271698
Trades crossing segment boundary: 0
avg net return: 0.02583

In [12]:
print("TEST final_equity:", res_test["final_equity"])
print("TEST n_trades:", res_test["n_trades"])

trades = res_test["trades"]
net = np.array([t["net_return"] for t in trades], dtype=float) if trades else np.array([])
print("TEST avg net:", net.mean() if len(net) else None)
print("TEST win rate:", (net > 0).mean() if len(net) else None)
print("TEST min/median/max:", (net.min(), np.median(net), net.max()) if len(net) else None)


TEST final_equity: 1.6027123880655472
TEST n_trades: 58
TEST avg net: 0.013725886669135455
TEST win rate: 0.5862068965517241
TEST min/median/max: (np.float64(-0.2515424823319645), np.float64(0.012225928964981758), np.float64(0.32334161900060887))


In [13]:
# def filter_entries(entries, seg_len, horizon, n):
#     kept = []
#     drop_oob = 0
#     drop_seg = 0
#     drop_horizon = 0

#     for e in entries:
#         e = int(e)
#         if e < 0 or e >= n:
#             drop_oob += 1
#             continue

#         # segment end (inclusive)
#         seg_end = min(((e // seg_len) + 1) * seg_len - 1, n - 1)

#         # need room for at least horizon bars INSIDE segment
#         last_allowed = min(e + horizon, seg_end, n - 1)

#         # if episode would immediately be at/over last_allowed, it's useless
#         # (or you can make this stricter: require at least 1 step)
#         if last_allowed <= e:
#             drop_horizon += 1
#             continue

#         kept.append(e)

#     kept = np.array(sorted(set(kept)), dtype=np.int32)

#     print("entries raw:", len(entries))
#     print("entries kept:", len(kept))
#     print("dropped oob:", drop_oob)
#     print("dropped horizon/seg:", drop_horizon)
#     return kept


# Sell

In [14]:
import os
import numpy as np
from copy import deepcopy

from agents.ddqn_agent import DDQNAgent
from envs.sell_env import SellEnv  # <- make sure this is the TM-consistent SellEnv v1

# Load entry indices saved by your TM cell
entries_train = np.load(os.path.join(out_dir, "entry_indices_train.npy"))
entries_test  = np.load(os.path.join(out_dir, "entry_indices_test.npy"))

print("entries_train:", entries_train.shape, "entries_test:", entries_test.shape)

# -----------------------
# Create SellEnv (TRAIN)
# -----------------------
sell_env_train = SellEnv(
    features=X_train,
    prices=p_train,
    entry_indices=entries_train,
    transaction_cost=cfg.reward.transaction_cost,
    sell_horizon=cfg.trade_manager.sell_horizon,
    min_hold_bars=cfg.trade_manager.min_hold_bars,
    segment_len=SEG_TRAIN,
    include_pos_features=True,
)

print("sell include_pos:", sell_env_train.include_pos)
print("sell feat_dim:", sell_env_train.feat_dim, "state_dim:", sell_env_train.state_dim)

# -----------------------
# Create SellAgent config
# # -----------------------
# sell_cfg = deepcopy(cfg.agent)
# sell_cfg.state_dim = int(sell_env_train.state_dim)
# sell_cfg.n_actions = 2

# # faster decay is fine, but DON'T double-count total_steps
# sell_cfg.epsilon_start = 1.0
# sell_cfg.epsilon_end = 0.05
# sell_cfg.epsilon_decay_steps = 40000

# sell_agent = DDQNAgent(sell_cfg)
# -----------------------
# Create SellAgent config
# -----------------------
sell_cfg = deepcopy(cfg.agent)
sell_cfg.state_dim = int(sell_env_train.state_dim)
sell_cfg.n_actions = 2

# OPTIMIZED HYPERPARAMS FOR SELL AGENT
sell_cfg.lr = 0.0005                 # Slightly lower LR for stability
sell_cfg.epsilon_start = 1.0
sell_cfg.epsilon_end = 0.05

# We want decay to finish at ~80% of training
EPISODES = 5000
avg_steps_per_ep = 15  # horizon is 20, exits often happen around 10-20
total_estimated_steps = EPISODES * avg_steps_per_ep

sell_cfg.epsilon_decay_steps = int(total_estimated_steps * 0.8)

sell_agent = DDQNAgent(sell_cfg)
print(f"Sell Decay Steps: {sell_cfg.epsilon_decay_steps} / Est Total: {total_estimated_steps}")

# ... training loop follows ...
print("SELL state_dim:", sell_cfg.state_dim, "n_actions:", sell_cfg.n_actions)

# -----------------------
# Train loop (episode-based)
# -----------------------
# EPISODES = 800
EPISODES = 4000
MAX_STEPS = 200         # safety cap (horizon is small anyway)
UPDATES_PER_STEP = 1

for ep in range(EPISODES):
    s = sell_env_train.reset()
    done = False
    ep_reward = 0.0
    steps = 0

    while (not done) and (steps < MAX_STEPS):
        a = sell_agent.select_action(s, greedy=False)  # <- this increments total_steps internally
        ns, r, done, info = sell_env_train.step(a)

        sell_agent.push(s, a, r, ns, done)

        # update after warmup (based on agent.total_steps, which is now correct)
        if sell_agent.total_steps >= int(cfg.training.warmup_steps):
            for _ in range(UPDATES_PER_STEP):
                sell_agent.update()

        s = ns
        ep_reward += float(r)
        steps += 1

    if (ep + 1) % 10 == 0:
        loss = sell_agent.loss_history[-1] if sell_agent.loss_history else None
        print(f"[SELL] ep={ep+1}/{EPISODES} reward={ep_reward:.4f} eps={sell_agent.eps:.3f} loss={loss}")

# Save model
sell_path = os.path.join(out_dir, "sell_agent.pt")
sell_agent.save(sell_path)
print("Saved:", sell_path)

# -----------------------
# Evaluation helpers
# -----------------------
def eval_sell_agent(env, agent, entry_indices, greedy=True):
    rets, holds, exits, reasons = [], [], [], []
    for e in entry_indices:
        s = env.reset(int(e))
        done = False
        total_r = 0.0
        steps = 0
        last_info = None

        while (not done) and (steps < 500):
            a = agent.select_action(s, greedy=greedy)  # greedy=True => no eps update, no step increment
            ns, r, done, info = env.step(a)
            s = ns
            total_r += float(r)
            steps += 1
            last_info = info

        rets.append(total_r)
        if last_info:
            exits.append(last_info.get("exit_idx", np.nan))
            holds.append(last_info.get("bars_held", np.nan))
            reasons.append(last_info.get("reason", ""))
        else:
            exits.append(np.nan); holds.append(np.nan); reasons.append("")

    return np.array(rets, float), np.array(holds, float), np.array(exits, float), reasons


def eval_fixed_horizon(env, entry_indices):
    # Just HOLD until forced exit
    rets = []
    for e in entry_indices:
        s = env.reset(int(e))
        done = False
        total_r = 0.0
        steps = 0
        while (not done) and (steps < 500):
            ns, r, done, info = env.step(0)  # HOLD
            s = ns
            total_r += float(r)
            steps += 1
        rets.append(total_r)
    return np.array(rets, dtype=float)

# -----------------------
# Evaluate on TEST subset
# -----------------------
sell_env_test = SellEnv(
    features=X_test,
    prices=p_test,
    entry_indices=entries_test,
    transaction_cost=cfg.reward.transaction_cost,
    sell_horizon=cfg.trade_manager.sell_horizon,
    min_hold_bars=cfg.trade_manager.min_hold_bars,
    segment_len=SEG_TEST,
    include_pos_features=True,
)

rets_agent, holds_agent, exits_agent, reasons_agent = eval_sell_agent(
    sell_env_test, sell_agent, entries_test, greedy=True
)
rets_base = eval_fixed_horizon(sell_env_test, entries_test)

print("\n=== SELL EVAL (TEST entries) ===")
print("n_entries:", len(entries_test))

print("\nSellAgent:")
print("mean:", float(rets_agent.mean()),
      "median:", float(np.median(rets_agent)),
      "win_rate:", float((rets_agent > 0).mean()),
      "min/max:", float(rets_agent.min()), float(rets_agent.max()))

print("\nFixed horizon baseline (hold->forced exit):")
print("mean:", float(rets_base.mean()),
      "median:", float(np.median(rets_base)),
      "win_rate:", float((rets_base > 0).mean()),
      "min/max:", float(rets_base.min()), float(rets_base.max()))

delta = rets_agent - rets_base
print("\nDelta (agent - baseline):")
print("mean delta:", float(delta.mean()),
      "median delta:", float(np.median(delta)),
      "better %:", float((delta > 0).mean()))

print("\nPer-entry delta:", np.round(delta, 4))
print("Better count:", int((delta > 0).sum()), "/", len(delta))

# Optional: inspect exit behavior
if len(exits_agent) > 0:
    print("\nSellAgent exit stats:")
    print("avg hold bars:", float(np.nanmean(holds_agent)),
          "min/max hold bars:", float(np.nanmin(holds_agent)), float(np.nanmax(holds_agent)))
    # quick breakdown of reasons
    unique, counts = np.unique(np.array(reasons_agent, dtype=str), return_counts=True)
    print("exit reasons:", dict(zip(unique.tolist(), counts.tolist())))


entries_train: (249,) entries_test: (98,)
sell include_pos: True
sell feat_dim: 10 state_dim: 13
Sell Decay Steps: 60000 / Est Total: 75000
SELL state_dim: 13 n_actions: 2
[SELL] ep=10/4000 reward=0.0050 eps=0.998 loss=None
[SELL] ep=20/4000 reward=0.0243 eps=0.996 loss=0.0010292499791830778
[SELL] ep=30/4000 reward=0.0110 eps=0.994 loss=0.0007058464689180255
[SELL] ep=40/4000 reward=0.0400 eps=0.992 loss=0.0009508313378319144
[SELL] ep=50/4000 reward=0.0321 eps=0.990 loss=0.0008290411788038909
[SELL] ep=60/4000 reward=0.0312 eps=0.988 loss=0.0008469241438433528
target sync @ 500 loss 0.0005994084058329463
[SELL] ep=70/4000 reward=-0.0079 eps=0.986 loss=0.00021494048996828496
[SELL] ep=80/4000 reward=-0.0344 eps=0.984 loss=0.00026774549041874707
[SELL] ep=90/4000 reward=-0.0442 eps=0.982 loss=0.00024250360729638487
[SELL] ep=100/4000 reward=0.0110 eps=0.981 loss=0.0003875322872772813
target sync @ 1000 loss 0.0005280726472847164
[SELL] ep=110/4000 reward=0.0183 eps=0.979 loss=0.0002166

# TradeManager with Sell

In [15]:
import os
import json
import numpy as np

from trade.trade_manager import TradeManager

# -----------------------
# SETTINGS
# -----------------------
SEG_LEN = 1239          # rows per ticker segment from your build_features validation
N_SEGS  = 5             # AAPL, MSFT, NVDA, AMZN, GOOGL
TRAIN_FRAC = 0.70       # time-based split within each segment

# NEW: entry harvesting (for SellAgent training)
TOPK_PER_SEG_TRAIN = 80     # try 50–150
TOPK_PER_SEG_TEST  = 40     # fewer is fine for eval
MIN_GAP_TRAIN = None        # None => defaults inside TradeManager
MIN_GAP_TEST  = None
USE_CONF_SCORE = False      # False => uses q1-q0 margin (recommended)

# -----------------------
# BUILD TRAIN/TEST INDEX (per segment, no leakage)
# -----------------------
train_len = int(SEG_LEN * TRAIN_FRAC)

train_idx = []
test_idx = []

for seg in range(N_SEGS):
    start = seg * SEG_LEN
    train_idx.extend(range(start, start + train_len))
    test_idx.extend(range(start + train_len, start + SEG_LEN))

train_idx = np.array(train_idx, dtype=np.int32)
test_idx  = np.array(test_idx, dtype=np.int32)

X_train = features[train_idx]
p_train = prices[train_idx]
X_test  = features[test_idx]
p_test  = prices[test_idx]

# Segment length inside each split subset (since we concatenated segments in order)
SEG_TRAIN = train_len
SEG_TEST  = SEG_LEN - train_len

print("=== DATA SPLIT ===")
print("features:", features.shape, "prices:", prices.shape)
print("SEG_LEN:", SEG_LEN, "N_SEGS:", N_SEGS, "TRAIN_FRAC:", TRAIN_FRAC)
print("train_len per seg:", SEG_TRAIN, "test_len per seg:", SEG_TEST)
print("X_train:", X_train.shape, "p_train:", p_train.shape)
print("X_test :", X_test.shape,  "p_test :", p_test.shape)

# -----------------------
# HELPER: run TM + debug logs (unchanged backtest)
# -----------------------
def run_tm(name: str, X: np.ndarray, p: np.ndarray, seg_len: int, sell_agent=None):
    tm = TradeManager(
        buy_agent=agent,            # trained buy agent
        sell_agent=sell_agent,      # optional
        state=X,
        prices=p,
        reward=cfg.reward,
        trade=cfg.trade_manager,
        segment_len=seg_len,        # IMPORTANT for boundary correctness
    )

    res = tm.run()
    trades = res["trades"]

    print("Sell seen:", tm._sell_debug["seen"], "Sell actions:", tm._sell_debug["sell_actions"])

    from collections import Counter
    reasons = Counter([t["meta"].get("reason", "none") for t in res["trades"]])
    print("Exit reasons:", dict(reasons))
    print("Non-time exits:", [t for t in res["trades"] if t["meta"].get("reason") != "time"][:3])

    print("ENTRY DEBUG:", res["entry_debug"])
    print("SELL DEBUG:", res["sell_debug"])
    print("EXIT REASONS:", res.get("exit_reasons"))


    print(f"\n=== TRADE MANAGER ({name}) ===")
    print("n_steps:", len(p))
    print("segment_len:", seg_len)
    print("n_trades:", res["n_trades"])
    print("final_equity:", res["final_equity"])

    # Boundary-crossing check (must be 0)
    if trades:
        cross = sum((t["entry_idx"] // seg_len) != (t["exit_idx"] // seg_len) for t in trades)
    else:
        cross = 0
    print("Trades crossing segment boundary:", cross)

    # Return stats
    if trades:
        net = np.array([t["net_return"] for t in trades], dtype=float)
        hold = np.array([t["hold_bars"] for t in trades], dtype=float)

        print("avg net return:", float(net.mean()))
        print("win rate:", float((net > 0).mean()))
        print("min/median/max net:", float(net.min()), float(np.median(net)), float(net.max()))
        print("median hold bars:", float(np.median(hold)))
        print("top 5 net:", np.sort(net)[-5:])
        print("bottom 5 net:", np.sort(net)[:5])

        # A few sample trades (head + tail)
        print("\nSample trades (first 3):")
        for t in trades[:3]:
            print(t)
        print("\nSample trades (last 3):")
        for t in trades[-3:]:
            print(t)
    else:
        print("No trades produced. Try lowering buy_min_confidence or disabling trend filter.")

    return tm, res

# -----------------------
# NEW: Harvest entry indices for SellAgent training (no trade execution)
# -----------------------
# def harvest_entries(name: str, tm: TradeManager, topk_per_seg: int, min_gap=None, use_confidence_score=False):
#     entries = tm.collect_entry_indices_topk(
#         topk_per_segment=topk_per_seg,
#         min_gap=min_gap,
#         use_confidence_score=use_confidence_score,
#     )
#     entries = np.array(entries, dtype=np.int32)

#     # Quick sanity: segment boundary + horizon feasibility check (should hold by construction)
#     horizon = int(cfg.trade_manager.sell_horizon)
#     if len(entries) > 0:
#         seg_ok = np.all((entries % tm.segment_len) <= (tm.segment_len - 1 - horizon))
#     else:
#         seg_ok = True

#     print(f"\n=== ENTRY HARVEST ({name}) ===")
#     print("topk_per_segment:", topk_per_seg, "min_gap:", min_gap, "use_conf_score:", use_confidence_score)
#     print("n_entries:", len(entries))
#     print("horizon:", horizon, "segment_len:", tm.segment_len, "feasible_in_segment:", bool(seg_ok))
#     if len(entries) > 0:
#         print("first 10:", entries[:10].tolist())
#         print("last 10 :", entries[-10:].tolist())

#     return entries

# -----------------------
# RUN TRAIN + TEST (backtest as before)
# -----------------------
tm_train_sell, res_train_sell = run_tm("TRAIN", X_train, p_train, seg_len=SEG_TRAIN, sell_agent=sell_agent)
tm_test_sell,  res_test_sell  = run_tm("TEST",  X_test,  p_test,  seg_len=SEG_TEST,  sell_agent=sell_agent)

stats_train = compute_mean_delta(
    trades=res_train_sell["trades"],
    prices=p_train,
    horizon=cfg.trade_manager.sell_horizon,
    tc=cfg.reward.transaction_cost,
)

stats_test = compute_mean_delta(
    trades=res_test_sell["trades"],
    prices=p_test,
    horizon=cfg.trade_manager.sell_horizon,
    tc=cfg.reward.transaction_cost,
)

print("SELL Δ TRAIN:", stats_train)
print("SELL Δ TEST :", stats_test)


# -----------------------
# HARVEST ENTRIES (NEW LOGIC) — use these for SellEnv training
# -----------------------
# train_entries_sell = harvest_entries(
#     "TRAIN",
#     tm_train_sell,
#     topk_per_seg=TOPK_PER_SEG_TRAIN,
#     min_gap=MIN_GAP_TRAIN,
#     use_confidence_score=USE_CONF_SCORE,
# )

# test_entries_sell = harvest_entries(
#     "TEST",
#     tm_test_sell,
#     topk_per_seg=TOPK_PER_SEG_TEST,
#     min_gap=MIN_GAP_TEST,
#     use_confidence_score=USE_CONF_SCORE,
# )

# -----------------------
# SAVE ARTIFACTS (into out_dir)
# -----------------------
os.makedirs(out_dir, exist_ok=True)

train_entries_path = os.path.join(out_dir, "entry_indices_train_sell.npy")
test_entries_path  = os.path.join(out_dir, "entry_indices_test_sell.npy")

# NEW: save harvested entries (not trade entries)
np.save(train_entries_path, train_entries)
np.save(test_entries_path,  test_entries)

train_trades_sell_json = os.path.join(out_dir, "trades_buy_sell_train.json")
test_trades_sell_json  = os.path.join(out_dir, "trades_buy_sell_test.json")

with open(train_trades_json, "w") as f:
    json.dump(res_train["trades"], f, indent=2)

with open(test_trades_json, "w") as f:
    json.dump(res_test["trades"], f, indent=2)

print("\n=== SAVED ===")
print(" -", train_entries_path, "(HARVESTED)")
print(" -", test_entries_path,  "(HARVESTED)")
print(" -", train_trades_json)
print(" -", test_trades_json)


=== DATA SPLIT ===
features: (6195, 10) prices: (6195,)
SEG_LEN: 1239 N_SEGS: 5 TRAIN_FRAC: 0.7
train_len per seg: 867 test_len per seg: 372
X_train: (4335, 10) p_train: (4335,)
X_test : (1860, 10) p_test : (1860,)
Sell seen: 1332 Sell actions: 243
Exit reasons: {'time': 110, 'sell_agent': 39, 'segment_end': 1}
Non-time exits: [{'entry_idx': 54, 'exit_idx': 69, 'entry_price': 48.49269104003906, 'exit_price': 47.91735076904297, 'gross_return': -0.011864473978584514, 'net_return': -0.013839756895101307, 'hold_bars': 15, 'forced_exit': False, 'meta': {'buy_conf': 0.5191797409353899, 'reason': 'sell_agent', 'sell_conf': 0.7618173928959574, 'sell_q0': 0.0325348861515522, 'sell_q1': 0.05578827112913132, 'sell_margin': 0.023253384977579117, 'sell_delta_vs_hold': 0.04867945231696269, 'sell_baseline_net': -0.062519209212064, 'sell_net_now': -0.013839756895101307}}, {'entry_idx': 170, 'exit_idx': 189, 'entry_price': 52.672393798828125, 'exit_price': 58.525146484375, 'gross_return': 0.11111613244

## How to "pass args" in a notebook

Instead of CLI args, edit the **Parameters** cell.

If you really want args-style overrides, you can do:

```python
import os
config_path = os.getenv("CFG", config_path)
features_npy = os.getenv("FEAT", features_npy)
prices_npy = os.getenv("PRICES", prices_npy)
```
