In [None]:
# =============================
# 1) Imports & config
# =============================
# %pip install tensorflow numpy matplotlib tqdm --upgrade --quiet

import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, models, optimizers
import matplotlib.pyplot as plt
from collections import deque
from tqdm import tqdm

np.set_printoptions(suppress=True)
print("✅ TensorFlow :", tf.__version__)

# =============================
# 2) Chargement des données
# =============================
print("📦 Chargement btc_dataset_for_agent.npz ...")
data = np.load("btc_dataset_for_agent.npz", allow_pickle=True)
features = data["features"].astype(np.float32)   # (N, d_f)
signals  = data["signals"].astype(np.float32)    # (N, d_s)
prices   = data["prices"].astype(np.float32).squeeze()  # (N,)

pos_dim   = 1
base_dim  = features.shape[1] + signals.shape[1] + pos_dim
n_actions = 3  # 0=Hold, 1=Buy, 2=Sell

split = int(0.8 * len(prices))
Xf_train, Xf_test = features[:split], features[split:]
Xs_train, Xs_test = signals[:split],  signals[split:]
P_train,  P_test  = prices[:split],   prices[split:]

print(f"✅ Données : features {features.shape}, signals {signals.shape}, prices {prices.shape}")
print(f"🔧 Split -> Train: {len(P_train)} | Test: {len(P_test)}")

# =============================
# 3) Environnement avec fenêtre (seq_len)
# =============================
class TradingEnv:
    """
    Actions: 0=Hold, 1=Long, 2=Short
    Reward = 100 * ( position * Δlog(P) - coût de turnover ) - hold_penalty
    Observation = fenêtre séquentielle (seq_len x state_dim) : [features | signals | position]
    """
    def __init__(self, features, signals, prices, seq_len=16, fee_bps=0.0005, reward_scale=100.0, hold_penalty=0.01):
        self.features = features.astype(np.float32)
        self.signals  = signals.astype(np.float32)
        self.prices   = prices.astype(np.float32).squeeze()
        self.seq_len  = int(seq_len)
        self.fee_bps  = float(fee_bps)
        self.scale    = float(reward_scale)
        self.hold_penalty = float(hold_penalty)

        # rendements log pour la reward
        self.logp = np.log(np.clip(self.prices, 1e-6, None))
        self.reset()

    def _state_now(self):
        vec = np.concatenate([self.features[self.t], self.signals[self.t], np.array([self.position], np.float32)], axis=-1)
        return vec

    def _obs_from_buffer(self):
        # (seq_len, state_dim)
        arr = np.array(self.buffer, dtype=np.float32)
        return arr

    def reset(self):
        self.t = 0
        self.position = 0  # -1,0,+1
        self.pnl = 0.0     # en %
        self.buffer = deque(maxlen=self.seq_len)
        # initialiser la fenêtre avec des états neutres (position=0)
        for k in range(self.seq_len):
            self.buffer.append(self._state_now())
            if self.t < len(self.prices) - 1:
                self.t += 1
        return self._obs_from_buffer()

    def step(self, action:int):
        prev_pos = self.position
        if action == 1:   self.position = 1
        elif action == 2: self.position = -1
        else:             self.position = 0

        reward = 0.0
        if self.t < len(self.prices) - 1:
            dlog = float(self.logp[self.t + 1] - self.logp[self.t])
            turnover_cost = self.fee_bps * abs(self.position - prev_pos)
            reward = (self.position * dlog - turnover_cost) * self.scale
            if action == 0:
                reward -= self.hold_penalty  # décourage inaction systématique
            # pnl en %
            self.pnl += (self.position * dlog - turnover_cost)

        # avancer le temps et rafraîchir la fenêtre
        self.buffer.append(self._state_now())
        self.t += 1
        done = self.t >= (len(self.prices) - 2)
        next_obs = None if done else self._obs_from_buffer()
        return next_obs, float(reward), bool(done)

# instancier
SEQ_LEN = 16
env_train = TradingEnv(Xf_train, Xs_train, P_train, seq_len=SEQ_LEN, hold_penalty=0.01)
env_test  = TradingEnv(Xf_test,  Xs_test,  P_test,  seq_len=SEQ_LEN, hold_penalty=0.00)

state_dim = base_dim  # par pas
print(f"🧩 Observation : (seq_len={SEQ_LEN}, state_dim={state_dim})")

# =============================
# 4) DARQN (LSTM + Dueling) TF
# =============================

tf.keras.mixed_precision.set_global_policy("float32")


def build_darqn(input_dim, n_actions, hidden=128, seq_len=16):
    inp = layers.Input(shape=(seq_len, input_dim))
    x = layers.TimeDistributed(layers.Dense(hidden, activation='relu'))(inp)
    x = layers.LSTM(hidden, return_sequences=False)(x)

    # --- Dueling head (en Lambda pour éviter l’erreur) ---
    val = layers.Dense(64, activation='relu')(x)
    val = layers.Dense(1, name="value")(val)                          # (B,1)

    adv = layers.Dense(64, activation='relu')(x)
    adv = layers.Dense(n_actions, name="advantage")(adv)              # (B,nA)

    # Centrage de l’avantage: adv - mean(adv, axis=1, keepdims=True)
    adv_mean = layers.Lambda(lambda a: tf.reduce_mean(a, axis=1, keepdims=True),
                             name="adv_mean")(adv)
    adv_centered = layers.Subtract(name="adv_centered")([adv, adv_mean])

    # Q(s,a) = V(s) + A(s,a)
    q = layers.Add(name="q_values")([val, adv_centered])              # broadcasting (B,1) + (B,nA)

    model = models.Model(inp, q, name="DARQN")
    return model

qnet = build_darqn(state_dim, n_actions, hidden=128, seq_len=SEQ_LEN)
tgt  = build_darqn(state_dim, n_actions, hidden=128, seq_len=SEQ_LEN)
tgt.set_weights(qnet.get_weights())
qnet.summary()

opt = optimizers.Adam(1e-4)
loss_fn = tf.keras.losses.Huber()

# =============================
# 5) Replay Buffer simple
# =============================
class ReplayBuffer:
    def __init__(self, capacity=20000):
        self.S  = deque(maxlen=capacity)
        self.SN = deque(maxlen=capacity)
        self.A  = deque(maxlen=capacity)
        self.R  = deque(maxlen=capacity)
        self.D  = deque(maxlen=capacity)

    def add(self, s, a, r, sn, d):
        self.S.append(s); self.A.append(a); self.R.append(r); self.SN.append(sn); self.D.append(d)

    def sample(self, batch):
        idx = np.random.choice(len(self.S), batch, replace=False)
        S  = np.array([self.S[i]  for i in idx], dtype=np.float32)
        SN = np.array([self.SN[i] for i in idx], dtype=np.float32)
        A  = np.array([self.A[i]  for i in idx], dtype=np.int32)
        R  = np.array([self.R[i]  for i in idx], dtype=np.float32)
        D  = np.array([self.D[i]  for i in idx], dtype=np.float32)
        return S, A, R, SN, D

buffer = ReplayBuffer(20000)

# =============================
# 6) Hyperparamètres RL
# =============================
gamma         = 0.99
epsilon       = 1.0
epsilon_min   = 0.05
epsilon_decay = 0.997
tau           = 0.01         # soft update cible
batch_size    = 64
update_every  = 32
warmup        = 512          # transitions avant premiers updates
episodes      = 60

print("\n🚀 Entraînement sur TRAIN ...\n")
for ep in range(episodes):
    s = env_train.reset()
    done = False
    total_R = 0.0
    step = 0

    while not done:
        # ε-greedy
        if np.random.rand() < epsilon:
            a = np.random.randint(0, n_actions)
        else:
            q = qnet.predict(s[np.newaxis, :], verbose=0)[0]
            a = int(np.argmax(q))

        sn, r, done = env_train.step(a)
        if sn is not None:
            buffer.add(s, a, r, sn, float(done))
            s = sn

        total_R += r
        step += 1

        # updates fréquents
        if len(buffer.S) >= max(batch_size, warmup) and (step % update_every == 0):
            S, A, R, SN, D = buffer.sample(batch_size)

            with tf.GradientTape() as tape:
                Q     = qnet(S, training=True)                                # (B, nA)
                Q_sel = tf.gather_nd(Q, tf.stack([tf.range(batch_size), A], axis=1))

                Qnext = tgt(SN, training=False)
                maxQn = tf.reduce_max(Qnext, axis=1)
                target= R + gamma * maxQn * (1.0 - D)

                loss  = loss_fn(target, Q_sel)

            grads = tape.gradient(loss, qnet.trainable_variables)
            opt.apply_gradients(zip(grads, qnet.trainable_variables))

            # soft update du réseau cible
            for tp, p in zip(tgt.variables, qnet.variables):
                tp.assign(tau * p + (1.0 - tau) * tp)

    epsilon = max(epsilon * epsilon_decay, epsilon_min)
    print(f"Épisode {ep+1}/{episodes} — Récompense totale (TRAIN) : {total_R:.3f} — ε={epsilon:.3f}")

print("\n✅ Entraînement terminé.")

# =============================
# 7) Test (greedy) sur TEST
# =============================
print("\n🔍 Phase de TEST ...\n")
s = env_test.reset()
done = False
actions, rewards, pnls = [], [], []

while not done:
    q = qnet.predict(s[np.newaxis, :], verbose=0)[0]
    a = int(np.argmax(q))
    sn, r, done = env_test.step(a)
    actions.append(a); rewards.append(r); pnls.append(env_test.pnl)
    if sn is not None: s = sn

print("✅ Test terminé.")

# =============================
# 8) Visualisations & métriques
# =============================
plt.figure(figsize=(12,6))
plt.plot(P_test, label='Prix BTC (TEST)', alpha=0.7)
buy  = [i for i,a in enumerate(actions) if a==1]
sell = [i for i,a in enumerate(actions) if a==2]
if buy:  plt.scatter(buy,  P_test[buy],  color='green', marker='^', label='Achat')
if sell: plt.scatter(sell, P_test[sell], color='red',   marker='v', label='Vente')
plt.title("Décisions de l’agent — TEST")
plt.legend(); plt.show()

plt.figure(figsize=(10,4))
plt.plot(np.array(pnls)*100, color='orange', label='PnL cumulé (%)')
plt.title("Profit net cumulé — TEST"); plt.legend(); plt.show()

R = np.array(rewards, np.float32)
pnl_total_pct = float(pnls[-1])*100 if len(pnls) else 0.0
sharpe = float(R.mean() / (R.std() + 1e-8))
win = float((R > 0).mean())*100.0

print("\n📊 Évaluation finale (TEST)")
print(f"💰 PnL total : {pnl_total_pct:.2f}%")
print(f"⚖️ Sharpe : {sharpe:.4f}")
print(f"🏆 Taux de succès : {win:.2f}%")


Note: you may need to restart the kernel to use updated packages.
✅ TensorFlow : 2.20.0-rc0
📦 Chargement btc_dataset_for_agent.npz ...
✅ Données : features (1367, 5), signals (1367, 2), prices (1367,)
🔧 Split -> Train: 1093 | Test: 274
🧩 Observation : (seq_len=16, state_dim=8)




🚀 Entraînement sur TRAIN ...



KeyboardInterrupt: 