### Imports

In [1]:
import os
import glob
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from typing import Tuple

import gymnasium as gym
import gym_trading_env  # IMPORTANT: Pour enregistrer l'environnement

from stable_baselines3 import PPO, SAC
from stable_baselines3.common.vec_env import DummyVecEnv, VecNormalize
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.callbacks import BaseCallback
from stable_baselines3.common.utils import set_random_seed

import wandb

os.makedirs("models", exist_ok=True)
os.makedirs("outputs", exist_ok=True)

print("‚úÖ Imports r√©ussis")

ModuleNotFoundError: No module named 'gym_trading_env'

### Configuration WandB et des Tests

In [None]:
# WandB (mettre False pour d√©sactiver)
USE_WANDB = True  

# Liste des configurations √† tester
TEST_CONFIGS = [
    # # Test 1 : Simple Return (baseline)
    # {
    #     "name": "simple_return_baseline",
    #     "algo": "PPO",
    #     "reward_type": "simple_return",
    #     "risk_penalty": 0.0,
    #     "reward_scaling": 1.0,
    #     "learning_rate": 3e-4,
    #     "timesteps": 500000,  # ‚Üê R√©duire √† 5000 pour test ultra-rapide
    # },
    
    # # Test 2 : Sharpe Conservateur
    # {
    #     "name": "sharpe_conservative",
    #     "algo": "PPO",
    #     "reward_type": "clipped_sharpe",
    #     "risk_penalty": 0.3,
    #     "reward_scaling": 1.0,
    #     "learning_rate": 1e-4,
    #     "timesteps": 500000,
    # },
    
    # # Test 3 : Sharpe √âquilibr√©
    # {
    #     "name": "sharpe_balanced",
    #     "algo": "PPO",
    #     "reward_type": "clipped_sharpe",
    #     "risk_penalty": 0.1,
    #     "reward_scaling": 1.5,
    #     "learning_rate": 3e-4,
    #     "timesteps": 500000,
    # },
    
    # # Test 4 : Sharpe Agressif
    # {
    #     "name": "sharpe_aggressive",
    #     "algo": "PPO",
    #     "reward_type": "clipped_sharpe",
    #     "risk_penalty": 0.05,
    #     "reward_scaling": 2.0,
    #     "learning_rate": 5e-4,
    #     "timesteps": 500000,
    # },
    
    # # Test 5 : Momentum-Based
    # {
    #     "name": "momentum_aggressive",
    #     "algo": "PPO",
    #     "reward_type": "momentum_based",
    #     "risk_penalty": 0.05,
    #     "reward_scaling": 2.0,
    #     "learning_rate": 3e-4,
    #     "timesteps": 500000,
    # },
    
    # # Test 6 : Profit avec P√©nalit√© Drawdown
    # {
    #     "name": "profit_with_drawdown",
    #     "algo": "PPO",
    #     "reward_type": "profit_drawdown",
    #     "risk_penalty": 0.5,
    #     "reward_scaling": 1.5,
    #     "learning_rate": 3e-4,
    #     "timesteps": 500000,
    # },
    
    # Test 7 : SAC avec Simple Return
    # {
    #     "name": "sac_simple",
    #     "algo": "SAC",
    #     "reward_type": "simple_return",
    #     "risk_penalty": 0.0,
    #     "reward_scaling": 1.0,
    #     "learning_rate": 3e-4,
    #     "timesteps": 500000,
    # },
    
    # # Test 8 : SAC avec Sharpe
    # {
    #     "name": "sac_sharpe",
    #     "algo": "SAC",
    #     "reward_type": "clipped_sharpe",
    #     "risk_penalty": 0.1,
    #     "reward_scaling": 1.5,
    #     "learning_rate": 3e-4,
    #     "timesteps": 500000,
    # },
     {
        "name": "sharpe_balanced_v2",
        "algo": "PPO",
        "reward_type": "clipped_sharpe",
        "risk_penalty": 0.1,
        "reward_scaling": 1.5,
        "learning_rate": 3e-4,
        "n_steps": 2048,
        "batch_size": 64,
        "n_epochs": 10,
        "ent_coef": 0.01,  # Exploration
        "timesteps": 500000,  # 50k pour meilleurs r√©sultats
    },
    
    # # Test 2 : Sharpe Plus Agressif (moins de p√©nalit√© risque)
    # {
    #     "name": "sharpe_aggressive_v2",
    #     "algo": "PPO",
    #     "reward_type": "clipped_sharpe",
    #     "risk_penalty": 0.05,  # ‚Üê Plus faible = plus agressif
    #     "reward_scaling": 2.0,  # ‚Üê Plus de scaling
    #     "learning_rate": 5e-4,  # ‚Üê Learning rate plus √©lev√©
    #     "n_steps": 2048,
    #     "batch_size": 64,
    #     "n_epochs": 10,
    #     "ent_coef": 0.02,  # Plus d'exploration
    #     "timesteps": 500000,
    # },
    
    # # Test 3 : Sharpe Conservateur (plus de p√©nalit√© risque)
    # {
    #     "name": "sharpe_conservative_v2",
    #     "algo": "PPO",
    #     "reward_type": "clipped_sharpe",
    #     "risk_penalty": 0.2,  # ‚Üê Plus √©lev√© = plus conservateur
    #     "reward_scaling": 1.0,
    #     "learning_rate": 1e-4,  # ‚Üê Learning rate plus faible
    #     "n_steps": 4096,  # ‚Üê Plus de steps = plus stable
    #     "batch_size": 128,  # ‚Üê Batch plus grand
    #     "n_epochs": 15,  # ‚Üê Plus d'√©poques
    #     "ent_coef": 0.005,  # Moins d'exploration
    #     "timesteps": 500000,
    # },
    
   
    
    # # Sortino Ratio (p√©nalise seulement downside)
    # {
    #     "name": "sortino_balanced",
    #     "algo": "PPO",
    #     "reward_type": "sortino_ratio",
    #     "risk_penalty": 0.1,
    #     "reward_scaling": 1.5,
    #     "learning_rate": 3e-4,
    #     "n_steps": 2048,
    #     "batch_size": 64,
    #     "n_epochs": 10,
    #     "ent_coef": 0.01,
    #     "timesteps": 500000,
    # },
    
   
    
    # # Simple Return avec Hyperparam√®tres Optimis√©s
    # {
    #     "name": "simple_return_optimized",
    #     "algo": "PPO",
    #     "reward_type": "simple_return",
    #     "risk_penalty": 0.0,  # Pas utilis√©
    #     "reward_scaling": 1.0,
    #     "learning_rate": 3e-4,
    #     "n_steps": 2048,
    #     "batch_size": 64,
    #     "n_epochs": 10,
    #     "ent_coef": 0.015,  # Un peu plus d'exploration
    #     "timesteps": 500000,
    # },
    
   
    # # Sharpe + Momentum Hybride
    # {
    #     "name": "hybrid_sharpe_momentum",
    #     "algo": "PPO",
    #     "reward_type": "hybrid_sharpe_momentum", 
    #     "risk_penalty": 0.1,
    #     "reward_scaling": 1.5,
    #     "learning_rate": 3e-4,
    #     "n_steps": 2048,
    #     "batch_size": 64,
    #     "n_epochs": 10,
    #     "ent_coef": 0.01,
    #     "timesteps": 500000,
    # }
]

print(f"‚úÖ {len(TEST_CONFIGS)} configurations √† tester")
print(f"   WandB: {'Activ√©' if USE_WANDB else 'D√©sactiv√©'}")

‚úÖ 1 configurations √† tester
   WandB: Activ√©


### Fonctions de Pr√©traitement

In [None]:
def load_pkl(path: str) -> pd.DataFrame:
    df = pd.read_pickle(path)
    if not isinstance(df.index, pd.DatetimeIndex):
        for col in ["datetime", "date", "time", "timestamp", "Timestamp"]:
            if col in df.columns:
                df = df.set_index(pd.to_datetime(df[col]))
                break
    df.index = pd.to_datetime(df.index)
    return df


def feature_log_return(df: pd.DataFrame, col: str = "close") -> pd.Series:
    return np.log(df[col]).diff()


def feature_moving_average(df: pd.DataFrame, col: str = "close", window: int = 20) -> pd.Series:
    return df[col].rolling(window).mean()


def feature_volatility(df: pd.DataFrame, col: str = "close", window: int = 20) -> pd.Series:
    return df[col].pct_change().rolling(window).std()


def feature_RSI(df: pd.DataFrame, col: str = "close", window: int = 14) -> pd.Series:
    delta = df[col].diff()
    up = delta.clip(lower=0)
    down = -delta.clip(upper=0)
    ma_up = up.rolling(window=window).mean()
    ma_down = down.rolling(window=window).mean()
    rs = ma_up / (ma_down + 1e-9)
    rsi = 100 - (100 / (1 + rs))
    return rsi


def feature_MACD(df: pd.DataFrame, col: str = "close", fast: int = 12, slow: int = 26, signal: int = 9):
    ema_fast = df[col].ewm(span=fast, adjust=False).mean()
    ema_slow = df[col].ewm(span=slow, adjust=False).mean()
    macd = ema_fast - ema_slow
    macd_signal = macd.ewm(span=signal, adjust=False).mean()
    return macd, macd_signal


def feature_OBV(df: pd.DataFrame, col_close: str = "close", col_volume: str = "volume") -> pd.Series:
    if col_volume not in df.columns:
        return pd.Series(0, index=df.index)
    obv = [0]
    for i in range(1, len(df)):
        if df[col_close].iat[i] > df[col_close].iat[i - 1]:
            obv.append(obv[-1] + df[col_volume].iat[i])
        elif df[col_close].iat[i] < df[col_close].iat[i - 1]:
            obv.append(obv[-1] - df[col_volume].iat[i])
        else:
            obv.append(obv[-1])
    return pd.Series(obv, index=df.index)


def zscore(series: pd.Series) -> pd.Series:
    return (series - series.mean()) / (series.std() + 1e-9)


def preprocess(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    df = df.sort_index().drop_duplicates().dropna(how="all")
    
    if "close" not in df.columns:
        for c in ["Close", "Adj Close", "adjclose", "adj_close"]:
            if c in df.columns:
                df["close"] = df[c]
                break
    if "volume" not in df.columns:
        df["volume"] = 0

    # Features
    df["feature_log_return"] = feature_log_return(df, "close")
    df["feature_RSI"] = feature_RSI(df, "close")
    macd, macd_signal = feature_MACD(df, "close")
    df["feature_MACD"] = macd
    df["feature_MACD_signal"] = macd_signal
    
    for w in [5, 20, 50]:
        df[f"feature_ma_{w}"] = feature_moving_average(df, "close", w)
    for w in [10, 20, 50]:
        df[f"feature_vol_{w}"] = feature_volatility(df, "close", w)
    df["feature_OBV"] = feature_OBV(df, "close", "volume")

    # Normalisation
    feat_cols = [c for c in df.columns if c.startswith("feature_")]
    for c in feat_cols:
        df[c] = df[c].ffill().fillna(0) 
        df[c] = zscore(df[c])
    
    return df.dropna()


# V√©rifier les donn√©es
dataset_dir = "data/*.pkl"
files = sorted(glob.glob(dataset_dir))
print(f"‚úÖ Trouv√© {len(files)} datasets")

‚úÖ Trouv√© 9 datasets


### Fonctions de R√©compense

In [None]:

def reward_simple_return(history, **kwargs) -> float:
    """REWARD 1 : Simple rendement en pourcentage"""
    current_value = history['portfolio_valuation', -1]
    
    if len(history) < 2:
        return 0.0
    
    previous_value = history['portfolio_valuation', -2]
    if previous_value <= 0:
        return -1.0
    
    pct_change = (current_value - previous_value) / previous_value
    return np.clip(pct_change * 100, -10, 10)


def reward_clipped_sharpe(history, risk_penalty=0.1, reward_scaling=1.0) -> float:
    """REWARD 2 : Ratio de Sharpe clipp√©"""
    current_value = history['portfolio_valuation', -1]
    
    if len(history) < 2:
        return 0.0
    
    previous_value = history['portfolio_valuation', -2]
    if previous_value <= 0 or current_value <= 0:
        return -1.0
    
    instant_log_return = np.log(current_value / previous_value)
    instant_log_return = np.clip(instant_log_return, -0.1, 0.1)
    
    WINDOW = 20
    all_values_np = np.asarray(history['portfolio_valuation'], dtype=np.float64)
    safe_values = np.where(all_values_np <= 0, 1e-9, all_values_np)
    
    if len(safe_values) > 1:
        log_returns = np.diff(np.log(safe_values))
    else:
        log_returns = np.array([0.0])
    
    if len(log_returns) >= WINDOW:
        volatility = np.std(log_returns[-WINDOW:])
    else:
        volatility = np.std(log_returns) if len(log_returns) > 1 else 1e-9
    
    volatility = max(volatility, 0.001)
    sharpe = instant_log_return / (volatility * risk_penalty)
    sharpe = np.clip(sharpe, -10, 10)
    
    return sharpe * reward_scaling


def reward_momentum_based(history, risk_penalty=0.05, reward_scaling=2.0) -> float:
    """REWARD 3 : Bas√© sur le momentum"""
    current_value = history['portfolio_valuation', -1]
    
    if len(history) < 3:
        return 0.0
    
    values = np.asarray(history['portfolio_valuation'][-10:], dtype=np.float64)
    safe_values = np.where(values <= 0, 1e-9, values)
    
    if len(safe_values) < 2:
        return 0.0
    
    returns = np.diff(np.log(safe_values))
    momentum = np.mean(returns[-5:]) if len(returns) >= 5 else np.mean(returns)
    momentum_bonus = np.tanh(momentum * 10) * reward_scaling
    
    instant_return = np.log(safe_values[-1] / safe_values[-2])
    instant_return = np.clip(instant_return, -0.1, 0.1)
    
    total_reward = (instant_return * 100) + momentum_bonus
    return np.clip(total_reward, -10, 10)


def reward_profit_drawdown(history, risk_penalty=0.5, reward_scaling=1.5) -> float:
    """REWARD 4 : Profit avec p√©nalit√© pour drawdown"""
    current_value = history['portfolio_valuation', -1]
    
    if len(history) < 2:
        return 0.0
    
    previous_value = history['portfolio_valuation', -2]
    if previous_value <= 0:
        return -1.0
    
    pct_change = (current_value - previous_value) / previous_value
    
    all_values = np.asarray(history['portfolio_valuation'], dtype=np.float64)
    all_values = np.where(all_values <= 0, 1e-9, all_values)
    
    peak_value = np.max(all_values)
    drawdown = (current_value - peak_value) / peak_value
    
    reward = (pct_change * 100) - (abs(drawdown) * 100 * risk_penalty)
    return np.clip(reward, -10, 10) * reward_scaling


def reward_sortino_ratio(history, risk_penalty=0.1, reward_scaling=1.0) -> float:
    """REWARD 5 : Ratio de Sortino (p√©nalise seulement downside)"""
    current_value = history['portfolio_valuation', -1]
    
    if len(history) < 2:
        return 0.0
    
    previous_value = history['portfolio_valuation', -2]
    if previous_value <= 0 or current_value <= 0:
        return -1.0
    
    instant_log_return = np.log(current_value / previous_value)
    instant_log_return = np.clip(instant_log_return, -0.1, 0.1)
    
    WINDOW = 20
    all_values_np = np.asarray(history['portfolio_valuation'], dtype=np.float64)
    safe_values = np.where(all_values_np <= 0, 1e-9, all_values_np)
    
    if len(safe_values) > 1:
        log_returns = np.diff(np.log(safe_values))
    else:
        log_returns = np.array([0.0])
    
    if len(log_returns) >= WINDOW:
        negative_returns = log_returns[-WINDOW:][log_returns[-WINDOW:] < 0]
        downside_vol = np.std(negative_returns) if len(negative_returns) > 0 else 0.001
    else:
        negative_returns = log_returns[log_returns < 0]
        downside_vol = np.std(negative_returns) if len(negative_returns) > 0 else 0.001
    
    downside_vol = max(downside_vol, 0.001)
    sortino = instant_log_return / (downside_vol * risk_penalty)
    sortino = np.clip(sortino, -10, 10)
    
    return sortino * reward_scaling


def reward_calmar_ratio(history, risk_penalty=0.2, reward_scaling=1.0) -> float:
    """REWARD 6 : Ratio de Calmar (rendement / max drawdown)"""
    current_value = history['portfolio_valuation', -1]
    
    if len(history) < 2:
        return 0.0
    
    initial_value = 1000
    total_return = (current_value - initial_value) / initial_value
    
    all_values = np.asarray(history['portfolio_valuation'], dtype=np.float64)
    all_values = np.where(all_values <= 0, 1e-9, all_values)
    
    running_max = np.maximum.accumulate(all_values)
    drawdowns = (all_values - running_max) / running_max
    max_drawdown = abs(np.min(drawdowns)) if len(drawdowns) > 0 else 0.001
    max_drawdown = max(max_drawdown, 0.01)
    
    calmar = total_return / (max_drawdown * risk_penalty)
    calmar = np.clip(calmar, -10, 10)
    
    return calmar * reward_scaling


def reward_hybrid_sharpe_momentum(history, risk_penalty=0.1, reward_scaling=1.5) -> float:
    """
    REWARD HYBRIDE : Combine Sharpe + Momentum
    Le meilleur des deux mondes !
    """
    current_value = history['portfolio_valuation', -1]
    
    if len(history) < 3:
        return 0.0
    
    previous_value = history['portfolio_valuation', -2]
    if previous_value <= 0 or current_value <= 0:
        return -1.0
    
    # ========== PARTIE 1 : SHARPE ==========
    instant_log_return = np.log(current_value / previous_value)
    instant_log_return = np.clip(instant_log_return, -0.1, 0.1)
    
    WINDOW = 20
    all_values_np = np.asarray(history['portfolio_valuation'], dtype=np.float64)
    safe_values = np.where(all_values_np <= 0, 1e-9, all_values_np)
    
    if len(safe_values) > 1:
        log_returns = np.diff(np.log(safe_values))
    else:
        log_returns = np.array([0.0])
    
    if len(log_returns) >= WINDOW:
        volatility = np.std(log_returns[-WINDOW:])
    else:
        volatility = np.std(log_returns) if len(log_returns) > 1 else 1e-9
    
    volatility = max(volatility, 0.001)
    sharpe_component = instant_log_return / (volatility * risk_penalty)
    sharpe_component = np.clip(sharpe_component, -5, 5)  # Limiter √† ¬±5
    
    # ========== PARTIE 2 : MOMENTUM ==========
    values = safe_values[-10:]
    
    if len(values) >= 5:
        returns = np.diff(np.log(values))
        momentum = np.mean(returns[-5:])
        momentum_bonus = np.tanh(momentum * 10) * 0.5  # Bonus ¬±0.5 max
    else:
        momentum_bonus = 0
    
    # ========== COMBINAISON ==========
    # 70% Sharpe + 30% Momentum
    hybrid_reward = (0.7 * sharpe_component) + (0.3 * momentum_bonus * 10)
    
    return np.clip(hybrid_reward, -10, 10) * reward_scaling


# Dictionnaire de toutes les fonctions
REWARD_FUNCTIONS = {
    "simple_return": reward_simple_return,
    "clipped_sharpe": reward_clipped_sharpe,
    "momentum_based": reward_momentum_based,
    "profit_drawdown": reward_profit_drawdown,
    "sortino_ratio": reward_sortino_ratio,
    "calmar_ratio": reward_calmar_ratio,
    "hybrid_sharpe_momentum" :reward_hybrid_sharpe_momentum
}

print(f"‚úÖ {len(REWARD_FUNCTIONS)} fonctions de r√©compense disponibles")

‚úÖ 7 fonctions de r√©compense disponibles


###  Fonction d'Entra√Ænement avec M√©triques Compl√®tes WandB

In [None]:
def train_single_config(config, use_wandb=True):
    """Entra√Æne un seul mod√®le avec une configuration donn√©e"""
    
    print(f"\n{'='*80}")
    print(f"üöÄ ENTRA√éNEMENT: {config['name']}")
    print(f"{'='*80}")
    
    # Extraire les param√®tres
    algo = config['algo']
    reward_type = config['reward_type']
    risk_penalty = config['risk_penalty']
    reward_scaling = config['reward_scaling']
    learning_rate = config['learning_rate']
    timesteps = config['timesteps']
    
    # Cr√©er la fonction de r√©compense
    reward_fn = REWARD_FUNCTIONS[reward_type]
    
    def reward_wrapper(history):
        return reward_fn(history, risk_penalty=risk_penalty, reward_scaling=reward_scaling)
    
    # Cr√©er l'environnement
    env = gym.make(
        "MultiDatasetTradingEnv",
        dataset_dir="data/*.pkl",
        preprocess=preprocess,
        portfolio_initial_value=1_000,
        trading_fees=0.1/100,
        borrow_interest_rate=0.02/100/24,
        reward_function=reward_wrapper,
    )
    
    env.add_metric('Portfolio Valuation', lambda h: round(h['portfolio_valuation', -1], 2))
    
    # Wrapping
    log_dir = f"models/{config['name']}"
    os.makedirs(log_dir, exist_ok=True)
    
    env = Monitor(env, filename=os.path.join(log_dir, "monitor.csv"))
    vec_env = DummyVecEnv([lambda: env])
    vec_env = VecNormalize(vec_env, norm_obs=True, norm_reward=False, clip_obs=10.)
    
    # WandB avec TensorBoard
    tensorboard_log_dir = f"runs/{config['name']}"
    
    if use_wandb:
        wandb.init(
            project="RL-project-trading",
            name=config['name'],
            config=config,
            reinit=True,
            sync_tensorboard=True,  # Synchronise TensorBoard
        )
        print(f"  üìä WandB: https://wandb.ai/{wandb.run.entity}/{wandb.run.project}/runs/{wandb.run.id}")
    
    # Callback D√âTAILL√â avec TOUTES les m√©triques
    class DetailedWandbCallback(BaseCallback):
        def __init__(self):
            super().__init__()
            self.episode_rewards = []
            self.episode_lengths = []
            self.portfolio_values = []
            self.episode_returns = []
            self.max_drawdowns = []
            
        def _on_step(self):
            # Logger tous les 100 steps
            if use_wandb and self.n_calls % 100 == 0:
                # M√©triques d'entra√Ænement de l'algorithme
                if hasattr(self.model, 'logger') and self.model.logger:
                    # Ces m√©triques viennent de l'algorithme lui-m√™me
                    wandb.log({
                        "train/learning_rate": self.model.learning_rate,
                        "timesteps": self.num_timesteps,
                    })
            
            # Logger les infos d'√©pisode (quand un √©pisode se termine)
            for idx, info in enumerate(self.locals.get('infos', [])):
                if 'episode' in info:
                    episode_reward = info['episode']['r']
                    episode_length = info['episode']['l']
                    
                    self.episode_rewards.append(episode_reward)
                    self.episode_lengths.append(episode_length)
                    
                    try:
                        # R√©cup√©rer les infos d√©taill√©es de l'environnement
                        base_env = self.training_env.envs[idx].unwrapped
                        
                        if hasattr(base_env, 'historical_info') and len(base_env.historical_info) > 0:
                            # Portfolio value
                            portfolio_value = base_env.historical_info[-1].get('portfolio_valuation', 1000)
                            self.portfolio_values.append(portfolio_value)
                            
                            # Calcul du rendement
                            total_return_pct = (portfolio_value - 1000) / 1000 * 100
                            self.episode_returns.append(total_return_pct)
                            
                            # Calcul du max drawdown
                            all_portfolio_values = [h['portfolio_valuation'] for h in base_env.historical_info]
                            running_max = np.maximum.accumulate(all_portfolio_values)
                            drawdowns = (np.array(all_portfolio_values) - running_max) / running_max
                            max_drawdown = np.min(drawdowns) if len(drawdowns) > 0 else 0
                            self.max_drawdowns.append(abs(max_drawdown) * 100)
                            
                            # Calcul de la volatilit√©
                            if len(all_portfolio_values) > 1:
                                returns = np.diff(np.log(all_portfolio_values))
                                volatility = np.std(returns) * 100
                            else:
                                volatility = 0
                            
                            # Ratio de Sharpe r√©alis√© (approximatif)
                            if volatility > 0:
                                sharpe_ratio = total_return_pct / volatility
                            else:
                                sharpe_ratio = 0
                            
                            if use_wandb:
                                # LOG COMPLET dans WandB
                                wandb.log({
                                    # √âpisode de base
                                    "episode/reward": episode_reward,
                                    "episode/length": episode_length,
                                    "episode/num_episodes": len(self.episode_rewards),
                                    
                                    # Portfolio
                                    "episode/portfolio_value": portfolio_value,
                                    "episode/total_return_pct": total_return_pct,
                                    
                                    # Risque
                                    "episode/max_drawdown_pct": abs(max_drawdown) * 100,
                                    "episode/volatility_pct": volatility,
                                    "episode/sharpe_ratio": sharpe_ratio,
                                    
                                    # Moyennes mobiles (importantes !)
                                    "episode/mean_reward_100": np.mean(self.episode_rewards[-100:]),
                                    "episode/mean_portfolio_100": np.mean(self.portfolio_values[-100:]),
                                    "episode/mean_return_100": np.mean(self.episode_returns[-100:]),
                                    
                                    # M√©triques cumulatives
                                    "cumulative/total_episodes": len(self.episode_rewards),
                                    "cumulative/best_portfolio": max(self.portfolio_values),
                                    "cumulative/worst_portfolio": min(self.portfolio_values),
                                    "cumulative/avg_episode_length": np.mean(self.episode_lengths),
                                    
                                    # Timesteps
                                    "timesteps": self.num_timesteps,
                                })
                            
                            # Print console
                            print(f"  Episode {len(self.episode_rewards)}: "
                                  f"Reward={episode_reward:.2f}, "
                                  f"Portfolio=${portfolio_value:.2f} ({total_return_pct:+.1f}%), "
                                  f"Drawdown={abs(max_drawdown)*100:.1f}%")
                    
                    except Exception as e:
                        # Fallback : logger au moins les rewards
                        if use_wandb:
                            wandb.log({
                                "episode/reward": episode_reward,
                                "episode/length": episode_length,
                                "episode/num_episodes": len(self.episode_rewards),
                                "episode/mean_reward_100": np.mean(self.episode_rewards[-100:]),
                                "timesteps": self.num_timesteps,
                            })
                        print(f"  Episode {len(self.episode_rewards)}: Reward={episode_reward:.2f}")
            
            return True
    
    callback = DetailedWandbCallback()
    
    # Cr√©er le mod√®le AVEC tensorboard_log
    if algo == "PPO":
        model = PPO(
            "MlpPolicy", vec_env,
            learning_rate=config.get('learning_rate', 3e-4),
            n_steps=config.get('n_steps', 2048),
            batch_size=config.get('batch_size', 64),
            n_epochs=config.get('n_epochs', 10),
            gamma=config.get('gamma', 0.99),
            gae_lambda=config.get('gae_lambda', 0.95),
            clip_range=config.get('clip_range', 0.2),
            ent_coef=config.get('ent_coef', 0.01),
            vf_coef=config.get('vf_coef', 0.5),
            verbose=0,
            tensorboard_log=tensorboard_log_dir,
        )
    elif algo == "SAC":
        model = SAC(
            "MlpPolicy", vec_env,
            learning_rate=learning_rate,
            buffer_size=50000, batch_size=256,
            gamma=0.99, tau=0.005,
            verbose=0,
            tensorboard_log=tensorboard_log_dir,
        )
   
    # Entra√Æner
    print(f"  Timesteps: {timesteps}")
    print(f"  Entra√Ænement en cours...")
    
    model.learn(total_timesteps=timesteps, callback=callback)
    
    # R√©sultats
    results = {
        "name": config['name'],
        "algo": algo,
        "reward_type": reward_type,
        "num_episodes": len(callback.episode_rewards),
        "mean_reward": np.mean(callback.episode_rewards) if callback.episode_rewards else 0,
        "final_portfolio": callback.portfolio_values[-1] if callback.portfolio_values else 1000,
        "mean_portfolio": np.mean(callback.portfolio_values) if callback.portfolio_values else 1000,
        "max_portfolio": np.max(callback.portfolio_values) if callback.portfolio_values else 1000,
        "mean_return": np.mean(callback.episode_returns) if callback.episode_returns else 0,
        "max_drawdown": np.max(callback.max_drawdowns) if callback.max_drawdowns else 0,
    }
    
    # Logging final dans WandB
    if use_wandb and len(callback.episode_rewards) > 0:
        wandb.log({
            "final/total_episodes": len(callback.episode_rewards),
            "final/mean_reward": results["mean_reward"],
            "final/portfolio_value": results["final_portfolio"],
            "final/mean_portfolio": results["mean_portfolio"],
            "final/max_portfolio": results["max_portfolio"],
            "final/total_return_pct": (results["final_portfolio"] - 1000) / 10,
            "final/mean_return_pct": results["mean_return"],
            "final/max_drawdown_pct": results["max_drawdown"],
        })
        
        # Cr√©er un graphique r√©capitulatif custom
        if len(callback.portfolio_values) > 0:
            fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))
            
            # Portfolio evolution
            ax1.plot(callback.portfolio_values, linewidth=2, color='green', alpha=0.7)
            ax1.axhline(y=1000, color='red', linestyle='--', alpha=0.5, label='Initial')
            ax1.fill_between(range(len(callback.portfolio_values)), 
                            1000, callback.portfolio_values, 
                            alpha=0.3, color='green' if callback.portfolio_values[-1] > 1000 else 'red')
            ax1.set_title(f'Portfolio Evolution - {config["name"]}')
            ax1.set_xlabel('Episode')
            ax1.set_ylabel('Portfolio Value ($)')
            ax1.grid(True, alpha=0.3)
            ax1.legend()
            
            # Rewards evolution
            ax2.plot(callback.episode_rewards, alpha=0.3, color='blue', label='Raw')
            if len(callback.episode_rewards) > 10:
                window = min(20, len(callback.episode_rewards) // 5)
                smoothed = pd.Series(callback.episode_rewards).rolling(window=window).mean()
                ax2.plot(smoothed, linewidth=2, color='darkblue', label=f'Smoothed ({window})')
            ax2.axhline(y=0, color='red', linestyle='--', alpha=0.5)
            ax2.set_title('Episode Rewards')
            ax2.set_xlabel('Episode')
            ax2.set_ylabel('Reward')
            ax2.grid(True, alpha=0.3)
            ax2.legend()
            
            plt.tight_layout()
            wandb.log({f"charts/{config['name']}_summary": wandb.Image(fig)})
            plt.close(fig)
        
        wandb.finish()
    
    # Sauvegarder
    model.save(os.path.join(log_dir, "model.zip"))
    vec_env.save(os.path.join(log_dir, "vec_normalize.pkl"))
    
    # Nettoyer
    vec_env.close()
    
    print(f"  ‚úÖ Portfolio Final: ${results['final_portfolio']:.2f} "
          f"({(results['final_portfolio']-1000)/10:+.1f}%)")
    print(f"  ‚úÖ Max Drawdown: {results['max_drawdown']:.1f}%")
    
    return results

print("‚úÖ Fonction d'entra√Ænement avec m√©triques compl√®tes pr√™te")

‚úÖ Fonction d'entra√Ænement avec m√©triques compl√®tes pr√™te


### Lancer tous les tests


In [None]:
all_results = []

for config in TEST_CONFIGS:
    try:
        result = train_single_config(config, use_wandb=USE_WANDB)
        all_results.append(result)
    except Exception as e:
        print(f"‚ùå Erreur avec {config['name']}: {e}")
        import traceback
        traceback.print_exc()

print(f"\n‚úÖ {len(all_results)} configurations test√©es avec succ√®s")


üöÄ ENTRA√éNEMENT: sharpe_balanced_v2


Traceback (most recent call last):
  File "C:\Users\lilia\AppData\Local\Temp\ipykernel_1368\243672580.py", line 5, in <module>
    result = train_single_config(config, use_wandb=USE_WANDB)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\lilia\AppData\Local\Temp\ipykernel_1368\1906654240.py", line 47, in train_single_config
    wandb.init(
    ^^^^^^^^^^
AttributeError: module 'wandb' has no attribute 'init'
Exception ignored in: <_io.FileIO name='C:\\Users\\lilia\\OneDrive\\Documents\\Etude\\CPE Lyon\\5A\\RL\\RL-projet-trading\\models\\sharpe_balanced_v2\\monitor.csv' mode='wb' closefd=True>
Traceback (most recent call last):
  File "C:\Users\lilia\AppData\Local\Temp\ipykernel_1368\243672580.py", line 10, in <module>


‚ùå Erreur avec sharpe_balanced_v2: module 'wandb' has no attribute 'init'

‚úÖ 0 configurations test√©es avec succ√®s


### Test et Comparaison de TOUS les Mod√®les

In [None]:
print("="*100)
print("TEST DE PORTFOLIO FINAL - TOUS LES MOD√àLES")
print("="*100)

# 1. Trouver tous les mod√®les
model_dirs = sorted(glob.glob("models/*/"))
print(f"\n‚úÖ Trouv√© {len(model_dirs)} mod√®les √† tester\n")

# 2. Stocker les r√©sultats
portfolio_results = []
failed_models = []

# 3. Tester chaque mod√®le
for idx, model_dir in enumerate(model_dirs, 1):
    model_name = os.path.basename(model_dir.rstrip('/'))
    
    print(f"\n{'‚îÄ'*100}")
    print(f"[{idx}/{len(model_dirs)}] Test de : {model_name}")
    print(f"{'‚îÄ'*100}")
    
    try:
        # V√©rifier que les fichiers existent
        model_file = os.path.join(model_dir, "model.zip")
        vec_normalize_file = os.path.join(model_dir, "vec_normalize.pkl")
        
        if not os.path.exists(model_file):
            print(f"   Mod√®le introuvable")
            failed_models.append({'name': model_name, 'reason': 'Model file not found'})
            continue
        
        if not os.path.exists(vec_normalize_file):
            print(f"   vec_normalize.pkl introuvable")
            failed_models.append({'name': model_name, 'reason': 'vec_normalize.pkl not found'})
            continue
        
        # D√©tecter l'algorithme
        algo = None
        if 'sac' in model_name.lower():
            algo = 'SAC'
        else:
            algo = 'PPO'
        
        print(f"   üì¶ Algorithme : {algo}")
        
        # Charger le mod√®le
        if algo == 'PPO':
            loaded_model = PPO.load(model_file)
        elif algo == 'SAC':
            loaded_model = SAC.load(model_file)
       
        print(f"‚úÖ Mod√®le charg√©")
        
        # Trouver la config correspondante
        config = None
        for test_config in TEST_CONFIGS:
            if test_config['name'] == model_name:
                config = test_config
                break
        
        # Si pas de config, utiliser simple_return
        if config is None:
            print(f"   Config non trouv√©e, utilisation de simple_return")
            reward_fn = reward_simple_return
            def reward_wrapper(history):
                return reward_fn(history)
        else:
            reward_fn = REWARD_FUNCTIONS[config['reward_type']]
            def reward_wrapper(history):
                return reward_fn(history, 
                               risk_penalty=config.get('risk_penalty', 0.1), 
                               reward_scaling=config.get('reward_scaling', 1.0))
        
        # Cr√©er un environnement de test (comme dans le code original)
        test_env = gym.make(
            "MultiDatasetTradingEnv",
            dataset_dir="data/*.pkl",
            preprocess=preprocess,
            portfolio_initial_value=1_000,
            trading_fees=0.1/100,
            borrow_interest_rate=0.02/100/24,
            reward_function=reward_wrapper,
        )
        
        test_env.add_metric('Portfolio Valuation', lambda h: round(h['portfolio_valuation', -1], 2))
        
        # Wrapping (EXACTEMENT comme le code)
        test_env = Monitor(test_env)
        test_vec_env = DummyVecEnv([lambda: test_env])
        
        # Charger la normalisation COMPL√àTE
        test_vec_env = VecNormalize.load(vec_normalize_file, test_vec_env)
        
        print(f"‚úÖ Environnement cr√©√© et normalis√©")
        
        # Tester sur 10 √©pisodes
        print(f"Test sur 10 √©pisodes...")
        
        episode_portfolios = []
        episode_returns = []
        
        for ep in range(10):
            try:
                # EXACTEMENT LE CODE DE LA CELL 9
                obs = test_vec_env.reset()
                done = False
                final_info = None
                
                while not done:
                    action, _ = loaded_model.predict(obs, deterministic=True)
                    obs, reward, done_ancien, info = test_vec_env.step(action)
                    
                    if done_ancien[0]:
                        final_info = info[0]
                        done = True
                
                # R√©cup√©rer le portfolio final (EXACTEMENT LE CODE)
                if final_info and 'episode' in final_info:
                    base_env_unwrapped = test_vec_env.venv.envs[0].unwrapped
                    
                    try:
                        final_metrics = base_env_unwrapped.get_metrics()
                        final_portfolio_value = final_metrics.get('Portfolio Valuation', None)
                        
                        if final_portfolio_value is not None:
                            episode_portfolios.append(final_portfolio_value)
                            episode_returns.append((final_portfolio_value - 1000) / 1000 * 100)
                            
                            if ep == 0:  # Afficher le premier pour debug
                                print(f"      √âpisode 1: ${final_portfolio_value:.2f}")
                    except AttributeError:
                        print(f"       √âpisode {ep+1}: Impossible d'acc√©der aux m√©triques")
                else:
                    print(f"       √âpisode {ep+1}: Non termin√© correctement")
            
            except Exception as ep_error:
                print(f"      ‚ùå √âpisode {ep+1}: Erreur - {ep_error}")
        
        # Fermer proprement
        base_env_unwrapped = test_vec_env.venv.envs[0].unwrapped
        base_env_unwrapped.close()
        test_vec_env.close()
        
        # Calculer les statistiques
        if len(episode_portfolios) >= 3:  # Au moins 3 √©pisodes r√©ussis
            result = {
                'name': model_name,
                'algo': algo,
                'num_episodes_tested': len(episode_portfolios),
                'mean_portfolio': np.mean(episode_portfolios),
                'std_portfolio': np.std(episode_portfolios),
                'min_portfolio': np.min(episode_portfolios),
                'max_portfolio': np.max(episode_portfolios),
                'median_portfolio': np.median(episode_portfolios),
                'mean_return_pct': np.mean(episode_returns),
                'std_return_pct': np.std(episode_returns),
                'success_rate': sum(1 for p in episode_portfolios if p > 1000) / len(episode_portfolios) * 100,
            }
            
            portfolio_results.append(result)
            
            # Afficher r√©sultat
            print(f"\n    R√âSULTATS ({len(episode_portfolios)}/10 √©pisodes):")
            print(f"      Portfolio Moyen    : ${result['mean_portfolio']:.2f} ¬± ${result['std_portfolio']:.2f}")
            print(f"      Rendement Moyen    : {result['mean_return_pct']:+.2f}% ¬± {result['std_return_pct']:.2f}%")
            print(f"      Min - Max          : ${result['min_portfolio']:.2f} - ${result['max_portfolio']:.2f}")
            print(f"      M√©diane            : ${result['median_portfolio']:.2f}")
            print(f"      Taux de R√©ussite   : {result['success_rate']:.0f}%")
            print(f"   ‚úÖ Test r√©ussi")
        else:
            print(f"   ‚ùå Trop peu d'√©pisodes compl√©t√©s ({len(episode_portfolios)}/10)")
            failed_models.append({'name': model_name, 'reason': f'Only {len(episode_portfolios)} episodes completed'})
    
    except Exception as e:
        print(f"   ‚ùå Erreur : {e}")
        failed_models.append({'name': model_name, 'reason': str(e)})
        import traceback
        traceback.print_exc()

print(f"\n{'='*100}")
print(f"‚úÖ Tests termin√©s : {len(portfolio_results)}/{len(model_dirs)} mod√®les test√©s avec succ√®s")

if len(failed_models) > 0:
    print(f"\n‚ö†Ô∏è {len(failed_models)} mod√®les ont √©chou√©:")
    for failed in failed_models:
        print(f"   - {failed['name']}: {failed['reason']}")

print(f"{'='*100}")

TEST DE PORTFOLIO FINAL - TOUS LES MOD√àLES

‚úÖ Trouv√© 1 mod√®les √† tester


‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
[1/1] Test de : 
‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
   Mod√®le introuvable

‚úÖ Tests termin√©s : 0/1 mod√®les test√©s avec succ√®s

‚ö†Ô∏è 1 mod√®les ont √©chou√©:
   - : Model file not found
