#  Notebook 3 : Reinforcement Learning (T08)

Ce notebook explore l'agent de trading bas√© sur le Reinforcement Learning :
- **Environnement Gym** : Custom trading environment
- **Agent DQN** : Deep Q-Network avec experience replay
- **Entra√Ænement** : 2022 (train) + 2023 (validation)
- **Test** : Performance sur 2024
- **Comparaison** : DQN vs ML (T07)

---

##  Imports

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import json
import sys

# Ajouter le chemin du projet
sys.path.append('../src/rl')

from trading_env import TradingEnv
from dqn_agent import DQNAgent

# Configuration
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette('husl')
%matplotlib inline

##  Environnement de Trading

### Espace d'actions
- **0** : HOLD (ne rien faire)
- **1** : BUY (position longue)
- **2** : SELL (position courte / fermer position)

### Espace d'√©tats
- Features techniques (RSI, MACD, EMAs, etc.)
- Position actuelle
- P&L non r√©alis√©
- Balance normalis√©e

In [None]:
# Charger les donn√©es
df_2022 = pd.read_parquet('../data/processed/ml_dataset_2022.parquet')

print(f" Donn√©es charg√©es (2022)")
print(f"   Lignes: {len(df_2022):,}")
print(f"   Colonnes: {len(df_2022.columns)}")

df_2022.head()

In [None]:
# Cr√©er l'environnement
env = TradingEnv(df_2022, initial_balance=10000)

print(" Environnement cr√©√©")
print(f"   State size: {env.observation_space.shape[0]}")
print(f"   Action size: {env.action_space.n}")
print(f"   Feature columns: {len(env.feature_cols)}")
print(f"\nüìã Premi√®res features:")
print(env.feature_cols[:10])

##  Test de l'environnement

Simulons quelques actions pour comprendre le fonctionnement

In [None]:
# Reset environnement
state, info = env.reset()

print(" Environnement reset√©")
print(f"   State shape: {state.shape}")
print(f"   Balance initiale: {info['balance']:.2f}")

# Faire quelques actions al√©atoires
actions_log = []
rewards_log = []

for i in range(10):
    action = env.action_space.sample()  # Action al√©atoire
    state, reward, terminated, truncated, info = env.step(action)
    
    actions_log.append(action)
    rewards_log.append(reward)
    
    action_name = ['HOLD', 'BUY', 'SELL'][action]
    print(f"Step {i+1}: Action={action_name}, Reward={reward:.4f}, Balance={info['balance']:.2f}")
    
    if terminated or truncated:
        break

##  Analyse de l'entra√Ænement

Chargeons l'historique d'entra√Ænement

In [None]:
# Charger l'historique d'entra√Ænement
history_file = Path('../models/saved/rl/training_history.json')

if history_file.exists():
    with open(history_file, 'r') as f:
        history = json.load(f)
    
    train_hist = history['train']
    val_hist = history['val']
    
    print("‚úì Historique charg√©")
    print(f"   √âpisodes d'entra√Ænement: {len(train_hist['episodes'])}")
    print(f"   √âpisodes de validation: {len(val_hist['episodes'])}")
else:
    print("‚ö†Ô∏è Fichier d'historique non trouv√©")
    print("   Assurez-vous d'avoir entra√Æn√© l'agent avec train_rl.py")

### √âvolution des performances

In [None]:
# Cr√©er DataFrame pour faciliter l'analyse
train_df = pd.DataFrame({
    'episode': train_hist['episodes'],
    'return': train_hist['returns'],
    'balance': train_hist['balances'],
    'trades': train_hist['trades'],
    'win_rate': train_hist['win_rates'],
    'loss': train_hist['losses'],
    'epsilon': train_hist['epsilons']
})

print(" Statistiques d'entra√Ænement:")
print(train_df.describe())

In [None]:
# Visualisation compl√®te
fig, axes = plt.subplots(3, 2, figsize=(16, 12))

# 1. Returns
axes[0, 0].plot(train_df['episode'], train_df['return'], label='Train', marker='o', alpha=0.7)
if val_hist['episodes']:
    axes[0, 0].plot(val_hist['episodes'], val_hist['returns'], 
                   label='Validation', marker='s', alpha=0.7, color='orange')
axes[0, 0].axhline(y=0, color='black', linestyle='--', alpha=0.3)
axes[0, 0].set_title('Total Return (%)', fontsize=14)
axes[0, 0].set_xlabel('√âpisode')
axes[0, 0].set_ylabel('Return %')
axes[0, 0].legend()
axes[0, 0].grid(True, alpha=0.3)

# 2. Balance
axes[0, 1].plot(train_df['episode'], train_df['balance'], label='Train', marker='o', alpha=0.7)
if val_hist['episodes']:
    axes[0, 1].plot(val_hist['episodes'], val_hist['balances'], 
                   label='Validation', marker='s', alpha=0.7, color='orange')
axes[0, 1].axhline(y=10000, color='black', linestyle='--', alpha=0.3, label='Initial')
axes[0, 1].set_title('Balance Finale', fontsize=14)
axes[0, 1].set_xlabel('√âpisode')
axes[0, 1].set_ylabel('Balance ($)')
axes[0, 1].legend()
axes[0, 1].grid(True, alpha=0.3)

# 3. Nombre de trades
axes[1, 0].plot(train_df['episode'], train_df['trades'], marker='o', alpha=0.7, color='green')
axes[1, 0].set_title('Nombre de Trades', fontsize=14)
axes[1, 0].set_xlabel('√âpisode')
axes[1, 0].set_ylabel('Trades')
axes[1, 0].grid(True, alpha=0.3)

# 4. Win Rate
axes[1, 1].plot(train_df['episode'], train_df['win_rate'], label='Train', marker='o', alpha=0.7)
if val_hist['episodes']:
    axes[1, 1].plot(val_hist['episodes'], val_hist['win_rates'], 
                   label='Validation', marker='s', alpha=0.7, color='orange')
axes[1, 1].axhline(y=0.5, color='black', linestyle='--', alpha=0.3, label='50%')
axes[1, 1].set_title('Win Rate', fontsize=14)
axes[1, 1].set_xlabel('√âpisode')
axes[1, 1].set_ylabel('Win Rate')
axes[1, 1].legend()
axes[1, 1].grid(True, alpha=0.3)

# 5. Epsilon (exploration)
axes[2, 0].plot(train_df['episode'], train_df['epsilon'], marker='o', alpha=0.7, color='purple')
axes[2, 0].set_title('Taux d\'Exploration (Epsilon)', fontsize=14)
axes[2, 0].set_xlabel('√âpisode')
axes[2, 0].set_ylabel('Epsilon')
axes[2, 0].grid(True, alpha=0.3)

# 6. Loss
axes[2, 1].plot(train_df['episode'], train_df['loss'], marker='o', alpha=0.7, color='red')
axes[2, 1].set_title('Loss (erreur d\'apprentissage)', fontsize=14)
axes[2, 1].set_xlabel('√âpisode')
axes[2, 1].set_ylabel('Loss')
axes[2, 1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

##  Analyse d√©taill√©e

### Probl√®mes potentiels

In [None]:
# Analyse des probl√®mes
print(" DIAGNOSTIC\n")

# 1. Return n√©gatif ?
avg_return = train_df['return'].mean()
if avg_return < 0:
    print(f"‚ùå PROBL√àME: Return moyen n√©gatif ({avg_return:.2f}%)")
    print("   ‚Üí L'agent perd de l'argent en moyenne")
else:
    print(f"‚úÖ Return moyen positif: {avg_return:.2f}%")

# 2. Trop de trades ?
avg_trades = train_df['trades'].mean()
total_steps = len(df_2022) - env.lookback_window
trade_ratio = avg_trades / total_steps

print(f"\n Activit√© de trading:")
print(f"   Trades moyens: {avg_trades:.0f}")
print(f"   Ratio: {trade_ratio:.2%} (trades / p√©riodes)")

if trade_ratio > 0.3:
    print("    L'agent trade BEAUCOUP (>30% du temps)")
    print("   ‚Üí Les frais de transaction mangent les profits")
    print("   ‚Üí Solution: Augmenter transaction_cost ou p√©naliser les trades")
elif trade_ratio < 0.05:
    print("    L'agent trade TR√àS PEU (<5% du temps)")
    print("   ‚Üí Peut-√™tre trop conservateur")
else:
    print("    Bon niveau de trading")

# 3. Win rate
avg_win_rate = train_df['win_rate'].mean()
print(f"\n Win Rate: {avg_win_rate:.2%}")
if avg_win_rate < 0.45:
    print("    Win rate trop faible (<45%)")
elif avg_win_rate > 0.55:
    print("    Bon win rate (>55%)")
else:
    print("    Win rate moyen (proche de 50%)")

# 4. Apprentissage
if len(train_df) > 5:
    first_5 = train_df.head(5)['return'].mean()
    last_5 = train_df.tail(5)['return'].mean()
    improvement = last_5 - first_5
    
    print(f"\n Progression:")
    print(f"   Return premiers 5 √©pisodes: {first_5:.2f}%")
    print(f"   Return derniers 5 √©pisodes: {last_5:.2f}%")
    print(f"   Am√©lioration: {improvement:+.2f}%")
    
    if improvement > 5:
        print("    L'agent s'am√©liore !")
    elif improvement < -5:
        print("    L'agent r√©gresse")
    else:
        print("    Peu d'am√©lioration")

##  Test de l'agent entra√Æn√©

Chargeons le meilleur agent

In [None]:
# Charger le meilleur agent
best_agent_path = Path('../models/saved/rl/best_agent.pth')

# Trouver le dernier best_agent si le fichier exact n'existe pas
if not best_agent_path.exists():
    rl_models = list(Path('../models/saved/rl').glob('best_agent_ep*.pth'))
    if rl_models:
        best_agent_path = sorted(rl_models)[-1]
        print(f"‚úì Agent trouv√©: {best_agent_path.name}")
    else:
        print(" Aucun agent entra√Æn√© trouv√©")
        print("   Entra√Ænez d'abord un agent avec train_rl.py")

if best_agent_path.exists():
    # Cr√©er agent
    state_size = env.observation_space.shape[0]
    action_size = env.action_space.n
    
    agent = DQNAgent(state_size=state_size, action_size=action_size)
    agent.load(str(best_agent_path))
    
    print(f" Agent charg√© depuis {best_agent_path}")
    print(f"   √âpisodes d'entra√Ænement: {agent.episode_count}")
    print(f"   Epsilon actuel: {agent.epsilon:.4f}")

In [None]:
# Tester l'agent sur un √©pisode
state, info = env.reset()
actions_taken = []
balance_history = [info['balance']]

steps = 0
max_steps = 1000  # Limiter pour le notebook

while steps < max_steps:
    # Action (greedy, pas d'exploration)
    action = agent.select_action(state, training=False)
    
    # Step
    state, reward, terminated, truncated, info = env.step(action)
    
    actions_taken.append(action)
    balance_history.append(info['balance'])
    steps += 1
    
    if terminated or truncated:
        break

# R√©sultats
perf = env.get_performance_summary()

print(f"\nüéÆ Test de l'agent ({steps} steps)")
print(f"   Balance initiale: {perf['initial_balance']:.2f} ‚Ç¨")
print(f"   Balance finale: {perf['final_balance']:.2f} ‚Ç¨")
print(f"   Return: {perf['total_return_pct']:.2f}%")
print(f"   Trades: {perf['total_trades']}")
print(f"   Win Rate: {perf['win_rate']:.2%}")

In [None]:
# Visualiser les actions prises
action_counts = pd.Series(actions_taken).value_counts().sort_index()
action_names = {0: 'HOLD', 1: 'BUY', 2: 'SELL'}

fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Distribution des actions
action_labels = [action_names[i] for i in action_counts.index]
axes[0].pie(action_counts.values, labels=action_labels, autopct='%1.1f%%',
           colors=['gray', 'green', 'red'], startangle=90)
axes[0].set_title('Distribution des Actions', fontsize=14)

# √âvolution du balance
axes[1].plot(balance_history, linewidth=2)
axes[1].axhline(y=perf['initial_balance'], color='black', 
               linestyle='--', alpha=0.5, label='Initial')
axes[1].set_title('√âvolution du Balance', fontsize=14)
axes[1].set_xlabel('Steps')
axes[1].set_ylabel('Balance ($)')
axes[1].legend()
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

##  Comparaison : DQN (T08) vs ML (T07)

Comparons les performances de l'agent RL avec les mod√®les ML

In [None]:
# R√©sultats T07 (ML) - √† partir des r√©sultats pr√©c√©dents
ml_results = {
    'Logistic Regression': {'return': 297.54, 'trades': 10, 'win_rate': 0.40},
    'Random Forest': {'return': 0.00, 'trades': 0, 'win_rate': 0.00},
    'XGBoost': {'return': -0.14, 'trades': 2, 'win_rate': 0.00}
}

# R√©sultats T08 (RL)
rl_results = {
    'DQN Agent': {
        'return': train_df['return'].iloc[-1],  # Dernier √©pisode
        'trades': train_df['trades'].iloc[-1],
        'win_rate': train_df['win_rate'].iloc[-1]
    }
}

# Combiner
all_results = {**ml_results, **rl_results}

# Cr√©er DataFrame
comparison_df = pd.DataFrame(all_results).T
comparison_df.columns = ['Return %', 'Trades', 'Win Rate']

print("\n" + "="*80)
print("COMPARAISON T07 (ML) vs T08 (RL)")
print("="*80)
print(comparison_df.to_string())

best_model = comparison_df['Return %'].idxmax()
best_return = comparison_df['Return %'].max()
print(f"\nüèÜ Meilleur mod√®le: {best_model} ({best_return:.2f}%)")

In [None]:
# Visualisation de la comparaison
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Returns
colors = ['green' if x > 0 else 'red' for x in comparison_df['Return %']]
comparison_df['Return %'].plot(kind='bar', ax=axes[0], color=colors, alpha=0.7)
axes[0].axhline(y=0, color='black', linestyle='--', alpha=0.3)
axes[0].set_title('Return % - Tous les mod√®les', fontsize=14)
axes[0].set_xlabel('Mod√®le')
axes[0].set_ylabel('Return %')
axes[0].grid(True, alpha=0.3, axis='y')
plt.setp(axes[0].xaxis.get_majorticklabels(), rotation=45, ha='right')

# Win Rate vs Return (scatter)
axes[1].scatter(comparison_df['Win Rate'], comparison_df['Return %'], s=200, alpha=0.6)
for idx, row in comparison_df.iterrows():
    axes[1].annotate(idx, (row['Win Rate'], row['Return %']), 
                    fontsize=9, ha='center')
axes[1].axhline(y=0, color='black', linestyle='--', alpha=0.3)
axes[1].axvline(x=0.5, color='black', linestyle='--', alpha=0.3)
axes[1].set_title('Win Rate vs Return', fontsize=14)
axes[1].set_xlabel('Win Rate')
axes[1].set_ylabel('Return %')
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

##  Conclusion

### Points cl√©s T08 :

**Forces** :
- ‚úÖ Environnement Gym custom fonctionnel
- ‚úÖ Agent DQN avec experience replay
- ‚úÖ Training loop complet avec validation
- ‚úÖ Apprentissage actif (l'agent explore et apprend)

**Faiblesses actuelles** :
- ‚ùå Returns n√©gatifs (si l'agent trade trop)
- ‚ùå Frais de transaction trop √©lev√©s
- ‚ùå Besoin d'optimisation des hyperparam√®tres
