# Policy Gradient Training for Financial Trading

This notebook demonstrates how to train a Policy Gradient agent for financial trading using the organized codebase structure.

## 1. Importing Agents

Import the agent classes from the respective modules in the agents folder.

In [1]:
import sys
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch

# Add project root to path
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath('__file__'))))

# Import agents
from agents.policy_gradient.policy_gradient_agent import (
    PolicyGradientAgent,
    train_policy_gradient_agent,
    evaluate_policy_gradient_agent
)

# Import environment
from environments.env_stocktrading import StockTradingEnv

print("âœ“ All imports successful!")
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")

âœ“ All imports successful!
PyTorch version: 2.7.0+cpu
CUDA available: False


## 2. DQN Agent Implementation

Demonstrate the DQN agent from dqn_agent.py, including initialization and training logic.

In [2]:
# Import DQN agent for comparison
from agents.dqn.dqn_agent import DQNAgent

# Example DQN agent initialization
state_dim = 10  # Example state dimension
action_dim = 3  # Example action dimension (hold, buy, sell)

dqn_agent = DQNAgent(
    state_dim=state_dim,
    action_dim=action_dim,
    hidden_dim=128,
    lr=1e-3,
    gamma=0.99,
    epsilon_start=1.0,
    epsilon_end=0.01,
    epsilon_decay=0.995,
    buffer_size=10000,
    batch_size=64,
    target_update_freq=100
)

print("âœ“ DQN Agent initialized")
print(f"State dim: {dqn_agent.state_dim}")
print(f"Action dim: {dqn_agent.action_dim}")
print(f"Hidden dim: {dqn_agent.hidden_dim}")

TypeError: DQNAgent.__init__() got an unexpected keyword argument 'action_dim'

## 3. Policy Gradient Agent Implementation

Demonstrate the Policy Gradient agent from policy_gradient_agent.py, including policy updates and gradient computations.

In [None]:
# Policy Gradient Agent initialization
pg_agent = PolicyGradientAgent(
    state_dim=state_dim,
    action_dim=action_dim,
    hidden_dim=128,
    lr=1e-3,
    gamma=0.99,
    device='cuda' if torch.cuda.is_available() else 'cpu'
)

print("âœ“ Policy Gradient Agent initialized")
print(f"State dim: {pg_agent.state_dim}")
print(f"Action dim: {pg_agent.action_dim}")
print(f"Device: {pg_agent.device}")

# Test action selection
test_state = np.random.randn(state_dim)
action, log_prob = pg_agent.select_action(test_state)
print(f"Test state shape: {test_state.shape}")
print(f"Selected action: {action}")
print(f"Log probability: {log_prob:.4f}")

# Test policy network
state_tensor = torch.FloatTensor(test_state).unsqueeze(0).to(pg_agent.device)
with torch.no_grad():
    probs = pg_agent.policy_net(state_tensor)
    print(f"Action probabilities: {probs.cpu().numpy().flatten()}")

## 4. Deep SARSA Agent Implementation

Demonstrate the Deep SARSA agents from deep_sarsa_agent.py and deep_sarsa_agent_paper.py, including Q-value updates and action selection.

In [None]:
# Import SARSA agents for comparison
from agents.sarsa.deep_sarsa_agent import DeepSARSAAgent
from agents.sarsa.deep_sarsa_agent_paper import DeepSARSAAgent as DeepSARSAAgentPaper

# Mock environment for demonstration
class MockEnv:
    def __init__(self, state_dim=10, action_dim=3):
        self.state_dim = state_dim
        self.action_dim = action_dim

    def reset(self):
        return np.random.randn(self.state_dim)

    def step(self, action):
        next_state = np.random.randn(self.state_dim)
        reward = np.random.randn()
        done = np.random.rand() > 0.95  # 5% chance of episode end
        return next_state, reward, done, {}

# Initialize mock environment
mock_env = MockEnv(state_dim, action_dim)

# Initialize SARSA agents
sarsa_agent = DeepSARSAAgent(
    env=mock_env,
    state_dim=state_dim,
    action_dim=action_dim,
    hidden_dim=128,
    lr=1e-3,
    gamma=0.99
)

sarsa_paper_agent = DeepSARSAAgentPaper(
    state_dim=state_dim,
    action_dim=action_dim,
    hidden_dim=128,
    lr=1e-3,
    gamma=0.99
)

print("âœ“ SARSA Agents initialized")
print(f"DeepSARSA Agent - State dim: {sarsa_agent.state_dim}, Action dim: {sarsa_agent.action_dim}")
print(f"DeepSARSA Paper Agent - State dim: {sarsa_paper_agent.state_dim}, Action dim: {sarsa_paper_agent.action_dim}")

# Test action selection for SARSA agents
test_state = np.random.randn(state_dim)
action_sarsa = sarsa_agent.select_action(test_state)
action_sarsa_paper = sarsa_paper_agent.select_action(test_state)

print(f"SARSA action: {action_sarsa}")
print(f"SARSA Paper action: {action_sarsa_paper}")

## 5. Training and Evaluation

Demonstrate training the Policy Gradient agent and evaluating its performance.

In [None]:
# Training demonstration (short training for demo)
print("ðŸš€ Training Policy Gradient Agent...")
print("(This is a short demo training - increase num_episodes for real training)")

training_rewards = train_policy_gradient_agent(
    env=mock_env,
    agent=pg_agent,
    num_episodes=50,  # Short training for demo
    max_steps=100,
    update_freq=5,
    save_path=None  # Skip saving for demo
)

print("âœ“ Training completed!")

# Plot training rewards
plt.figure(figsize=(10, 6))
plt.plot(training_rewards, alpha=0.7)
plt.title('Policy Gradient Training Rewards')
plt.xlabel('Episode')
plt.ylabel('Total Reward')
plt.grid(True, alpha=0.3)
plt.show()

# Moving average
window_size = 10
if len(training_rewards) >= window_size:
    moving_avg = np.convolve(training_rewards, np.ones(window_size)/window_size, mode='valid')
    plt.figure(figsize=(10, 6))
    plt.plot(moving_avg, color='red', linewidth=2, label=f'Moving Average ({window_size} episodes)')
    plt.title('Policy Gradient Training Rewards - Moving Average')
    plt.xlabel('Episode')
    plt.ylabel('Average Reward')
    plt.grid(True, alpha=0.3)
    plt.legend()
    plt.show()

print(f"Final episode reward: {training_rewards[-1]:.2f}")
print(f"Average reward (last 10 episodes): {np.mean(training_rewards[-10:]):.2f}")
print(f"Best episode reward: {max(training_rewards):.2f}")

In [None]:
# Evaluation
print("\nðŸ“Š Evaluating trained Policy Gradient Agent...")

evaluation_rewards = evaluate_policy_gradient_agent(
    env=mock_env,
    agent=pg_agent,
    num_episodes=10,
    max_steps=100
)

print("âœ“ Evaluation completed!")

# Plot evaluation results
plt.figure(figsize=(10, 6))
plt.bar(range(len(evaluation_rewards)), evaluation_rewards, alpha=0.7, color='skyblue')
plt.axhline(y=np.mean(evaluation_rewards), color='red', linestyle='--',
            linewidth=2, label=f'Mean: {np.mean(evaluation_rewards):.2f}')
plt.title('Policy Gradient Agent Evaluation Results')
plt.xlabel('Episode')
plt.ylabel('Total Reward')
plt.grid(True, alpha=0.3)
plt.legend()
plt.show()

print(f"Evaluation Statistics:")
print(f"  Mean Reward: {np.mean(evaluation_rewards):.2f}")
print(f"  Std Reward: {np.std(evaluation_rewards):.2f}")
print(f"  Min Reward: {min(evaluation_rewards):.2f}")
print(f"  Max Reward: {max(evaluation_rewards):.2f}")

## Summary

This notebook demonstrated:

1. **Importing Agents**: Successfully imported all agent classes from the organized structure
2. **DQN Agent**: Initialized DQN agent with Q-networks and replay buffer
3. **Policy Gradient Agent**: Implemented REINFORCE algorithm with baseline for variance reduction
4. **Deep SARSA Agents**: Showed both FinRL-integrated and paper-based SARSA implementations
5. **Training & Evaluation**: Demonstrated training loop and performance evaluation

### Key Features of Policy Gradient Agent:
- **Direct Policy Learning**: Learns action probabilities directly instead of Q-values
- **REINFORCE Algorithm**: Uses Monte Carlo policy gradient with discounted returns
- **Baseline Subtraction**: Uses value function approximation to reduce gradient variance
- **PyTorch Implementation**: Fully vectorized neural network implementation

### Next Steps:
- Replace mock environment with real FinRL StockTradingEnv
- Load actual financial data (FPT, VN30, etc.)
- Tune hyperparameters for better performance
- Compare with DQN and SARSA agents
- Implement more advanced policy gradient methods (PPO, TRPO)

The organized codebase structure makes it easy to experiment with different RL algorithms and compare their performance on financial trading tasks.