In [1]:
# Import các thư viện cần thiết
import pandas as pd
import numpy as np
import yfinance as yf
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')



In [3]:
# Import thêm các thư viện cho Deep RL
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from collections import deque
import random
from copy import deepcopy

# Set random seeds for reproducibility
np.random.seed(42)
random.seed(42)
torch.manual_seed(42)

print("Deep RL libraries imported successfully!")
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")

Deep RL libraries imported successfully!
PyTorch version: 2.7.0+cpu
CUDA available: False


In [4]:
# Tải dữ liệu training cho Good situation
print("Đang tải dữ liệu training (Good situation)...")
df_train_good = yf.download(tickers, start=train_good_start, end=train_good_end, group_by='ticker')


Đang tải dữ liệu training (Good situation)...
YF.download() has changed argument auto_adjust default to True


[*********************100%***********************]  6 of 6 completed


In [15]:
# Tải dữ liệu test cho Good situation
print("Đang tải dữ liệu test (Good situation)...")
df_test_good = yf.download(tickers, start=test_good_start, end=test_good_end, group_by='ticker')


Đang tải dữ liệu test (Good situation)...


[*********************100%***********************]  6 of 6 completed


In [16]:
# Tải dữ liệu training cho Bad situation
print("Đang tải dữ liệu training (Bad situation)...")
df_train_bad = yf.download(tickers, start=train_bad_start, end=train_bad_end, group_by='ticker')


Đang tải dữ liệu training (Bad situation)...


[*********************100%***********************]  6 of 6 completed


In [17]:
# Tải dữ liệu test cho Bad situation
print("Đang tải dữ liệu test (Bad situation)...")
df_test_bad = yf.download(tickers, start=test_bad_start, end=test_bad_end, group_by='ticker')


Đang tải dữ liệu test (Bad situation)...


[*********************100%***********************]  6 of 6 completed


### Gắn dữ liệu

In [5]:
# Calculate Technical Indicators (MACD, RSI, CCI, ADX)
def calculate_technical_indicators(df, window=14):
    """
    Calculate technical indicators for a single stock dataframe
    """
    data = df.copy()
    
    # MACD (Moving Average Convergence Divergence)
    exp1 = data['Close'].ewm(span=12, adjust=False).mean()
    exp2 = data['Close'].ewm(span=26, adjust=False).mean()
    data['MACD'] = exp1 - exp2
    
    # RSI (Relative Strength Index)
    delta = data['Close'].diff()
    gain = (delta.where(delta > 0, 0)).rolling(window=window).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(window=window).mean()
    rs = gain / loss
    data['RSI'] = 100 - (100 / (1 + rs))
    
    # CCI (Commodity Channel Index)
    tp = (data['High'] + data['Low'] + data['Close']) / 3
    data['CCI'] = (tp - tp.rolling(window=window).mean()) / (0.015 * tp.rolling(window=window).std())
    
    # ADX (Average Directional Index) - Simplified version
    high_diff = data['High'].diff()
    low_diff = -data['Low'].diff()
    
    pos_dm = high_diff.where((high_diff > low_diff) & (high_diff > 0), 0)
    neg_dm = low_diff.where((low_diff > high_diff) & (low_diff > 0), 0)
    
    tr = pd.concat([data['High'] - data['Low'], 
                    (data['High'] - data['Close'].shift()).abs(), 
                    (data['Low'] - data['Close'].shift()).abs()], axis=1).max(axis=1)
    
    atr = tr.rolling(window=window).mean()
    pos_di = 100 * (pos_dm.rolling(window=window).mean() / atr)
    neg_di = 100 * (neg_dm.rolling(window=window).mean() / atr)
    
    dx = 100 * (pos_di - neg_di).abs() / (pos_di + neg_di)
    data['ADX'] = dx.rolling(window=window).mean()
    
    # Fill NaN values
    data.fillna(method='bfill', inplace=True)
    data.fillna(method='ffill', inplace=True)
    data.fillna(0, inplace=True)
    
    return data

# Test with one stock
test_data = calculate_technical_indicators(df_train_good['AAPL'])
print("Technical indicators calculated!")
print(f"\nColumns: {test_data.columns.tolist()}")
print(f"\nSample data:")
print(test_data[['Close', 'MACD', 'RSI', 'CCI', 'ADX']].tail())

Technical indicators calculated!

Columns: ['Open', 'High', 'Low', 'Close', 'Volume', 'MACD', 'RSI', 'CCI', 'ADX']

Sample data:
Price           Close      MACD        RSI         CCI        ADX
Date                                                             
2018-12-24  34.936462 -2.477554  11.321534 -143.637251  42.996215
2018-12-26  37.396755 -2.386403  30.982132  -87.823527  45.199311
2018-12-27  37.154049 -2.307154  31.566454  -71.825673  46.995164
2018-12-28  37.173096 -2.217253  36.137583  -45.891443  47.928414
2018-12-31  37.532394 -2.092887  36.710180  -27.391718  47.097537


### Xây dựng môi trường và agent

In [6]:
class StockTradingEnv:
    """
    Stock Trading Environment following Yang et al. (2020)
    
    State: [price, balance, holdings, MACD, RSI, CCI, ADX]
    Action: {-k, ..., 0, ..., k} where negative=sell, positive=buy, 0=hold
    Reward: Change in portfolio value - trading fees
    """
    
    def __init__(self, data, initial_balance=1000, max_shares=5, fee_rate=0.001, min_balance_tolerance=0):
        """
        Args:
            data: DataFrame with columns ['Close', 'MACD', 'RSI', 'CCI', 'ADX']
            initial_balance: Starting cash balance
            max_shares: Maximum number of shares to trade in one action (k)
            fee_rate: Trading fee rate (default 0.1% = 0.001)
            min_balance_tolerance: Minimum balance allowed (e.g., 0 or -100)
        """
        self.data = data.reset_index(drop=True)
        self.initial_balance = initial_balance
        self.max_shares = max_shares
        self.fee_rate = fee_rate
        self.min_balance_tolerance = min_balance_tolerance
        
        # Action space: {-k, -k+1, ..., -1, 0, 1, ..., k-1, k}
        self.action_space = list(range(-max_shares, max_shares + 1))
        self.n_actions = len(self.action_space)
        
        # State dimension: [price, balance, holdings, MACD, RSI, CCI, ADX]
        self.state_dim = 7
        
        self.reset()
    
    def reset(self):
        """Reset environment to initial state"""
        self.current_step = 0
        self.balance = self.initial_balance
        self.holdings = 0
        self.total_trades = 0
        self.trade_history = []
        
        return self._get_state()
    
    def _get_state(self):
        """Get current state representation"""
        row = self.data.iloc[self.current_step]
        
        state = np.array([
            row['Close'] / 1000.0,  # Normalize price
            self.balance / self.initial_balance,  # Normalize balance
            self.holdings / self.max_shares if self.max_shares > 0 else 0,  # Normalize holdings
            row['MACD'] / 100.0,  # Normalize MACD
            row['RSI'] / 100.0,  # Normalize RSI (0-100)
            row['CCI'] / 200.0,  # Normalize CCI
            row['ADX'] / 100.0   # Normalize ADX (0-100)
        ], dtype=np.float32)
        
        return state
    
    def step(self, action_idx):
        """
        Execute one step in the environment
        
        Args:
            action_idx: Index of action in action_space
            
        Returns:
            next_state, reward, done, info
        """
        action = self.action_space[action_idx]  # Convert index to actual action
        
        current_price = self.data.iloc[self.current_step]['Close']
        prev_portfolio_value = self.balance + current_price * self.holdings
        
        # Execute action with constraints
        executed_shares = self._execute_action(action, current_price)
        
        # Move to next step
        self.current_step += 1
        done = self.current_step >= len(self.data) - 1
        
        if not done:
            next_price = self.data.iloc[self.current_step]['Close']
        else:
            next_price = current_price
        
        # Calculate reward: change in portfolio value
        new_portfolio_value = self.balance + next_price * self.holdings
        reward = new_portfolio_value - prev_portfolio_value
        
        next_state = self._get_state() if not done else None
        
        info = {
            'portfolio_value': new_portfolio_value,
            'balance': self.balance,
            'holdings': self.holdings,
            'executed_shares': executed_shares,
            'price': next_price
        }
        
        return next_state, reward, done, info
    
    def _execute_action(self, action, current_price):
        """
        Execute trading action with constraints
        
        Args:
            action: Number of shares to trade (negative=sell, positive=buy, 0=hold)
            current_price: Current stock price
            
        Returns:
            executed_shares: Actual number of shares traded
        """
        executed_shares = 0
        
        if action > 0:  # Buy
            # Constraint 1: Cannot make balance below min_balance_tolerance
            max_affordable = int((self.balance - self.min_balance_tolerance) / current_price)
            shares_to_buy = min(action, max_affordable)
            
            if shares_to_buy > 0:
                cost = shares_to_buy * current_price
                fee = self.fee_rate * cost
                total_cost = cost + fee
                
                if self.balance >= total_cost + self.min_balance_tolerance:
                    self.balance -= total_cost
                    self.holdings += shares_to_buy
                    executed_shares = shares_to_buy
                    self.total_trades += 1
                    self.trade_history.append(('BUY', shares_to_buy, current_price, self.current_step))
        
        elif action < 0:  # Sell
            # Constraint 2: Cannot sell more than current holdings
            shares_to_sell = min(abs(action), self.holdings)
            
            if shares_to_sell > 0:
                revenue = shares_to_sell * current_price
                fee = self.fee_rate * revenue
                net_revenue = revenue - fee
                
                self.balance += net_revenue
                self.holdings -= shares_to_sell
                executed_shares = -shares_to_sell
                self.total_trades += 1
                self.trade_history.append(('SELL', shares_to_sell, current_price, self.current_step))
        
        # action == 0: Hold (do nothing)
        
        return executed_shares
    
    def get_portfolio_value(self):
        """Get current total portfolio value"""
        current_price = self.data.iloc[self.current_step]['Close']
        return self.balance + current_price * self.holdings

# Test the environment
print("Testing Stock Trading Environment...")
test_env = StockTradingEnv(test_data, initial_balance=1000, max_shares=5)
state = test_env.reset()
print(f"\nInitial state shape: {state.shape}")
print(f"Initial state: {state}")
print(f"Action space size: {test_env.n_actions}")
print(f"Action space: {test_env.action_space}")

Testing Stock Trading Environment...

Initial state shape: (7,)
Initial state: [ 0.01590992  1.          0.          0.          0.2862478  -0.59649104
  0.4053651 ]
Action space size: 11
Action space: [-5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5]


In [7]:
class DQN(nn.Module):
    """Deep Q-Network"""
    
    def __init__(self, state_dim, n_actions, hidden_dims=[128, 128, 64]):
        super(DQN, self).__init__()
        
        layers = []
        input_dim = state_dim
        
        for hidden_dim in hidden_dims:
            layers.append(nn.Linear(input_dim, hidden_dim))
            layers.append(nn.ReLU())
            layers.append(nn.Dropout(0.2))
            input_dim = hidden_dim
        
        layers.append(nn.Linear(input_dim, n_actions))
        
        self.network = nn.Sequential(*layers)
    
    def forward(self, x):
        return self.network(x)

# Test DQN
test_dqn = DQN(state_dim=7, n_actions=21)
print("DQN Network Architecture:")
print(test_dqn)
print(f"\nTotal parameters: {sum(p.numel() for p in test_dqn.parameters()):,}")

DQN Network Architecture:
DQN(
  (network): Sequential(
    (0): Linear(in_features=7, out_features=128, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.2, inplace=False)
    (3): Linear(in_features=128, out_features=128, bias=True)
    (4): ReLU()
    (5): Dropout(p=0.2, inplace=False)
    (6): Linear(in_features=128, out_features=64, bias=True)
    (7): ReLU()
    (8): Dropout(p=0.2, inplace=False)
    (9): Linear(in_features=64, out_features=21, bias=True)
  )
)

Total parameters: 27,157


In [8]:
class ReplayBuffer:
    """Experience Replay Buffer for DQN"""
    
    def __init__(self, capacity=10000):
        self.buffer = deque(maxlen=capacity)
    
    def push(self, state, action, reward, next_state, done):
        self.buffer.append((state, action, reward, next_state, done))
    
    def sample(self, batch_size):
        batch = random.sample(self.buffer, batch_size)
        states, actions, rewards, next_states, dones = zip(*batch)
        return (np.array(states), np.array(actions), np.array(rewards), 
                np.array(next_states), np.array(dones))
    
    def __len__(self):
        return len(self.buffer)

print("Replay Buffer implemented!")

Replay Buffer implemented!


In [9]:
class DQNAgent:
    """Deep Q-Learning Agent with alpha blending parameter"""
    
    def __init__(self, state_dim, n_actions, lr=0.00001, gamma=0.6, alpha=0.7,
                 epsilon_start=0.8, epsilon_end=0.2, epsilon_decay=0.9,
                 buffer_capacity=10000, batch_size=64, target_update=10):
        
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.n_actions = n_actions
        self.gamma = gamma
        self.alpha = alpha  # <--- thêm alpha
        self.epsilon = epsilon_start
        self.epsilon_end = epsilon_end
        self.epsilon_decay = epsilon_decay
        self.batch_size = batch_size
        self.target_update = target_update
        
        # Q-networks
        self.q_network = DQN(state_dim, n_actions).to(self.device)
        self.target_network = DQN(state_dim, n_actions).to(self.device)
        self.target_network.load_state_dict(self.q_network.state_dict())
        self.target_network.eval()
        
        self.optimizer = optim.Adam(self.q_network.parameters(), lr=lr)
        self.replay_buffer = ReplayBuffer(buffer_capacity)
        
        self.update_counter = 0
        self.loss_history = []
    
    def select_action(self, state, training=True):
        """Epsilon-greedy action selection"""
        if training and random.random() < self.epsilon:
            return random.randrange(self.n_actions)
        else:
            with torch.no_grad():
                state_tensor = torch.FloatTensor(state).unsqueeze(0).to(self.device)
                q_values = self.q_network(state_tensor)
                return q_values.argmax(dim=1).item()
    
    def store_transition(self, state, action, reward, next_state, done):
        """Store transition in replay buffer"""
        self.replay_buffer.push(state, action, reward, next_state, done)
    
    def train_step(self):
        """Perform one training step"""
        if len(self.replay_buffer) < self.batch_size:
            return None
        
        # Sample batch
        states, actions, rewards, next_states, dones = self.replay_buffer.sample(self.batch_size)
        
        # Convert to tensors
        states = torch.FloatTensor(states).to(self.device)
        actions = torch.LongTensor(actions).to(self.device)
        rewards = torch.FloatTensor(rewards).to(self.device)
        next_states = torch.FloatTensor(next_states).to(self.device)
        dones = torch.FloatTensor(dones).to(self.device)
        
        # Current Q values
        current_q_values = self.q_network(states).gather(1, actions.unsqueeze(1)).squeeze(1)
        
        # Target Q values (with alpha blending)
        with torch.no_grad():
            next_q_values = self.target_network(next_states).max(1)[0]
            q_target_raw = rewards + (1 - dones) * self.gamma * next_q_values
            target_q_values = (1 - self.alpha) * current_q_values + self.alpha * q_target_raw
        
        # Compute loss
        loss = F.mse_loss(current_q_values, target_q_values)
        
        # Optimize
        self.optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(self.q_network.parameters(), 1.0)
        self.optimizer.step()
        
        # Update target network
        self.update_counter += 1
        if self.update_counter % self.target_update == 0:
            self.target_network.load_state_dict(self.q_network.state_dict())
        
        # Decay epsilon
        self.epsilon = max(self.epsilon_end, self.epsilon * self.epsilon_decay)
        
        self.loss_history.append(loss.item())
        return loss.item()
    
    def save(self, filepath):
        """Save model"""
        torch.save({
            'q_network': self.q_network.state_dict(),
            'target_network': self.target_network.state_dict(),
            'optimizer': self.optimizer.state_dict(),
            'epsilon': self.epsilon,
            'alpha': self.alpha
        }, filepath)
    
    def load(self, filepath):
        """Load model"""
        checkpoint = torch.load(filepath)
        self.q_network.load_state_dict(checkpoint['q_network'])
        self.target_network.load_state_dict(checkpoint['target_network'])
        self.optimizer.load_state_dict(checkpoint['optimizer'])
        self.epsilon = checkpoint['epsilon']
        self.alpha = checkpoint.get('alpha', 0.7)  

print("✅ DQN Agent with alpha implemented successfully!")

✅ DQN Agent with alpha implemented successfully!


In [10]:
class DeepSARSAAgent:
    """
    Deep SARSA Agent
    
    Key difference from DQN: Uses next action selected by policy (on-policy)
    instead of max Q-value (off-policy)
    
    Hyperparameters as per Yang et al. (2020):
    - lr: 1e-5 (neural network learning rate)
    - alpha: 0.7 (Q-function update learning rate for smooth update)
    - gamma: 0.6 (discount factor)
    - epsilon: 0.8 → 0.2 with decay 0.9
    """

    def __init__(self, state_dim, n_actions, lr=1e-5, gamma=0.6, alpha=0.7,
                 epsilon_start=0.8, epsilon_end=0.2, epsilon_decay=0.9,
                 buffer_capacity=10000, batch_size=64, target_update=10):
        
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.n_actions = n_actions
        self.gamma = gamma
        self.alpha = alpha  # Learning rate for Q-function smooth update
        self.epsilon = epsilon_start
        self.epsilon_end = epsilon_end
        self.epsilon_decay = epsilon_decay
        self.batch_size = batch_size
        self.target_update = target_update
        
        # Q-networks
        self.q_network = DQN(state_dim, n_actions).to(self.device)
        self.target_network = DQN(state_dim, n_actions).to(self.device)
        self.target_network.load_state_dict(self.q_network.state_dict())
        self.target_network.eval()
        
        self.optimizer = optim.Adam(self.q_network.parameters(), lr=lr)
        self.replay_buffer = ReplayBuffer(buffer_capacity)
        
        self.update_counter = 0
        self.loss_history = []
    
    def select_action(self, state, training=True):
        """Epsilon-greedy action selection"""
        if training and random.random() < self.epsilon:
            return random.randrange(self.n_actions)
        else:
            with torch.no_grad():
                state_tensor = torch.FloatTensor(state).unsqueeze(0).to(self.device)
                q_values = self.q_network(state_tensor)
                return q_values.argmax(dim=1).item()
    
    def store_transition(self, state, action, reward, next_state, next_action, done, buffer_capacity=10000):
        """Store SARSA transition (includes next_action)"""
        self.replay_buffer.push(state, action, reward, next_state, done)
        # Store next_action separately for SARSA update
        if not hasattr(self, 'next_actions'):
            self.next_actions = deque(maxlen=buffer_capacity)
        self.next_actions.append(next_action)
    
    def train_step(self):
        """Perform one SARSA training step"""
        if len(self.replay_buffer) < self.batch_size:
            return None
        
        # Sample batch
        states, actions, rewards, next_states, dones = self.replay_buffer.sample(self.batch_size)
        
        # Get corresponding next_actions
        next_actions_batch = random.sample(list(self.next_actions), self.batch_size)
        
        # Convert to tensors
        states = torch.FloatTensor(states).to(self.device)
        actions = torch.LongTensor(actions).to(self.device)
        rewards = torch.FloatTensor(rewards).to(self.device)
        next_states = torch.FloatTensor(next_states).to(self.device)
        next_actions = torch.LongTensor(next_actions_batch).to(self.device)
        dones = torch.FloatTensor(dones).to(self.device)
        
        # Current Q values: Q(s, a)
        current_q_values = self.q_network(states).gather(1, actions.unsqueeze(1)).squeeze(1)
        
        # SARSA Target with smooth update (as per paper):
        # Q_target = (1 - α) * Q_current + α * [r + γ * Q(s', a')]
        with torch.no_grad():
            next_q_values = self.target_network(next_states).gather(1, next_actions.unsqueeze(1)).squeeze(1)
            td_target = rewards + (1 - dones) * self.gamma * next_q_values
            # Smooth update: blend current Q-value with TD target
            target_q_values = (1 - self.alpha) * current_q_values + self.alpha * td_target
        
        # Compute loss
        loss = F.mse_loss(current_q_values, target_q_values)
        
        # Optimize
        self.optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(self.q_network.parameters(), 1.0)
        self.optimizer.step()
        
        # Update target network
        self.update_counter += 1
        if self.update_counter % self.target_update == 0:
            self.target_network.load_state_dict(self.q_network.state_dict())
        
        # Decay epsilon
        self.epsilon = max(self.epsilon_end, self.epsilon * self.epsilon_decay)
        
        self.loss_history.append(loss.item())
        return loss.item()
    
    def save(self, filepath):
        """Save model"""
        torch.save({
            'q_network': self.q_network.state_dict(),
            'target_network': self.target_network.state_dict(),
            'optimizer': self.optimizer.state_dict(),
            'epsilon': self.epsilon
        }, filepath)
    
    def load(self, filepath):
        """Load model"""
        checkpoint = torch.load(filepath)
        self.q_network.load_state_dict(checkpoint['q_network'])
        self.target_network.load_state_dict(checkpoint['target_network'])
        self.optimizer.load_state_dict(checkpoint['optimizer'])
        self.epsilon = checkpoint['epsilon']

print("Deep SARSA Agent implemented successfully!")
print("\nKey difference from DQN:")
print("- DQN uses max Q(s', a') for target (off-policy)")
print("- SARSA uses Q(s', a') where a' is actually selected action (on-policy)")

Deep SARSA Agent implemented successfully!

Key difference from DQN:
- DQN uses max Q(s', a') for target (off-policy)
- SARSA uses Q(s', a') where a' is actually selected action (on-policy)


In [12]:
# Policy Gradient Network
class PolicyNetwork(nn.Module):
    """Policy Network for Policy Gradient Method"""
    
    def __init__(self, state_dim, n_actions, hidden_dims=[128, 128, 64]):
        super(PolicyNetwork, self).__init__()
        
        layers = []
        input_dim = state_dim
        
        for hidden_dim in hidden_dims:
            layers.append(nn.Linear(input_dim, hidden_dim))
            layers.append(nn.ReLU())
            layers.append(nn.Dropout(0.2))
            input_dim = hidden_dim
        
        layers.append(nn.Linear(input_dim, n_actions))
        layers.append(nn.Softmax(dim=-1))
        
        self.network = nn.Sequential(*layers)
    
    def forward(self, x):
        return self.network(x)

print("Policy Network implemented!")
print("\nNetwork outputs probability distribution over actions using Softmax")

Policy Network implemented!

Network outputs probability distribution over actions using Softmax


In [11]:
def train_dqn(agent, env, n_episodes=100, print_every=10):
    """
    Train DQN agent
    
    Args:
        agent: DQNAgent instance
        env: StockTradingEnv instance
        n_episodes: Number of training episodes
        print_every: Print statistics every N episodes
        
    Returns:
        episode_rewards, episode_portfolio_values
    """
    episode_rewards = []
    episode_portfolio_values = []
    
    for episode in range(n_episodes):
        state = env.reset()
        episode_reward = 0
        done = False
        
        while not done:
            # Select action
            action = agent.select_action(state, training=True)
            
            # Take action
            next_state, reward, done, info = env.step(action)
            
            # Store transition
            if next_state is not None:
                agent.store_transition(state, action, reward, next_state, done)
            
            # Train agent
            loss = agent.train_step()
            
            episode_reward += reward
            state = next_state if next_state is not None else state
        
        # Record metrics
        final_portfolio_value = env.get_portfolio_value()
        episode_rewards.append(episode_reward)
        episode_portfolio_values.append(final_portfolio_value)
        
        # Print progress
        if (episode + 1) % print_every == 0:
            avg_reward = np.mean(episode_rewards[-print_every:])
            avg_portfolio = np.mean(episode_portfolio_values[-print_every:])
            print(f"Episode {episode + 1}/{n_episodes} | "
                  f"Avg Reward: {avg_reward:.2f} | "
                  f"Avg Portfolio: ${avg_portfolio:.2f} | "
                  f"Epsilon: {agent.epsilon:.4f}")
    
    return episode_rewards, episode_portfolio_values

print("DQN Training function ready!")

DQN Training function ready!


In [None]:
def train_sarsa(agent, env, n_episodes=100, print_every=10):
    """
    Train Deep SARSA agent
    
    Args:
        agent: DeepSARSAAgent instance
        env: StockTradingEnv instance
        n_episodes: Number of training episodes
        print_every: Print statistics every N episodes
        
    Returns:
        episode_rewards, episode_portfolio_values
    """
    episode_rewards = []
    episode_portfolio_values = []
    
    for episode in range(n_episodes):
        state = env.reset()
        action = agent.select_action(state, training=True)
        episode_reward = 0
        done = False
        
        while not done:
            # Take action
            next_state, reward, done, info = env.step(action)
            
            # Select next action (this is key for SARSA)
            if next_state is not None:
                next_action = agent.select_action(next_state, training=True)
                
                # Store SARSA transition (s, a, r, s', a')
                agent.store_transition(state, action, reward, next_state, next_action, done)
                
                # Train agent
                loss = agent.train_step()
                
                # Update for next iteration
                state = next_state
                action = next_action
            else:
                done = True
            
            episode_reward += reward
        
        # Record metrics
        final_portfolio_value = env.get_portfolio_value()
        episode_rewards.append(episode_reward)
        episode_portfolio_values.append(final_portfolio_value)
        
        # Print progress
        if (episode + 1) % print_every == 0:
            avg_reward = np.mean(episode_rewards[-print_every:])
            avg_portfolio = np.mean(episode_portfolio_values[-print_every:])
            print(f"Episode {episode + 1}/{n_episodes} | "
                  f"Avg Reward: {avg_reward:.2f} | "
                  f"Avg Portfolio: ${avg_portfolio:.2f} | "
                  f"Epsilon: {agent.epsilon:.4f}")
    
    return episode_rewards, episode_portfolio_values

print("SARSA Training function ready!")

In [13]:
def train_policy_gradient(agent, env, n_episodes=30, print_every=10):
    """
    Train Policy Gradient agent
    
    Args:
        agent: PolicyGradientAgent instance
        env: StockTradingEnv instance
        n_episodes: Number of training episodes (paper uses 30)
        print_every: Print statistics every N episodes
        
    Returns:
        episode_rewards, episode_portfolio_values
    """
    episode_rewards = []
    episode_portfolio_values = []
    
    for episode in range(n_episodes):
        state = env.reset()
        episode_reward = 0
        done = False
        
        # Collect episode trajectory
        while not done:
            # Select action from policy
            action = agent.select_action(state, training=True)
            
            # Take action
            next_state, reward, done, info = env.step(action)
            
            # Store reward
            agent.store_reward(reward)
            
            episode_reward += reward
            state = next_state if next_state is not None else state
        
        # Train on complete episode
        loss = agent.train_episode()
        
        # Record metrics
        final_portfolio_value = env.get_portfolio_value()
        episode_rewards.append(episode_reward)
        episode_portfolio_values.append(final_portfolio_value)
        
        # Print progress
        if (episode + 1) % print_every == 0:
            avg_reward = np.mean(episode_rewards[-print_every:])
            avg_portfolio = np.mean(episode_portfolio_values[-print_every:])
            print(f"Episode {episode + 1}/{n_episodes} | "
                  f"Avg Reward: {avg_reward:.2f} | "
                  f"Avg Portfolio: ${avg_portfolio:.2f}")
    
    return episode_rewards, episode_portfolio_values

print("Policy Gradient training function ready!")

Policy Gradient training function ready!


In [14]:
def calculate_annual_return(env, agent, agent_name="Agent"):
    """
    Calculate annual return percentage on test set
    
    Args:
        env: Test environment
        agent: Trained agent
        agent_name: Name for printing
        
    Returns:
        annual_return_pct: Annualized return percentage
    """
    state = env.reset()
    done = False
    initial_value = env.get_portfolio_value()
    
    while not done:
        action = agent.select_action(state, training=False)
        next_state, reward, done, info = env.step(action)
        state = next_state if next_state is not None else state
    
    final_value = env.get_portfolio_value()
    total_return = (final_value - initial_value) / initial_value
    
    # Calculate number of years in test period
    n_days = len(env.data)
    n_years = n_days / 252  # 252 trading days per year
    
    # Annualize the return
    annual_return = ((1 + total_return) ** (1 / n_years)) - 1
    annual_return_pct = annual_return * 100
    
    print(f"\n{agent_name} Results:")
    print(f"  Initial Portfolio: ${initial_value:.2f}")
    print(f"  Final Portfolio: ${final_value:.2f}")
    print(f"  Total Return: {total_return*100:.2f}%")
    print(f"  Test Period: {n_days} days ({n_years:.2f} years)")
    print(f"  Annual Return: {annual_return_pct:.2f}%")
    
    return annual_return_pct

print("Annual return calculation function ready!")

Annual return calculation function ready!


### Training stage

In [None]:
# Prepare data for Good Period experiment
print("="*80)
print("PREPARING DATA - GOOD PERIOD (AAPL)")
print("="*80)

# Calculate technical indicators
aapl_train_good = calculate_technical_indicators(df_train_good['AAPL'])
aapl_test_good = calculate_technical_indicators(df_test_good['AAPL'])

# Create environments with k=5 (as per paper)
train_env_good = StockTradingEnv(
    aapl_train_good, 
    initial_balance=1000, 
    max_shares=5,  # k=5 as per paper
    fee_rate=0.001,  # 0.1%
    min_balance_tolerance=0
)

test_env_good = StockTradingEnv(
    aapl_test_good, 
    initial_balance=1000, 
    max_shares=5,
    fee_rate=0.001,
    min_balance_tolerance=0
)

print(f"\nTraining period: {len(aapl_train_good)} days")
print(f"Test period: {len(aapl_test_good)} days")
print(f"\nEnvironment configuration:")
print(f"  - Initial balance: $10,000")
print(f"  - Max shares per action (k): 5")
print(f"  - Trading fee: 0.1%")
print(f"  - State dimension: {train_env_good.state_dim}")
print(f"  - Number of actions: {train_env_good.n_actions}")