# DRL Walk-Forward Training

In [15]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
import gym
from collections import deque
import yfinance as yf
import random
import os

In [16]:
# Set device to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("CUDA or CPU: ", device)
print("Device: ",torch.cuda.get_device_name(0))

CUDA or CPU:  cuda
Device:  NVIDIA GeForce RTX 4080


In [17]:
# Experiment parameters
WINDOW_SIZE = 120  # Rolling window size for state representation
STD_WINDOW_SIZE = 20
NUM_ASSETS = 14   # Example number of assets
ENV_STEP_SIZE = 1
LEARNING_RATE = 0.01
WEIGHT_DECAY = 1e-4
DISCOUNT_FACTOR = 0.9
BATCH_SIZE = 50
EPISODES = 100
THETA = 0.5
SIGMA = 0.02
LAMBDA = 2

In [18]:
# Define tickers
tickers = {
    'XLF': 'XLF',   # Financials Select Sector SPDR Fund
    'XLK': 'XLK',   # Technology Select Sector SPDR Fund
    'XLE': 'XLE',   # Energy Select Sector SPDR Fund
    'XLP': 'XLP',   # Consumer Staples Select Sector SPDR Fund
    'XLY': 'XLY',   # Consumer Discretionary Select Sector SPDR Fund
    'XLU': 'XLU',   # Utilities Select Sector SPDR Fund
    'XLI': 'XLI',   # Industrials Select Sector SPDR Fund
    'XLV': 'XLV',   # Health Care Select Sector SPDR Fund
    'VNQ': 'VNQ',   # Vanguard Real Estate ETF
    'IYZ': 'IYZ',   # iShares U.S. Telecommunications ETF
    'XBI': 'XBI',   # SPDR S&P Biotech ETF
    'XOP': 'XOP',   # SPDR S&P Oil & Gas Exploration ETF
    'ITA': 'ITA',   # iShares U.S. Aerospace & Defense ETF
    'KBE': 'KBE'    # SPDR S&P Bank ETF
}

# Define the range of years
start_year = 2011
end_year = 2019

# Initialize an empty list to store filtered returns
filtered_returns_list = []

# Loop through each year in the range
for year in range(start_year, end_year):
    # Dynamically adjust parameters for each iteration
    start_analysis_date = f"{year}-01-01"
    extra_data_start_date = f"{year - 1}-01-01"
    end_date = f"{year + 4}-12-31"

    # Download data
    data = yf.download(list(tickers.values()), start=extra_data_start_date, end=end_date)['Adj Close']

    # Calculate daily returns
    returns = data.pct_change().dropna()

    # Print data summary
    print(f"\n--- Period: {start_analysis_date} to {end_date} ---")
    print("Data Start Date:", returns.index.min())
    print("Data End Date:", returns.index.max())
    print("Total Days in Dataset:", len(returns))

    # Convert start_analysis_date to pandas Timestamp
    start_analysis_date = pd.Timestamp(start_analysis_date)

    # Ensure the index is timezone-naive for consistency
    returns.index = returns.index.tz_localize(None)

    # Find the index of the closest date before or equal to the start_analysis_date
    if start_analysis_date not in returns.index:
        start_analysis_date = returns.index.asof(start_analysis_date)

    analysis_index = returns.index.get_loc(start_analysis_date)

    # Compute the index for the look-back start date
    lookback_start_index = analysis_index - (WINDOW_SIZE + STD_WINDOW_SIZE)

    # Validate the computed index
    if lookback_start_index < 0:
        print("Warning: Not enough data available for the look-back period!")
        continue

    print("Lookback Start Index:", lookback_start_index)
    print("Analysis Index:", analysis_index)

    # Get the look-back start date from the index
    lookback_start_date = returns.index[lookback_start_index]

    # Filter the data to include the necessary look-back period
    filtered_returns = returns.loc[lookback_start_date:]
    filtered_returns_list.append(filtered_returns)

    # Confirm the filtered data range
    print(f"Filtered Data Start Date (includes look-back): {filtered_returns.index.min()}")
    print(f"Filtered Data End Date: {filtered_returns.index.max()}")

# Output the number of processed periods
print(f"\nTotal periods processed: {len(filtered_returns_list)}")

[*********************100%***********************]  14 of 14 completed
[**************        29%                       ]  4 of 14 completed


--- Period: 2011-01-01 to 2015-12-31 ---
Data Start Date: 2010-01-05 00:00:00+00:00
Data End Date: 2015-12-30 00:00:00+00:00
Total Days in Dataset: 1508
Lookback Start Index: 110
Analysis Index: 250
Filtered Data Start Date (includes look-back): 2010-06-14 00:00:00
Filtered Data End Date: 2015-12-30 00:00:00


[*********************100%***********************]  14 of 14 completed
[*******               14%                       ]  2 of 14 completed


--- Period: 2012-01-01 to 2016-12-31 ---
Data Start Date: 2011-01-04 00:00:00+00:00
Data End Date: 2016-12-30 00:00:00+00:00
Total Days in Dataset: 1509
Lookback Start Index: 110
Analysis Index: 250
Filtered Data Start Date (includes look-back): 2011-06-13 00:00:00
Filtered Data End Date: 2016-12-30 00:00:00


[*********************100%***********************]  14 of 14 completed
[                       0%                       ]


--- Period: 2013-01-01 to 2017-12-31 ---
Data Start Date: 2012-01-04 00:00:00+00:00
Data End Date: 2017-12-29 00:00:00+00:00
Total Days in Dataset: 1508
Lookback Start Index: 108
Analysis Index: 248
Filtered Data Start Date (includes look-back): 2012-06-08 00:00:00
Filtered Data End Date: 2017-12-29 00:00:00


[*********************100%***********************]  14 of 14 completed
[*****************     36%                       ]  5 of 14 completed


--- Period: 2014-01-01 to 2018-12-31 ---
Data Start Date: 2013-01-03 00:00:00+00:00
Data End Date: 2018-12-28 00:00:00+00:00
Total Days in Dataset: 1508
Lookback Start Index: 110
Analysis Index: 250
Filtered Data Start Date (includes look-back): 2013-06-12 00:00:00
Filtered Data End Date: 2018-12-28 00:00:00


[*********************100%***********************]  14 of 14 completed
[*****************     36%                       ]  5 of 14 completed


--- Period: 2015-01-01 to 2019-12-31 ---
Data Start Date: 2014-01-03 00:00:00+00:00
Data End Date: 2019-12-30 00:00:00+00:00
Total Days in Dataset: 1508
Lookback Start Index: 110
Analysis Index: 250
Filtered Data Start Date (includes look-back): 2014-06-12 00:00:00
Filtered Data End Date: 2019-12-30 00:00:00


[*********************100%***********************]  14 of 14 completed
[*******               14%                       ]  2 of 14 completed


--- Period: 2016-01-01 to 2020-12-31 ---
Data Start Date: 2015-01-05 00:00:00+00:00
Data End Date: 2020-12-30 00:00:00+00:00
Total Days in Dataset: 1509
Lookback Start Index: 110
Analysis Index: 250
Filtered Data Start Date (includes look-back): 2015-06-12 00:00:00
Filtered Data End Date: 2020-12-30 00:00:00


[*********************100%***********************]  14 of 14 completed
[*******               14%                       ]  2 of 14 completed


--- Period: 2017-01-01 to 2021-12-31 ---
Data Start Date: 2016-01-05 00:00:00+00:00
Data End Date: 2021-12-30 00:00:00+00:00
Total Days in Dataset: 1509
Lookback Start Index: 110
Analysis Index: 250
Filtered Data Start Date (includes look-back): 2016-06-13 00:00:00
Filtered Data End Date: 2021-12-30 00:00:00


[*********************100%***********************]  14 of 14 completed


--- Period: 2018-01-01 to 2022-12-31 ---
Data Start Date: 2017-01-04 00:00:00+00:00
Data End Date: 2022-12-30 00:00:00+00:00
Total Days in Dataset: 1509
Lookback Start Index: 109
Analysis Index: 249
Filtered Data Start Date (includes look-back): 2017-06-12 00:00:00
Filtered Data End Date: 2022-12-30 00:00:00

Total periods processed: 8





In [19]:
def calculate_features(data, window_size):
    # Ensure we have enough data for the specified window sizes
    if len(data) < STD_WINDOW_SIZE + window_size:
        raise ValueError("Data length must be at least STD_WINDOW_SIZE + window_size to calculate rolling metrics.")
    
    # Convert data to a tensor and move to GPU
    data_tensor = torch.tensor(data.values, dtype=torch.float32).to(device)

    # Initialize tensors to store results for the rolling metrics
    num_assets = data.shape[1]
    rolling_returns = data_tensor[STD_WINDOW_SIZE:STD_WINDOW_SIZE + window_size]  # Use the returns directly
    rolling_volatilities = torch.zeros((window_size, num_assets), device=device)
    rolling_drawdowns = torch.zeros((window_size, num_assets), device=device)

    # Calculate cumulative returns to determine drawdowns
    cumulative_returns = torch.cumprod(1 + rolling_returns, dim=0)
    peak_values = torch.cummax(cumulative_returns, dim=0).values
    rolling_drawdowns = torch.where(cumulative_returns < peak_values, cumulative_returns - peak_values, torch.tensor(0.0, device=device))

    # Calculate rolling volatilities using an inner window of `std_size`
    for i in range(window_size):
        idx = STD_WINDOW_SIZE + i
        volatility_window = data_tensor[idx - STD_WINDOW_SIZE:idx]  # Only previous `std_size` returns for volatility
        rolling_volatilities[i, :] = volatility_window.std(dim=0)

    # Move results back to CPU if needed
    return (rolling_returns.cpu().numpy(), 
            rolling_volatilities.cpu().numpy(), 
            rolling_drawdowns.cpu().numpy())

In [20]:
def create_state_matrix(returns, volatilities, drawdowns):
    return np.stack([returns, volatilities, drawdowns], axis=0)

In [21]:
# Get the past `WINDOW_SIZE` days of data up to `self.current_step`
window_data = data.iloc[0:STD_WINDOW_SIZE + WINDOW_SIZE]
print(window_data.shape)
returns, volatilities, drawdowns = calculate_features(window_data, WINDOW_SIZE)
        
# Stack features to form the state, ensuring a shape of (3, 60, 11)
state = create_state_matrix(returns, volatilities, drawdowns)
if state.shape != (3, WINDOW_SIZE, NUM_ASSETS):
    raise ValueError(f"State shape is {state.shape} but expected (3, {WINDOW_SIZE}, {NUM_ASSETS})")

(140, 14)


In [22]:
def calculate_adjusted_rewards(data, action, annualized_return, periods_per_year=252):
    """
    Calculate the Sharpe ratio for a portfolio based on the given action (portfolio weights),
    the provided annualized return, and the return data over a specified time window.
    
    Args:
        data (pd.DataFrame): The return data for assets over a time window.
        action (np.array): Portfolio weights for the assets.
        annualized_return (float): Precomputed annualized return of the portfolio.
        periods_per_year (int): Number of periods in a year (e.g., 252 for daily trading).
        
    Returns:
        adjusted_reward (float): Adjusted returns of the portfolio.
    """

    returns = annualized_return / periods_per_year

    # Convert data to a tensor
    data_tensor = torch.tensor(data.values, dtype=torch.float32).to(device)  # Shape: (time_window, num_assets)

    # Compute portfolio returns over the time window
    action_tensor = torch.tensor(action, dtype=torch.float32).to(device)  # Shape: (num_assets,)
    portfolio_returns = data_tensor @ action_tensor  # Shape: (time_window,)

    # Calculate the standard deviation of portfolio returns
    std_return = portfolio_returns.std().item()

    # Calculate Sharpe ratio, handling cases where annualized_std is zero
    adjusted_reward = returns - (LAMBDA / 2) * std_return**2

    return adjusted_reward

In [23]:
class PortfolioEnv(gym.Env):
    def __init__(self, data):
        super().__init__()
        self.data = data.dropna()  # Drop initial NaNs to ensure complete data
        self.current_step = STD_WINDOW_SIZE + WINDOW_SIZE  # Start with enough data for the rolling window
        self.done = False

    def reset(self):
        # Reset to the initial starting point, which is the 2*WINDOW_SIZE day
        self.current_step = STD_WINDOW_SIZE + WINDOW_SIZE
        return self.get_state()

    def step(self, action):
        # Calculate the reward (e.g., based on Sharpe ratio) for the next WINDOW_SIZE days
        portfolio_returns = self.get_portfolio_return(action)
        vol_window = self.data.iloc[self.current_step - STD_WINDOW_SIZE + 1:self.current_step + 1]
        reward = calculate_adjusted_rewards(vol_window, action, portfolio_returns)  # Reward based on Sharpe ratio
        
        # Move to the next day, and check the updated `done` condition
        self.current_step += ENV_STEP_SIZE
        self.done = self.current_step + ENV_STEP_SIZE >= len(self.data)  # Ensure enough data for future window
        return self.get_state(), reward, self.done, {}

    def get_state(self):
        # Get a consistent rolling window of data ending at `self.current_step`
        if self.current_step < STD_WINDOW_SIZE + WINDOW_SIZE:
            raise ValueError("current_step must be at least STD_WINDOW_SIZE + WINDOW_SIZE to have a full double-window.")

        # Get the past `2 * WINDOW_SIZE` days of data up to `self.current_step`
        window_data = self.data.iloc[self.current_step - STD_WINDOW_SIZE - WINDOW_SIZE:self.current_step]
        returns, volatilities, drawdowns = calculate_features(window_data, WINDOW_SIZE)
        
        # Stack features to form the state, ensuring a shape of (3, WINDOW_SIZE, num_assets)
        state = create_state_matrix(returns, volatilities, drawdowns)
        if state.shape != (3, WINDOW_SIZE, NUM_ASSETS):
            raise ValueError(f"State shape is {state.shape} but expected (3, {WINDOW_SIZE}, {NUM_ASSETS})")
        
        # Calculate the rolling covariance matrix
        cov = data.iloc[self.current_step - WINDOW_SIZE:self.current_step].cov()

        # Store the covariance matrix as part of the state
        state = {
            'features': state,  # Original features (returns, volatilities, drawdowns)
            'covariance': cov.values  # Covariance matrix
        }
        
        return state

    def get_portfolio_return(self, action, periods_per_year=252):
        # Ensure there is a next day available
        if self.current_step + 1 >= len(self.data):
            raise ValueError("Not enough data to calculate next day's return.")

        # Calculate returns for the next day (current_step + 1)
        next_day_returns = self.data.iloc[self.current_step + 1]

        # Portfolio return as dot product of action weights and next day's returns
        portfolio_return = np.dot(action, next_day_returns) * periods_per_year

        return portfolio_return

In [24]:
class ActorNetwork(nn.Module):
    def __init__(self, num_assets, window_size=WINDOW_SIZE, hidden_dim=256):
        super(ActorNetwork, self).__init__()

        # Define convolutional layers for the primary features (returns, volatilities, drawdowns)
        self.conv1 = nn.Conv2d(3, 16, kernel_size=3, padding=1).to(device)
        self.conv2 = nn.Conv2d(16, 32, kernel_size=3, padding=1).to(device)
        self.conv3 = nn.Conv2d(32, 32, kernel_size=3, padding=1).to(device)
        self.conv4 = nn.Conv2d(32, 64, kernel_size=3, padding=1).to(device)  # Updated input channels to match
        self.conv5 = nn.Conv2d(64, 64, kernel_size=3, padding=1).to(device)
        self.conv6 = nn.Conv2d(64, 128, kernel_size=3, padding=1).to(device)  # Increased output channels for more complexity

        # Initialize fully connected layers for feature branch
        self._initialize_feature_branch(num_assets, window_size)

        # Flattened covariance size (num_assets * num_assets)
        self.covariance_size = num_assets * num_assets

        # Define final fully connected layers for combined output
        combined_size = self.feature_output_size + self.covariance_size
        self.fc = nn.Sequential(
            nn.Linear(combined_size, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, num_assets),
            nn.Softmax(dim=-1)  # Output portfolio weights
        ).to(device)

    def _initialize_feature_branch(self, num_assets, window_size):
        # Calculate flatten size for the feature branch
        dummy_input = torch.zeros(1, 3, window_size, num_assets).to(device)
        x = torch.relu(self.conv1(dummy_input))
        x = torch.relu(self.conv2(x))
        x = torch.relu(self.conv3(x))
        x = torch.relu(self.conv4(x))
        x = torch.relu(self.conv5(x))
        x = torch.relu(self.conv6(x))
        self.feature_output_size = x.view(1, -1).shape[1]

    def forward(self, state):
        # Process primary features
        features = state['features'].to(device)
        x = torch.relu(self.conv1(features))
        x = torch.relu(self.conv2(x))
        x = torch.relu(self.conv3(x))
        x = torch.relu(self.conv4(x))
        x = torch.relu(self.conv5(x))
        x = torch.relu(self.conv6(x))
        x = x.view(x.size(0), -1)  # Flatten feature branch output

        # Flatten covariance matrix
        covariance = state['covariance'].to(device).view(x.size(0), -1)  # Flatten covariance matrix

        # Combine both branches
        combined = torch.cat([x, covariance], dim=1)

        # Final output layer
        weights = self.fc(combined)
        return weights

In [25]:
class CriticNetwork(nn.Module):
    def __init__(self, num_assets, window_size=WINDOW_SIZE, hidden_dim=256):
        super(CriticNetwork, self).__init__()

        # Convolutional layers for processing the state (features: returns, volatilities, drawdowns)
        self.conv1 = nn.Conv2d(3, 16, kernel_size=3, padding=1).to(device)
        self.conv2 = nn.Conv2d(16, 32, kernel_size=3, padding=1).to(device)
        self.conv3 = nn.Conv2d(32, 32, kernel_size=3, padding=1).to(device)
        self.conv4 = nn.Conv2d(32, 64, kernel_size=3, padding=1).to(device)  # Updated input channels to match conv3
        self.conv5 = nn.Conv2d(64, 64, kernel_size=3, padding=1).to(device)
        self.conv6 = nn.Conv2d(64, 128, kernel_size=3, padding=1).to(device)  # Increased output channels for more complexity

        # Initialize fully connected layers for the feature branch
        self._initialize_feature_branch(num_assets, window_size)

        # Flattened covariance size (num_assets * num_assets)
        self.covariance_size = num_assets * num_assets

        # Fully connected layers for action processing
        self.fc_action = nn.Sequential(
            nn.Linear(num_assets, hidden_dim),
            nn.ReLU()
        ).to(device)

        # Final layers for combining all representations (features, covariance, action)
        combined_size = self.feature_output_size + self.covariance_size + hidden_dim
        self.fc_q = nn.Sequential(
            nn.Linear(combined_size, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, 1)  # Output Q-value
        ).to(device)

    def _initialize_feature_branch(self, num_assets, window_size):
        # Calculate flatten size for the feature branch
        dummy_input = torch.zeros(1, 3, window_size, num_assets).to(device)
        x = torch.relu(self.conv1(dummy_input))
        x = torch.relu(self.conv2(x))
        x = torch.relu(self.conv3(x))
        x = torch.relu(self.conv4(x))
        x = torch.relu(self.conv5(x))
        x = torch.relu(self.conv6(x))
        self.feature_output_size = x.view(1, -1).shape[1]

    def forward(self, state, action):
        # Process the state (features: returns, volatilities, drawdowns)
        features = state['features'].to(device)
        x = torch.relu(self.conv1(features))
        x = torch.relu(self.conv2(x))
        x = torch.relu(self.conv3(x))
        x = torch.relu(self.conv4(x))
        x = torch.relu(self.conv5(x))
        x = torch.relu(self.conv6(x))
        x = x.view(x.size(0), -1)  # Flatten feature branch output
        state_out = x

        # Flatten covariance matrix
        covariance = state['covariance'].to(device).view(x.size(0), -1)  # Flatten covariance matrix
        cov_out = covariance

        # Process the action
        action_out = self.fc_action(action.to(device))

        # Combine all branches
        combined = torch.cat([state_out, cov_out, action_out], dim=1)

        # Compute Q-value
        q_value = self.fc_q(combined)
        return q_value

In [26]:
class OUNoise:
    def __init__(self, size, mu=0.0, theta=0.15, sigma=0.2):
        """
        Ornstein-Uhlenbeck noise process.
        Args:
            size (int): Dimension of the noise.
            mu (float): Mean of the process.
            theta (float): Speed of mean reversion.
            sigma (float): Volatility parameter.
        """
        self.size = size
        self.mu = mu
        self.theta = theta
        self.sigma = sigma
        self.state = np.ones(self.size) * self.mu

    def reset(self):
        """Reset the internal state to the mean."""
        self.state = np.ones(self.size) * self.mu

    def sample(self):
        """Generate a noise sample."""
        dx = self.theta * (self.mu - self.state) + self.sigma * np.random.normal(size=self.size)
        self.state += dx
        return self.state

In [27]:
class DDPGAgent:
    def __init__(self, num_assets):
        # Initialize actor and critic networks, and target networks
        self.actor = ActorNetwork(num_assets).to(device)
        self.critic = CriticNetwork(num_assets).to(device)
        self.target_actor = ActorNetwork(num_assets).to(device)
        self.target_critic = CriticNetwork(num_assets).to(device)

        # Synchronize target networks with the main networks
        self.target_actor.load_state_dict(self.actor.state_dict())
        self.target_critic.load_state_dict(self.critic.state_dict())

        # Optimizers
        self.optimizer_actor = optim.Adam(self.actor.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)
        self.optimizer_critic = optim.Adam(self.critic.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)

        # Replay memory
        self.memory = deque(maxlen=10000)
        self.batch_size = BATCH_SIZE
        self.discount_factor = DISCOUNT_FACTOR
        self.tau = 0.005  # Soft update rate for target networks

        # OU Noise for exploration
        self.ou_noise = OUNoise(size=num_assets, theta=THETA, sigma=SIGMA)

    def act(self, state, add_noise=True):
        """
        Get an action from the actor network, optionally adding noise.
        Args:
            state (np.array): Current state.
            add_noise (bool): Whether to add exploration noise.
        Returns:
            np.array: Action vector.
        """
        # Convert state components to tensors and move to GPU if available
        features_tensor = torch.tensor(state['features'], dtype=torch.float32).unsqueeze(0).to(device)  # Shape: (1, 3, WINDOW_SIZE, NUM_ASSETS)
        covariance_tensor = torch.tensor(state['covariance'], dtype=torch.float32).unsqueeze(0).to(device)  # Shape: (1, 1, NUM_ASSETS, NUM_ASSETS)

        # Combine tensors into a dictionary to match the actor's expected input
        state_tensor = {
            'features': features_tensor,
            'covariance': covariance_tensor
        }

        # Pass the structured state to the actor and get the action
        action = self.actor(state_tensor).detach().cpu().numpy()[0]  # Move action back to CPU

        # Optionally add Ornstein-Uhlenbeck noise for exploration
        if add_noise:
            noise = self.ou_noise.sample()
            action += noise

        action = np.clip(action, 0, 1)  # Ensure valid portfolio weights 
        return action / action.sum()  # Ensure weights sum to 1

    def reset_noise(self):
        """Reset the Ornstein-Uhlenbeck noise process."""
        self.ou_noise.reset()

    def update(self):
        # Check if there are enough samples in memory
        if len(self.memory) < self.batch_size:
            return

        # Sample a mini-batch of experiences from the replay buffer
        mini_batch = random.sample(self.memory, self.batch_size)
        states, actions, rewards, next_states, dones = zip(*mini_batch)

        # Convert rewards and dones to tensors
        rewards = torch.tensor(np.array(rewards), dtype=torch.float32).view(-1, 1).to(device)
        dones = torch.tensor(np.array(dones), dtype=torch.float32).view(-1, 1).to(device)
        actions = torch.tensor(np.array(actions), dtype=torch.float32).to(device)

        # Process states and next_states into features and covariance components
        states_features = torch.tensor(np.array([s['features'] for s in states]), dtype=torch.float32).to(device)
        states_covariance = torch.tensor(np.array([s['covariance'] for s in states]), dtype=torch.float32).unsqueeze(1).to(device)

        next_states_features = torch.tensor(np.array([s['features'] for s in next_states]), dtype=torch.float32).to(device)
        next_states_covariance = torch.tensor(np.array([s['covariance'] for s in next_states]), dtype=torch.float32).unsqueeze(1).to(device)

        # Create structured state dictionaries for compatibility with the networks
        states_dict = {'features': states_features, 'covariance': states_covariance}
        next_states_dict = {'features': next_states_features, 'covariance': next_states_covariance}

        # Critic update
        with torch.no_grad():
            next_actions = self.target_actor(next_states_dict)
            target_q_values = self.target_critic(next_states_dict, next_actions)
            target_values = rewards + self.discount_factor * target_q_values * (1 - dones)

        current_q_values = self.critic(states_dict, actions)
        critic_loss = nn.MSELoss()(current_q_values, target_values)

        # Backpropagate the loss for the critic network
        self.optimizer_critic.zero_grad()
        critic_loss.backward()
        self.optimizer_critic.step()

        # Actor update
        predicted_actions = self.actor(states_dict)
        actor_loss = -self.critic(states_dict, predicted_actions).mean()  # Maximize Q-value
        self.optimizer_actor.zero_grad()
        actor_loss.backward()
        self.optimizer_actor.step()

        # Soft update for target networks
        for target_param, param in zip(self.target_critic.parameters(), self.critic.parameters()):
            target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data)

        for target_param, param in zip(self.target_actor.parameters(), self.actor.parameters()):
            target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data)

In [28]:
# Initialize a dictionary to store rewards for each iteration
all_rewards = {}

# Loop through each period in `filtered_returns_list`
for i in range(len(filtered_returns_list)):
    print(f"\n--- Training for Period {i + 1} ---")

    # Set up the environment for the current period
    env = PortfolioEnv(filtered_returns_list[i])

    # Instantiate the agent
    agent = DDPGAgent(num_assets=NUM_ASSETS)

    # Load the models from the previous iteration
    actor_path = f"saved_models/last_actor_7-{i+1}.pth"
    critic_path = f"saved_models/last_critic_7-{i+1}.pth"
    if os.path.exists(actor_path) and os.path.exists(critic_path):
        agent.actor.load_state_dict(torch.load(actor_path))
        agent.critic.load_state_dict(torch.load(critic_path))
        print(f"Loaded actor model from {actor_path} and critic model from {critic_path}")
    else:
        raise FileNotFoundError(f"Missing required models: {actor_path} or {critic_path}")

    # Create an array to store rewards for the current iteration
    rewards = []

    # Training parameters
    MAX_ITERATIONS = EPISODES  # Maximum number of episodes

    # Best model tracking
    best_avg_reward = -float('inf')  # Track the best average reward
    best_actor_state = None          # To store the best actor state
    best_critic_state = None         # To store the best critic state

    # Training loop
    for episode in range(MAX_ITERATIONS):
        state = env.reset()
        agent.reset_noise()  # Reset the OU noise process for the new episode

        episode_reward = 0
        count = 0

        while True:
            action = agent.act(state)
            next_state, reward, done, _ = env.step(action)

            # Store transition in memory and update agent
            agent.memory.append((state, action, reward, next_state, done))
            agent.update()

            # Update state and episode reward
            state = next_state
            episode_reward += reward
            count += 1

            if done:
                break

        # Calculate average reward for this episode
        ann_episode_reward = episode_reward / count * 252
        rewards.append(ann_episode_reward)
        print(f"Iteration {i + 1}, Episode {episode + 1}/{MAX_ITERATIONS}, Average Reward: {ann_episode_reward}")

        # Save the best-performing model if the reward is improved
        if ann_episode_reward > best_avg_reward:
            best_avg_reward = ann_episode_reward
            best_actor_state = agent.actor.state_dict()
            best_critic_state = agent.critic.state_dict()

    # Save the final best model for this period
    torch.save(best_actor_state, f'saved_models/best_actor_7-{i+2}.pth')
    torch.save(best_critic_state, f'saved_models/best_critic_7-{i+2}.pth')

    # Save the last model for the next iteration
    torch.save(agent.actor.state_dict(), f"saved_models/last_actor_7-{i+2}.pth")
    torch.save(agent.critic.state_dict(), f"saved_models/last_critic_7-{i+2}.pth")

    # Store rewards for this iteration
    all_rewards[i + 1] = rewards


--- Training for Period 1 ---
Loaded actor model from saved_models/last_actor_7-1.pth and critic model from saved_models/last_critic_7-1.pth
Iteration 1, Episode 1/100, Average Reward: 0.11322103544715809
Iteration 1, Episode 2/100, Average Reward: 0.09901942546390542
Iteration 1, Episode 3/100, Average Reward: 0.10866979610205244
Iteration 1, Episode 4/100, Average Reward: 0.051840591096521345
Iteration 1, Episode 5/100, Average Reward: 0.09779456549384487
Iteration 1, Episode 6/100, Average Reward: 0.10915470368685194
Iteration 1, Episode 7/100, Average Reward: 0.10110961674711855
Iteration 1, Episode 8/100, Average Reward: 0.11251596563351801
Iteration 1, Episode 9/100, Average Reward: 0.09789016253836323
Iteration 1, Episode 10/100, Average Reward: 0.13210296236783609
Iteration 1, Episode 11/100, Average Reward: 0.043017163346862834
Iteration 1, Episode 12/100, Average Reward: 0.09327386749802435
Iteration 1, Episode 13/100, Average Reward: 0.1018528778744472
Iteration 1, Episode 