In [1]:
import pandas as pd
import numpy as np
import torch
import os
from datetime import datetime

# Assuming utils are in parent directory or PYTHONPATH is set
from utils.portfolio_env import PortfolioEnv
from utils.drl_agent_jules import DRLAgent  # Import the modified agent

# For learning rate schedule
from typing import Callable

%load_ext autoreload
%autoreload 2

## TensorBoard Logging

This notebook logs training progress using TensorBoard. Logs for each agent and window will be saved in subdirectories within the `../tensorboard_logs/` directory (relative to this notebook's location).

To view the logs:
1. Open a terminal or command prompt.
2. Navigate to the directory *containing* the `tensorboard_logs` directory (i.e., the root of this repository if you are running the notebook from the `notebooks` folder).
3. Run the command: `tensorboard --logdir tensorboard_logs/`
4. Open the URL provided by TensorBoard (usually http://localhost:6006/) in your web browser.

You should see experiments named like `PPO_WindowX_AgentY_SeedZ`.

In [2]:
# --- Main Configuration ---
N_WINDOWS = 2  # 10 in paper
AGENTS_PER_WINDOW = 2  # 5 in paper
BASE_START_YEAR = 2006

# Data paths
PRICE_DATA_PATH = "../data/prices.parquet"
RETURNS_DATA_PATH = "../data/returns.parquet"
VOLA_DATA_PATH = "../data/vola.parquet"

timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
MODEL_SAVE_DIR = f"../models/sliding_window_jules/{timestamp}/"
TENSORBOARD_LOG_DIR = f"../tensorboard_logs/{timestamp}/"

# Ensure model save directory exists
os.makedirs(MODEL_SAVE_DIR, exist_ok=True)
os.makedirs(TENSORBOARD_LOG_DIR, exist_ok=True)

# --- DRL Agent Hyperparameters (from paper) ---
N_ENVS = 10
TOTAL_TIMESTEPS_PER_ROUND = 10**6 # 7_500_000  in paper
N_STEPS_PER_ENV = 252 * 3  # n_steps = 252 * 3 * n_envs (this is per env for PPO buffer)
# total buffer size before update = N_STEPS_PER_ENV * N_ENVS

BATCH_SIZE = 1260
N_EPOCHS = 16
GAMMA = 0.9
GAE_LAMBDA = 0.9
CLIP_RANGE = 0.25
LOG_STD_INIT = -1.0
POLICY_KWARGS = dict(
    activation_fn=torch.nn.Tanh,
    net_arch=[64, 64],  # Shared layers for policy and value networks
    log_std_init=LOG_STD_INIT,
)

# Learning rate schedule: linear decay from 3e-4 to 1e-5
INITIAL_LR = 3e-4
FINAL_LR = 1e-5


def linear_schedule(
    initial_value: float, final_value: float
) -> Callable[[float], float]:
    """
    Linear learning rate schedule.
    :param initial_value: Initial learning rate.
    :param final_value: Final learning rate.
    :return: schedule that computes current learning rate depending on progress remaining (1.0 -> 0.0)
    """

    def func(progress_remaining: float) -> float:
        """
        Progress will decrease from 1.0 to 0.0
        """
        return final_value + progress_remaining * (initial_value - final_value)

    return func


LEARNING_RATE_SCHEDULE = linear_schedule(INITIAL_LR, FINAL_LR)

# --- PortfolioEnv Parameters ---
ENV_WINDOW_SIZE = 60  # Lookback window for features in PortfolioEnv
TRANSACTION_COST = 0.0  # As per paper (or can be adjusted)
INITIAL_BALANCE = 100_000
REWARD_SCALING = 1.0
ETA_DSR = 1 / 252  # For Differential Sharpe Ratio in PortfolioEnv

In [3]:
# Load the full datasets once
try:
    print("Loading data...")
    prices_df_full = pd.read_parquet(PRICE_DATA_PATH)
    returns_df_full = pd.read_parquet(RETURNS_DATA_PATH)
    vola_df_full = pd.read_parquet(VOLA_DATA_PATH)
    print("Data loaded successfully.")

    # Ensure DataFrames have DateTimeIndex
    for df in [prices_df_full, returns_df_full, vola_df_full]:
        if not isinstance(df.index, pd.DatetimeIndex):
            df.index = pd.to_datetime(df.index)

    # print("\nPrice Data Head:")
    # print(prices_df_full.head())
    # print("\nReturns Data Head:")
    # print(returns_df_full.head())
    # print("\nVolatility Data Head:")
    # print(vola_df_full.head())

except FileNotFoundError as e:
    print(f"ERROR: Data file not found. {e}")
    print("Please ensure data is generated and paths are correct in Cell 2.")
    # Stop execution or raise error if data is critical for notebook to run
    raise

Loading data...
Data loaded successfully.


In [4]:
def slice_data(
    year_start,
    num_train_years,
    num_val_years,
    num_test_years,
    prices_df,
    returns_df,
    vol_df,
):
    """Slices data for a given window configuration."""

    train_start_date = pd.to_datetime(f"{year_start}-01-01")
    train_end_date = pd.to_datetime(f"{year_start + num_train_years - 1}-12-31")

    val_start_date = pd.to_datetime(f"{year_start + num_train_years}-01-01")
    val_end_date = pd.to_datetime(
        f"{year_start + num_train_years + num_val_years - 1}-12-31"
    )

    test_start_date = pd.to_datetime(
        f"{year_start + num_train_years + num_val_years}-01-01"
    )
    test_end_date = pd.to_datetime(
        f"{year_start + num_train_years + num_val_years + num_test_years - 1}-12-31"
    )

    print(f"  Train Period: {train_start_date.date()} to {train_end_date.date()}")
    print(f"  Val Period  : {val_start_date.date()} to {val_end_date.date()}")
    print(f"  Test Period : {test_start_date.date()} to {test_end_date.date()}")

    # Slicing (ensure index is datetime)
    train_prices = prices_df[train_start_date:train_end_date]
    train_returns = returns_df[train_start_date:train_end_date]
    train_vola = vol_df[train_start_date:train_end_date]

    val_prices = prices_df[val_start_date:val_end_date]
    val_returns = returns_df[val_start_date:val_end_date]
    val_vola = vol_df[val_start_date:val_end_date]

    test_prices = prices_df[test_start_date:test_end_date]
    test_returns = returns_df[test_start_date:test_end_date]
    test_vola = vol_df[test_start_date:test_end_date]

    # Basic check for empty slices which can halt env creation
    if train_prices.empty or val_prices.empty or test_prices.empty:
        print(
            "WARNING: One or more data slices are empty. Check date ranges and data availability."
        )
        # Potentially raise an error or handle as per requirements

    return (
        (train_prices, train_returns, train_vola),
        (val_prices, val_returns, val_vola),
        (test_prices, test_returns, test_vola),
    )


In [None]:
all_backtest_results = []
best_agent_paths_per_window = [] # To store path of the best agent for each window

# --- Main Loop for Sliding Windows ---
for i_window in range(N_WINDOWS):
    current_start_year = BASE_START_YEAR + i_window
    print(f"--- Starting Window {i_window+1}/{N_WINDOWS} (Train Year Start: {current_start_year}) ---")

    # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
    # 1. Slice Data for the current window
    # 5 years train, 1 year validation, 1 year test
    train_data, val_data, test_data = slice_data(
        year_start=current_start_year,
        num_train_years=5,
        num_val_years=1,
        num_test_years=1,
        prices_df=prices_df_full,
        returns_df=returns_df_full,
        vol_df=vola_df_full
    )
    
    # Unpack data
    (train_prices, train_returns, train_vola) = train_data
    (val_prices, val_returns, val_vola) = val_data
    (test_prices, test_returns, test_vola) = test_data

    # Check if any crucial dataframe is too short (e.g., shorter than ENV_WINDOW_SIZE)
    # PortfolioEnv requires at least `window_size` days of data to start.
    min_data_len = ENV_WINDOW_SIZE + 1 # Need at least window_size + 1 for one step
    if len(train_prices) < min_data_len or len(val_prices) < min_data_len or len(test_prices) < min_data_len:
        print(f"SKIPPING Window {i_window+1} due to insufficient data length for one or more periods.")
        print(f"  Train length: {len(train_prices)}, Val length: {len(val_prices)}, Test length: {len(test_prices)}")
        print(f"  Required minimum: {min_data_len}")
        best_agent_paths_per_window.append(None) # Mark as skipped
        all_backtest_results.append({"window": i_window+1, "status": "skipped_insufficient_data", "metrics": {}})
        continue

    # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
    # 2. Create Training and Validation Environments
    # These envs are re-created for each agent to ensure fresh state and correct data.
    # However, the data slice itself is per-window.

    best_agent_for_window = None
    best_val_reward = -np.inf
    
    # --- Inner Loop for Training AGENTS_PER_WINDOW Agents ---
    for i_agent in range(AGENTS_PER_WINDOW):
        agent_seed = (i_window * AGENTS_PER_WINDOW) + i_agent # Unique seed for each agent run
        print(f"  Training Agent {i_agent+1}/{AGENTS_PER_WINDOW} with seed {agent_seed}...")

        # Create environments for this specific agent
        # Training Env
        env_train_config = {
            'returns_df': train_returns, 'prices_df': train_prices, 'vol_df': train_vola,
            'window_size': ENV_WINDOW_SIZE, 'transaction_cost': TRANSACTION_COST,
            'initial_balance': INITIAL_BALANCE, 'reward_scaling': REWARD_SCALING, 'eta': ETA_DSR
        }
        # The DRLAgent class will use this first env to understand structure for SubprocVecEnv
        # This single_env_for_init is just for the DRLAgent constructor to get parameters.
        # The actual training will use N_ENVS created by DRLAgent.
        single_env_for_init_train = PortfolioEnv(**env_train_config)

        # Validation Env (single, not vectorized for evaluation)
        env_val_config = {
            'returns_df': val_returns, 'prices_df': val_prices, 'vol_df': val_vola,
            'window_size': ENV_WINDOW_SIZE, 'transaction_cost': TRANSACTION_COST,
            'initial_balance': INITIAL_BALANCE, 'reward_scaling': REWARD_SCALING, 'eta': ETA_DSR
        }
        env_val = PortfolioEnv(**env_val_config)
        
        # Instantiate DRL Agent
        agent = DRLAgent(
            env=single_env_for_init_train, # Pass the sample env for DRLAgent to clone
            n_envs=N_ENVS,
            policy_kwargs=POLICY_KWARGS,
            n_steps=N_STEPS_PER_ENV, # n_steps per environment for PPO
            batch_size=BATCH_SIZE,
            n_epochs=N_EPOCHS,
            learning_rate=LEARNING_RATE_SCHEDULE,
            gamma=GAMMA,
            gae_lambda=GAE_LAMBDA,
            clip_range=CLIP_RANGE,
            seed=agent_seed,
            tensorboard_log=TENSORBOARD_LOG_DIR
        )

        # Agent Seeding: Load previous window's best agent if not the first window
        if i_window > 0 and best_agent_paths_per_window[i_window-1] is not None:
            previous_best_agent_path = best_agent_paths_per_window[i_window-1]
            print(f"    Seeding agent from: {previous_best_agent_path}")
            # The env for load_from_file should match the new training env structure
            # DRLAgent's load_from_file uses its internal self.env by default if env=None.
            # This self.env is already configured with N_ENVS and the new train_data.
            agent.load_from_file(path=previous_best_agent_path, env=None) 
            agent.model.set_random_seed(agent_seed) # Ensure the loaded model uses the new agent_seed
                                   
        # Train the agent
        print(f"    Starting training for {TOTAL_TIMESTEPS_PER_ROUND} timesteps...")
        # Note: Training can be very long. For testing, reduce TOTAL_TIMESTEPS_PER_ROUND.
        # Example: agent.train(total_timesteps=10000, tb_log_name=f"ppo_win{i_window}_agent{i_agent}")
        agent.train(
            total_timesteps=TOTAL_TIMESTEPS_PER_ROUND, 
            tb_experiment_name=f"PPO_Window{i_window+1}_Agent{i_agent+1}_Seed{agent_seed}",
        )
        
        # Evaluate the agent on the validation set
        print("    Evaluating agent on validation set...")
        # The evaluate method in DRLAgentJules is designed for a single eval_env
        val_metrics = agent.evaluate(eval_env=env_val, n_eval_episodes=1) # Use 1 episode for validation speed
        current_val_reward = val_metrics.get("mean_reward", -np.inf)
        print(f"    Validation Mean Reward: {current_val_reward:.4f}")
        
        # Save this agent
        current_agent_model_name = f"agent_win{i_window+1}_seed{agent_seed}_valrew{current_val_reward:.2f}.zip"
        current_agent_save_path = os.path.join(MODEL_SAVE_DIR, current_agent_model_name)
        agent.save(current_agent_save_path)
        print(f"    Agent saved to: {current_agent_save_path}")

        if current_val_reward > best_val_reward:
            best_val_reward = current_val_reward
            best_agent_for_window_path = current_agent_save_path 
            print(f"    New best agent for this window with validation reward: {best_val_reward:.4f}")

        # Clean up to free memory if needed, though Python's GC should handle agent and envs
        del agent
        del single_env_for_init_train
        del env_val
        torch.cuda.empty_cache() # If using GPU

    best_agent_paths_per_window.append(best_agent_for_window_path if 'best_agent_for_window_path' in locals() and best_agent_for_window_path is not None else None)
    
    if best_agent_paths_per_window[-1] is None:
        print(f"  No best agent found or saved for window {i_window+1}. Skipping backtest.")
        all_backtest_results.append({"window": i_window+1, "status": "no_best_agent", "metrics": {}})
        continue
    
    # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
    # 3. Backtest the best agent of the window
    print(f"  Backtesting best agent for Window {i_window+1} ({best_agent_paths_per_window[-1]})" )
    
    # Create Backtesting Environment
    env_test_config = {
        'returns_df': test_returns, 'prices_df': test_prices, 'vol_df': test_vola,
        'window_size': ENV_WINDOW_SIZE, 'transaction_cost': TRANSACTION_COST,
        'initial_balance': INITIAL_BALANCE, 'reward_scaling': REWARD_SCALING, 'eta': ETA_DSR
    }
    env_test = PortfolioEnv(**env_test_config)
    
    # Load the best agent for this window
    # For loading, we need a sample env. We can create a dummy one or use env_test.
    # The DRLAgent needs an env instance for its constructor to derive parameters for make_env.
    # So, we pass a temporary env instance here.
    # The actual self.env for the loaded model will be set by PPO.load(env=env_test)
    
    # Create a temporary env instance for DRLAgent initialization before loading the model.
    # This env should reflect the structure the agent was trained on (e.g. observation/action space from PortfolioEnv)
    # but the actual data doesn't matter as much for just loading.
    # However, to be safe, use a structure similar to what it was trained on.
    # The DRLAgent.load method sets the environment for the loaded PPO model.
    
    # Simplified: Create a DRLAgent shell, then load into it.
    # The DRLAgent constructor needs an 'env' to setup its internal SubprocVecEnv, even if we immediately load.
    # We can pass the test_env for this, but DRLAgent will make it a VecEnv.
    # For loading for evaluation, the internal self.env of DRLAgent is less critical
    # if the PPO.load() correctly associates the model with the new eval_env.
    
    # Let's use the DRLAgent.load() method which takes an env.
    # We need to initialize DRLAgent first with *some* env that has the right structure.
    # The `single_env_for_init_train` used earlier has the correct structure.
    # It is important that the observation and action spaces match.
    
    # Re-create a template env for agent initialization before loading
    # This is just to satisfy DRLAgent's __init__ requirement for an env instance.
    # The actual environment for the loaded model will be `env_test`.
    temp_env_for_load_init = PortfolioEnv(
        returns_df=train_returns.iloc[:ENV_WINDOW_SIZE+5], # minimal data for init
        prices_df=train_prices.iloc[:ENV_WINDOW_SIZE+5],
        vol_df=train_vola.iloc[:ENV_WINDOW_SIZE+5],
        window_size=ENV_WINDOW_SIZE, 
        initial_balance=INITIAL_BALANCE
    )

    best_agent_loaded = DRLAgent(
        env=temp_env_for_load_init, # Template env
        n_envs=1, # For eval, n_envs=1 is fine for the DRLAgent wrapper
        policy_kwargs=POLICY_KWARGS 
        # Other params don't matter as much as we are loading a pre-trained model
    )
    
    print(f"    Loading model from: {best_agent_paths_per_window[-1]}")
    # Pass the actual test_env to PPO.load via DRLAgent.load method
    best_agent_loaded.load(path=best_agent_paths_per_window[-1], env=env_test) 
                                   
    print("    Running backtest evaluation...")
    backtest_metrics = best_agent_loaded.evaluate(eval_env=env_test, n_eval_episodes=1)
    
    print(f"    Backtest Metrics for Window {i_window+1}:")
    for key, value in backtest_metrics.items():
        print(f"      {key}: {value}")
    
    all_backtest_results.append({
        "window": i_window+1, 
        "best_agent_path": best_agent_paths_per_window[-1],
        "status": "completed",
        "metrics": backtest_metrics
    })
    
    del best_agent_loaded
    del temp_env_for_load_init
    del env_test
    torch.cuda.empty_cache() # If using GPU

# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
# 4. Save results
print("\n--- All Windows Processed ---")
print("Summary of Best Agent Paths:")
for i, path in enumerate(best_agent_paths_per_window):
    print(f"Window {i+1}: {path}")

print("\nSummary of Backtest Results:")
for result in all_backtest_results:
    print(f"Window {result['window']} ({result['status']}):")
    if result['status'] == 'completed':
        # print(f"  Agent: {result['best_agent_path']}")
        for k, v in result['metrics'].items():
            if isinstance(v, float): print(f"    {k}: {v:.4f}")
            else: print(f"    {k}: {v}")

--- Starting Window 1/2 (Train Year Start: 2006) ---
  Train Period: 2006-01-01 to 2010-12-31
  Val Period  : 2011-01-01 to 2011-12-31
  Test Period : 2012-01-01 to 2012-12-31
  Training Agent 1/2 with seed 0...


Output()

    Starting training for 1000000 timesteps...



Training complete. Trained for 1000000 timesteps.
TensorBoard logs for experiment 'PPO_Window1_Agent1_Seed0' saved in directory: ../tensorboard_logs/20250531_205413/
    Evaluating agent on validation set...
    Validation Mean Reward: 0.9366
    Agent saved to: ../models/sliding_window_jules/20250531_205413/agent_win1_seed0_valrew0.94.zip
    New best agent for this window with validation reward: 0.9366
  Training Agent 2/2 with seed 1...


Output()

    Starting training for 1000000 timesteps...



Training complete. Trained for 1000000 timesteps.
TensorBoard logs for experiment 'PPO_Window1_Agent2_Seed1' saved in directory: ../tensorboard_logs/20250531_205413/
    Evaluating agent on validation set...
    Validation Mean Reward: 0.7091
    Agent saved to: ../models/sliding_window_jules/20250531_205413/agent_win1_seed1_valrew0.71.zip
  Backtesting best agent for Window 1 (../models/sliding_window_jules/20250531_205413/agent_win1_seed0_valrew0.94.zip)
    Loading model from: ../models/sliding_window_jules/20250531_205413/agent_win1_seed0_valrew0.94.zip
    Running backtest evaluation...
    Backtest Metrics for Window 1:
      Annual return: -0.0136138887051519
      Cumulative returns: -0.010227891285999546
      Annual volatility: 0.11287923825024124
      Sharpe ratio: -0.24218885407465138
      Calmar ratio: -0.2959433792822436
      Stability: 0.8985700924498159
      Max drawdown: -0.04600166673155483
      Omega ratio: 0.961026191085054
      Sortino ratio: -0.387184485532

Output()

    Seeding agent from: ../models/sliding_window_jules/20250531_205413/agent_win1_seed0_valrew0.94.zip
Model loaded from ../models/sliding_window_jules/20250531_205413/agent_win1_seed0_valrew0.94.zip
    Starting training for 1000000 timesteps...



Training complete. Trained for 1000000 timesteps.
TensorBoard logs for experiment 'PPO_Window2_Agent1_Seed2' saved in directory: ../tensorboard_logs/20250531_205413/
    Evaluating agent on validation set...
    Validation Mean Reward: -0.4063
    Agent saved to: ../models/sliding_window_jules/20250531_205413/agent_win2_seed2_valrew-0.41.zip
    New best agent for this window with validation reward: -0.4063
  Training Agent 2/2 with seed 3...


Output()

    Seeding agent from: ../models/sliding_window_jules/20250531_205413/agent_win1_seed0_valrew0.94.zip
Model loaded from ../models/sliding_window_jules/20250531_205413/agent_win1_seed0_valrew0.94.zip
    Starting training for 1000000 timesteps...



Training complete. Trained for 1000000 timesteps.
TensorBoard logs for experiment 'PPO_Window2_Agent2_Seed3' saved in directory: ../tensorboard_logs/20250531_205413/
    Evaluating agent on validation set...
    Validation Mean Reward: 0.0972
    Agent saved to: ../models/sliding_window_jules/20250531_205413/agent_win2_seed3_valrew0.10.zip
    New best agent for this window with validation reward: 0.0972
  Backtesting best agent for Window 2 (../models/sliding_window_jules/20250531_205413/agent_win2_seed3_valrew0.10.zip)
    Loading model from: ../models/sliding_window_jules/20250531_205413/agent_win2_seed3_valrew0.10.zip
    Running backtest evaluation...
    Backtest Metrics for Window 2:
      Annual return: 0.0257492772058614
      Cumulative returns: 0.019456134240499523
      Annual volatility: 0.10605036983949467
      Sharpe ratio: 0.10423545782037345
      Calmar ratio: 0.46144547351441667
      Stability: 0.9041179563505012
      Max drawdown: -0.055801343135413656
      Ome

In [6]:
results_df = pd.DataFrame(all_backtest_results)

# Expand the 'metrics' dictionary into separate columns
metrics_df = results_df["metrics"].apply(pd.Series)
results_df = pd.concat([results_df.drop("metrics", axis=1), metrics_df], axis=1)

results_filename = (
    f"backtest_results_summary_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"
)
results_save_path = os.path.join(MODEL_SAVE_DIR, results_filename)
results_df.to_csv(results_save_path, index=False)
print(f"\nBacktest results summary saved to: {results_save_path}")
print("\nFinal Results DataFrame:")
results_df.head()


Backtest results summary saved to: ../models/sliding_window_jules/20250531_205413/backtest_results_summary_20250531_210357.csv

Final Results DataFrame:


Unnamed: 0,window,best_agent_path,status,Annual return,Cumulative returns,Annual volatility,Sharpe ratio,Calmar ratio,Stability,Max drawdown,...,Sortino ratio,Skew,Kurtosis,Tail ratio,Daily value at risk (95%),Portfolio turnover,mean_reward,std_reward,n_eval_episodes,final_portfolio_value_first_episode
0,1,../models/sliding_window_jules/20250531_205413...,completed,-0.013614,-0.010228,0.112879,-0.242189,-0.295943,0.89857,-0.046002,...,-0.387184,0.086205,0.508573,1.222959,-0.010403,,-0.146446,0.0,1.0,98977.210871
1,2,../models/sliding_window_jules/20250531_205413...,completed,0.025749,0.019456,0.10605,0.104235,0.461445,0.904118,-0.055801,...,0.160031,-0.263293,0.640463,0.945525,-0.011436,,2.57342,0.0,1.0,101945.613424


In [7]:
# PROBLEMS

# P1
# after the first window, the best agent is chosen as starting point for all agents in the next window
# but smth is wrong with the training process, randomization or smth
# all of the 5 agents in the following window are the same, with same rewards and performance etc
# therefore all of the following 9 windows after the first are superfluous
# or at least training 5 agents in the following windows is a waste of time ... smth

# P2
# the performance is getting better, but the final portfolio value is still just 101919 ie +2k$ which is really not good

# P3
# training logs are not saved, or at least the print statement says its saved
# but there is not directory and I cant find the log files anywhere

# P4
# training progress is not visible, only a tqdm bar
# maybe smth like pytorch lightning with live monitoring would be great

# P5
# in the calculated metrics portfolio turnover is always nan for all agents and windows