# DRL Model Evaluation

In [None]:
import os
import json
import pandas as pd
from datetime import datetime
import glob
import numpy as np

from stable_baselines3 import PPO

from utils.config import DRLConfig # Assuming DRLConfig can be instantiated from a dict
from utils.portfolio import Portfolio
from utils.portfolio_env import PortfolioEnv # Make sure this is the correct environment class
from utils.drl_train import slice_data # Or replicate logic if it's complex

%load_ext autoreload
%autoreload 2

## User Inputs

Please specify the following parameters for evaluation:

In [None]:
# REQUIRED: Timestamp of the DRL model training run (e.g., "20250611_234904")
# This corresponds to the folder name in `../models/`
MODEL_TIMESTAMP = "20250611_234904"  # <--- CHANGE THIS

# OPTIONAL: Specify evaluation period. 
# If None, the script will try to determine the test period from the DRL config.
EVAL_START_DATE = "2012-01-01"  # Example: "2012-01-01" or None
EVAL_END_DATE = "2022-12-31"    # Example: "2022-12-31" or None

# OPTIONAL: Path to MVO results CSV for comparison
# Example: "../results/20250626_0038_mvo_backtest_[2012-01-01,2022-12-31]_daily/mvo_metrics.csv"
MVO_RESULTS_CSV_PATH = "../results/20250626_0038_mvo_backtest_[2012-01-01,2022-12-31]_daily/mvo_metrics.csv" # <--- CHANGE THIS or set to None

# OPTIONAL: Specify which DRL agents to evaluate from the model timestamp folder
# If None or empty list, all agents (agent_seed*_valrew*.zip) in the folder will be evaluated.
# Example: ["agent_seed0_valrew-8.70.zip", "agent_seed1_valrew-11.50.zip"]
SPECIFIC_AGENTS_TO_EVALUATE = [] # or None

## 1. Load Configuration and Data

In [None]:
model_dir = f"../models/{MODEL_TIMESTAMP}"
config_path = os.path.join(model_dir, f"config_{MODEL_TIMESTAMP}.json")

if not os.path.exists(model_dir):
    raise FileNotFoundError(f"Model directory not found: {model_dir}")
if not os.path.exists(config_path):
    raise FileNotFoundError(f"Config file not found: {config_path}")

with open(config_path, 'r') as f:
    config_dict = json.load(f)

# Manually create DRLConfig object. If DRLConfig cannot take a dict directly,
# you might need to instantiate it by passing each key-value pair from config_dict.
# This assumes DRLConfig can be created by unpacking a dictionary.
try:
    drl_config = DRLConfig(**config_dict)
except TypeError as e:
    print(f"Error instantiating DRLConfig: {e}")
    print("Please ensure DRLConfig can be instantiated from the dictionary keys in the JSON.")
    # Fallback: Create a simple namespace object if DRLConfig is problematic
    from argparse import Namespace
    drl_config = Namespace(**config_dict)

print(f"Successfully loaded configuration from {config_path}")
print(f"DRL Config: {drl_config}")

In [None]:
# Define data paths (consistent with drl_train.ipynb)
# These paths might need adjustment if your data structure is different.
DATA_DIR = "../data/snp_new" # Or use a path from drl_config if available
RETURNS_PATH = os.path.join(DATA_DIR, "returns_1d.parquet")
PRICES_PATH = os.path.join(DATA_DIR, "prices_1d.parquet")
VOLA_PATH = os.path.join(DATA_DIR, "vola_1d.parquet")

if not os.path.exists(RETURNS_PATH):
    raise FileNotFoundError(f"Returns data not found: {RETURNS_PATH}. Please check DATA_DIR.")
if not os.path.exists(PRICES_PATH):
    raise FileNotFoundError(f"Prices data not found: {PRICES_PATH}. Please check DATA_DIR.")
if not os.path.exists(VOLA_PATH):
    raise FileNotFoundError(f"Volatility data not found: {VOLA_PATH}. Please check DATA_DIR.")

df_ret = pd.read_parquet(RETURNS_PATH)
df_prices = pd.read_parquet(PRICES_PATH)
df_vol = pd.read_parquet(VOLA_PATH)

# Ensure datetime index
df_ret.index = pd.to_datetime(df_ret.index)
df_prices.index = pd.to_datetime(df_prices.index)
df_vol.index = pd.to_datetime(df_vol.index)

print("Data loaded successfully:")
print(f"Returns shape: {df_ret.shape}, from {df_ret.index.min()} to {df_ret.index.max()}")
print(f"Prices shape: {df_prices.shape}, from {df_prices.index.min()} to {df_prices.index.max()}")
print(f"Volatility shape: {df_vol.shape}, from {df_vol.index.min()} to {df_vol.index.max()}")

## 2. Determine Evaluation Period and Slice Data

In [None]:
if EVAL_START_DATE and EVAL_END_DATE:
    eval_start_date = pd.to_datetime(EVAL_START_DATE)
    eval_end_date = pd.to_datetime(EVAL_END_DATE)
    print(f"Using user-defined evaluation period: {eval_start_date.date()} to {eval_end_date.date()}")
else:
    print("Warning: EVAL_START_DATE or EVAL_END_DATE not fully specified by the user.")
    print("The DRL training process uses rolling windows, so a single 'test period' from the config isn't directly applicable for a continuous evaluation run.")
    print(f"Defaulting to the last full year of available data based on df_prices: {df_prices.index.max() - pd.DateOffset(years=1) + pd.DateOffset(days=1)} to {df_prices.index.max()}")
    eval_end_date = df_prices.index.max()
    eval_start_date = eval_end_date - pd.DateOffset(years=1) + pd.DateOffset(days=1)
    EVAL_START_DATE = eval_start_date.strftime('%Y-%m-%d') # Update global var for reporting
    EVAL_END_DATE = eval_end_date.strftime('%Y-%m-%d')     # Update global var for reporting
    print(f"Using default evaluation period: {EVAL_START_DATE} to {EVAL_END_DATE}")

# Slice data according to the evaluation period
eval_df_prices = df_prices.loc[eval_start_date:eval_end_date].copy() 
eval_df_ret = df_ret.loc[eval_start_date:eval_end_date].copy()
eval_df_vol = df_vol.loc[eval_start_date:eval_end_date].copy()

# Check for sufficient data for the environment window
if eval_df_prices.empty or len(eval_df_prices) < drl_config.env_window_size:
    raise ValueError(
        f"Insufficient data for the evaluation period {eval_start_date.date()} to {eval_end_date.date()} "
        f"after slicing. Need at least {drl_config.env_window_size} days for the environment observation window. "
        f"Found {len(eval_df_prices)} days. Please check your dates or data source."
    )

print(f"\nEvaluation data shapes after slicing ({eval_start_date.date()} to {eval_end_date.date()}):")
print(f"Prices: {eval_df_prices.shape}")
print(f"Returns: {eval_df_ret.shape}")
print(f"Volatility: {eval_df_vol.shape}")

# Ensure data alignment and select common tickers across all three evaluation dataframes
common_tickers = eval_df_ret.columns.intersection(eval_df_prices.columns).intersection(eval_df_vol.columns)
eval_df_ret = eval_df_ret[list(common_tickers)]
eval_df_prices = eval_df_prices[list(common_tickers)]
eval_df_vol = eval_df_vol[list(common_tickers)]

if eval_df_ret.empty or eval_df_prices.empty or eval_df_vol.empty:
    raise ValueError("Dataframes are empty after aligning common tickers. Check data consistency for the selected period.")

print(f"Data aligned to {len(common_tickers)} common tickers. Example tickers: {list(common_tickers[:5])}...")

## 3. Load DRL Models and Evaluate

In [None]:
if SPECIFIC_AGENTS_TO_EVALUATE and len(SPECIFIC_AGENTS_TO_EVALUATE) > 0:
    agent_model_files = [os.path.join(model_dir, fname) for fname in SPECIFIC_AGENTS_TO_EVALUATE]
    agent_model_files = [path for path in agent_model_files if os.path.exists(path)]
    print(f"Using user-specified agent models: {agent_model_files}")
else:
    agent_model_files = glob.glob(os.path.join(model_dir, "agent_seed*_valrew*.zip"))
    print(f"Found {len(agent_model_files)} agent models in {model_dir}:")
    for f in agent_model_files:
        print(f"  - {os.path.basename(f)}")

if not agent_model_files:
    raise FileNotFoundError(f"No agent model files found in {model_dir} matching criteria.")

evaluated_portfolios = {} # To store Portfolio objectskeyed by agent name
agent_errors = {}

In [None]:
for agent_path in agent_model_files:
    agent_name = os.path.basename(agent_path)
    print(f"\n--- Evaluating Agent: {agent_name} ---")
    
    try:
        # 1. Load the DRL model
        model = PPO.load(agent_path, device='cpu') # Assuming CPU for evaluation
        print(f"Loaded model from {agent_path}")

        # 2. Create the Portfolio Environment for evaluation
        # Ensure the environment uses the sliced evaluation data
        eval_env = PortfolioEnv(
            df_prices=eval_df_prices,
            df_returns=eval_df_ret,
            df_features=eval_df_vol, # Assuming vol is the feature, adjust if more features are used
            tickers=list(common_tickers),
            initial_balance=drl_config.initial_balance,
            window_size=drl_config.env_window_size,
            transaction_cost=drl_config.transaction_cost,
            reward_scaling=drl_config.reward_scaling,
            eta_dsr=drl_config.eta_dsr,
            seed=None # No need for a specific seed during deterministic evaluation
        )
        print(f"PortfolioEnv created for evaluation with window size {drl_config.env_window_size}")

        # 3. Initialize a Portfolio object to track performance
        # The tickers for the portfolio should match those in the environment
        portfolio = Portfolio(tickers=list(common_tickers), initial_balance=drl_config.initial_balance)
        print(f"Portfolio tracker initialized with balance: {portfolio.initial_balance}")

        # 4. Run the evaluation loop
        obs, info = eval_env.reset()
        done = False
        episode_rewards = []
        
        # Record initial state
        initial_date = eval_env.current_date()
        if initial_date:
             portfolio.update(current_prices=eval_df_prices.loc[initial_date], date=initial_date)
        else:
            print("Warning: Could not get initial date from environment for portfolio recording.")

        print(f"Starting evaluation loop from {eval_env.current_date()}...")
        num_steps = 0
        while not done:
            action, _ = model.predict(obs, deterministic=True)
            obs, reward, terminated, truncated, info = eval_env.step(action)
            done = terminated or truncated
            episode_rewards.append(reward)
            num_steps += 1

            # Update the portfolio object with the new state from the environment
            current_date = eval_env.current_date()
            current_prices_series = eval_df_prices.loc[current_date]
            
            # The environment internally manages its portfolio state after an action.
            # We need to get the weights *after* the action has been processed by the env.
            # The Portfolio class's update_rebalance expects target weights if rebalancing,
            # or just new prices if only updating values. 
            # Here, the DRL agent's action *is* the target weights (plus cash).
            # The env.portfolio object reflects the state *after* the action.
            
            # Sync our external Portfolio tracker with the environment's internal one.
            portfolio.current_balance = eval_env.portfolio.current_balance
            portfolio.cash = eval_env.portfolio.cash
            portfolio.positions = eval_env.portfolio.positions.copy() # copy to be safe
            portfolio.weights = eval_env.portfolio.weights.copy()
            portfolio.w_c = eval_env.portfolio.w_c
            
            # Record this state in our external portfolio's history
            portfolio.history.append({
                'date': current_date,
                'portfolio_value': portfolio.current_balance,
                'cash': portfolio.cash,
                'w_c': portfolio.w_c,
                **{f'w_{t}': portfolio.weights.get(t, 0.0) for t in common_tickers},
                **{f's_{t}': portfolio.positions.get(t, 0.0) for t in common_tickers}
            })
            
            if num_steps % 252 == 0: # Log progress every year (approx)
                print(f"  Step {num_steps}, Date: {current_date}, Portfolio Value: {portfolio.current_balance:.2f}")

        evaluated_portfolios[agent_name] = portfolio
        mean_reward = np.mean(episode_rewards) if episode_rewards else 0
        print(f"Evaluation complete for {agent_name}. Steps: {num_steps}. Mean reward: {mean_reward:.4f}")
        print(f"Final portfolio value: {portfolio.current_balance:.2f}")
        
    except Exception as e:
        print(f"ERROR evaluating agent {agent_name}: {e}")
        import traceback
        traceback.print_exc()
        agent_errors[agent_name] = str(e)

if not evaluated_portfolios:
    print("\nNo agents were successfully evaluated. Check errors above.")
else:
    print(f"\nSuccessfully evaluated {len(evaluated_portfolios)} agents.")

## 4. Calculate DRL Performance Metrics

In [None]:
drl_performance_metrics_list = []

if not evaluated_portfolios:
    print("No portfolios were evaluated, so no metrics to calculate.")
else:
    for agent_name, portfolio_obj in evaluated_portfolios.items():
        print(f"Calculating metrics for {agent_name}...")
        try:
            # Ensure the portfolio history is not empty before calculating metrics
            if not portfolio_obj.history:
                print(f"  Skipping metrics for {agent_name}: Portfolio history is empty.")
                # Add a record with NaNs or default values if desired, or just skip
                metrics = {'Agent': agent_name, 'Error': 'Empty history'}
            else:
                # The risk_free_rate can be a parameter, e.g., from drl_config or a fixed value
                risk_free_rate = getattr(drl_config, 'risk_free_rate', 0.0) # Default to 0 if not in config
                metrics = portfolio_obj.calc_metrics(risk_free_rate=risk_free_rate)
                metrics['Agent'] = agent_name # Add agent name for identification
            
            drl_performance_metrics_list.append(metrics)
            print(f"  Metrics calculated for {agent_name}.")
        except Exception as e:
            print(f"  Error calculating metrics for {agent_name}: {e}")
            import traceback
            traceback.print_exc()
            drl_performance_metrics_list.append({'Agent': agent_name, 'Error': str(e)})

if drl_performance_metrics_list:
    df_drl_metrics = pd.DataFrame(drl_performance_metrics_list)
    # Reorder columns to have 'Agent' first, then 'Error' if it exists, then others
    cols = ['Agent']
    if 'Error' in df_drl_metrics.columns: # Check if Error column was added for any agent
        cols.append('Error')
    cols.extend([col for col in df_drl_metrics.columns if col not in cols])
    df_drl_metrics = df_drl_metrics[cols]
    
    print("\nDRL Performance Metrics Summary:")
    print(df_drl_metrics.to_markdown(index=False))
else:
    print("\nNo DRL performance metrics were generated.")
    df_drl_metrics = pd.DataFrame() # Ensure df_drl_metrics exists for later steps

## 5. Create Results Directory and Save Artifacts

In [None]:
eval_timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
results_dir_name = f"drl_eval_{MODEL_TIMESTAMP}_{eval_timestamp}"
results_save_dir = os.path.join("../results", results_dir_name)

os.makedirs(results_save_dir, exist_ok=True)
print(f"Created results directory: {results_save_dir}")

In [None]:
# Save info.txt
info_content = f"DRL Evaluation Run Information\n"
info_content += f"------------------------------------\n"
info_content += f"Evaluation Timestamp: {eval_timestamp}\n"
info_content += f"Source DRL Model Timestamp: {MODEL_TIMESTAMP}\n"
info_content += f"Evaluation Start Date: {EVAL_START_DATE}\n"
info_content += f"Evaluation End Date: {EVAL_END_DATE}\n"
info_content += f"\nEvaluated DRL Agents from {model_dir}:\n"
if evaluated_portfolios:
    for agent_name in evaluated_portfolios.keys():
        info_content += f"  - {agent_name}\n"
else:
    info_content += "  - No agents were successfully evaluated.\n"
    
if agent_errors:
    info_content += f"\nErrors during evaluation for some agents:\n"
    for agent_name, error_msg in agent_errors.items():
        info_content += f"  - {agent_name}: {error_msg}\n"

info_content += f"\nMVO Comparison File: {MVO_RESULTS_CSV_PATH if MVO_RESULTS_CSV_PATH else 'Not specified'}\n"

info_file_path = os.path.join(results_save_dir, "info.txt")
with open(info_file_path, 'w') as f:
    f.write(info_content)
print(f"Saved evaluation info to: {info_file_path}")

In [None]:
# Save DRL performance metrics CSV
if not df_drl_metrics.empty:
    drl_metrics_path = os.path.join(results_save_dir, "drl_performance_metrics.csv")
    df_drl_metrics.to_csv(drl_metrics_path, index=False)
    print(f"Saved DRL performance metrics to: {drl_metrics_path}")
else:
    print("DRL performance metrics DataFrame is empty. Not saving.")

In [None]:
# Save portfolio history for each DRL agent
if evaluated_portfolios:
    for agent_name, portfolio_obj in evaluated_portfolios.items():
        if portfolio_obj.history:
            history_df = portfolio_obj.get_history()
            # Sanitize agent_name for filename (e.g., remove .zip)
            safe_agent_name = agent_name.replace(".zip", "").replace(".", "_")
            history_filename = f"drl_agent_{safe_agent_name}_portfolio_history.csv"
            history_path = os.path.join(results_save_dir, history_filename)
            history_df.to_csv(history_path, index=True) # index=True because history has date index
            print(f"Saved portfolio history for {agent_name} to: {history_path}")
        else:
            print(f"Portfolio history for {agent_name} is empty. Not saving.")
else:
    print("No evaluated portfolios. Not saving any portfolio histories.")

## 6. Compare with MVO Results (Optional)

In [None]:
if MVO_RESULTS_CSV_PATH and os.path.exists(MVO_RESULTS_CSV_PATH):
    print(f"\n--- MVO Comparison --- C")
    print(f"Loading MVO results from: {MVO_RESULTS_CSV_PATH}")
    try:
        df_mvo_metrics = pd.read_csv(MVO_RESULTS_CSV_PATH)
        
        # The MVO metrics CSV might have an unnamed index column if saved with index=True
        if 'Unnamed: 0' in df_mvo_metrics.columns:
            df_mvo_metrics = df_mvo_metrics.rename(columns={'Unnamed: 0': 'MVO_Strategy_ID'})
            # Or df_mvo_metrics.set_index('MVO_Strategy_ID', inplace=True) if that's preferred
        
        print("\nMVO Performance Metrics:")
        print(df_mvo_metrics.to_markdown(index=False))
        
        # Save MVO metrics to the results folder as well for completeness
        mvo_metrics_copy_path = os.path.join(results_save_dir, "mvo_comparison_metrics.csv")
        df_mvo_metrics.to_csv(mvo_metrics_copy_path, index=False)
        print(f"\nCopied MVO metrics to: {mvo_metrics_copy_path}")

        if not df_drl_metrics.empty:
            print("\nDRL Performance Metrics (repeated for comparison):")
            print(df_drl_metrics.to_markdown(index=False))
            
            # Attempt a simple concatenation for side-by-side view if structures are somewhat similar
            # This is a basic comparison; more sophisticated merging might be needed depending on exact formats
            try:
                # Add a 'Type' column to distinguish DRL from MVO
                df_drl_metrics_typed = df_drl_metrics.copy()
                df_drl_metrics_typed['Type'] = 'DRL'
                # Standardize 'Agent' column name for MVO if possible, or use a generic ID
                # For now, assuming MVO has a 'lookback' or 'MVO_Strategy_ID' that can act as an identifier
                df_mvo_metrics_typed = df_mvo_metrics.copy()
                df_mvo_metrics_typed['Type'] = 'MVO'
                if 'MVO_Strategy_ID' in df_mvo_metrics_typed.columns:
                    df_mvo_metrics_typed.rename(columns={'MVO_Strategy_ID': 'Agent'}, inplace=True)
                elif 'lookback' in df_mvo_metrics_typed.columns:
                     df_mvo_metrics_typed.rename(columns={'lookback': 'Agent'}, inplace=True)
                else: # Add a placeholder agent column if no clear identifier
                    df_mvo_metrics_typed['Agent'] = 'MVO_lookback_' + df_mvo_metrics_typed.index.astype(str)

                # Ensure 'Agent' column is string type for both before concat if it exists
                if 'Agent' in df_drl_metrics_typed.columns:
                    df_drl_metrics_typed['Agent'] = df_drl_metrics_typed['Agent'].astype(str)
                if 'Agent' in df_mvo_metrics_typed.columns:
                    df_mvo_metrics_typed['Agent'] = df_mvo_metrics_typed['Agent'].astype(str)

                df_combined_metrics = pd.concat([df_drl_metrics_typed, df_mvo_metrics_typed], ignore_index=True)
                
                # Reorder columns for better readability
                cols_combined = ['Type', 'Agent']
                if 'Error' in df_combined_metrics.columns:
                    cols_combined.append('Error')
                cols_combined.extend([col for col in df_combined_metrics.columns if col not in cols_combined])
                df_combined_metrics = df_combined_metrics[cols_combined]
                
                print("\nCombined DRL and MVO Metrics:")
                print(df_combined_metrics.to_markdown(index=False))
                
                combined_metrics_path = os.path.join(results_save_dir, "drl_vs_mvo_metrics_comparison.csv")
                df_combined_metrics.to_csv(combined_metrics_path, index=False)
                print(f"Saved combined metrics comparison to: {combined_metrics_path}")
                
            except Exception as e:
                print(f"Could not create a combined comparison table: {e}")
        else:
            print("DRL metrics are empty, cannot show side-by-side comparison.")
            
    except Exception as e:
        print(f"Error loading or processing MVO results: {e}")
        import traceback
        traceback.print_exc()
elif MVO_RESULTS_CSV_PATH:
    print(f"\nSpecified MVO results file not found: {MVO_RESULTS_CSV_PATH}")
else:
    print("\nNo MVO results CSV path specified. Skipping MVO comparison.")

print("\n--- Evaluation Notebook Complete ---")