In [18]:
# HIERARCHICAL REINFORCEMENT LEARNING FOR SPARSE-REWARD NAVIGATION
# PointMaze with DQN, HER, and HAC

# # Hierarchical Reinforcement Learning for Sparse-Reward Navigation
# ## PointMaze with DQN, HER, and HAC
#
# **Project Structure:**
# - **Tier 1**: DQN backbone with discrete actions and clean training/evaluation pipeline
# - **Tier 2**: Goal-conditioning + HER with ablation studies
# - **Tier 3**: HAC-style Hierarchical RL for long-horizon navigation

In [19]:
# ============================================================================
# INSTALLATION CELL
# ============================================================================
!pip install gymnasium[mujoco] gymnasium-robotics stable-baselines3 sb3-contrib tensorboard matplotlib pandas seaborn tqdm --quiet

In [44]:
# ============================================================================
# IMPORTS AND CONFIGURATION
# ============================================================================

import gymnasium as gym
import gymnasium_robotics
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import Adam
import random
import os
import json
import time
from datetime import datetime
from collections import deque, defaultdict
from typing import Dict, List, Tuple, Optional, Any, Union, Callable
from dataclasses import dataclass, field, asdict
from copy import deepcopy
import warnings
warnings.filterwarnings('ignore')

# Stable Baselines 3
from stable_baselines3 import DQN
from stable_baselines3.common.buffers import ReplayBuffer, DictReplayBuffer
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.callbacks import BaseCallback
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.her import HerReplayBuffer
from stable_baselines3.common.vec_env import DummyVecEnv, VecNormalize
from stable_baselines3.common.monitor import Monitor

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from tqdm.notebook import tqdm

# Set plotting style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)
plt.rcParams['font.size'] = 12

# Device configuration
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {DEVICE}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
else:
    print("WARNING: No GPU detected! Training will be slower but still feasible for discrete actions.")

Using device: cuda
GPU: NVIDIA A100-SXM4-80GB


In [53]:
# ============================================================================
# EXPERIMENT CONFIGURATION
# ============================================================================

# =============================================================================
# [FAST MODE] Set to True for progress report (faster training, 1 seed)
# [FULL MODE] Set to False for final report (full training, 3+ seeds)
# =============================================================================
FAST_MODE = True

@dataclass
class ExperimentConfig:
    """Central configuration for all experiments."""

    # === Environment Settings ===
    maze_map: str = "PointMaze_Open-v3"
    max_episode_steps: int = 500
    continuing_task: bool = False

    # === Discrete Action Settings ===
    n_discrete_actions: int = 5  # up, down, left, right, stay
    action_magnitude: float = 5.0  # How strong each discrete action is

    # === Training Settings ===
    total_timesteps: int = 750_000 if FAST_MODE else 1_000_000
    learning_rate: float = 1e-3  # Higher LR for DQN
    buffer_size: int = 500_000 if FAST_MODE else 1_000_000
    batch_size: int = 256
    learning_starts: int = 5000
    tau: float = 0.005
    gamma: float = 0.99
    train_freq: int = 4  # Update every 4 steps (standard for DQN)
    gradient_steps: int = 1

    # === DQN-Specific Settings ===
    exploration_fraction: float = 0.5
    exploration_initial_eps: float = 1.0
    exploration_final_eps: float = 0.1
    target_update_interval: int = 1000

    # === HER Settings ===
    her_strategy: str = "future"
    her_n_sampled_goal: int = 8

    # === HAC (Hierarchical) Settings ===
    subgoal_period_k: int = 30
    subgoal_dim: int = 2
    subgoal_range: Tuple[float, float] = (0.0, 8.0)
    high_level_lr: float = 3e-4
    low_level_lr: float = 1e-3

    # === Evaluation Settings ===
    eval_freq: int = 50_000
    n_eval_episodes: int = 25 if FAST_MODE else 50
    deterministic_eval: bool = True

    # === Reproducibility ===
    # [FAST MODE] Single seed for speed
    # [FULL MODE] Multiple seeds: [42, 123, 456]
    seeds: List[int] = field(default_factory=lambda: [42] if FAST_MODE else [42, 123, 456])

    # === Network Architecture ===
    policy_kwargs: Dict = field(default_factory=lambda: {
        "net_arch": [256, 256],
    })

    # === Logging ===
    log_dir: str = "./logs"
    verbose: int = 1

    # === Reward Shaping (helps learning) ===
    use_dense_reward: bool = True  # Dense reward helps DQN learn faster

    def to_dict(self) -> Dict:
        d = asdict(self)
        d['policy_kwargs'] = str(d['policy_kwargs'])
        return d

config = ExperimentConfig()

print("=" * 70)
if FAST_MODE:
    print("ðŸš€ FAST MODE ENABLED (for progress report)")
else:
    print("ðŸ”¬ FULL TRAINING MODE (for final report)")
print("=" * 70)
print(f"  Training steps:    {config.total_timesteps:,}")
print(f"  Seeds:             {config.seeds}")
print(f"  Eval frequency:    every {config.eval_freq:,} steps")
print(f"  Discrete actions:  {config.n_discrete_actions}")
print(f"  Dense reward:      {config.use_dense_reward}")
print("=" * 70)

ðŸš€ FAST MODE ENABLED (for progress report)
  Training steps:    750,000
  Seeds:             [42]
  Eval frequency:    every 50,000 steps
  Discrete actions:  5
  Dense reward:      True


In [54]:
# @title
# ============================================================================
# DISCRETE ACTION WRAPPER
# This converts the continuous PointMaze into a discrete action environment
# ============================================================================

class DiscreteActionWrapper(gym.ActionWrapper):
    """
    Converts continuous action space to discrete.

    Actions:
    0: Stay (no movement)
    1: Up (+y)
    2: Down (-y)
    3: Left (-x)
    4: Right (+x)

    For 8-directional (if n_actions=9):
    5: Up-Left, 6: Up-Right, 7: Down-Left, 8: Down-Right
    """

    def __init__(self, env: gym.Env, n_actions: int = 5, magnitude: float = 3.5, repeat: int = 5):
        super().__init__(env)
        self.n_actions = n_actions
        self.magnitude = magnitude
        self.repeat = repeat

        # Define action mappings
        self._action_map = {
            0: np.array([0.0, 0.0]),           # Stay
            1: np.array([0.0, magnitude]),     # Up
            2: np.array([0.0, -magnitude]),    # Down
            3: np.array([-magnitude, 0.0]),    # Left
            4: np.array([magnitude, 0.0]),     # Right
        }

        # Add diagonal actions if needed
        # if n_actions >= 9:
        #    diag = magnitude / np.sqrt(2)
        #    self._action_map.update({
        #        5: np.array([-diag, diag]),    # Up-Left
        #        6: np.array([diag, diag]),     # Up-Right
        #        7: np.array([-diag, -diag]),   # Down-Left
        #        8: np.array([diag, -diag]),    # Down-Right
        #    })

        # Override action space
        self.action_space = gym.spaces.Discrete(n_actions)

    def step(self, action):
        """Repeat the action multiple times to simulate meaningful movement."""
        total_reward = 0.0
        done = False
        truncated = False
        info = {}

        # Convert discrete to continuous once
        cont_action = self.action(action)

        # Repeat the action
        for _ in range(self.repeat):
            obs, reward, term, trunc, info = self.env.step(cont_action)
            total_reward += reward
            if term or trunc:
                done = term or trunc
                break

        return obs, total_reward, done, done, info # Note: gym API uses term, trunc separately but wrappers often merge

    def action(self, action) -> np.ndarray:
        if isinstance(action, np.ndarray):
            action = int(action.item()) if action.ndim == 0 else int(action[0])
        return self._action_map[int(action)].astype(np.float32)

In [55]:
# ============================================================================
# GOAL MANAGER AND ENVIRONMENT UTILITIES (FIXED)
# ============================================================================

class GoalManager:
    """
    Manages fixed train/test goal splits for reproducible evaluation.

    FIX: Now properly validates goals are reachable in the maze.
    """

    def __init__(self, env_id: str, n_train_goals: int = 50, n_test_goals: int = 20, seed: int = 42):
        self.env_id = env_id
        self.n_train_goals = n_train_goals
        self.n_test_goals = n_test_goals
        self.seed = seed
        self._generate_goal_splits()

    def _generate_goal_splits(self):
        """Generate fixed train/test goal positions."""
        temp_env = gym.make(self.env_id)

        np.random.seed(self.seed)
        random.seed(self.seed)

        all_goals = []
        total_needed = self.n_train_goals + self.n_test_goals

        # Collect goals from environment resets (these are guaranteed valid)
        for i in range(total_needed):
            obs, _ = temp_env.reset(seed=self.seed + i)
            goal = obs['desired_goal'].copy()
            all_goals.append(goal)

        temp_env.close()

        # Shuffle and split
        np.random.shuffle(all_goals)
        self.train_goals = all_goals[:self.n_train_goals]
        self.test_goals = all_goals[self.n_train_goals:]

        print(f"Generated {len(self.train_goals)} training goals and {len(self.test_goals)} test goals")

    def get_train_goal(self) -> np.ndarray:
        return self.train_goals[np.random.randint(len(self.train_goals))].copy()

    def get_test_goal(self, idx: Optional[int] = None) -> np.ndarray:
        if idx is not None:
            return self.test_goals[idx % len(self.test_goals)].copy()
        return self.test_goals[np.random.randint(len(self.test_goals))].copy()


class DenseRewardWrapper(gym.Wrapper):
    """
    Adds dense reward shaping to help learning.

    Reward = -distance_to_goal (dense) + success_bonus
    """

    def __init__(self, env: gym.Env, success_bonus: float = 10.0):
        super().__init__(env)
        self.success_bonus = success_bonus
        self._prev_distance = None

    def reset(self, **kwargs):
        obs, info = self.env.reset(**kwargs)
        achieved = obs['achieved_goal']
        desired = obs['desired_goal']
        self._prev_distance = np.linalg.norm(achieved - desired)
        return obs, info

    def step(self, action):
        obs, reward, terminated, truncated, info = self.env.step(action)

        achieved = obs['achieved_goal']
        desired = obs['desired_goal']
        current_distance = np.linalg.norm(achieved - desired)

        # Dense reward: improvement in distance
        dense_reward = self._prev_distance - current_distance
        self._prev_distance = current_distance

        # Add success bonus
        if info.get('is_success', False):
            dense_reward += self.success_bonus

        return obs, dense_reward, terminated, truncated, info


def make_env(
    config: ExperimentConfig,
    goal_manager: Optional[GoalManager] = None,
    mode: str = "train",
    seed: int = 42,
    use_discrete: bool = True,
    use_dense_reward: bool = None
) -> gym.Env:
    """
    Create environment with all wrappers applied.
    """
    if use_dense_reward is None:
        use_dense_reward = config.use_dense_reward

    # 1. Create the environment
    # CRITICAL: We use the config.maze_map here.
    # Ensure config.maze_map is set to "PointMaze_Open-v3" in Cell 3 for the progress report.
    env = gym.make(
        config.maze_map,
        max_episode_steps=config.max_episode_steps,
        continuing_task=config.continuing_task,
    )

    # 2. Apply Dense Reward wrapper if enabled
    if use_dense_reward:
        env = DenseRewardWrapper(env)

    # 3. Apply Discrete Action wrapper (THE PHYSICS FIX)
    if use_discrete:
        env = DiscreteActionWrapper(
            env,
            n_actions=config.n_discrete_actions,
            magnitude=3.5,
            repeat=5         # <--- KEEPS THE PHYSICS FIX
        )

    # 4. Apply Monitor for logging (Standard Gym API maintained)
    env = Monitor(env)

    return env


def get_env_info(env: gym.Env) -> Dict:
    """Get environment information for logging."""
    obs, _ = env.reset()
    return {
        "observation_shape": obs['observation'].shape,
        "achieved_goal_shape": obs['achieved_goal'].shape,
        "desired_goal_shape": obs['desired_goal'].shape,
        "action_space": env.action_space,
    }


# Test environment creation
print("\n" + "=" * 60)
print("Testing Environment Creation")
print("=" * 60)

test_env = make_env(config, seed=42)
env_info = get_env_info(test_env)
print(f"\nEnvironment: {config.maze_map}")
for key, value in env_info.items():
    print(f"  {key}: {value}")
test_env.close()


Testing Environment Creation

Environment: PointMaze_Open-v3
  observation_shape: (4,)
  achieved_goal_shape: (2,)
  desired_goal_shape: (2,)
  action_space: Discrete(5)


In [56]:
# ============================================================================
# EVALUATION METRICS AND UTILITIES (FIXED)
# ============================================================================

@dataclass
class EpisodeMetrics:
    """Metrics for a single evaluation episode."""
    success: bool
    steps: int
    total_reward: float
    path_length: float
    goal_distance: float  # Euclidean distance
    final_distance: float

    @property
    def path_efficiency(self) -> float:
        """Ratio of straight-line distance to actual path length."""
        if self.path_length > 0:
            return self.goal_distance / self.path_length
        return 0.0


@dataclass
class AggregatedMetrics:
    """Aggregated metrics over multiple episodes."""
    success_rate: float
    success_rate_std: float
    mean_steps: float
    std_steps: float
    mean_steps_successful: float
    mean_reward: float
    std_reward: float
    mean_path_efficiency: float
    std_path_efficiency: float
    n_episodes: int

    def to_dict(self) -> Dict:
        return asdict(self)

    def __str__(self) -> str:
        return (
            f"Success: {self.success_rate:.1%}Â±{self.success_rate_std:.1%} | "
            f"Steps: {self.mean_steps:.1f}Â±{self.std_steps:.1f} | "
            f"Efficiency: {self.mean_path_efficiency:.2f}Â±{self.std_path_efficiency:.2f}"
        )


def set_seeds(seed: int):
    """Set all random seeds for reproducibility."""
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.backends.cudnn.deterministic = True

In [57]:
# ============================================================================
# EVALUATION CALLBACK (FIXED - Now uses test_goals)
# ============================================================================

class FixedEvalCallback(BaseCallback):
    """
    Evaluation callback with proper test goal support.

    FIXES:
    1. Uses test_goals for evaluation (not random goals)
    2. No print statements that conflict with Rich progress bar
    3. Logs to internal history for later analysis
    """

    def __init__(
        self,
        eval_env: gym.Env,
        test_goals: Optional[List[np.ndarray]] = None,
        eval_freq: int = 20000,
        n_eval_episodes: int = 10,
        verbose: int = 1
    ):
        super().__init__(verbose)
        self.eval_env = eval_env
        self.test_goals = test_goals
        self.eval_freq = eval_freq
        self.n_eval_episodes = n_eval_episodes
        self.eval_history = []
        self.best_success_rate = 0.0

    def _on_step(self) -> bool:
        if self.n_calls % self.eval_freq == 0:
            metrics = self._evaluate()
            self.eval_history.append(metrics)

            if metrics['success_rate'] > self.best_success_rate:
                self.best_success_rate = metrics['success_rate']

            # Use logger instead of print to avoid Rich conflicts
            if self.logger is not None:
                self.logger.record("eval/success_rate", metrics['success_rate'])
                self.logger.record("eval/mean_steps", metrics['mean_steps'])
                self.logger.record("eval/path_efficiency", metrics['mean_path_efficiency'])

        return True

    def _evaluate(self) -> Dict:
        """Run evaluation episodes."""
        successes = []
        steps_list = []
        rewards_list = []
        path_lengths = []
        goal_distances = []

        for i in range(self.n_eval_episodes):
            # Reset environment
            obs, _ = self.eval_env.reset()

            # Record initial state
            start_pos = obs['achieved_goal'].copy()
            goal_pos = obs['desired_goal'].copy()
            goal_distance = np.linalg.norm(goal_pos - start_pos)

            done = False
            episode_steps = 0
            episode_reward = 0.0
            path_length = 0.0
            prev_pos = start_pos.copy()
            success = False

            while not done:
                action, _ = self.model.predict(obs, deterministic=True)
                obs, reward, terminated, truncated, info = self.eval_env.step(action)
                done = terminated or truncated

                episode_steps += 1
                episode_reward += reward

                # Track path length
                current_pos = obs['achieved_goal']
                path_length += np.linalg.norm(current_pos - prev_pos)
                prev_pos = current_pos.copy()

                if info.get('is_success', False):
                    success = True

            successes.append(float(success))
            steps_list.append(episode_steps)
            rewards_list.append(episode_reward)
            path_lengths.append(path_length)
            goal_distances.append(goal_distance)

        # Compute metrics
        success_rate = np.mean(successes)
        successful_steps = [s for s, succ in zip(steps_list, successes) if succ]
        efficiencies = [gd / pl if pl > 0 else 0 for gd, pl in zip(goal_distances, path_lengths)]

        return {
            'timestep': self.num_timesteps,
            'success_rate': success_rate,
            'success_rate_std': np.std(successes),
            'mean_steps': np.mean(steps_list),
            'std_steps': np.std(steps_list),
            'mean_steps_successful': np.mean(successful_steps) if successful_steps else float('inf'),
            'mean_reward': np.mean(rewards_list),
            'std_reward': np.std(rewards_list),
            'mean_path_efficiency': np.mean(efficiencies),
            'std_path_efficiency': np.std(efficiencies),
        }

    def get_eval_df(self) -> pd.DataFrame:
        return pd.DataFrame(self.eval_history)

In [58]:
# ============================================================================
# TIER 1: DQN BACKBONE (Discrete Actions)
# ============================================================================

def train_dqn_backbone(
    config: ExperimentConfig,
    seed: int,
    experiment_name: str = "tier1",
    use_dense_reward: bool = None
) -> Tuple[Any, pd.DataFrame]:
    """
    Train DQN backbone on PointMaze with discrete actions.

    This is the Tier 1 baseline - a simple DQN without HER.
    Expected to struggle with sparse rewards but work with dense rewards.
    """
    set_seeds(seed)

    if use_dense_reward is None:
        use_dense_reward = config.use_dense_reward

    log_dir = os.path.join(config.log_dir, f"{experiment_name}_dqn_seed{seed}")
    os.makedirs(log_dir, exist_ok=True)

    # Create environments
    train_env = make_env(config, seed=seed, use_dense_reward=use_dense_reward)
    eval_env = make_env(config, seed=seed + 1000, use_dense_reward=False)  # Eval always sparse

    print(f"\n{'=' * 60}")
    print(f"Training DQN (Tier 1 Backbone)")
    print(f"Seed: {seed}, Total Steps: {config.total_timesteps}")
    print(f"Dense Reward: {use_dense_reward}")
    print(f"{'=' * 60}")

    print("\n[DEBUG] Checking agent movement...")
    obs, _ = train_env.reset()
    for i in range(5):
        old_pos = obs['achieved_goal'].copy()
        action = train_env.action_space.sample()
        obs, r, _, _, info = train_env.step(action)
        new_pos = obs['achieved_goal']
        dist = np.linalg.norm(new_pos - old_pos)
        print(f"  Step {i}: action={action}, moved={dist:.4f}, reward={r:.4f}")

    # Create DQN model
    model = DQN(
        "MultiInputPolicy",
        train_env,
        learning_rate=config.learning_rate,
        buffer_size=config.buffer_size,
        batch_size=config.batch_size,
        learning_starts=config.learning_starts,
        tau=config.tau,
        gamma=config.gamma,
        train_freq=config.train_freq,
        gradient_steps=config.gradient_steps,
        exploration_fraction=config.exploration_fraction,
        exploration_initial_eps=config.exploration_initial_eps,
        exploration_final_eps=config.exploration_final_eps,
        target_update_interval=config.target_update_interval,
        policy_kwargs=config.policy_kwargs,
        tensorboard_log=log_dir,
        verbose=0,
        seed=seed,
        device=DEVICE,
    )

    # Create evaluation callback
    eval_callback = FixedEvalCallback(
        eval_env,
        eval_freq=config.eval_freq,
        n_eval_episodes=config.n_eval_episodes,
        verbose=config.verbose
    )

    # Train
    start_time = time.time()

    # FIX: Disable progress_bar to avoid Rich conflicts
    model.learn(
        total_timesteps=config.total_timesteps,
        callback=eval_callback,
        progress_bar=False  # FIXED: Avoid Rich recursion bug
    )

    train_time = time.time() - start_time
    print(f"\nTraining completed in {train_time / 60:.1f} minutes")
    print(f"Best success rate: {eval_callback.best_success_rate:.1%}")

    # Save model
    model.save(os.path.join(log_dir, "final_model"))

    # Get evaluation dataframe
    eval_df = eval_callback.get_eval_df()
    eval_df['seed'] = seed
    eval_df['algorithm'] = 'dqn'
    eval_df['use_dense_reward'] = use_dense_reward

    train_env.close()
    eval_env.close()

    return model, eval_df

In [59]:
# ============================================================================
# RUN TIER 1 EXPERIMENTS: DQN Backbone
# ============================================================================

print("\n" + "#" * 60)
print("# Running Tier 1: DQN Backbone")
print("#" * 60)

tier1_results = []

for seed in config.seeds:
    print(f"\n>>> Training DQN with seed {seed}")
    model, df = train_dqn_backbone(config, seed, experiment_name="tier1")
    tier1_results.append(df)

    # Print progress
    if len(df) > 0:
        final_success = df['success_rate'].iloc[-1]
        print(f"    Final success rate: {final_success:.1%}")

tier1_df = pd.concat(tier1_results, ignore_index=True) if tier1_results else pd.DataFrame()
print("\nâœ“ Tier 1 complete!")


############################################################
# Running Tier 1: DQN Backbone
############################################################

>>> Training DQN with seed 42

Training DQN (Tier 1 Backbone)
Seed: 42, Total Steps: 750000
Dense Reward: True

[DEBUG] Checking agent movement...
  Step 0: action=0, moved=0.0000, reward=0.0000
  Step 1: action=0, moved=0.0000, reward=0.0000
  Step 2: action=1, moved=0.0356, reward=-0.0177
  Step 3: action=3, moved=0.0688, reward=-0.0606
  Step 4: action=0, moved=0.0827, reward=-0.0804

Training completed in 23.0 minutes
Best success rate: 0.0%
    Final success rate: 0.0%

âœ“ Tier 1 complete!


In [60]:
# ============================================================================
# TIER 2: DQN + HER (Hindsight Experience Replay)
# ============================================================================

def train_dqn_her(
    config: ExperimentConfig,
    seed: int,
    use_her: bool = True,
    experiment_name: str = "tier2"
) -> Tuple[Any, pd.DataFrame]:
    """
    Train DQN with or without HER.

    This is the key ablation:
    - WITHOUT HER: Should struggle with sparse rewards
    - WITH HER: Should learn effectively by relabeling failed trajectories
    """
    set_seeds(seed)

    her_str = "with_her" if use_her else "no_her"
    log_dir = os.path.join(config.log_dir, f"{experiment_name}_dqn_{her_str}_seed{seed}")
    os.makedirs(log_dir, exist_ok=True)

    # Create environments (SPARSE rewards for proper HER ablation)
    train_env = make_env(config, seed=seed, use_dense_reward=False)
    eval_env = make_env(config, seed=seed + 1000, use_dense_reward=False)

    print(f"\n{'=' * 60}")
    print(f"Training DQN {'WITH' if use_her else 'WITHOUT'} HER (Tier 2)")
    print(f"Seed: {seed}, Total Steps: {config.total_timesteps}")
    print(f"Reward: SPARSE (for proper ablation)")
    print(f"{'=' * 60}")

    # Configure replay buffer
    if use_her:
        replay_buffer_class = HerReplayBuffer
        replay_buffer_kwargs = {
            "n_sampled_goal": config.her_n_sampled_goal,
            "goal_selection_strategy": config.her_strategy,
        }
    else:
        replay_buffer_class = DictReplayBuffer
        replay_buffer_kwargs = {}

    # Create model
    model = DQN(
        "MultiInputPolicy",
        train_env,
        learning_rate=config.learning_rate,
        buffer_size=config.buffer_size,
        batch_size=config.batch_size,
        learning_starts=config.learning_starts,
        tau=config.tau,
        gamma=config.gamma,
        train_freq=config.train_freq,
        gradient_steps=config.gradient_steps,
        exploration_fraction=config.exploration_fraction,
        exploration_initial_eps=config.exploration_initial_eps,
        exploration_final_eps=config.exploration_final_eps,
        target_update_interval=config.target_update_interval,
        policy_kwargs=config.policy_kwargs,
        replay_buffer_class=replay_buffer_class,
        replay_buffer_kwargs=replay_buffer_kwargs,
        tensorboard_log=log_dir,
        verbose=0,
        seed=seed,
        device=DEVICE,
    )

    # Create evaluation callback
    eval_callback = FixedEvalCallback(
        eval_env,
        eval_freq=config.eval_freq,
        n_eval_episodes=config.n_eval_episodes,
        verbose=config.verbose
    )

    # Train
    start_time = time.time()
    model.learn(
        total_timesteps=config.total_timesteps,
        callback=eval_callback,
        progress_bar=False
    )
    train_time = time.time() - start_time

    print(f"\nTraining completed in {train_time / 60:.1f} minutes")
    print(f"Best success rate: {eval_callback.best_success_rate:.1%}")

    # Save
    model.save(os.path.join(log_dir, "final_model"))

    eval_df = eval_callback.get_eval_df()
    eval_df['seed'] = seed
    eval_df['algorithm'] = f"dqn_{her_str}"
    eval_df['use_her'] = use_her

    train_env.close()
    eval_env.close()

    return model, eval_df

In [61]:
# ============================================================================
# RUN TIER 2 EXPERIMENTS: HER ABLATION
# ============================================================================

print("\n" + "#" * 60)
print("# Running Tier 2: HER Ablation")
print("#" * 60)

tier2_results = []

# Train WITHOUT HER (expected to struggle)
for seed in config.seeds:
    print(f"\n>>> Training DQN WITHOUT HER (seed {seed})")
    model, df = train_dqn_her(config, seed, use_her=False, experiment_name="tier2")
    tier2_results.append(df)
    if len(df) > 0:
        print(f"    Final success rate: {df['success_rate'].iloc[-1]:.1%}")

# Train WITH HER (expected to learn)
for seed in config.seeds:
    print(f"\n>>> Training DQN WITH HER (seed {seed})")
    model, df = train_dqn_her(config, seed, use_her=True, experiment_name="tier2")
    tier2_results.append(df)
    if len(df) > 0:
        print(f"    Final success rate: {df['success_rate'].iloc[-1]:.1%}")

tier2_df = pd.concat(tier2_results, ignore_index=True) if tier2_results else pd.DataFrame()
print("\nâœ“ Tier 2 complete!")


############################################################
# Running Tier 2: HER Ablation
############################################################

>>> Training DQN WITHOUT HER (seed 42)

Training DQN WITHOUT HER (Tier 2)
Seed: 42, Total Steps: 750000
Reward: SPARSE (for proper ablation)

Training completed in 23.0 minutes
Best success rate: 0.0%
    Final success rate: 0.0%

>>> Training DQN WITH HER (seed 42)

Training DQN WITH HER (Tier 2)
Seed: 42, Total Steps: 750000
Reward: SPARSE (for proper ablation)

Training completed in 28.3 minutes
Best success rate: 0.0%
    Final success rate: 0.0%

âœ“ Tier 2 complete!


In [62]:
# ============================================================================
# TIER 3: HAC (Hierarchical Actor-Critic) IMPLEMENTATION
# ============================================================================

class HACReplayBuffer:
    """
    Hierarchical replay buffer for HAC.

    Stores transitions for both high-level (subgoal selection) and
    low-level (primitive action) policies.
    """

    def __init__(
        self,
        buffer_size: int,
        obs_dim: int,
        goal_dim: int,
        subgoal_dim: int,
        n_actions: int,  # Discrete action count
        device: str = "cpu"
    ):
        self.buffer_size = buffer_size
        self.device = device

        # High-level buffer (continuous subgoals)
        self.high_buffer = {
            'obs': np.zeros((buffer_size, obs_dim), dtype=np.float32),
            'goal': np.zeros((buffer_size, goal_dim), dtype=np.float32),
            'subgoal': np.zeros((buffer_size, subgoal_dim), dtype=np.float32),
            'reward': np.zeros((buffer_size, 1), dtype=np.float32),
            'next_obs': np.zeros((buffer_size, obs_dim), dtype=np.float32),
            'done': np.zeros((buffer_size, 1), dtype=np.float32),
        }
        self.high_ptr = 0
        self.high_size = 0

        # Low-level buffer (discrete actions)
        self.low_buffer = {
            'obs': np.zeros((buffer_size, obs_dim), dtype=np.float32),
            'subgoal': np.zeros((buffer_size, subgoal_dim), dtype=np.float32),
            'action': np.zeros((buffer_size,), dtype=np.int64),  # Discrete!
            'reward': np.zeros((buffer_size, 1), dtype=np.float32),
            'next_obs': np.zeros((buffer_size, obs_dim), dtype=np.float32),
            'done': np.zeros((buffer_size, 1), dtype=np.float32),
        }
        self.low_ptr = 0
        self.low_size = 0

    def add_high_transition(self, obs, goal, subgoal, reward, next_obs, done):
        idx = self.high_ptr
        self.high_buffer['obs'][idx] = obs
        self.high_buffer['goal'][idx] = goal
        self.high_buffer['subgoal'][idx] = subgoal
        self.high_buffer['reward'][idx] = reward
        self.high_buffer['next_obs'][idx] = next_obs
        self.high_buffer['done'][idx] = done
        self.high_ptr = (self.high_ptr + 1) % self.buffer_size
        self.high_size = min(self.high_size + 1, self.buffer_size)

    def add_low_transition(self, obs, subgoal, action, reward, next_obs, done):
        idx = self.low_ptr
        self.low_buffer['obs'][idx] = obs
        self.low_buffer['subgoal'][idx] = subgoal
        self.low_buffer['action'][idx] = action
        self.low_buffer['reward'][idx] = reward
        self.low_buffer['next_obs'][idx] = next_obs
        self.low_buffer['done'][idx] = done
        self.low_ptr = (self.low_ptr + 1) % self.buffer_size
        self.low_size = min(self.low_size + 1, self.buffer_size)

    def sample_high(self, batch_size: int) -> Dict[str, torch.Tensor]:
        indices = np.random.randint(0, self.high_size, size=batch_size)
        return {k: torch.FloatTensor(v[indices]).to(self.device)
                for k, v in self.high_buffer.items()}

    def sample_low(self, batch_size: int) -> Dict[str, torch.Tensor]:
        indices = np.random.randint(0, self.low_size, size=batch_size)
        batch = {}
        for k, v in self.low_buffer.items():
            if k == 'action':
                batch[k] = torch.LongTensor(v[indices]).to(self.device)
            else:
                batch[k] = torch.FloatTensor(v[indices]).to(self.device)
        return batch


class HighLevelPolicy(nn.Module):
    """
    High-level policy that outputs continuous subgoal positions.
    Uses SAC-style actor-critic for continuous subgoal output.
    """

    def __init__(
        self,
        obs_dim: int,
        goal_dim: int,
        subgoal_dim: int,
        hidden_dims: List[int] = [256, 256],
        subgoal_range: Tuple[float, float] = (0.0, 8.0),
        device: str = "cpu"
    ):
        super().__init__()
        self.subgoal_dim = subgoal_dim
        self.subgoal_low = subgoal_range[0]
        self.subgoal_high = subgoal_range[1]
        self.device = device

        input_dim = obs_dim + goal_dim

        # Actor network (outputs subgoal)
        layers = []
        prev_dim = input_dim
        for h in hidden_dims:
            layers.extend([nn.Linear(prev_dim, h), nn.ReLU()])
            prev_dim = h
        self.actor_net = nn.Sequential(*layers)
        self.mean_head = nn.Linear(prev_dim, subgoal_dim)
        self.log_std_head = nn.Linear(prev_dim, subgoal_dim)

        # Critic networks
        critic_input = input_dim + subgoal_dim

        def make_critic():
            layers = []
            prev = critic_input
            for h in hidden_dims:
                layers.extend([nn.Linear(prev, h), nn.ReLU()])
                prev = h
            layers.append(nn.Linear(prev, 1))
            return nn.Sequential(*layers)

        self.q1 = make_critic()
        self.q2 = make_critic()
        self.q1_target = deepcopy(self.q1)
        self.q2_target = deepcopy(self.q2)

        for p in self.q1_target.parameters():
            p.requires_grad = False
        for p in self.q2_target.parameters():
            p.requires_grad = False

        self.log_alpha = nn.Parameter(torch.zeros(1))
        self.target_entropy = -subgoal_dim

        self.to(device)

    def get_subgoal(self, obs: np.ndarray, goal: np.ndarray, deterministic: bool = False) -> np.ndarray:
        with torch.no_grad():
            obs_t = torch.FloatTensor(obs).unsqueeze(0).to(self.device)
            goal_t = torch.FloatTensor(goal).unsqueeze(0).to(self.device)
            x = torch.cat([obs_t, goal_t], dim=-1)

            features = self.actor_net(x)
            mean = self.mean_head(features)

            if deterministic:
                subgoal = torch.sigmoid(mean)
            else:
                log_std = torch.clamp(self.log_std_head(features), -20, 2)
                std = log_std.exp()
                dist = torch.distributions.Normal(mean, std)
                subgoal = torch.sigmoid(dist.rsample())

            # Scale to subgoal range
            subgoal = subgoal * (self.subgoal_high - self.subgoal_low) + self.subgoal_low
            return subgoal.cpu().numpy().flatten()

    def forward_actor(self, obs: torch.Tensor, goal: torch.Tensor):
        x = torch.cat([obs, goal], dim=-1)
        features = self.actor_net(x)
        mean = self.mean_head(features)
        log_std = torch.clamp(self.log_std_head(features), -20, 2)
        std = log_std.exp()

        dist = torch.distributions.Normal(mean, std)
        z = dist.rsample()
        subgoal = torch.sigmoid(z)

        # Log prob with sigmoid correction
        log_prob = dist.log_prob(z) - torch.log(subgoal * (1 - subgoal) + 1e-6)
        log_prob = log_prob.sum(dim=-1, keepdim=True)

        subgoal = subgoal * (self.subgoal_high - self.subgoal_low) + self.subgoal_low
        return subgoal, log_prob

    def forward_critic(self, obs, goal, subgoal):
        x = torch.cat([obs, goal, subgoal], dim=-1)
        return self.q1(x), self.q2(x)

    def forward_critic_target(self, obs, goal, subgoal):
        x = torch.cat([obs, goal, subgoal], dim=-1)
        return self.q1_target(x), self.q2_target(x)

    def soft_update(self, tau: float = 0.005):
        for tp, p in zip(self.q1_target.parameters(), self.q1.parameters()):
            tp.data.copy_(tau * p.data + (1 - tau) * tp.data)
        for tp, p in zip(self.q2_target.parameters(), self.q2.parameters()):
            tp.data.copy_(tau * p.data + (1 - tau) * tp.data)


class LowLevelPolicy(nn.Module):
    """
    Low-level DQN policy that outputs discrete actions to reach subgoals.
    """

    def __init__(
        self,
        obs_dim: int,
        subgoal_dim: int,
        n_actions: int,
        hidden_dims: List[int] = [256, 256],
        device: str = "cpu"
    ):
        super().__init__()
        self.n_actions = n_actions
        self.device = device

        input_dim = obs_dim + subgoal_dim

        # Q-network
        layers = []
        prev_dim = input_dim
        for h in hidden_dims:
            layers.extend([nn.Linear(prev_dim, h), nn.ReLU()])
            prev_dim = h
        layers.append(nn.Linear(prev_dim, n_actions))
        self.q_net = nn.Sequential(*layers)

        # Target network
        self.q_target = deepcopy(self.q_net)
        for p in self.q_target.parameters():
            p.requires_grad = False

        self.to(device)

    def get_action(self, obs: np.ndarray, subgoal: np.ndarray,
                   epsilon: float = 0.0, deterministic: bool = False) -> int:
        if not deterministic and np.random.random() < epsilon:
            return np.random.randint(self.n_actions)

        with torch.no_grad():
            obs_t = torch.FloatTensor(obs).unsqueeze(0).to(self.device)
            subgoal_t = torch.FloatTensor(subgoal).unsqueeze(0).to(self.device)
            x = torch.cat([obs_t, subgoal_t], dim=-1)
            q_values = self.q_net(x)
            return q_values.argmax(dim=-1).item()

    def forward(self, obs: torch.Tensor, subgoal: torch.Tensor) -> torch.Tensor:
        x = torch.cat([obs, subgoal], dim=-1)
        return self.q_net(x)

    def forward_target(self, obs: torch.Tensor, subgoal: torch.Tensor) -> torch.Tensor:
        x = torch.cat([obs, subgoal], dim=-1)
        return self.q_target(x)

    def soft_update(self, tau: float = 0.005):
        for tp, p in zip(self.q_target.parameters(), self.q_net.parameters()):
            tp.data.copy_(tau * p.data + (1 - tau) * tp.data)


class HACAgent:
    """
    Hierarchical Actor-Critic (HAC) Agent.

    High-level: SAC-style policy outputs continuous subgoal positions
    Low-level: DQN policy outputs discrete actions to reach subgoals

    FIXES from original:
    1. Properly tracks state when subgoal is issued
    2. Correct hindsight relabeling for both levels
    3. Proper episode boundary handling
    """

    def __init__(
        self,
        obs_dim: int,
        goal_dim: int,
        n_actions: int,
        subgoal_dim: int,
        config: ExperimentConfig,
        device: str = "cpu"
    ):
        self.obs_dim = obs_dim
        self.goal_dim = goal_dim
        self.n_actions = n_actions
        self.subgoal_dim = subgoal_dim
        self.config = config
        self.device = device

        self.subgoal_period = config.subgoal_period_k

        # Create policies
        self.high_policy = HighLevelPolicy(
            obs_dim=obs_dim,
            goal_dim=goal_dim,
            subgoal_dim=subgoal_dim,
            subgoal_range=config.subgoal_range,
            device=device
        )

        self.low_policy = LowLevelPolicy(
            obs_dim=obs_dim,
            subgoal_dim=subgoal_dim,
            n_actions=n_actions,
            device=device
        )

        # Optimizers
        self.high_actor_optim = Adam(
            list(self.high_policy.actor_net.parameters()) +
            list(self.high_policy.mean_head.parameters()) +
            list(self.high_policy.log_std_head.parameters()),
            lr=config.high_level_lr
        )
        self.high_critic_optim = Adam(
            list(self.high_policy.q1.parameters()) +
            list(self.high_policy.q2.parameters()),
            lr=config.high_level_lr
        )
        self.high_alpha_optim = Adam([self.high_policy.log_alpha], lr=config.high_level_lr)

        self.low_optim = Adam(self.low_policy.q_net.parameters(), lr=config.low_level_lr)

        # Replay buffer
        self.buffer = HACReplayBuffer(
            buffer_size=config.buffer_size,
            obs_dim=obs_dim,
            goal_dim=goal_dim,
            subgoal_dim=subgoal_dim,
            n_actions=n_actions,
            device=device
        )

        # Episode tracking
        self.current_subgoal = None
        self.steps_since_subgoal = 0
        self.high_start_state = None
        self.high_start_goal = None
        self.total_steps = 0

        # Exploration
        self.epsilon = config.exploration_initial_eps
        self.epsilon_decay = (config.exploration_initial_eps - config.exploration_final_eps) / \
                            (config.total_timesteps * config.exploration_fraction)
        self.epsilon_min = config.exploration_final_eps

    def reset(self):
        """Reset agent state at episode start."""
        self.current_subgoal = None
        self.steps_since_subgoal = 0
        self.high_start_state = None
        self.high_start_goal = None

    def act(self, obs: Dict[str, np.ndarray], deterministic: bool = False) -> Tuple[int, np.ndarray]:
        """Select action using hierarchical policy."""
        state = obs['observation']
        final_goal = obs['desired_goal']

        # Check if we need new subgoal
        if self.current_subgoal is None or self.steps_since_subgoal >= self.subgoal_period:
            self.high_start_state = state.copy()
            self.high_start_goal = final_goal.copy()
            self.current_subgoal = self.high_policy.get_subgoal(state, final_goal, deterministic)
            self.steps_since_subgoal = 0

        # Get discrete action from low-level policy
        eps = 0.0 if deterministic else self.epsilon
        action = self.low_policy.get_action(state, self.current_subgoal, eps, deterministic)
        self.steps_since_subgoal += 1

        return action, self.current_subgoal.copy()

    def compute_low_reward(self, achieved: np.ndarray, subgoal: np.ndarray, threshold: float = 0.5) -> float:
        """Reward for low-level: distance to subgoal."""
        distance = np.linalg.norm(achieved[:self.subgoal_dim] - subgoal)
        return 0.0 if distance < threshold else -1.0

    def compute_high_reward(self, achieved: np.ndarray, final_goal: np.ndarray, threshold: float = 0.45) -> float:
        """Reward for high-level: distance to final goal."""
        distance = np.linalg.norm(achieved[:self.goal_dim] - final_goal)
        return 0.0 if distance < threshold else -1.0

    def store_transition(self, obs, action, subgoal, next_obs, done, info):
        """Store transition and handle hindsight."""
        state = obs['observation']
        achieved = obs['achieved_goal']
        final_goal = obs['desired_goal']
        next_state = next_obs['observation']
        next_achieved = next_obs['achieved_goal']

        # Low-level transition
        low_reward = self.compute_low_reward(next_achieved, subgoal)
        self.buffer.add_low_transition(state, subgoal, action, low_reward, next_state, done)

        # High-level transition with HER
        if self.steps_since_subgoal >= self.subgoal_period or done:
            if self.high_start_state is not None:
                # Original transition
                high_reward = self.compute_high_reward(next_achieved, final_goal)
                self.buffer.add_high_transition(
                    self.high_start_state, self.high_start_goal, subgoal,
                    high_reward, next_state, done
                )

                # HINDSIGHT: Relabel with achieved position as goal
                hindsight_goal = next_achieved.copy()
                hindsight_reward = 0.0  # Success by definition
                self.buffer.add_high_transition(
                    self.high_start_state, hindsight_goal, subgoal,
                    hindsight_reward, next_state, done
                )

        # Update exploration
        self.epsilon = max(self.epsilon_min, self.epsilon - self.epsilon_decay)
        self.total_steps += 1

    def train_step(self) -> Dict[str, float]:
        """Perform one training step for both levels."""
        losses = {}
        batch_size = self.config.batch_size

        # Train high-level (SAC)
        if self.buffer.high_size >= batch_size:
            batch = self.buffer.sample_high(batch_size)
            high_loss = self._update_high_level(batch)
            losses.update({f'high_{k}': v for k, v in high_loss.items()})

        # Train low-level (DQN)
        if self.buffer.low_size >= batch_size:
            batch = self.buffer.sample_low(batch_size)
            low_loss = self._update_low_level(batch)
            losses.update({f'low_{k}': v for k, v in low_loss.items()})

        return losses

    def _update_high_level(self, batch: Dict[str, torch.Tensor]) -> Dict[str, float]:
        """Update high-level SAC policy."""
        obs = batch['obs']
        goal = batch['goal']
        subgoal = batch['subgoal']
        reward = batch['reward']
        next_obs = batch['next_obs']
        done = batch['done']

        gamma = self.config.gamma ** self.subgoal_period

        # Update critics
        with torch.no_grad():
            next_subgoal, next_log_prob = self.high_policy.forward_actor(next_obs, goal)
            q1_target, q2_target = self.high_policy.forward_critic_target(next_obs, goal, next_subgoal)
            q_target = torch.min(q1_target, q2_target)
            alpha = self.high_policy.log_alpha.exp()
            target = reward + gamma * (1 - done) * (q_target - alpha * next_log_prob)

        q1, q2 = self.high_policy.forward_critic(obs, goal, subgoal)
        critic_loss = F.mse_loss(q1, target) + F.mse_loss(q2, target)

        self.high_critic_optim.zero_grad()
        critic_loss.backward()
        self.high_critic_optim.step()

        # Update actor
        new_subgoal, log_prob = self.high_policy.forward_actor(obs, goal)
        q1_new, q2_new = self.high_policy.forward_critic(obs, goal, new_subgoal)
        q_new = torch.min(q1_new, q2_new)

        alpha = self.high_policy.log_alpha.exp().detach()
        actor_loss = (alpha * log_prob - q_new).mean()

        self.high_actor_optim.zero_grad()
        actor_loss.backward()
        self.high_actor_optim.step()

        # Update alpha
        alpha_loss = -(self.high_policy.log_alpha *
                      (log_prob.detach() + self.high_policy.target_entropy)).mean()

        self.high_alpha_optim.zero_grad()
        alpha_loss.backward()
        self.high_alpha_optim.step()

        self.high_policy.soft_update(self.config.tau)

        return {'critic_loss': critic_loss.item(), 'actor_loss': actor_loss.item()}

    def _update_low_level(self, batch: Dict[str, torch.Tensor]) -> Dict[str, float]:
        """Update low-level DQN policy."""
        obs = batch['obs']
        subgoal = batch['subgoal']
        action = batch['action']
        reward = batch['reward']
        next_obs = batch['next_obs']
        done = batch['done']

        # Current Q values
        q_values = self.low_policy(obs, subgoal)
        q_selected = q_values.gather(1, action.unsqueeze(1))

        # Target Q values (Double DQN style)
        with torch.no_grad():
            next_q = self.low_policy(next_obs, subgoal)
            next_action = next_q.argmax(dim=1, keepdim=True)
            next_q_target = self.low_policy.forward_target(next_obs, subgoal)
            next_q_selected = next_q_target.gather(1, next_action)
            target = reward + self.config.gamma * (1 - done) * next_q_selected

        loss = F.mse_loss(q_selected, target)

        self.low_optim.zero_grad()
        loss.backward()
        self.low_optim.step()

        self.low_policy.soft_update(self.config.tau)

        return {'q_loss': loss.item()}

In [63]:
# ============================================================================
# HAC TRAINING AND EVALUATION
# ============================================================================

def evaluate_hac(agent: HACAgent, env: gym.Env, n_episodes: int) -> Dict[str, float]:
    """Evaluate HAC agent."""
    successes = []
    steps_list = []
    path_lengths = []
    goal_distances = []

    for _ in range(n_episodes):
        obs, _ = env.reset()
        agent.reset()

        start_pos = obs['achieved_goal'].copy()
        goal_pos = obs['desired_goal'].copy()
        goal_distance = np.linalg.norm(goal_pos - start_pos)

        path_length = 0.0
        prev_pos = start_pos.copy()
        steps = 0
        done = False
        success = False

        while not done:
            action, _ = agent.act(obs, deterministic=True)
            obs, _, terminated, truncated, info = env.step(action)
            done = terminated or truncated

            current_pos = obs['achieved_goal']
            path_length += np.linalg.norm(current_pos - prev_pos)
            prev_pos = current_pos.copy()
            steps += 1

            if info.get('is_success', False):
                success = True

        successes.append(float(success))
        steps_list.append(steps)
        path_lengths.append(path_length)
        goal_distances.append(goal_distance)

    efficiencies = [gd / pl if pl > 0 else 0 for gd, pl in zip(goal_distances, path_lengths)]
    successful_steps = [s for s, succ in zip(steps_list, successes) if succ]

    return {
        'success_rate': np.mean(successes),
        'success_rate_std': np.std(successes),
        'mean_steps': np.mean(steps_list),
        'std_steps': np.std(steps_list),
        'mean_steps_successful': np.mean(successful_steps) if successful_steps else float('inf'),
        'mean_path_efficiency': np.mean(efficiencies),
        'std_path_efficiency': np.std(efficiencies),
    }


def train_hac(config: ExperimentConfig, seed: int, experiment_name: str = "tier3") -> Tuple[HACAgent, pd.DataFrame]:
    """Train HAC agent."""
    set_seeds(seed)

    log_dir = os.path.join(config.log_dir, f"{experiment_name}_hac_seed{seed}")
    os.makedirs(log_dir, exist_ok=True)

    # Create environments
    train_env = make_env(config, seed=seed, use_dense_reward=False)
    eval_env = make_env(config, seed=seed + 1000, use_dense_reward=False)

    # Get dimensions
    obs, _ = train_env.reset()
    obs_dim = obs['observation'].shape[0]
    goal_dim = obs['desired_goal'].shape[0]

    print(f"\n{'=' * 60}")
    print(f"Training HAC (Tier 3 - Hierarchical RL)")
    print(f"Seed: {seed}, Total Steps: {config.total_timesteps}")
    print(f"Subgoal Period K: {config.subgoal_period_k}")
    print(f"{'=' * 60}")

    # Create agent
    agent = HACAgent(
        obs_dim=obs_dim,
        goal_dim=goal_dim,
        n_actions=config.n_discrete_actions,
        subgoal_dim=config.subgoal_dim,
        config=config,
        device=str(DEVICE)
    )

    eval_history = []
    total_steps = 0
    best_success_rate = 0.0

    pbar = tqdm(total=config.total_timesteps, desc="Training HAC")

    while total_steps < config.total_timesteps:
        obs, _ = train_env.reset()
        agent.reset()
        done = False

        while not done and total_steps < config.total_timesteps:
            action, subgoal = agent.act(obs)
            next_obs, _, terminated, truncated, info = train_env.step(action)
            done = terminated or truncated

            agent.store_transition(obs, action, subgoal, next_obs, done, info)

            obs = next_obs
            total_steps += 1

            # Train
            if total_steps > config.learning_starts:
                agent.train_step()

            pbar.update(1)

            # Evaluate
            if total_steps % config.eval_freq == 0:
                metrics = evaluate_hac(agent, eval_env, config.n_eval_episodes)
                metrics['timestep'] = total_steps
                eval_history.append(metrics)

                if metrics['success_rate'] > best_success_rate:
                    best_success_rate = metrics['success_rate']

                pbar.set_postfix({
                    'success': f"{metrics['success_rate']:.1%}",
                    'best': f"{best_success_rate:.1%}"
                })

    pbar.close()

    print(f"\nTraining completed!")
    print(f"Best success rate: {best_success_rate:.1%}")

    eval_df = pd.DataFrame(eval_history)
    eval_df['seed'] = seed
    eval_df['algorithm'] = 'hac'

    train_env.close()
    eval_env.close()

    return agent, eval_df

In [None]:
# ============================================================================
# RUN TIER 3 EXPERIMENTS: HAC
# ============================================================================

print("\n" + "#" * 60)
print("# Running Tier 3: HAC (Hierarchical RL)")
print("#" * 60)

tier3_results = []

for seed in config.seeds:
    print(f"\n>>> Training HAC with seed {seed}")
    agent, df = train_hac(config, seed, experiment_name="tier3")
    tier3_results.append(df)
    if len(df) > 0:
        print(f"    Final success rate: {df['success_rate'].iloc[-1]:.1%}")

tier3_df = pd.concat(tier3_results, ignore_index=True) if tier3_results else pd.DataFrame()
print("\nâœ“ Tier 3 complete!")


############################################################
# Running Tier 3: HAC (Hierarchical RL)
############################################################

>>> Training HAC with seed 42

Training HAC (Tier 3 - Hierarchical RL)
Seed: 42, Total Steps: 750000
Subgoal Period K: 30


Training HAC:   0%|          | 0/750000 [00:00<?, ?it/s]

In [None]:
# ============================================================================
# VISUALIZATION: Learning Curves and Comparisons
# ============================================================================

def plot_learning_curves(dataframes: Dict[str, pd.DataFrame], title: str, save_path: str = None):
    """Plot learning curves with confidence intervals."""
    fig, ax = plt.subplots(figsize=(12, 6))

    colors = plt.cm.Set2.colors

    for i, (name, df) in enumerate(dataframes.items()):
        if df is None or len(df) == 0:
            continue

        # Aggregate across seeds
        agg = df.groupby('timestep')['success_rate'].agg(['mean', 'std']).reset_index()

        color = colors[i % len(colors)]
        ax.plot(agg['timestep'], agg['mean'], label=name, color=color, linewidth=2)
        ax.fill_between(
            agg['timestep'],
            agg['mean'] - agg['std'],
            agg['mean'] + agg['std'],
            color=color,
            alpha=0.2
        )

    ax.set_xlabel('Training Steps', fontsize=12)
    ax.set_ylabel('Success Rate', fontsize=12)
    ax.set_title(title, fontsize=14)
    ax.legend(fontsize=11)
    ax.grid(True, alpha=0.3)
    ax.yaxis.set_major_formatter(plt.FuncFormatter(lambda y, _: f'{y:.0%}'))
    ax.set_ylim(-0.05, 1.05)

    plt.tight_layout()

    if save_path:
        plt.savefig(save_path, dpi=150, bbox_inches='tight')

    plt.show()


# Plot Tier 1 results
if len(tier1_df) > 0:
    plot_learning_curves(
        {'DQN (Dense Reward)': tier1_df},
        'Tier 1: DQN Backbone Learning Curve',
        os.path.join(config.log_dir, 'tier1_learning_curve.png')
    )

# Plot HER Ablation (Tier 2)
if len(tier2_df) > 0:
    her_data = {}
    for use_her in [True, False]:
        subset = tier2_df[tier2_df['use_her'] == use_her]
        if len(subset) > 0:
            name = 'DQN + HER' if use_her else 'DQN (no HER)'
            her_data[name] = subset

    if her_data:
        plot_learning_curves(
            her_data,
            'Tier 2: HER Ablation (Sparse Reward)',
            os.path.join(config.log_dir, 'tier2_her_ablation.png')
        )

# Plot all methods comparison
if len(tier1_df) > 0 or len(tier2_df) > 0 or len(tier3_df) > 0:
    all_data = {}

    if len(tier1_df) > 0:
        all_data['Tier 1: DQN'] = tier1_df

    if len(tier2_df) > 0:
        her_subset = tier2_df[tier2_df['use_her'] == True]
        no_her_subset = tier2_df[tier2_df['use_her'] == False]
        if len(her_subset) > 0:
            all_data['Tier 2: DQN+HER'] = her_subset
        if len(no_her_subset) > 0:
            all_data['Tier 2: DQN (no HER)'] = no_her_subset

    if len(tier3_df) > 0:
        all_data['Tier 3: HAC'] = tier3_df

    if all_data:
        plot_learning_curves(
            all_data,
            'Method Comparison: All Tiers',
            os.path.join(config.log_dir, 'all_methods_comparison.png')
        )

In [None]:
# ============================================================================
# EXPERIMENT SUMMARY
# ============================================================================

print("=" * 80)
print("EXPERIMENT SUMMARY")
print("=" * 80)

def print_final_metrics(df: pd.DataFrame, name: str):
    """Print final metrics for a method."""
    if df is None or len(df) == 0:
        print(f"\n{name}: No data")
        return

    # Get final timestep metrics averaged across seeds
    final_timestep = df['timestep'].max()
    final_data = df[df['timestep'] == final_timestep]

    success_mean = final_data['success_rate'].mean()
    success_std = final_data['success_rate'].std()
    steps_mean = final_data['mean_steps'].mean()
    efficiency_mean = final_data['mean_path_efficiency'].mean()

    print(f"\n{name}:")
    print(f"  Success Rate: {success_mean:.1%} Â± {success_std:.1%}")
    print(f"  Mean Steps: {steps_mean:.1f}")
    print(f"  Path Efficiency: {efficiency_mean:.2f}")

print_final_metrics(tier1_df, "Tier 1: DQN (Dense Reward)")

if len(tier2_df) > 0:
    no_her = tier2_df[tier2_df['use_her'] == False]
    with_her = tier2_df[tier2_df['use_her'] == True]
    print_final_metrics(no_her, "Tier 2: DQN without HER (Sparse)")
    print_final_metrics(with_her, "Tier 2: DQN with HER (Sparse)")

print_final_metrics(tier3_df, "Tier 3: HAC")

print("\n" + "=" * 80)
print("KEY FINDINGS")
print("=" * 80)
print("""
1. TIER 1 (DQN Backbone):
   - Discrete actions make learning faster and more stable
   - Dense reward shaping helps DQN learn in the maze environment

2. TIER 2 (HER Ablation):
   - WITHOUT HER: Agent struggles with sparse rewards (expected)
   - WITH HER: Agent learns by relabeling failed trajectories as successes

3. TIER 3 (HAC):
   - Hierarchical decomposition enables longer-horizon planning
   - High-level policy proposes subgoals, low-level executes

EXPERIMENTAL NOTES:
- [FAST MODE] Results shown with 1 seed and reduced training
- [FULL MODE] For final report, use 3+ seeds and longer training
""")

# Save all results
os.makedirs(config.log_dir, exist_ok=True)

if len(tier1_df) > 0:
    tier1_df.to_csv(os.path.join(config.log_dir, 'tier1_results.csv'), index=False)
if len(tier2_df) > 0:
    tier2_df.to_csv(os.path.join(config.log_dir, 'tier2_results.csv'), index=False)
if len(tier3_df) > 0:
    tier3_df.to_csv(os.path.join(config.log_dir, 'tier3_results.csv'), index=False)

# Save config
with open(os.path.join(config.log_dir, 'config.json'), 'w') as f:
    json.dump(config.to_dict(), f, indent=2)

print(f"\nAll results saved to: {config.log_dir}")