In [None]:
! pip install swig
! pip install box2d-py
! pip install gym[box2d]

Collecting swig
  Downloading swig-4.4.0-py3-none-manylinux_2_12_x86_64.manylinux2010_x86_64.whl.metadata (3.5 kB)
Downloading swig-4.4.0-py3-none-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (1.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m22.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: swig
Successfully installed swig-4.4.0
Collecting box2d-py
  Downloading box2d-py-2.3.8.tar.gz (374 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m374.5/374.5 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: box2d-py
  Building wheel for box2d-py (setup.py) ... [?25l[?25hdone
  Created wheel for box2d-py: filename=box2d_py-2.3.8-cp312-cp312-linux_x86_64.whl size=2399004 sha256=3e507bb57d81f0a9007f98609836909488db87209e8ce28728f0216e8991f4c5
  Stored in directory: /root/.cache/pip/wheels/d6/3c/ab/b6fd75459cadc56f4a4125d4

In [None]:
import gymnasium as gym
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.distributions import Normal
import os
import glob
import io
import base64
import pickle
import matplotlib.pyplot as plt
from IPython.display import HTML, display
from google.colab import drive

drive.mount('/content/drive')

DRIVE_FOLDER = "/content/drive/MyDrive/BipedalWalker_Project"
os.makedirs(DRIVE_FOLDER, exist_ok=True)
print(f"✅ Saving all models to: {DRIVE_FOLDER}")

def show_video(video_folder="videos"):
    mp4list = glob.glob(f'{video_folder}/*.mp4')
    if len(mp4list) > 0:
        mp4 = max(mp4list, key=os.path.getctime)
        video = io.open(mp4, 'r+b').read()
        encoded = base64.b64encode(video)
        display(HTML(data='''<video alt="test" autoplay
                loop controls style="height: 400px;">
                <source src="data:video/mp4;base64,{0}" type="video/mp4" />
             </video>'''.format(encoded.decode('ascii'))))
    else:
        print("No video found yet.")

# Model architecture
def layer_init(layer, std=np.sqrt(2), bias_const=0.0):
    torch.nn.init.orthogonal_(layer.weight, std)
    torch.nn.init.constant_(layer.bias, bias_const)
    return layer

class ActorCritic(nn.Module):
    def __init__(self, obs_dim, act_dim):
        super().__init__()
        self.critic = nn.Sequential(
            layer_init(nn.Linear(obs_dim, 512)),
            nn.Tanh(),
            layer_init(nn.Linear(512, 512)),
            nn.Tanh(),
            layer_init(nn.Linear(512, 1), std=1.0),
        )
        self.actor_mean = nn.Sequential(
            layer_init(nn.Linear(obs_dim, 512)),
            nn.Tanh(),
            layer_init(nn.Linear(512, 512)),
            nn.Tanh(),
            layer_init(nn.Linear(512, act_dim), std=0.01),
        )
        self.actor_logstd = nn.Parameter(torch.ones(1, act_dim) * -0.5)

    def get_value(self, x):
        return self.critic(x)

    def get_action_and_value(self, x, action=None):
        action_mean = self.actor_mean(x)
        action_logstd = self.actor_logstd.expand_as(action_mean)
        action_std = torch.exp(action_logstd)
        dist = Normal(action_mean, action_std)

        if action is None:
            action = dist.sample()

        log_prob = dist.log_prob(action).sum(1)
        entropy = dist.entropy().sum(1)
        value = self.critic(x)
        return action, log_prob, entropy, value.squeeze(1)

class RolloutBuffer:
    def __init__(self, size, obs_dim, act_dim, device):
        self.size = size
        self.device = device
        self.obs = np.zeros((size, obs_dim), dtype=np.float32)
        self.actions = np.zeros((size, act_dim), dtype=np.float32)
        self.log_probs = np.zeros(size, dtype=np.float32)
        self.rewards = np.zeros(size, dtype=np.float32)
        self.dones = np.zeros(size, dtype=np.float32)
        self.values = np.zeros(size, dtype=np.float32)
        self.advantages = np.zeros(size, dtype=np.float32)
        self.returns = np.zeros(size, dtype=np.float32)
        self.ptr = 0
        self.path_start_idx = 0

    def store(self, obs, action, log_prob, reward, done, value):
        assert self.ptr < self.size
        self.obs[self.ptr] = obs
        self.actions[self.ptr] = action
        self.log_probs[self.ptr] = log_prob
        self.rewards[self.ptr] = reward
        self.dones[self.ptr] = done
        self.values[self.ptr] = value
        self.ptr += 1

    def finish_path(self, last_value, gamma, lam):
        path_slice = slice(self.path_start_idx, self.ptr)
        rewards = np.append(self.rewards[path_slice], last_value)
        values = np.append(self.values[path_slice], last_value)

        gae = 0.0
        adv = np.zeros_like(self.rewards[path_slice])

        for t in reversed(range(len(rewards) - 1)):
            delta = rewards[t] + gamma * values[t + 1] * (1 - self.dones[path_slice][t]) - values[t]
            gae = delta + gamma * lam * (1 - self.dones[path_slice][t]) * gae
            adv[t] = gae

        self.advantages[path_slice] = adv
        self.returns[path_slice] = adv + self.values[path_slice]
        self.path_start_idx = self.ptr

    def get(self):
        assert self.ptr == self.size
        self.ptr = 0
        self.path_start_idx = 0

        adv = self.advantages
        adv = (adv - adv.mean()) / (adv.std() + 1e-8)

        return dict(
            obs=torch.tensor(self.obs, dtype=torch.float32, device=self.device),
            actions=torch.tensor(self.actions, dtype=torch.float32, device=self.device),
            log_probs=torch.tensor(self.log_probs, dtype=torch.float32, device=self.device),
            advantages=torch.tensor(adv, dtype=torch.float32, device=self.device),
            returns=torch.tensor(self.returns, dtype=torch.float32, device=self.device),
            values=torch.tensor(self.values, dtype=torch.float32, device=self.device),
        )

class PPOAgent:
    def __init__(
        self,
        env_id="BipedalWalker-v3",
        total_timesteps=2_000_000,
        rollout_steps=4096,
        gamma=0.99,
        lam=0.95,
        clip_eps=0.2,
        learning_rate=2.5e-4,
        train_epochs=10,
        minibatch_size=512,
        vf_coef=0.5,
        ent_coef=0.00,
        render_freq=25,
        device="cpu",
    ):
        self.env_id = env_id
        self.total_timesteps = total_timesteps
        self.rollout_steps = rollout_steps
        self.gamma = gamma
        self.lam = lam
        self.clip_eps = clip_eps
        self.learning_rate = learning_rate
        self.train_epochs = train_epochs
        self.minibatch_size = minibatch_size
        self.vf_coef = vf_coef
        self.ent_coef = ent_coef
        self.render_freq = render_freq
        self.device = device

        # Stat tracking for plots
        self.ep_returns = []
        self.ep_timesteps = []

        self.env = gym.make(env_id)
        self.env = gym.wrappers.RecordEpisodeStatistics(self.env)
        self.env = gym.wrappers.ClipAction(self.env)

        self.env = gym.wrappers.NormalizeObservation(self.env)
        self.env = gym.wrappers.TransformObservation(self.env, lambda obs: np.clip(obs, -10, 10), self.env.observation_space)

        self.obs_dim = self.env.observation_space.shape[0]
        self.act_dim = self.env.action_space.shape[0]

        self.ac = ActorCritic(self.obs_dim, self.act_dim).to(device)
        self.optimizer = optim.Adam(self.ac.parameters(), lr=self.learning_rate, eps=1e-5)

        self.num_updates = total_timesteps // rollout_steps
        self.lr_scheduler = torch.optim.lr_scheduler.LinearLR(
            self.optimizer, start_factor=1.0, end_factor=0.0, total_iters=self.num_updates
        )

        self.buffer = RolloutBuffer(self.rollout_steps, self.obs_dim, self.act_dim, self.device)

    def visualize_agent(self, update_count):
        print(f"\n--- Visualizing Agent at Update {update_count} ---")
        vis_env = gym.make(self.env_id, render_mode="rgb_array")
        vis_env = gym.wrappers.RecordVideo(
            vis_env,
            video_folder="videos",
            name_prefix=f"update_{update_count}",
            disable_logger=True
        )
        vis_env = gym.wrappers.ClipAction(vis_env)
        vis_norm = gym.wrappers.NormalizeObservation(vis_env)
        try:
            vis_norm.obs_rms = self.env.get_wrapper_attr('obs_rms')
        except AttributeError:
            pass
        vis_env = gym.wrappers.TransformObservation(vis_norm, lambda obs: np.clip(obs, -10, 10), vis_env.observation_space)

        obs, _ = vis_env.reset()
        ret = 0
        while True:
            obs_tensor = torch.tensor(obs, dtype=torch.float32, device=self.device).unsqueeze(0)
            with torch.no_grad():
                action = self.ac.actor_mean(obs_tensor).squeeze(0).cpu().numpy()
            obs, reward, terminated, truncated, _ = vis_env.step(action)
            ret += reward
            if terminated or truncated:
                break
        vis_env.close()
        print(f"Visualization finished with return: {ret:.2f}")
        show_video("videos")

    def train(self):
        obs, _ = self.env.reset()
        timesteps_collected = 0
        update_count = 0

        while timesteps_collected < self.total_timesteps:
            for _ in range(self.rollout_steps):
                obs_tensor = torch.tensor(obs, dtype=torch.float32, device=self.device).unsqueeze(0)
                with torch.no_grad():
                    action, log_prob, _, value = self.ac.get_action_and_value(obs_tensor)

                action = action.cpu().numpy().squeeze(0)
                log_prob = log_prob.item()
                value = value.item()

                next_obs, reward, terminated, truncated, infos = self.env.step(action)
                done = terminated or truncated

                self.buffer.store(obs, action, log_prob, reward, done, value)
                timesteps_collected += 1
                obs = next_obs

                if "episode" in infos:
                    ret = infos['episode']['r']
                    self.ep_returns.append(ret)
                    self.ep_timesteps.append(timesteps_collected)
                    print(f"Update {update_count} | Steps: {timesteps_collected} | Return: {ret:.2f}")

                if done:
                    if truncated:
                        last_val_obs = torch.tensor(next_obs, dtype=torch.float32, device=self.device).unsqueeze(0)
                        with torch.no_grad():
                            last_value = self.ac.get_value(last_val_obs).item()
                    else:
                        last_value = 0
                    self.buffer.finish_path(last_value=last_value, gamma=self.gamma, lam=self.lam)
                    obs, _ = self.env.reset()

                if timesteps_collected >= self.total_timesteps:
                    break

            if not done:
                obs_tensor = torch.tensor(obs, dtype=torch.float32, device=self.device).unsqueeze(0)
                with torch.no_grad():
                    last_value = self.ac.get_value(obs_tensor).item()
                self.buffer.finish_path(last_value=last_value, gamma=self.gamma, lam=self.lam)

            if self.buffer.ptr == self.buffer.size:
                data = self.buffer.get()
                self._update(data)
                self.lr_scheduler.step()
                update_count += 1

                if update_count % 50 == 0:
                    self.save(os.path.join(DRIVE_FOLDER, "ppo_bipedal_checkpoint.pt"))
                    print(f"Checkpoint saved to drive at update {update_count}")

            if update_count > 0 and update_count % self.render_freq == 0:
                self.visualize_agent(update_count)

        self.env.close()

    def _update(self, data):
        obs = data["obs"]
        actions = data["actions"]
        old_log_probs = data["log_probs"]
        advantages = data["advantages"]
        returns = data["returns"]

        batch_size = len(obs)
        inds = np.arange(batch_size)

        for _ in range(self.train_epochs):
            np.random.shuffle(inds)
            for start in range(0, batch_size, self.minibatch_size):
                end = start + self.minibatch_size
                mb_inds = inds[start:end]
                mb_obs = obs[mb_inds]
                mb_actions = actions[mb_inds]
                mb_old_log_probs = old_log_probs[mb_inds]
                mb_adv = advantages[mb_inds]
                mb_returns = returns[mb_inds]

                _, new_log_probs, entropy, values = self.ac.get_action_and_value(mb_obs, mb_actions)
                ratio = torch.exp(new_log_probs - mb_old_log_probs)
                surr1 = ratio * mb_adv
                surr2 = torch.clamp(ratio, 1.0 - self.clip_eps, 1.0 + self.clip_eps) * mb_adv

                actor_loss = -torch.min(surr1, surr2).mean()
                critic_loss = 0.5 * ((values - mb_returns) ** 2).mean()
                entropy_loss = entropy.mean()
                loss = actor_loss + self.vf_coef * critic_loss - self.ent_coef * entropy_loss

                self.optimizer.zero_grad()
                loss.backward()
                nn.utils.clip_grad_norm_(self.ac.parameters(), max_norm=0.5)
                self.optimizer.step()

    def plot_results(self, window_size=50):
        if not self.ep_returns:
            print("Can not plot, no rewards")
            return

        # Calculate Running Average
        returns = np.array(self.ep_returns)
        running_avg = np.convolve(returns, np.ones(window_size)/window_size, mode='valid')

        plt.figure(figsize=(10, 6))
        plt.plot(self.ep_timesteps, returns, alpha=0.3, color='blue', label='Episode Return')

        avg_timesteps = self.ep_timesteps[window_size-1:]
        plt.plot(avg_timesteps, running_avg, color='red', linewidth=2, label=f'Running Avg (last {window_size} eps)')

        plt.title(f"PPO Training Progress: {self.env_id}")
        plt.xlabel("Total Timesteps")
        plt.ylabel("Return")
        plt.legend()
        plt.grid(True, linestyle='--', alpha=0.6)

        # Save plot to drive
        plot_path = os.path.join(DRIVE_FOLDER, "training_plot.png")
        plt.savefig(plot_path)
        plt.show()
        print(f"Plot saved to: {plot_path}")

    def save(self, path):
        torch.save(self.ac.state_dict(), path)

if __name__ == "__main__":
    device = "cuda" if torch.cuda.is_available() else "cpu"
    print(f"Using device: {device}")

    agent = PPOAgent(
        env_id="BipedalWalker-v3",
        total_timesteps=2_000_000,
        rollout_steps=4096,
        render_freq=25,
        device=device,
    )

    agent.train()

    # Final plot and save
    agent.plot_results(window_size=50)

    final_model_path = os.path.join(DRIVE_FOLDER, "ppo_bipedal_final.pt")
    final_stats_path = os.path.join(DRIVE_FOLDER, "obs_stats.pkl")
    agent.save(final_model_path)

    with open(final_stats_path, "wb") as f:
        pickle.dump(agent.env.get_wrapper_attr('obs_rms'), f)

    print(f"Training Complete. Files saved to Drive:\n1. {final_model_path}\n2. {final_stats_path}")

### Code for consistent finisher part 1, firstly the agent is trained using reward and observation normalisation ###

In [None]:
import gymnasium as gym
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import os
import pickle
import glob
import io
import base64
import matplotlib.pyplot as plt
from torch.distributions import Normal
from IPython.display import HTML, display
from google.colab import drive

if not os.path.exists('/content/drive'):
    drive.mount('/content/drive')

DRIVE_FOLDER = "/content/drive/MyDrive/BipedalWalker_Project"
BASE_MODEL = os.path.join(DRIVE_FOLDER, "ppo_bipedal_final.pt")
BASE_STATS = os.path.join(DRIVE_FOLDER, "obs_stats.pkl")
CHECKPOINT = os.path.join(DRIVE_FOLDER, "hardcore_checkpoint.pt")
FINAL_SAVE = os.path.join(DRIVE_FOLDER, "ppo_hardcore_solved.pt")

os.makedirs(DRIVE_FOLDER, exist_ok=True)

def show_video(folder="videos"):
    mp4list = glob.glob(f'{folder}/*.mp4')
    if len(mp4list) > 0:
        mp4 = max(mp4list, key=os.path.getctime)
        video = io.open(mp4, 'r+b').read()
        encoded = base64.b64encode(video)
        display(HTML(data='''<video alt="test" autoplay loop controls style="height: 400px;">
                <source src="data:video/mp4;base64,{0}" type="video/mp4" /></video>'''.format(encoded.decode('ascii'))))

def layer_init(layer, std=np.sqrt(2), bias_const=0.0):
    torch.nn.init.orthogonal_(layer.weight, std)
    torch.nn.init.constant_(layer.bias, bias_const)
    return layer

class ActorCritic(nn.Module):
    def __init__(self, obs_dim, act_dim):
        super().__init__()
        self.critic = nn.Sequential(
            layer_init(nn.Linear(obs_dim, 512)), nn.Tanh(),
            layer_init(nn.Linear(512, 512)), nn.Tanh(),
            layer_init(nn.Linear(512, 1), std=1.0),
        )
        self.actor_mean = nn.Sequential(
            layer_init(nn.Linear(obs_dim, 512)), nn.Tanh(),
            layer_init(nn.Linear(512, 512)), nn.Tanh(),
            layer_init(nn.Linear(512, act_dim), std=0.01),
        )
        self.actor_logstd = nn.Parameter(torch.ones(1, act_dim) * -0.5)

    def get_value(self, x): return self.critic(x)
    def get_action_and_value(self, x, action=None):
        action_mean = self.actor_mean(x)
        action_logstd = self.actor_logstd.expand_as(action_mean)
        action_std = torch.exp(action_logstd)
        dist = Normal(action_mean, action_std)
        if action is None: action = dist.sample()
        return action, dist.log_prob(action).sum(1), dist.entropy().sum(1), self.critic(x).squeeze(1)

class HardcoreAgent:
    def __init__(self, device="cuda"):
        self.device = device
        self.total_timesteps = 40_000_000
        self.rollout_steps = 4096
        self.learning_rate = 1.0e-4
        self.ent_coef_start = 0.01
        self.ent_coef_end = 0.00
        self.gamma = 0.999
        self.lam = 0.95
        self.clip_eps = 0.1
        self.train_epochs = 10
        self.minibatch_size = 512
        self.vf_coef = 0.5
        self.max_grad_norm = 0.5

        self.history = {'steps': [], 'returns': [], 'phase': []}

        self.start_timestep = 0
        self.start_update = 0
        self.success_count = 0

        self.env = gym.make("BipedalWalkerHardcore-v3")
        self.env = gym.wrappers.RecordEpisodeStatistics(self.env)
        self.env = gym.wrappers.ClipAction(self.env)
        self.env = gym.wrappers.NormalizeObservation(self.env)
        self.env = gym.wrappers.TransformObservation(self.env, lambda obs: np.clip(obs, -10, 10), self.env.observation_space)
        self.env = gym.wrappers.NormalizeReward(self.env, gamma=self.gamma)
        self.env = gym.wrappers.TransformReward(self.env, lambda r: np.clip(r, -10, 10))

        self.obs_dim = self.env.observation_space.shape[0]
        self.act_dim = self.env.action_space.shape[0]

        self.ac = ActorCritic(self.obs_dim, self.act_dim).to(device)
        self.optimizer = optim.Adam(self.ac.parameters(), lr=self.learning_rate, eps=1e-5)
        self.num_updates = self.total_timesteps // self.rollout_steps
        self.lr_scheduler = torch.optim.lr_scheduler.LinearLR(self.optimizer, start_factor=1.0, end_factor=0.0, total_iters=self.num_updates)

        if os.path.exists(CHECKPOINT):
            self.load_checkpoint(CHECKPOINT)
        elif os.path.exists(BASE_MODEL):
            base_weights = torch.load(BASE_MODEL, map_location=device, weights_only=False)
            self.ac.load_state_dict(base_weights)
            with torch.no_grad(): self.ac.actor_logstd.fill_(-0.5)
            if os.path.exists(BASE_STATS):
                with open(BASE_STATS, "rb") as f: self.set_obs_rms(pickle.load(f))

        self.buffer = {
            'obs': np.zeros((self.rollout_steps, self.obs_dim), dtype=np.float32),
            'actions': np.zeros((self.rollout_steps, self.act_dim), dtype=np.float32),
            'log_probs': np.zeros(self.rollout_steps, dtype=np.float32),
            'rewards': np.zeros(self.rollout_steps, dtype=np.float32),
            'dones': np.zeros(self.rollout_steps, dtype=np.float32),
            'values': np.zeros(self.rollout_steps, dtype=np.float32),
        }

    def get_obs_rms(self):
        ptr = self.env
        while hasattr(ptr, 'env'):
            if isinstance(ptr, gym.wrappers.NormalizeObservation): return ptr.obs_rms
            ptr = ptr.env
        return None

    def set_obs_rms(self, rms):
        ptr = self.env
        while hasattr(ptr, 'env'):
            if isinstance(ptr, gym.wrappers.NormalizeObservation): ptr.obs_rms = rms; break
            ptr = ptr.env

    def get_return_rms(self):
        ptr = self.env
        while hasattr(ptr, 'env'):
            if isinstance(ptr, gym.wrappers.NormalizeReward): return ptr.return_rms
            ptr = ptr.env
        return None

    def set_return_rms(self, rms):
        ptr = self.env
        while hasattr(ptr, 'env'):
            if isinstance(ptr, gym.wrappers.NormalizeReward): ptr.return_rms = rms; break
            ptr = ptr.env

    def save_checkpoint(self, path, update, timestep):
        checkpoint = {
            'model_state_dict': self.ac.state_dict(),
            'optimizer_state_dict': self.optimizer.state_dict(),
            'scheduler_state_dict': self.lr_scheduler.state_dict(),
            'update_count': update,
            'total_timesteps': timestep,
            'success_count': self.success_count,
            'obs_rms': self.get_obs_rms(),
            'return_rms': self.get_return_rms(),
            'history': self.history
        }
        torch.save(checkpoint, path)
        self.plot_history()
        print(f"Checkpoint saved and plot updated: Step {timestep}")

    def load_checkpoint(self, path):
        ckpt = torch.load(path, map_location=self.device, weights_only=False)
        self.ac.load_state_dict(ckpt['model_state_dict'])
        self.optimizer.load_state_dict(ckpt['optimizer_state_dict'])
        self.lr_scheduler.load_state_dict(ckpt['scheduler_state_dict'])
        self.start_update = ckpt['update_count']
        self.start_timestep = ckpt['total_timesteps']
        self.success_count = ckpt.get('success_count', 0)
        self.history = ckpt.get('history', {'steps': [], 'returns': [], 'phase': []})
        if ckpt.get('obs_rms'): self.set_obs_rms(ckpt['obs_rms'])
        if ckpt.get('return_rms'): self.set_return_rms(ckpt['return_rms'])

    def plot_history(self):
        if not self.history['returns']: return
        plt.figure(figsize=(12, 6))
        steps = np.array(self.history['steps'])
        returns = np.array(self.history['returns'])
        phases = np.array(self.history['phase'])

        plt.plot(steps, returns, alpha=0.2, color='blue', label='Raw Episode Return')

        if len(returns) > 50:
            avg = np.convolve(returns, np.ones(50)/50, mode='valid')
            plt.plot(steps[49:], avg, color='red', linewidth=2, label='Running Avg (50)')

        if 1 in phases:
            idx = np.where(phases == 1)[0][0]
            plt.axvline(x=steps[idx], color='green', linestyle='--', linewidth=2)
            plt.text(steps[idx], plt.ylim()[1], ' Energy Fine-Tuning Started', color='green', rotation=0, verticalalignment='top')

        plt.title("BipedalWalker Hardcore: Full Training Progress")
        plt.xlabel("Environment Timesteps")
        plt.ylabel("Return")
        plt.legend()
        plt.grid(True, alpha=0.3)
        plt.savefig(os.path.join(DRIVE_FOLDER, "hardcore_training_plot.png"))
        plt.show()

    def visualize_agent(self, update_count):
        print(f"\n--- Visualizing Hardcore Agent ---")
        vis_env = gym.make("BipedalWalkerHardcore-v3", render_mode="rgb_array")
        vis_env = gym.wrappers.RecordVideo(vis_env, video_folder="videos", name_prefix=f"hc_update_{update_count}", disable_logger=True)
        vis_env = gym.wrappers.ClipAction(vis_env)
        vis_norm = gym.wrappers.NormalizeObservation(vis_env)
        current_obs_rms = self.get_obs_rms()
        if current_obs_rms: vis_norm.obs_rms = current_obs_rms
        vis_env = gym.wrappers.TransformObservation(vis_norm, lambda obs: np.clip(obs, -10, 10), vis_env.observation_space)
        obs, _ = vis_env.reset()
        while True:
            obs_tensor = torch.tensor(obs, dtype=torch.float32, device=self.device).unsqueeze(0)
            with torch.no_grad(): action = self.ac.actor_mean(obs_tensor).squeeze(0).cpu().numpy()
            obs, reward, terminated, truncated, _ = vis_env.step(action)
            if terminated or truncated: break
        vis_env.close()
        show_video("videos")

    def train(self):
        obs, _ = self.env.reset()
        update_count = self.start_update
        timesteps_collected = self.start_timestep
        while timesteps_collected < self.total_timesteps:
            frac = 1.0 - (update_count / self.num_updates)
            current_ent_coef = self.ent_coef_start * max(0, frac)
            for i in range(self.rollout_steps):
                with torch.no_grad():
                    obs_t = torch.tensor(obs, dtype=torch.float32, device=self.device).unsqueeze(0)
                    action, log_prob, _, value = self.ac.get_action_and_value(obs_t)
                action_np = action.cpu().numpy().squeeze(0)
                next_obs, reward, terminated, truncated, infos = self.env.step(action_np)
                done = terminated or truncated
                self.buffer['obs'][i], self.buffer['actions'][i], self.buffer['log_probs'][i] = obs, action_np, log_prob.item()
                self.buffer['rewards'][i], self.buffer['dones'][i], self.buffer['values'][i] = reward, done, value.item()
                obs = next_obs
                timesteps_collected += 1
                if "episode" in infos:
                    ret = infos['episode']['r']
                    self.history['steps'].append(timesteps_collected)
                    self.history['returns'].append(ret)
                    self.history['phase'].append(0) # Phase 0
                    print(f"Upd {update_count} | Step {timesteps_collected} | Ret: {ret:.2f}")
                    if ret >= 300:
                        self.success_count += 1
                        if self.success_count >= 20: self.save_checkpoint(FINAL_SAVE, update_count, timesteps_collected); return
                if done: obs, _ = self.env.reset()
            with torch.no_grad():
                next_val = self.ac.get_value(torch.tensor(next_obs, dtype=torch.float32, device=self.device).unsqueeze(0)).item()
            adv = np.zeros(self.rollout_steps, dtype=np.float32)
            lastgaelam = 0
            for t in reversed(range(self.rollout_steps)):
                nonterminal = 1.0 - self.buffer['dones'][t]
                next_val_t = next_val if t == self.rollout_steps - 1 else self.buffer['values'][t+1]
                delta = self.buffer['rewards'][t] + self.gamma * next_val_t * nonterminal - self.buffer['values'][t]
                lastgaelam = delta + self.gamma * self.lam * nonterminal * lastgaelam
                adv[t] = lastgaelam
            returns = adv + self.buffer['values']
            b_obs = torch.tensor(self.buffer['obs'], device=self.device)
            b_act = torch.tensor(self.buffer['actions'], device=self.device)
            b_log = torch.tensor(self.buffer['log_probs'], device=self.device)
            b_ret = torch.tensor(returns, device=self.device)
            b_adv = torch.tensor(adv, device=self.device)
            b_adv = (b_adv - b_adv.mean()) / (b_adv.std() + 1e-8)
            inds = np.arange(self.rollout_steps)
            for _ in range(self.train_epochs):
                np.random.shuffle(inds)
                for start in range(0, self.rollout_steps, self.minibatch_size):
                    end = start + self.minibatch_size
                    mb_inds = inds[start:end]
                    _, new_lp, entropy, new_val = self.ac.get_action_and_value(b_obs[mb_inds], b_act[mb_inds])
                    ratio = torch.exp(new_lp - b_log[mb_inds])
                    surr1, surr2 = ratio * b_adv[mb_inds], torch.clamp(ratio, 1-self.clip_eps, 1+self.clip_eps) * b_adv[mb_inds]
                    loss = -torch.min(surr1, surr2).mean() + self.vf_coef * 0.5 * ((new_val.squeeze() - b_ret[mb_inds])**2).mean() - current_ent_coef * entropy.mean()
                    self.optimizer.zero_grad(); loss.backward(); nn.utils.clip_grad_norm_(self.ac.parameters(), self.max_grad_norm); self.optimizer.step()
            self.lr_scheduler.step(); update_count += 1
            if update_count % 100 == 0: self.save_checkpoint(CHECKPOINT, update_count, timesteps_collected); self.visualize_agent(update_count)

if __name__ == "__main__":
    agent = HardcoreAgent(device="cuda" if torch.cuda.is_available() else "cpu")
    agent.train()

### Code for consistent finisher part 2, adds penalties for jerking and over-exertion.

In [None]:
import gymnasium as gym
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import os
import pickle
import glob
import io
import base64
import matplotlib.pyplot as plt
from torch.distributions import Normal
from IPython.display import HTML, display
from google.colab import drive

# --- DRIVE CONFIG ---
if not os.path.exists("/content/drive"): drive.mount("/content/drive")
DRIVE_FOLDER = "/content/drive/MyDrive/BipedalWalker_Project"
CHECKPOINT_IN = os.path.join(DRIVE_FOLDER, "hardcore_checkpoint.pt")
CHECKPOINT_OUT = os.path.join(DRIVE_FOLDER, "hardcore_energy_ft_checkpoint.pt")
FINAL_OUT      = os.path.join(DRIVE_FOLDER, "ppo_hardcore_energy_ft_final.pt")
BEST_OUT       = os.path.join(DRIVE_FOLDER, "ppo_hardcore_energy_ft_best.pt")

def show_video(folder="videos"):
    mp4list = glob.glob(f"{folder}/*.mp4")
    if len(mp4list) > 0:
        mp4 = max(mp4list, key=os.path.getctime)
        video = io.open(mp4, "r+b").read(); encoded = base64.b64encode(video)
        display(HTML(data=f'<video autoplay loop controls style="height: 420px;"><source src="data:video/mp4;base64,{encoded.decode("ascii")}" type="video/mp4" /></video>'))

class EnergyPenaltyWrapper(gym.Wrapper):
    def __init__(self, env, a_coef=0.015, da_coef=0.035):
        super().__init__(env)
        self.a_coef, self.da_coef = a_coef, da_coef
        self.prev_action = None
        self.raw_return = 0.0
        self.shaped_return = 0.0
        self.ep_len = 0
    def reset(self, **kwargs):
        obs, info = self.env.reset(**kwargs)
        self.prev_action = np.zeros(self.action_space.shape, dtype=np.float32)
        self.raw_return, self.shaped_return, self.ep_len = 0.0, 0.0, 0
        return obs, info
    def step(self, action):
        obs, raw_r, terminated, truncated, info = self.env.step(action)
        penalty = self.a_coef * np.mean(np.square(action)) + self.da_coef * np.mean(np.square(action - self.prev_action))
        shaped_r = float(raw_r - penalty)
        self.prev_action = action.copy()
        self.raw_return += float(raw_r); self.shaped_return += float(shaped_r); self.ep_len += 1
        done = terminated or truncated
        if done:
            info = dict(info)
            info["episode_raw"] = {"r": self.raw_return, "l": self.ep_len}
            info["episode_shaped"] = {"r": self.shaped_return, "l": self.ep_len}
        return obs, shaped_r, terminated, truncated, info

def layer_init(layer, std=np.sqrt(2), bias_const=0.0):
    torch.nn.init.orthogonal_(layer.weight, std); torch.nn.init.constant_(layer.bias, bias_const); return layer

class ActorCritic(nn.Module):
    def __init__(self, obs_dim, act_dim):
        super().__init__()
        self.critic = nn.Sequential(layer_init(nn.Linear(obs_dim, 512)), nn.Tanh(), layer_init(nn.Linear(512, 512)), nn.Tanh(), layer_init(nn.Linear(512, 1), std=1.0))
        self.actor_mean = nn.Sequential(layer_init(nn.Linear(obs_dim, 512)), nn.Tanh(), layer_init(nn.Linear(512, 512)), nn.Tanh(), layer_init(nn.Linear(512, act_dim), std=0.01))
        self.actor_logstd = nn.Parameter(torch.ones(1, act_dim) * -0.5)
    def get_value(self, x): return self.critic(x).squeeze(1)
    def get_action_and_value(self, x, action=None):
        mean = self.actor_mean(x); logstd = self.actor_logstd.expand_as(mean); std = torch.exp(logstd); dist = Normal(mean, std)
        if action is None: action = dist.sample()
        return action, dist.log_prob(action).sum(1), dist.entropy().sum(1), self.critic(x).squeeze(1)

class HardcoreEnergyFineTuner:
    def __init__(self, device="cuda"):
        self.device = device
        self.total_timesteps, self.rollout_steps = 40_000_000, 4096
        self.gamma, self.lam, self.clip_eps, self.learning_rate = 0.99, 0.95, 0.12, 5e-5
        self.train_epochs, self.minibatch_size, self.vf_coef, self.max_grad_norm = 10, 512, 0.5, 0.5
        self.target_kl, self.a_coef, self.da_coef = 0.02, 0.02, 0.05

        self.history = {'steps': [], 'returns': [], 'phase': []} # Will be loaded

        env = gym.make("BipedalWalkerHardcore-v3")
        env = gym.wrappers.RecordEpisodeStatistics(env)
        env = gym.wrappers.ClipAction(env)
        env = gym.wrappers.NormalizeObservation(env)
        env = gym.wrappers.TransformObservation(env, lambda obs: np.clip(obs, -10, 10), env.observation_space)
        self.env = EnergyPenaltyWrapper(env, a_coef=self.a_coef, da_coef=self.da_coef)

        self.ac = ActorCritic(self.env.observation_space.shape[0], self.env.action_space.shape[0]).to(self.device)
        self.optimizer = optim.Adam(self.ac.parameters(), lr=self.learning_rate, eps=1e-5)
        self.lr_scheduler = torch.optim.lr_scheduler.LinearLR(self.optimizer, start_factor=1.0, end_factor=0.0, total_iters=self.total_timesteps // self.rollout_steps)

        self.buf_obs = np.zeros((self.rollout_steps, self.env.observation_space.shape[0]), dtype=np.float32)
        self.buf_act = np.zeros((self.rollout_steps, self.env.action_space.shape[0]), dtype=np.float32)
        self.buf_logp, self.buf_rew, self.buf_done, self.buf_val = np.zeros((4, self.rollout_steps), dtype=np.float32)

        self.load_checkpoint(CHECKPOINT_IN)

    def _get_obs_rms(self):
        ptr = self.env
        while hasattr(ptr, "env"):
            if isinstance(ptr, gym.wrappers.NormalizeObservation): return ptr.obs_rms
            ptr = ptr.env
        return None

    def _set_obs_rms(self, rms):
        ptr = self.env
        while hasattr(ptr, "env"):
            if isinstance(ptr, gym.wrappers.NormalizeObservation): ptr.obs_rms = rms; return
            ptr = ptr.env

    def plot_history(self):
        if not self.history['returns']: return
        plt.figure(figsize=(12, 6))
        steps, returns, phases = np.array(self.history['steps']), np.array(self.history['returns']), np.array(self.history['phase'])
        plt.plot(steps, returns, alpha=0.2, color='blue', label='Raw Episode Return')
        if len(returns) > 50:
            avg = np.convolve(returns, np.ones(50)/50, mode='valid')
            plt.plot(steps[49:], avg, color='red', linewidth=2, label='Running Avg (50)')

        if 1 in phases:
            idx = np.where(phases == 1)[0][0]
            plt.axvline(x=steps[idx], color='green', linestyle='--', linewidth=2)
            plt.text(steps[idx], plt.ylim()[1]*0.9, ' Energy Fine-Tuning Phase', color='green', fontweight='bold')

        plt.title("Continuous Progress: Hardcore -> Energy Fine-Tuning")
        plt.xlabel("Total Environment Timesteps")
        plt.ylabel("Raw Return")
        plt.legend()
        plt.grid(True, alpha=0.3)
        plt.savefig(os.path.join(DRIVE_FOLDER, "hardcore_energy_ft_plot.png"))
        plt.show()

    def save_checkpoint(self, path, update, timestep):
        ckpt = {'model_state_dict': self.ac.state_dict(), 'optimizer_state_dict': self.optimizer.state_dict(), 'scheduler_state_dict': self.lr_scheduler.state_dict(), 'update_count': update, 'total_timesteps': timestep, 'obs_rms': self._get_obs_rms(), 'history': self.history}
        torch.save(ckpt, path)
        self.plot_history()

    def load_checkpoint(self, path):
        ckpt = torch.load(path, map_location=self.device, weights_only=False)
        self.ac.load_state_dict(ckpt["model_state_dict"] if "model_state_dict" in ckpt else ckpt)
        if "optimizer_state_dict" in ckpt: self.optimizer.load_state_dict(ckpt["optimizer_state_dict"])
        self.start_update, self.start_timestep = int(ckpt.get("update_count", 0)), int(ckpt.get("total_timesteps", 0))
        self.history = ckpt.get('history', {'steps': [], 'returns': [], 'phase': []})
        if ckpt.get("obs_rms") is not None: self._set_obs_rms(ckpt["obs_rms"])

    def train(self):
        obs, _ = self.env.reset(); update_count, timesteps_collected = self.start_update, self.start_timestep
        while timesteps_collected < self.total_timesteps:
            for t in range(self.rollout_steps):
                obs_t = torch.tensor(obs, dtype=torch.float32, device=self.device).unsqueeze(0)
                with torch.no_grad(): action, logp, _, value = self.ac.get_action_and_value(obs_t)
                act_np = action.squeeze(0).cpu().numpy(); next_obs, shaped_r, term, trunc, info = self.env.step(act_np); done = term or trunc
                self.buf_obs[t], self.buf_act[t], self.buf_logp[t], self.buf_rew[t], self.buf_done[t], self.buf_val[t] = obs, act_np, logp.item(), shaped_r, done, value.item()
                obs, timesteps_collected = next_obs, timesteps_collected + 1
                if "episode_raw" in info:
                    ret = info["episode_raw"]["r"]
                    self.history['steps'].append(timesteps_collected)
                    self.history['returns'].append(ret)
                    self.history['phase'].append(1) # Phase 1 for Energy FT
                    print(f"FT Upd {update_count} | Step {timesteps_collected} | RAW {ret:7.2f}")
                if done: obs, _ = self.env.reset()
            with torch.no_grad(): next_val = self.ac.get_value(torch.tensor(obs, dtype=torch.float32, device=self.device).unsqueeze(0)).item()
            adv = np.zeros(self.rollout_steps, dtype=np.float32); lastgaelam = 0.0
            for t in reversed(range(self.rollout_steps)):
                nonterminal = 1.0 - self.buf_done[t]; next_v = next_val if t == self.rollout_steps-1 else self.buf_val[t+1]
                delta = self.buf_rew[t] + self.gamma * next_v * nonterminal - self.buf_val[t]
                lastgaelam = delta + self.gamma * self.lam * nonterminal * lastgaelam; adv[t] = lastgaelam
            ret = adv + self.buf_val; b_obs, b_act, b_log, b_adv, b_ret = [torch.tensor(x, dtype=torch.float32, device=self.device) for x in [self.buf_obs, self.buf_act, self.buf_logp, adv, ret]]
            b_adv = (b_adv - b_adv.mean()) / (b_adv.std() + 1e-8)
            for _ in range(self.train_epochs):
                inds = np.arange(self.rollout_steps); np.random.shuffle(inds)
                for start in range(0, self.rollout_steps, self.minibatch_size):
                    mb = inds[start:start+self.minibatch_size]
                    _, n_lp, ent, n_v = self.ac.get_action_and_value(b_obs[mb], b_act[mb]); ratio = torch.exp(n_lp - b_log[mb])
                    surr1, surr2 = ratio * b_adv[mb], torch.clamp(ratio, 1.0-self.clip_eps, 1.0+self.clip_eps) * b_adv[mb]
                    loss = -torch.min(surr1, surr2).mean() + 0.5 * ((n_v - b_ret[mb])**2).mean() - 0.0 * ent.mean()
                    self.optimizer.zero_grad(); loss.backward(); nn.utils.clip_grad_norm_(self.ac.parameters(), self.max_grad_norm); self.optimizer.step()
            self.lr_scheduler.step(); update_count += 1
            if update_count % 50 == 0: self.save_checkpoint(CHECKPOINT_OUT, update_count, timesteps_collected)

if __name__ == "__main__":
    agent = HardcoreEnergyFineTuner(device="cuda" if torch.cuda.is_available() else "cpu")
    agent.train()

### Current version that can get 300, still working on it, uses multiple environments in parallel and oversamples runs that the agent does bad on, still working on it, if you run it let it run until it plateaus and then manually turn on the smoothness regularisers by setting the median threshold to a low value like 0, I have not figured out yet when a good time is to automatically turn it on

In [None]:
import os, glob, io, base64, pickle, random
import numpy as np
import gymnasium as gym
import torch
import torch.nn as nn
import torch.optim as optim
from torch.distributions import Normal
from collections import deque
from IPython.display import HTML, display
from google.colab import drive
import matplotlib.pyplot as plt

try:
    import gymnasium.wrappers.utils
    torch.serialization.add_safe_globals([gymnasium.wrappers.utils.RunningMeanStd])
except:
    pass

if not os.path.exists("/content/drive"):
    drive.mount("/content/drive")

DRIVE_FOLDER = "/content/drive/MyDrive/BipedalWalker_Project"
os.makedirs(DRIVE_FOLDER, exist_ok=True)

BASE_NORMAL_MODEL = os.path.join(DRIVE_FOLDER, "ppo_bipedal_final.pt")
CKPT_PATH  = os.path.join(DRIVE_FOLDER, "ppo_hardcore_vector_hardseed_checkpoint.pt")
BEST_PATH  = os.path.join(DRIVE_FOLDER, "ppo_hardcore_vector_hardseed_best.pt")
FINAL_PATH = os.path.join(DRIVE_FOLDER, "ppo_hardcore_vector_hardseed_final.pt")
PLOT_PATH  = os.path.join(DRIVE_FOLDER, "training_progress.png")

def show_video(folder="videos"):
    mp4list = glob.glob(f"{folder}/*.mp4")
    if len(mp4list) == 0:
        print("No video found.")
        return
    mp4 = max(mp4list, key=os.path.getctime)
    video = io.open(mp4, "r+b").read()
    encoded = base64.b64encode(video)
    display(HTML(data=f"""
    <video autoplay loop controls style="height: 420px;">
      <source src="data:video/mp4;base64,{encoded.decode('ascii')}" type="video/mp4" />
    </video>
    """))

def layer_init(layer, std=np.sqrt(2), bias_const=0.0):
    nn.init.orthogonal_(layer.weight, std)
    nn.init.constant_(layer.bias, bias_const)
    return layer

class ActorCritic(nn.Module):
    def __init__(self, obs_dim, act_dim):
        super().__init__()
        self.critic = nn.Sequential(
            layer_init(nn.Linear(obs_dim, 512)), nn.Tanh(),
            layer_init(nn.Linear(512, 512)), nn.Tanh(),
            layer_init(nn.Linear(512, 1), std=1.0),
        )
        self.actor_mean = nn.Sequential(
            layer_init(nn.Linear(obs_dim, 512)), nn.Tanh(),
            layer_init(nn.Linear(512, 512)), nn.Tanh(),
            layer_init(nn.Linear(512, act_dim), std=0.01),
        )
        self.actor_logstd = nn.Parameter(torch.ones(1, act_dim) * -0.7)

    def get_value(self, x): return self.critic(x).squeeze(1)
    def get_dist(self, x):
        mean = self.actor_mean(x)
        std = torch.exp(self.actor_logstd.expand_as(mean))
        return Normal(mean, std), mean

    def get_action_and_value(self, x, action=None):
        dist, mean = self.get_dist(x)
        if action is None: action = dist.sample()
        return action, dist.log_prob(action).sum(1), dist.entropy().sum(1), self.get_value(x), mean

def make_one_env(seed=None, render_mode=None):
    env = gym.make("BipedalWalkerHardcore-v3", render_mode=render_mode)
    env = gym.wrappers.RecordEpisodeStatistics(env)
    env = gym.wrappers.ClipAction(env)
    env = gym.wrappers.NormalizeObservation(env)
    env = gym.wrappers.TransformObservation(env, lambda o: np.clip(o, -10, 10), env.observation_space)
    if seed is not None: env.reset(seed=int(seed))
    return env

def make_vec_env(n, seed_list=None):
    def thunk(i):
        def _f():
            s = None if seed_list is None else seed_list[i]
            return make_one_env(seed=s)
        return _f
    return gym.vector.SyncVectorEnv([thunk(i) for i in range(n)])

def find_normalize_obs_wrapper(env):
    env_ptr = env
    while hasattr(env_ptr, "env"):
        if isinstance(env_ptr, gym.wrappers.NormalizeObservation): return env_ptr
        env_ptr = env_ptr.env
    return None

class HardSeedPool:
    def __init__(self, maxlen=4000):
        self.maxlen = maxlen
        self.seeds = deque(maxlen=maxlen)
    def add(self, seed): self.seeds.append(int(seed))
    def sample(self, k):
        if not self.seeds: return None
        return random.sample(list(self.seeds), min(k, len(self.seeds)))
    def state_dict(self): return {"maxlen": self.maxlen, "seeds": list(self.seeds)}
    def load_state_dict(self, d):
        self.maxlen = int(d.get("maxlen", self.maxlen))
        self.seeds = deque(d.get("seeds", []), maxlen=self.maxlen)

class PPOHardcoreVector:
    def __init__(self, device="cuda", num_envs=16, rollout_steps=2048, total_env_steps=40_000_000, **kwargs):
        self.device = device
        self.num_envs = num_envs
        self.rollout_steps = rollout_steps
        self.total_env_steps = total_env_steps

        self.gamma, self.lam, self.clip_eps = 0.99, 0.95, 0.2
        self.lr, self.train_epochs, self.minibatch_size = 2.5e-4, 10, 4096
        self.vf_coef, self.max_grad_norm, self.target_kl = 0.5, 0.5, 0.03
        self.ent_coef_start, self.ent_coef_end = 0.01, 0.0

        self.smooth_start_median = kwargs.get("smooth_start_median", 120.0)
        self.smooth_a_coef, self.smooth_da_coef = 0.002, 0.006
        self.eval_every_updates, self.eval_episodes = 10, 16
        self.hard_seed_threshold = 230.0

        self.history = {"steps": [], "returns": [], "smooth_on_step": None}
        self.seed_pool = HardSeedPool()
        self.env_seeds = [random.randint(0, 2000000000) for _ in range(num_envs)]
        self.env = make_vec_env(num_envs, seed_list=self.env_seeds)

        self.ac = ActorCritic(self.env.single_observation_space.shape[0], self.env.single_action_space.shape[0]).to(device)
        self.opt = optim.Adam(self.ac.parameters(), lr=self.lr, eps=1e-5)

        self.global_step, self.update, self.best_eval_median, self.enable_smooth = 0, 0, -1e9, False

        if os.path.exists(CKPT_PATH):
            self.load(CKPT_PATH)
        elif os.path.exists(BASE_NORMAL_MODEL):
            self.ac.load_state_dict(torch.load(BASE_NORMAL_MODEL, map_location=device, weights_only=False))
            with torch.no_grad(): self.ac.actor_logstd.fill_(-0.7)

        self.buf_obs = np.zeros((rollout_steps, num_envs, self.env.single_observation_space.shape[0]), dtype=np.float32)
        self.buf_act = np.zeros((rollout_steps, num_envs, self.env.single_action_space.shape[0]), dtype=np.float32)
        self.buf_logp, self.buf_rew, self.buf_done, self.buf_val = np.zeros((4, rollout_steps, num_envs), dtype=np.float32)
        self.prev_mean = np.zeros((num_envs, self.env.single_action_space.shape[0]), dtype=np.float32)

    def plot_training_progress(self):
        if len(self.history["returns"]) < 2: return
        plt.figure(figsize=(12, 6))
        steps, rets = self.history["steps"], self.history["returns"]
        plt.plot(steps, rets, alpha=0.3, color='royalblue', label='Episode Return')
        if len(rets) >= 50:
            plt.plot(steps[49:], np.convolve(rets, np.ones(50)/50, mode='valid'), color='red', linewidth=2, label='Running Avg (50 eps)')
        if self.history.get("smooth_on_step"):
            plt.axvline(x=self.history["smooth_on_step"], color='green', linestyle='--', linewidth=2, label="Smoothness ON")
        plt.title(f"BipedalWalker Progress (Step: {self.global_step})"); plt.xlabel("Steps"); plt.ylabel("Return"); plt.grid(True, alpha=0.3); plt.legend()
        plt.savefig(PLOT_PATH); plt.show()

    def visualize_agent(self, update_count):
        print(f"\n--- Visualizing Agent at Update {update_count} ---")
        vis_env = make_one_env(render_mode="rgb_array")
        vis_env = gym.wrappers.RecordVideo(vis_env, video_folder="videos", name_prefix=f"hc_upd_{update_count}", disable_logger=True)

        train_wno = find_normalize_obs_wrapper(self.env.envs[0])
        vis_wno = find_normalize_obs_wrapper(vis_env)
        if train_wno and vis_wno: vis_wno.obs_rms = train_wno.obs_rms

        obs, _ = vis_env.reset()
        ep_ret, done = 0.0, False
        while not done:
            obs_t = torch.tensor(obs, dtype=torch.float32, device=self.device).unsqueeze(0)
            with torch.no_grad(): _, m = self.ac.get_dist(obs_t)
            obs, r, term, trunc, _ = vis_env.step(m.squeeze(0).cpu().numpy())
            ep_ret += float(r); done = term or trunc
        vis_env.close()
        print(f"Visualization finished with return: {ep_ret:.2f}")
        show_video("videos")

    def save(self, path):
        wno = find_normalize_obs_wrapper(self.env.envs[0])
        ckpt = {
            "model": self.ac.state_dict(), "opt": self.opt.state_dict(), "global_step": self.global_step,
            "update": self.update, "best_eval_median": self.best_eval_median, "enable_smooth": self.enable_smooth,
            "obs_rms": (wno.obs_rms if wno else None), "seed_pool": self.seed_pool.state_dict(),
            "env_seeds": self.env_seeds, "history": self.history
        }
        torch.save(ckpt, path)
        self.plot_training_progress()

    def load(self, path):
        ckpt = torch.load(path, map_location=self.device, weights_only=False)
        self.ac.load_state_dict(ckpt["model"])
        if "opt" in ckpt: self.opt.load_state_dict(ckpt["opt"])
        self.global_step, self.update = ckpt["global_step"], ckpt["update"]
        self.best_eval_median = ckpt.get("best_eval_median", -1e9)
        self.enable_smooth = ckpt.get("enable_smooth", False)
        self.history = ckpt.get("history", {"steps": [], "returns": [], "smooth_on_step": None})
        if "smooth_on_step" not in self.history:
            self.history["smooth_on_step"] = self.global_step if self.enable_smooth else None
        if ckpt.get("seed_pool"): self.seed_pool.load_state_dict(ckpt["seed_pool"])
        if ckpt.get("env_seeds"): self.env_seeds = list(ckpt["env_seeds"])
        try: self.env.close()
        except: pass
        self.env = make_vec_env(self.num_envs, seed_list=self.env_seeds)
        if ckpt.get("obs_rms"):
            for e in self.env.envs:
                wno = find_normalize_obs_wrapper(e)
                if wno: wno.obs_rms = ckpt["obs_rms"]
        print(f"Resumed: update={self.update} step={self.global_step}")

    @torch.no_grad()
    def evaluate(self, episodes=16):
        env = make_one_env()
        t_wno, e_wno = find_normalize_obs_wrapper(self.env.envs[0]), find_normalize_obs_wrapper(env)
        if t_wno and e_wno: e_wno.obs_rms = t_wno.obs_rms
        rets = []
        for _ in range(episodes):
            seed = random.randint(0, 2000000000)
            obs, _ = env.reset(seed=int(seed))
            ep_ret, done = 0.0, False
            while not done:
                obs_t = torch.tensor(obs, dtype=torch.float32, device=self.device).unsqueeze(0)
                _, m = self.ac.get_dist(obs_t)
                obs, r, term, trunc, _ = env.step(m.squeeze(0).cpu().numpy())
                ep_ret += float(r); done = term or trunc
            rets.append(ep_ret)
        env.close()
        return np.mean(rets), np.median(rets), np.max(rets), np.min(rets)

    def train(self):
        obs, _ = self.env.reset(); self.prev_mean[:] = 0.0
        batch_size = self.num_envs * self.rollout_steps
        while self.global_step < self.total_env_steps:
            with torch.no_grad(): self.ac.actor_logstd.clamp_(-5.0, -1.2)
            ent_coef = self.ent_coef_end + (self.ent_coef_start - self.ent_coef_end) * max(0.0, 1.0 - (self.global_step / self.total_env_steps))

            for t in range(self.rollout_steps):
                self.buf_obs[t] = obs
                with torch.no_grad(): action, logp, ent, val, mean = self.ac.get_action_and_value(torch.tensor(obs, dtype=torch.float32, device=self.device))
                act_np, mean_np = action.cpu().numpy(), mean.cpu().numpy()
                next_obs, raw_r, term, trunc, infos = self.env.step(act_np); done = np.logical_or(term, trunc)
                shaped = raw_r.astype(np.float32)
                if self.enable_smooth:
                    shaped -= (self.smooth_a_coef * np.mean(mean_np**2, axis=1) + self.smooth_da_coef * np.mean((mean_np - self.prev_mean)**2, axis=1)).astype(np.float32)
                self.prev_mean, self.buf_act[t], self.buf_logp[t], self.buf_val[t], self.buf_rew[t], self.buf_done[t] = mean_np, act_np, logp.cpu().numpy(), val.cpu().numpy(), shaped, done.astype(np.float32)
                obs = next_obs; self.global_step += self.num_envs
                if isinstance(infos, dict) and "episode" in infos:
                    for i in range(self.num_envs):
                        if done[i]:
                            r_val = float(infos["episode"]["r"][i])
                            self.history["steps"].append(self.global_step); self.history["returns"].append(r_val)
                            print(f"[TRAIN EP] step={self.global_step:9d} | upd={self.update:6d} | env={i:2d} | ret={r_val:7.2f} | smooth={'ON' if self.enable_smooth else 'OFF'} | ent={ent_coef:.4f}")
                            if r_val < self.hard_seed_threshold: self.seed_pool.add(self.env_seeds[i])
                done_idxs = np.where(done)[0]
                if len(done_idxs) > 0:
                    hard = self.seed_pool.sample(len(done_idxs))
                    for j, env_i in enumerate(done_idxs):
                        seed = hard[j % len(hard)] if (hard and random.random() < 0.3) else random.randint(0, 2000000000)
                        self.env_seeds[env_i] = seed; self.env.envs[env_i].reset(seed=int(seed))

            with torch.no_grad(): nv = self.ac.get_value(torch.tensor(obs, dtype=torch.float32, device=self.device)).cpu().numpy()
            adv = np.zeros((self.rollout_steps, self.num_envs), dtype=np.float32); lastg = 0
            for t in reversed(range(self.rollout_steps)):
                nt = 1.0 - self.buf_done[t]; v_next = nv if t == self.rollout_steps-1 else self.buf_val[t+1]
                delta = self.buf_rew[t] + 0.99 * v_next * nt - self.buf_val[t]
                lastg = delta + 0.99 * 0.95 * nt * lastg; adv[t] = lastg
            ret = adv + self.buf_val
            b_obs, b_act, b_log, b_adv, b_ret = [torch.tensor(x.reshape(batch_size, -1) if len(x.shape)>2 else x.reshape(-1), device=self.device) for x in [self.buf_obs, self.buf_act, self.buf_logp, adv, ret]]
            b_adv = (b_adv - b_adv.mean()) / (b_adv.std() + 1e-8)
            for g in self.opt.param_groups: g["lr"] = self.lr
            for _ in range(self.train_epochs):
                inds = np.arange(batch_size); np.random.shuffle(inds)
                for s in range(0, batch_size, 4096):
                    mb = inds[s:s+4096]
                    _, n_lp, ent, n_v, _ = self.ac.get_action_and_value(b_obs[mb], b_act[mb])
                    log_r = n_lp - b_log[mb]; ratio = torch.exp(log_r)
                    surr1, surr2 = ratio * b_adv[mb], torch.clamp(ratio, 0.8, 1.2) * b_adv[mb]
                    loss = -torch.min(surr1, surr2).mean() + 0.5*(n_v - b_ret[mb]).pow(2).mean() - ent_coef * ent.mean()
                    self.opt.zero_grad(); loss.backward(); nn.utils.clip_grad_norm_(self.ac.parameters(), 0.5); self.opt.step()
                    with torch.no_grad(): approx_kl = (ratio - 1.0 - log_r).mean().item()
                    if approx_kl > 1.5 * 0.03: break
                if approx_kl > 1.5 * 0.03: break
            self.update += 1
            if self.update % 10 == 0:
                mean_r, med_r, best_r, worst_r = self.evaluate()
                print(f"Deterministic evaluation | mean {mean_r:7.2f} median {med_r:7.2f} best {best_r:7.2f} worst {worst_r:7.2f} | ent {ent_coef:.4f}")
                if (not self.enable_smooth) and (med_r >= self.smooth_start_median):
                    self.enable_smooth, self.history["smooth_on_step"] = True, self.global_step
                if med_r > self.best_eval_median:
                    self.best_eval_median = med_r; torch.save(self.ac.state_dict(), BEST_PATH)
                self.save(CKPT_PATH)
                self.visualize_agent(self.update)

if __name__ == "__main__":
    device = "cuda" if torch.cuda.is_available() else "cpu"
    trainer = PPOHardcoreVector(device=device, smooth_start_median=120.0)
    trainer.train()