This notebook contains Curiosity and Skills wrapper implemented for a semi-supervised reinforcement learning built on [stable baseline3](https://github.com/DLR-RM/stable-baselines3) [example](https://colab.research.google.com/github/Stable-Baselines-Team/rl-colab-notebooks/blob/sb3/stable_baselines_getting_started.ipynb)

In [1]:
pip install stable-baselines3[extra]

Collecting stable-baselines3[extra]
  Downloading stable_baselines3-2.4.0-py3-none-any.whl.metadata (4.5 kB)
Collecting gymnasium<1.1.0,>=0.29.1 (from stable-baselines3[extra])
  Downloading gymnasium-1.0.0-py3-none-any.whl.metadata (9.5 kB)
Collecting ale-py>=0.9.0 (from stable-baselines3[extra])
  Downloading ale_py-0.10.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.6 kB)
Collecting farama-notifications>=0.0.1 (from gymnasium<1.1.0,>=0.29.1->stable-baselines3[extra])
  Downloading Farama_Notifications-0.0.4-py3-none-any.whl.metadata (558 bytes)
Downloading ale_py-0.10.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m22.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading gymnasium-1.0.0-py3-none-any.whl (958 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m958.1/958.1 kB[0m [31m30.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading stable_bas

In [2]:
import stable_baselines3
import gymnasium as gym
import numpy as np
from stable_baselines3 import PPO
from stable_baselines3.ppo import MlpPolicy
import torch
import torch.nn as nn
import torch.optim as optim
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv, VecNormalize, VecEnvWrapper
from stable_baselines3.common.callbacks import BaseCallback
from stable_baselines3.common.evaluation import evaluate_policy

In [4]:
env = gym.make("CartPole-v1")
model = PPO(MlpPolicy, env, verbose=0)

In [5]:
# Use a separate environement for evaluation
eval_env = gym.make("CartPole-v1", render_mode="rgb_array")

# Random Agent, before training
mean_reward, std_reward = evaluate_policy(model, eval_env, n_eval_episodes=100)

print(f"mean_reward:{mean_reward:.2f} +/- {std_reward:.2f}")



mean_reward:8.80 +/- 0.66


In [6]:
# Train the agent for 10000 steps
model.learn(total_timesteps=10_000)

<stable_baselines3.ppo.ppo.PPO at 0x7dbdb2e46c80>

In [7]:
# Evaluate the trained agent
mean_reward, std_reward = evaluate_policy(model, eval_env, n_eval_episodes=100)

print(f"mean_reward:{mean_reward:.2f} +/- {std_reward:.2f}")

mean_reward:327.62 +/- 133.52


In [9]:
class ForwardModel(nn.Module):
    """
    A simple forward dynamics model: predicts next state features from current state and action.
    For demonstration, we'll assume a simple MLP that takes state and action as input.
    """
    def __init__(self, state_dim, action_dim, hidden_dim=64):
        super(ForwardModel, self).__init__()
        self.fc1 = nn.Linear(state_dim + action_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, state_dim)  # Predict next state representation

    def forward(self, state, action):
        x = torch.cat([state, action], dim=-1)
        x = torch.relu(self.fc1(x))
        return self.fc2(x)

In [7]:
class CuriosityEnvWrapper(VecEnvWrapper):
    """
    VecEnv wrapper that adds curiosity-driven intrinsic rewards.
    """
    def __init__(self, venv, state_dim, action_dim, learning_rate=1e-3):
        super().__init__(venv)
        self.forward_model = ForwardModel(state_dim, action_dim)
        self.optimizer = optim.Adam(self.forward_model.parameters(), lr=learning_rate)

        # store previous states and actions to train the model
        self.last_obs = None

    def reset(self):
        obs = self.venv.reset()
        self.last_obs = obs
        return obs

    def step_wait(self):
        obs, rewards, dones, infos = self.venv.step_wait()

        # convert to tensors
        obs_tensor = torch.tensor(obs, dtype=torch.float32)
        last_obs_tensor = torch.tensor(self.last_obs, dtype=torch.float32)

        # override the step_async and store the actions.
        actions_one_hot = torch.zeros((len(self.last_actions), self.action_space.n))
        for i, a in enumerate(self.last_actions):
            actions_one_hot[i, a] = 1.0

        pred_next_state = self.forward_model(last_obs_tensor, actions_one_hot)
        intrinsic_reward = torch.mean((pred_next_state - obs_tensor)**2, dim=-1).detach().numpy()

        # combine intrinsic reward with extrinsic reward
        total_reward = rewards + intrinsic_reward

        self.optimizer.zero_grad()
        loss = torch.mean((pred_next_state - obs_tensor)**2)
        loss.backward()
        self.optimizer.step()

        # update last_obs
        self.last_obs = obs
        return obs, total_reward, dones, infos

    def step_async(self, actions):
        # store actions for later use
        self.last_actions = actions
        self.venv.step_async(actions)

In [10]:
# Make the base environment
env_id = "CartPole-v1"
base_env = gym.make(env_id)

obs_dim = base_env.observation_space.shape[0]
act_dim = base_env.action_space.n
base_env.close()

# Create a vectorized environment
def make_env():
    return gym.make(env_id)

venv = DummyVecEnv([make_env])

# Wrap with curiosity:
curiosity_venv = CuriosityEnvWrapper(venv, state_dim=obs_dim, action_dim=act_dim)

In [11]:
model = PPO("MlpPolicy", curiosity_venv, verbose=1, n_steps=2048, batch_size=64, ent_coef=0.0, learning_rate=3e-4, n_epochs=10)
eval_env = gym.make("CartPole-v1")
# Random Agent, before training
mean_reward, std_reward = evaluate_policy(model, eval_env, n_eval_episodes=100)

print(f"mean_reward:{mean_reward:.2f} +/- {std_reward:.2f}")


Using cuda device




mean_reward:29.28 +/- 9.83


In [12]:
model.learn(total_timesteps=20000)

-----------------------------
| time/              |      |
|    fps             | 391  |
|    iterations      | 1    |
|    time_elapsed    | 5    |
|    total_timesteps | 2048 |
-----------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 315         |
|    iterations           | 2           |
|    time_elapsed         | 12          |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.008498171 |
|    clip_fraction        | 0.0916      |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.686      |
|    explained_variance   | -0.00794    |
|    learning_rate        | 0.0003      |
|    loss                 | 7.74        |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0137     |
|    value_loss           | 61.8        |
-----------------------------------------
----------------------------------

<stable_baselines3.ppo.ppo.PPO at 0x7ba443f1cf40>

In [13]:
eval_env = gym.make("CartPole-v1")
mean_reward, std_reward = evaluate_policy(model, eval_env, n_eval_episodes=100)
print(f"mean_reward:{mean_reward:.2f} +/- {std_reward:.2f}")

mean_reward:499.43 +/- 3.06


Skill


In [14]:
class SkillWrapper(gym.Wrapper):
    """
    A skill wrapper that augments observations with a skill embedding.
    Each episode, a skill index is sampled and a one-hot skill vector is appended to the observation.
    """
    def __init__(self, env, skill_dim=4):
        super().__init__(env)
        self.skill_dim = skill_dim
        self.current_skill = None

        orig_obs_space = self.env.observation_space

        # assume original obs space is a Box
        low = np.concatenate([orig_obs_space.low, np.zeros(self.skill_dim)])
        high = np.concatenate([orig_obs_space.high, np.ones(self.skill_dim)])
        self.observation_space = gym.spaces.Box(low=low, high=high, dtype=orig_obs_space.dtype)

    def reset(self, **kwargs):
        obs, info = self.env.reset(**kwargs)
        self.current_skill = np.random.randint(self.skill_dim)
        obs = self._augment_obs(obs, self.current_skill)
        return obs, info

    def step(self, action):
        obs, reward, done, truncated, info = self.env.step(action)
        obs = self._augment_obs(obs, self.current_skill)
        return obs, reward, done, truncated, info

    def _augment_obs(self, obs, skill_idx):
        skill_vec = np.zeros(self.skill_dim, dtype=obs.dtype)
        skill_vec[skill_idx] = 1.0
        return np.concatenate([obs, skill_vec])


def make_env_skill(env_id="CartPole-v1", skill_dim=4):
    def _init():
        env = gym.make(env_id)
        env = SkillWrapper(env, skill_dim=skill_dim)
        return env
    return _init

In [15]:
env_id = "CartPole-v1"
skill_dim = 4
venv = DummyVecEnv([make_env_skill(env_id, skill_dim) for _ in range(1)])

sample_env = gym.make(env_id)
obs_dim = sample_env.observation_space.shape[0]
sample_env.close()
# combine dimensions
aug_obs_dim = obs_dim + skill_dim
act_dim = 2  # CartPole has 2 discrete actions

# wrap with curiosity
curiosity_venv = CuriosityEnvWrapper(venv, state_dim=aug_obs_dim, action_dim=act_dim)


  gym.logger.warn(
  gym.logger.warn(


In [20]:
model = PPO("MlpPolicy", curiosity_venv, verbose=1, n_steps=2048, batch_size=64, ent_coef=0.0, learning_rate=3e-4, n_epochs=10)
eval_env = SkillWrapper(gym.make("CartPole-v1"), skill_dim=4)
# Random Agent, before training
mean_reward, std_reward = evaluate_policy(model, eval_env, n_eval_episodes=100)

print(f"mean_reward:{mean_reward:.2f} +/- {std_reward:.2f}")

Using cuda device


  gym.logger.warn(
  gym.logger.warn(


mean_reward:39.52 +/- 56.39


In [21]:
model.learn(total_timesteps=20000)

-----------------------------
| time/              |      |
|    fps             | 400  |
|    iterations      | 1    |
|    time_elapsed    | 5    |
|    total_timesteps | 2048 |
-----------------------------
----------------------------------------
| time/                   |            |
|    fps                  | 335        |
|    iterations           | 2          |
|    time_elapsed         | 12         |
|    total_timesteps      | 4096       |
| train/                  |            |
|    approx_kl            | 0.00980261 |
|    clip_fraction        | 0.104      |
|    clip_range           | 0.2        |
|    entropy_loss         | -0.686     |
|    explained_variance   | -0.0025    |
|    learning_rate        | 0.0003     |
|    loss                 | 5.74       |
|    n_updates            | 10         |
|    policy_gradient_loss | -0.0152    |
|    value_loss           | 39.6       |
----------------------------------------
-----------------------------------------
| time/   

<stable_baselines3.ppo.ppo.PPO at 0x7ba440d46260>

In [22]:
eval_env = gym.make("CartPole-v1")
eval_env = SkillWrapper(eval_env, skill_dim=4)

mean_reward, std_reward = evaluate_policy(model, eval_env, n_eval_episodes=100)
print(f"mean_reward:{mean_reward:.2f} +/- {std_reward:.2f}")

mean_reward:431.59 +/- 79.92
