# Random Network Distillation 

In [1]:
# TODO fix notebook reloading
%load_ext autoreload
%autoreload 2

In [2]:
import os
import gymnasium as gym
from gym.wrappers import RecordVideo
from IPython.display import Video, display, clear_output
from tqdm import tqdm
import torch 
from torch.utils.tensorboard import SummaryWriter
# torch default device
if  torch.backends.mps.is_available():
    device = torch.device("mps")
elif torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")
# device = torch.device("mps")
torch.set_default_device(device)

from rnd_rl.runner.policy_runner import PPOConfig, PolicyRunner

Gym has been unmaintained since 2022 and does not support NumPy 2.0 amongst other critical functionality.
Please upgrade to Gymnasium, the maintained drop-in replacement of Gym, or contact the authors of your software and request that they upgrade.
Users of this version of Gym should be able to simply replace 'import gym' with 'import gymnasium as gym' in the vast majority of cases.
See the migration guide at https://gymnasium.farama.org/introduction/migration_guide/ for additional information.


In [3]:
# @title Visualization code. Used later.

def visualize(agent):

    video_dir = "./videos"  # Directory to save videos
    os.makedirs(video_dir, exist_ok=True)

    # Create environment with proper render_mode
    env = gym.make("InvertedPendulum-v5", render_mode="rgb_array", reset_noise_scale=0.2)

    # Apply video recording wrapper
    env = RecordVideo(env, video_folder=video_dir, episode_trigger=lambda x: True)

    obs, _ = env.reset()


    for t in range(4096):
        actions, _ = agent.get_action(torch.Tensor(obs)[None, :].to(device))
        obs, _, done, _, _ = env.step(actions.squeeze(0).cpu().numpy())

        if done:
            # self.writer.add_scalar("Duration", t, i)
            break

    env.close()

    # Display the latest video
    video_path = os.path.join(video_dir, sorted(os.listdir(video_dir))[-1])  # Get the latest video


    clear_output(wait=True)
    display(Video(video_path, embed=True))

In [4]:
# Launch TensorBoard
%load_ext tensorboard
%tensorboard --logdir runs

Reusing TensorBoard on port 6006 (pid 49336), started 1:03:44 ago. (Use '!kill 49336' to kill it.)

In [5]:
n_envs = 64
envs = gym.vector.SyncVectorEnv(
    [lambda: gym.make("InvertedPendulum-v5", reset_noise_scale=0.2) for _ in range(n_envs)]
    )

### PPO baseline

In [6]:
ppo_cfg = PPOConfig(
    use_rnd=False, 
    clip_params=0.2,
    init_noise_std=1.0, 
)

In [7]:
num_epochs = 250
policy_runner = PolicyRunner(envs=envs, policy_cfg=ppo_cfg, num_mini_epochs=10, device=device)
for epoch in tqdm(range(num_epochs)):
    policy_runner.rollout(epoch)
    policy_runner.update()

100%|██████████| 250/250 [03:08<00:00,  1.33it/s]


In [8]:
visualize(policy_runner.alg)
print("PPO trained agent")

PPO trained agent


### PPO with RND

In [9]:
ppo_rnd_cfg = PPOConfig(
    use_rnd=True, 
    clip_params=0.2,
    init_noise_std=1.0, 
)

In [None]:
num_epochs = 250 
rnd_policy_runner = PolicyRunner(envs=envs, policy_cfg=ppo_rnd_cfg, num_mini_epochs=10,device=device)
for epoch in tqdm(range(num_epochs)):
    rnd_policy_runner.rollout(epoch)
    rnd_policy_runner.update()

  4%|▎         | 9/250 [00:08<03:48,  1.06it/s]

In [None]:
visualize(rnd_policy_runner.alg)
print("RND PPO trained agent")

RND PPO trained agent


### Reward normalization only

In [None]:

ppo_rnd_reward_normalization_cfg = PPOConfig(
    use_rnd=True, 
    clip_params=0.2,
    init_noise_std=1.0, 
    reward_normalization = True
)


In [None]:
num_epochs = 250 
rnd_reward_norm_policy_runner = PolicyRunner(envs=envs, policy_cfg=ppo_rnd_reward_normalization_cfg, num_mini_epochs=10,device=device)
rnd_reward_norm_policy_runner.writer = SummaryWriter(log_dir=f'runs/{"RND_reward_normalization"}') 
for epoch in tqdm(range(num_epochs)):
    rnd_reward_norm_policy_runner.rollout(epoch)
    rnd_reward_norm_policy_runner.update()

100%|██████████| 250/250 [04:10<00:00,  1.00s/it]


In [None]:
visualize(rnd_reward_norm_policy_runner.alg)
print("RND PPO trained agent with reward normalization")

RND PPO trained agent with reward normalization


### Reward and observation normalization

In [None]:
ppo_rnd_all_normalization_cfg = PPOConfig(
    use_rnd=True, 
    clip_params=0.2,
    init_noise_std=1.0, 
    reward_normalization = True,
    obs_normalization = True
)


In [None]:
num_epochs = 250 
rnd_all_norm_policy_runner = PolicyRunner(envs=envs, policy_cfg=ppo_rnd_all_normalization_cfg, num_mini_epochs=10,device=device)
rnd_all_norm_policy_runner.writer = SummaryWriter(log_dir=f'runs/{"RND_all_normalization"}') 
for epoch in tqdm(range(num_epochs)):
    rnd_all_norm_policy_runner.rollout(epoch)
    rnd_all_norm_policy_runner.update()

100%|██████████| 250/250 [05:03<00:00,  1.21s/it]


In [None]:
visualize(rnd_all_norm_policy_runner.alg)
print("RND PPO trained agent with observation normalization")

RND PPO trained agent with observation normalization


## RND Model Improvements

Based on recent research, here are several methods to improve RND model quality:

### 1. **Better Activation Functions**
- **ELU**: Exponential Linear Unit can provide better gradient flow
- **Swish**: Self-gated activation function often performs better than ReLU
- **LeakyReLU**: Helps prevent dead neurons

### 2. **Normalization Layers**
- **Batch Normalization**: Stabilizes training and speeds up convergence
- **Layer Normalization**: Alternative to batch norm, works well with varying batch sizes

### 3. **Regularization**
- **Dropout**: Prevents overfitting in the predictor network
- **Gradient Clipping**: Prevents exploding gradients

### 4. **Architecture Improvements**
- Deeper networks with proper normalization
- Better initialization strategies

Let's test these improvements systematically:


In [None]:
### Test 1: RND with ELU Activation


In [None]:
ppo_rnd_elu_cfg = PPOConfig(
    use_rnd=True, 
    clip_params=0.2,
    init_noise_std=1.0, 
    reward_normalization=True,
    obs_normalization=True,
    rnd_activation="elu",
    rnd_gradient_clip=1.0
)

num_epochs = 250 
rnd_elu_policy_runner = PolicyRunner(envs=envs, policy_cfg=ppo_rnd_elu_cfg, num_mini_epochs=10, device=device)
rnd_elu_policy_runner.writer = SummaryWriter(log_dir=f'runs/{"RND_ELU"}')
for epoch in tqdm(range(num_epochs)):
    rnd_elu_policy_runner.rollout(epoch)
    rnd_elu_policy_runner.update()


NameError: name 'PPOConfig' is not defined

In [None]:
visualize(rnd_elu_policy_runner.alg)
print("RND PPO trained agent with ELU activation")


In [None]:
### Test 2: RND with Swish Activation


In [None]:
ppo_rnd_swish_cfg = PPOConfig(
    use_rnd=True, 
    clip_params=0.2,
    init_noise_std=1.0, 
    reward_normalization=True,
    obs_normalization=True,
    rnd_activation="swish",
    rnd_gradient_clip=1.0
)

num_epochs = 250 
rnd_swish_policy_runner = PolicyRunner(envs=envs, policy_cfg=ppo_rnd_swish_cfg, num_mini_epochs=10, device=device)
rnd_swish_policy_runner.writer = SummaryWriter(log_dir=f'runs/{"RND_Swish"}')
for epoch in tqdm(range(num_epochs)):
    rnd_swish_policy_runner.rollout(epoch)
    rnd_swish_policy_runner.update()


In [None]:
visualize(rnd_swish_policy_runner.alg)
print("RND PPO trained agent with Swish activation")


In [None]:
### Test 3: RND with Batch Normalization


In [None]:
ppo_rnd_batchnorm_cfg = PPOConfig(
    use_rnd=True, 
    clip_params=0.2,
    init_noise_std=1.0, 
    reward_normalization=True,
    obs_normalization=True,
    rnd_activation="elu",
    rnd_use_batch_norm=True,
    rnd_gradient_clip=1.0
)

num_epochs = 250 
rnd_batchnorm_policy_runner = PolicyRunner(envs=envs, policy_cfg=ppo_rnd_batchnorm_cfg, num_mini_epochs=10, device=device)
rnd_batchnorm_policy_runner.writer = SummaryWriter(log_dir=f'runs/{"RND_BatchNorm"}')
for epoch in tqdm(range(num_epochs)):
    rnd_batchnorm_policy_runner.rollout(epoch)
    rnd_batchnorm_policy_runner.update()


In [None]:
visualize(rnd_batchnorm_policy_runner.alg)
print("RND PPO trained agent with Batch Normalization")


In [None]:
### Test 4: RND with Dropout Regularization


In [None]:
ppo_rnd_dropout_cfg = PPOConfig(
    use_rnd=True, 
    clip_params=0.2,
    init_noise_std=1.0, 
    reward_normalization=True,
    obs_normalization=True,
    rnd_activation="swish",
    rnd_dropout_rate=0.1,
    rnd_gradient_clip=1.0
)

num_epochs = 250 
rnd_dropout_policy_runner = PolicyRunner(envs=envs, policy_cfg=ppo_rnd_dropout_cfg, num_mini_epochs=10, device=device)
rnd_dropout_policy_runner.writer = SummaryWriter(log_dir=f'runs/{"RND_Dropout"}')
for epoch in tqdm(range(num_epochs)):
    rnd_dropout_policy_runner.rollout(epoch)
    rnd_dropout_policy_runner.update()


In [None]:
visualize(rnd_dropout_policy_runner.alg)
print("RND PPO trained agent with Dropout regularization")


In [None]:
### Test 5: RND with All Improvements Combined


In [None]:
ppo_rnd_combined_cfg = PPOConfig(
    use_rnd=True, 
    clip_params=0.2,
    init_noise_std=1.0, 
    reward_normalization=True,
    obs_normalization=True,
    rnd_activation="swish",
    rnd_use_layer_norm=True,  # Using layer norm instead of batch norm for better stability
    rnd_dropout_rate=0.1,
    rnd_gradient_clip=1.0,
    rnd_lr=5e-5  # Slightly lower learning rate for stability
)

num_epochs = 250 
rnd_combined_policy_runner = PolicyRunner(envs=envs, policy_cfg=ppo_rnd_combined_cfg, num_mini_epochs=10, device=device)
rnd_combined_policy_runner.writer = SummaryWriter(log_dir=f'runs/{"RND_Combined"}')
for epoch in tqdm(range(num_epochs)):
    rnd_combined_policy_runner.rollout(epoch)
    rnd_combined_policy_runner.update()


In [None]:
visualize(rnd_combined_policy_runner.alg)
print("RND PPO trained agent with all improvements combined")


## Summary of RND Improvements

The implemented improvements include:

1. **Better Activation Functions**: ELU, Swish, and LeakyReLU alternatives to ReLU
2. **Normalization Layers**: Batch Normalization and Layer Normalization for training stability
3. **Regularization**: Dropout layers to prevent overfitting
4. **Training Stability**: Gradient clipping to prevent exploding gradients
5. **Architecture Flexibility**: Configurable network depth and normalization strategies

### Key Benefits:
- **ELU**: Better gradient flow, especially for negative inputs
- **Swish**: Self-gated activation often outperforms ReLU in deep networks
- **Layer Normalization**: More stable than batch norm for varying batch sizes
- **Dropout**: Prevents overfitting in the predictor network
- **Gradient Clipping**: Maintains training stability

You can now compare the performance of different configurations using TensorBoard to see which improvements work best for your specific environment and task.
