# Random Network Distillation 

In [108]:
# TODO fix notebook reloading
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [109]:
import os
import gymnasium as gym
from gym.wrappers import RecordVideo
from IPython.display import Video, display, clear_output
from tqdm import tqdm
import torch 
from torch.utils.tensorboard import SummaryWriter
# torch default device
if  torch.backends.mps.is_available():
    device = torch.device("mps")
elif torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")
# device = torch.device("mps")
torch.set_default_device(device)

from rnd_rl.runner.policy_runner import PPOConfig, PolicyRunner

In [110]:
# @title Visualization code. Used later.

def visualize(agent):

    video_dir = "./videos"  # Directory to save videos
    os.makedirs(video_dir, exist_ok=True)

    # Create environment with proper render_mode
    env = gym.make("InvertedPendulum-v5", render_mode="rgb_array", reset_noise_scale=0.2)

    # Apply video recording wrapper
    env = RecordVideo(env, video_folder=video_dir, episode_trigger=lambda x: True)

    obs, _ = env.reset()


    for t in range(4096):
        actions, _ = agent.get_action(torch.Tensor(obs)[None, :].to(device))
        actions = actions.detach()
        obs, _, done, _, _ = env.step(actions.squeeze(0).cpu().numpy())

        if done:
            # self.writer.add_scalar("Duration", t, i)
            break

    env.close()

    # Display the latest video
    video_path = os.path.join(video_dir, sorted(os.listdir(video_dir))[-1])  # Get the latest video


    clear_output(wait=True)
    display(Video(video_path, embed=True))

In [111]:
# Launch TensorBoard
%load_ext tensorboard
%tensorboard --logdir runs

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


Reusing TensorBoard on port 6006 (pid 30803), started 1:16:38 ago. (Use '!kill 30803' to kill it.)

In [112]:
n_envs = 64
envs = gym.vector.SyncVectorEnv(
    [lambda: gym.make("InvertedPendulum-v5", reset_noise_scale=0.2) for _ in range(n_envs)]
    )

### PPO baseline

In [113]:
ppo_cfg = PPOConfig(
    use_rnd=False, 
    clip_params=0.2,
    init_noise_std=1.0, 
    safety_layer_enabled=True,
)

In [114]:
num_epochs = 250
policy_runner = PolicyRunner(envs=envs, policy_cfg=ppo_cfg, num_mini_epochs=10, device=device)
for epoch in tqdm(range(num_epochs)):
    policy_runner.rollout(epoch)
    policy_runner.update()

 16%|█▋        | 41/250 [01:36<08:05,  2.32s/it]/Users/gabrielxia/anaconda3/envs/drl/lib/python3.11/site-packages/glfw/__init__.py:917: GLFWError: (65537) b'The GLFW library is not initialized'
 16%|█▋        | 41/250 [01:38<08:05,  2.32s/it]

MoviePy - Building video /Users/gabrielxia/cs8803/DRL_RND/videos/rl-video-episode-0.mp4.
MoviePy - Writing video /Users/gabrielxia/cs8803/DRL_RND/videos/rl-video-episode-0.mp4



 16%|█▋        | 41/250 [01:38<08:05,  2.32s/it]

MoviePy - Done !
MoviePy - video ready /Users/gabrielxia/cs8803/DRL_RND/videos/rl-video-episode-0.mp4
MoviePy - Building video /Users/gabrielxia/cs8803/DRL_RND/videos/rl-video-episode-0.mp4.
MoviePy - Writing video /Users/gabrielxia/cs8803/DRL_RND/videos/rl-video-episode-0.mp4



 16%|█▋        | 41/250 [01:38<08:05,  2.32s/it]

MoviePy - Done !
MoviePy - video ready /Users/gabrielxia/cs8803/DRL_RND/videos/rl-video-episode-0.mp4
MoviePy - Building video /Users/gabrielxia/cs8803/DRL_RND/videos/rl-video-episode-0.mp4.
MoviePy - Writing video /Users/gabrielxia/cs8803/DRL_RND/videos/rl-video-episode-0.mp4



 16%|█▋        | 41/250 [01:38<08:05,  2.32s/it]

MoviePy - Done !
MoviePy - video ready /Users/gabrielxia/cs8803/DRL_RND/videos/rl-video-episode-0.mp4


100%|██████████| 250/250 [09:45<00:00,  2.34s/it]


In [115]:
visualize(policy_runner.alg)
print("PPO trained agent")

PPO trained agent


### PPO with RND

In [116]:
ppo_rnd_cfg = PPOConfig(
    use_rnd=True, 
    clip_params=0.2,
    init_noise_std=1.0, 
)

In [117]:
num_epochs = 250 
rnd_policy_runner = PolicyRunner(envs=envs, policy_cfg=ppo_rnd_cfg, num_mini_epochs=10,device=device)
for epoch in tqdm(range(num_epochs)):
    rnd_policy_runner.rollout(epoch)
    rnd_policy_runner.update()

100%|██████████| 250/250 [33:41<00:00,  8.08s/it]   


In [118]:
visualize(rnd_policy_runner.alg)
print("RND PPO trained agent")

RND PPO trained agent


### Reward normalization only

In [119]:

ppo_rnd_reward_normalization_cfg = PPOConfig(
    use_rnd=True, 
    clip_params=0.2,
    init_noise_std=1.0, 
    reward_normalization = True
)


In [120]:
num_epochs = 250 
rnd_reward_norm_policy_runner = PolicyRunner(envs=envs, policy_cfg=ppo_rnd_reward_normalization_cfg, num_mini_epochs=10,device=device)
rnd_reward_norm_policy_runner.writer = SummaryWriter(log_dir=f'runs/{"RND_reward_normalization"}') 
for epoch in tqdm(range(num_epochs)):
    rnd_reward_norm_policy_runner.rollout(epoch)
    rnd_reward_norm_policy_runner.update()

 17%|█▋        | 43/250 [05:01<24:09,  7.00s/it]   


KeyboardInterrupt: 

In [None]:
visualize(rnd_reward_norm_policy_runner.alg)
print("RND PPO trained agent with reward normalization")

RND PPO trained agent with reward normalization


### Reward and observation normalization

In [None]:
ppo_rnd_obs_normalization_cfg = PPOConfig(
    use_rnd=True, 
    clip_params=0.2,
    init_noise_std=1.0, 
    reward_normalization = True,
    obs_normalization = True
)


In [None]:
num_epochs = 250 
rnd_obs_norm_policy_runner = PolicyRunner(envs=envs, policy_cfg=ppo_rnd_obs_normalization_cfg, num_mini_epochs=10,device=device)
rnd_obs_norm_policy_runner.writer = SummaryWriter(log_dir=f'runs/{"RND_obs_normalization"}') 
for epoch in tqdm(range(num_epochs)):
    rnd_obs_norm_policy_runner.rollout(epoch)
    rnd_obs_norm_policy_runner.update()

100%|██████████| 250/250 [08:22<00:00,  2.01s/it]


In [None]:
visualize(rnd_obs_norm_policy_runner.alg)
print("RND PPO trained agent with observation normalization")

RND PPO trained agent with observation normalization
