# Random Network Distillation 

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import os
import gymnasium as gym
from gym.wrappers import RecordVideo
from IPython.display import Video, display, clear_output
from tqdm import tqdm
import torch 
import gymnasium_robotics

# torch default device
if  torch.backends.mps.is_available():
    device = torch.device("mps")
elif torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")
# device = torch.device("mps")
torch.set_default_device(device)
print(f"Using device: {device}")

from rnd_rl.runner.policy_runner import PPOConfig, PolicyRunner

gym.register_envs(gymnasium_robotics)

In [None]:
# @title Visualization code. Used later.
fixed_goal_maze =  [[1, 1, 1, 1, 1, 1, 1, 1],
            [1, "g", 0, 1, 1, 0, 0, 1],
            [1, 0, 0, 1, 0, 0, 0, 1],
            [1, 1, 0, 0, 0, 1, 1, 1],
            [1, 0, 0, 1, 0, 0, 0, 1],
            [1, 0, 1, 0, 0, 1, 0, 1],
            [1, 0, 0, 0, 1, "r", 0, 1],
            [1, 1, 1, 1, 1, 1, 1, 1]] # force a longer path.

def visualize(agent):

    video_dir = "./videos"  # Directory to save videos
    os.makedirs(video_dir, exist_ok=True)

    # Create environment with proper render_mode
    # env = gym.make("PointMaze_UMaze", render_mode="rgb_array", maze_map = fixed_goal_maze, continuing_task = False)
    env = gym.make("PointMaze_Medium-v3", render_mode="rgb_array", maze_map = fixed_goal_maze, continuing_task = False)

    # Apply video recording wrapper
    env = RecordVideo(env, video_folder=video_dir, episode_trigger=lambda x: True)

    obs, _ = env.reset()
    obs = obs["observation"]

    for t in range(4096):
        actions, _ = agent.get_action(torch.Tensor(obs)[None, :].to(device))
        obs, _, done, _, _ = env.step(actions.squeeze(0).cpu().numpy())
        obs = obs["observation"]

        if done:
            # self.writer.add_scalar("Duration", t, i)
            break

    env.close()

    # Display the latest video
    video_path = os.path.join(video_dir, sorted(os.listdir(video_dir))[-1])  # Get the latest video


    clear_output(wait=True)
    display(Video(video_path, embed=True))

In [None]:
n_envs = 64
envs = gym.vector.SyncVectorEnv(
    [lambda: gym.make("PointMaze_Medium-v3", maze_map = fixed_goal_maze, continuing_task = False) for _ in range(n_envs)]
    )

### PPO baseline

In [None]:
ppo_cfg = PPOConfig(
    use_rnd=False, 
    clip_params=0.2,
    init_noise_std=1.0, 
)

In [None]:
num_epochs = 250
policy_runner = PolicyRunner(
    envs=envs, 
    policy_cfg=ppo_cfg, 
    num_mini_epochs=10, 
    device=device, 
    experiment_name="PPO_maze_med_far",
    dict_obs_space = True,
    num_steps_per_env = 256 # 128 # makes it difficult for the agent to reach goal
)
for epoch in tqdm(range(num_epochs)):
    policy_runner.rollout(epoch)
    policy_runner.update()

In [None]:
visualize(policy_runner.alg)
print("PPO trained agent")

### PPO with RND
Finding: if the intrinsic rewards always dominates, then agent cannot improve - does not know which action can lead to the true extrinsic reward. Requires reward normalization (which is only applied to intrinsic rewards, zero-centered.) Maybe also need to rescale. The original paper worked because game scores are much larger than 1.

### Reward normalization only

In [None]:

ppo_rnd_reward_normalization_cfg = PPOConfig(
    use_rnd=True, 
    clip_params=0.2,
    init_noise_std=1.0, 
    reward_normalization = True,
    intrinsic_reward_scale = 0.1 # excessive intrinsic reward might degrade training
)


In [None]:
num_epochs = 250 
rnd_reward_norm_policy_runner = PolicyRunner(
    envs=envs, 
    policy_cfg=ppo_rnd_reward_normalization_cfg, 
    num_mini_epochs=10,
    device=device, 
    experiment_name="PPO_RND_rew_norm_maze_med_far",
    dict_obs_space = True,
    num_steps_per_env = 256 # 128 # makes it difficult for the agent to reach goal
)
for epoch in tqdm(range(num_epochs)):
    rnd_reward_norm_policy_runner.rollout(epoch)
    rnd_reward_norm_policy_runner.update()

In [None]:
visualize(rnd_reward_norm_policy_runner.alg)
print("RND PPO trained agent with reward normalization")

In [None]:
# episode length
# print(torch.nonzero(rnd_reward_norm_policy_runner.traj_data.extrinsic_rewards)) 


In [None]:
magnitude = torch.sqrt(torch.pow(rnd_reward_norm_policy_runner.traj_data.states[2],2) \
    + torch.pow(rnd_reward_norm_policy_runner.traj_data.states[3],2))

print(torch.mean(magnitude))
print(torch.max(magnitude))

# print(magnitude)

### Reward and observation normalization
Finding: might work better for the original implementation with image embedding observations of unknown value range. Here, it will only improve if the agent can reach to most of the obs spaces in the first several steps, which is not applicable.

### Reward and observation normalization
Finding: might work better for the original implementation with image embedding observations of unknown value range. Here, it will only improve if the agent can reach to most of the obs spaces in the first several steps, which is not applicable.