# IMPORTS

In [27]:
from easypip import easyimport
import functools
import time

easyimport("importlib_metadata==4.13.0")
OmegaConf = easyimport("omegaconf").OmegaConf
bbrl_gym = easyimport("bbrl_gym")
bbrl = easyimport("bbrl>=0.1.6")

import os
import copy
import time
import numpy as np
from tqdm import tqdm

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.distributions import Normal
from torch.distributions.normal import Normal
from torch.distributions.independent import Independent

import gym
from bbrl.agents.agent import Agent
from bbrl import get_arguments, get_class, instantiate_class
from bbrl.workspace import Workspace
from bbrl.agents import Agents, RemoteAgent, TemporalAgent
from bbrl.agents.gymb import AutoResetGymAgent, NoAutoResetGymAgent
from bbrl.visu.play import load_agent, play
from bbrl.utils.replay_buffer import ReplayBuffer
from bbrl.utils.functionalb import gae

# !rm -rf gym-rocketlander
# !git clone https://github.com/EmbersArc/gym-rocketlander

# !conda uninstall gym-rocketlander
!pip install -e ./gym-rocketlander

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

Obtaining file:///mnt/c/Users/karna/Desktop/Projects/MasterArtificialIntelligencePTs/S3/RLD/TME_bonus/gym-rocketlander
  Preparing metadata (setup.py) ... [?25ldone
Installing collected packages: gym-rocketlander
  Attempting uninstall: gym-rocketlander
    Found existing installation: gym-rocketlander 0.0.1
    Uninstalling gym-rocketlander-0.0.1:
      Successfully uninstalled gym-rocketlander-0.0.1
  Running setup.py develop for gym-rocketlander
Successfully installed gym-rocketlander-0.0.1


# ROCKETLANDER ENV

In [28]:
class RocketLanderWrapper(gym.Wrapper):
    """
    Specific wrapper to shape the reward of the rocket lander environment
    """
    def __init__(self, env):
        super(RocketLanderWrapper, self).__init__(env)
        self.env = env
        self.prev_shaping = None
        
    def reset(self):
        self.prev_shaping = None
        return self.env.reset()

    def step(self, action):
        d = 1
        next_state, reward, done, info = self.env.step(action)
        # reward shaping
        """
        shaping = -0.5 * (self.env.distance + self.env.speed + abs(self.env.angle) ** 2)
        shaping += 0.1 * (
            self.env.legs[0].ground_contact + self.env.legs[1].ground_contact
        )
        if self.prev_shaping is not None:
            reward += shaping - self.prev_shaping
        self.prev_shaping = shaping
        """
        # print ("distance", self.env.distance)
        
        # shaping = 0.02
        shaping = 0.008 * (1 - self.env.distance)
        # shaping = 0.1 * (self.env.groundcontact - self.env.speed)
        if (
            self.env.legs[0].ground_contact > 0
            and self.env.legs[1].ground_contact > 0
            and self.env.speed < 0.1
        ):
            d = d * 2
            print("landed !")
            print ("speed", self.env.speed)
            shaping += 6.0 * d / self.env.speed
        else:
          d = 1
        reward += shaping
        return next_state, reward, done, info

    def old_step(self, action):
        next_state, reward, done, info = self.env.step(action)
        # reward shaping
        # shaping = -0.5 * (self.env.distance + self.env.speed + abs(self.env.angle) ** 2)
        # shaping += 0.1 * (self.env.legs[0].ground_contact + self.env.legs[1].ground_contact)
        shaping = 0
        if self.prev_shaping is not None:
            reward += shaping - self.prev_shaping
        self.prev_shaping = shaping

        return next_state, reward, done, info



class FrameSkip(gym.Wrapper):
    """
    Return only every ``skip``-th frame (frameskipping)
    :param env: the environment
    :param skip: number of ``skip``-th frame
    """

    def __init__(self, env: gym.Env, skip: int = 1):
        super().__init__(env)
        self._skip = skip

    def step(self, action: np.ndarray):
        """
        Step the environment with the given action
        Repeat action, sum reward, and max over last observations.
        :param action: the action
        :return: observation, reward, done, information
        """
        total_reward = 0.0
        done = None
        for i in range(self._skip):
            obs, reward, done, info = self.env.step(action)
            total_reward += reward
            if done:
                break

        return obs, total_reward, done, info

    def reset(self):
        return self.env.reset()

In [31]:
env = gym.make("RocketLander-v0")
env = RocketLanderWrapper(env)
env = FrameSkip(env, skip=1)
obs_space = env.observation_space
action_space = env.action_space
print("The observation space: {}".format(obs_space))
print("The action space: {}".format(action_space))

# env = gym.make("LunarLander-v2")
# obs_space = env.observation_space
# action_space = env.action_space
# print("The observation space: {}".format(obs_space))
# print("The action space: {}".format(action_space))

# env = gym.make("LunarLanderContinuous-v2")
# obs_space = env.observation_space
# action_space = env.action_space
# print("The observation space: {}".format(obs_space))
# print("The action space: {}".format(action_space))

The observation space: Box([ -1.  -1.  -1.  -1.  -1.  -1.  -1. -inf -inf -inf], [ 1.  1.  1.  1.  1.  1.  1. inf inf inf], (10,), float32)
The action space: Box([-1. -1. -1.], [1. 1. 1.], (3,), float32)


# GENERAL FUNCTIONS

In [27]:
def build_mlp(sizes, activation, output_activation=nn.Identity()):
    layers = []
    for j in range(len(sizes) - 1):
        act = activation if j < len(sizes) - 2 else output_activation
        layers += [nn.Linear(sizes[j], sizes[j + 1]), act]
    return nn.Sequential(*layers)

def make_gym_env(env_name):
	rocket_env = gym.make(env_name)
	rocket_env = RocketLanderWrapper(rocket_env)
	return FrameSkip(rocket_env, skip=1)

def get_env_agents(cfg):
    train_env_agent = AutoResetGymAgent(
        get_class(cfg.gym_env),
        get_arguments(cfg.gym_env),
        cfg.algorithm.n_envs,
        cfg.algorithm.seed,
    )
    eval_env_agent = NoAutoResetGymAgent(
    get_class(cfg.gym_env),
    get_arguments(cfg.gym_env),
    cfg.algorithm.nb_evals,
    cfg.algorithm.seed,
    )
    return train_env_agent, eval_env_agent

# LOGGER

In [28]:
class Logger():
  def __init__(self, cfg, log_string):
    self.logger = instantiate_class(cfg.logger)

  def add_log(self, log_string, loss, epoch):
    self.logger.add_scalar(log_string, loss.item(), epoch)

  def add_q_norms(self, q_norm, target_q_norm, epoch):
    # my_summary_writer.add_scalars(f'loss/check_info', {
    #                               'score': score[iteration],
    #                               'score_nf': score_nf[iteration],
    #                           }, iteration)
    # self.logger.add_scalar(log_string, q_norm, epoch)

    # self.logger.add_scalar(f'q_value/q_norm',    q_norm,    epoch)
    # self.logger.add_scalar(f'q_value/target_q_norm', target_q_norm, epoch)

    self.logger.add_scalars(f'q_value', {
        'q_norm': q_norm,
        'target_q_norm': target_q_norm,
    }, epoch)
    
  def add_q_norms2(self, q_norm1, q_norm2, target, epoch):
    self.logger.add_scalars(f'q_value', {
        'q_norm1': q_norm1,
        'q_norm2': q_norm2,
        'target_q_norm': target,
    }, epoch)

  # def log_losses(self, cfg, epoch, critic_loss, entropy_loss, a2c_loss):
  #   self.add_log("critic_loss", critic_loss, epoch)
  #   self.add_log("entropy_loss", entropy_loss, epoch)
  #   self.add_log("a2c_loss", a2c_loss, epoch)

  def log_losses(self, epoch, critic_loss, entropy_loss, actor_loss):
    self.add_log("critic_loss", critic_loss, epoch)
    self.add_log("entropy_loss", entropy_loss, epoch)
    self.add_log("actor_loss", actor_loss, epoch)

# PPO

In [29]:
class BaseActor(Agent):
    def copy_parameters(self, other):
        """Copy parameters from other agent"""
        for self_p, other_p in zip(self.parameters(), other.parameters()):
            self_p.data.copy_(other_p)

        

class DiscreteActor(BaseActor):
    def __init__(self, state_dim, hidden_size, n_actions):
        super().__init__()
        self.model = build_mlp(
            [state_dim] + list(hidden_size) + [n_actions], activation=nn.ReLU()
        )
        
    def dist(self, obs):
        scores = self.model(obs)
        probs = torch.softmax(scores, dim=-1)
        return torch.distributions.Categorical(probs)

    def forward(self, t, *, stochastic=True, predict_proba=False, compute_entropy=False, **kwargs):
        """
        Compute the action given either a time step (looking into the workspace)
        or an observation (in kwargs)
        """
        if "observation" in kwargs:
            observation = kwargs["observation"]
        else:
            observation = self.get(("env/env_obs", t))
        scores = self.model(observation)
        probs = torch.softmax(scores, dim=-1)

        if predict_proba:
            action = self.get(("action", t))
            log_prob = probs[torch.arange(probs.size()[0]), action].log()
            self.set(("logprob_predict", t), log_prob)
        else:
            if stochastic:
                action = torch.distributions.Categorical(probs).sample()
            else:
                action = scores.argmax(1)

            log_probs = probs[torch.arange(probs.size()[0]), action].log()

            self.set(("action", t), action)
            self.set(("action_logprobs", t), log_probs)

        if compute_entropy:
            entropy = torch.distributions.Categorical(probs).entropy()
            self.set(("entropy", t), entropy)

    def predict_action(self, obs, stochastic):
        scores = self.model(obs)

        if stochastic:
            probs = torch.softmax(scores, dim=-1)
            action = torch.distributions.Categorical(probs).sample()
        else:
            action = scores.argmax(0)
        return action



class TunableVarianceContinuousActor(BaseActor):
    def __init__(self, state_dim, hidden_layers, action_dim):
        super().__init__()
        layers = [state_dim] + list(hidden_layers) + [action_dim]
        self.model = build_mlp(layers, activation=nn.ReLU())

        # The standard deviation associated with each dimension
        self.std_param = nn.parameter.Parameter(torch.randn(action_dim, 1))
        
        # We use the softplus function to compute the variance for the normal
        # The base version computes exp(1+log(x)) component-wise
        # https://pytorch.org/docs/stable/generated/torch.nn.Softplus.html
        self.soft_plus = torch.nn.Softplus()

    def dist(self, obs: torch.Tensor):
        mean = self.model(obs)
        return Independent(Normal(mean, self.soft_plus(self.std_param)), 1)    
            
    def forward(self, t, *, stochastic=True, predict_proba=False, compute_entropy=False, **kwargs):
        obs = self.get(("env/env_obs", t))
        dist = self.dist(obs)

        if predict_proba:
            action = self.get(("action", t))
            self.set(("logprob_predict", t), dist.log_prob(action))
        else:
            action = dist.sample() if stochastic else dist.mean
            logp_pi = dist.log_prob(action)

            self.set(("action", t), action)
            self.set(("action_logprobs", t), logp_pi)

        if compute_entropy:
            self.set(("entropy", t), dist.entropy())

    def predict_action(self, obs, stochastic):
        """Predict just one action (without using the workspace)"""
        dist = self.dist(obs)
        action = dist.sample() if stochastic else dist.mean
        return action



class VAgent(Agent):
    def __init__(self, state_dim, hidden_layers):
        super().__init__()
        self.is_q_function = False
        self.model = build_mlp(
            [state_dim] + list(hidden_layers) + [1], activation=nn.ReLU()
        )

    def forward(self, t, **kwargs):
        observation = self.get(("env/env_obs", t))
        critic = self.model(observation).squeeze(-1)
        self.set(("v_value", t), critic)



class KLAgent(Agent):
    def __init__(self, model_1, model_2):
        super().__init__()
        self.model_1 = model_1
        self.model_2 = model_2

    def forward(self, t, **kwargs):
        obs = self.get(("env/env_obs", t))
        
        dist_1 = self.model_1.dist(obs)
        dist_2 = self.model_2.dist(obs)
        kl = torch.distributions.kl.kl_divergence(dist_1, dist_2)
        self.set(("kl", t), kl)




def create_ppo_agent(cfg, train_env_agent, eval_env_agent, needs_kl=None):
    obs_size, act_size = train_env_agent.get_obs_and_actions_sizes()

    if train_env_agent.is_continuous_action():
        action_agent = TunableVarianceContinuousActor(
            obs_size, cfg.algorithm.architecture.actor_hidden_size, act_size
        )
    else:
        action_agent = DiscreteActor(
            obs_size, cfg.algorithm.architecture.actor_hidden_size, act_size
        )

    tr_agent = Agents(train_env_agent, action_agent)
    ev_agent = Agents(eval_env_agent, action_agent)

    critic_agent = TemporalAgent(
        VAgent(obs_size, cfg.algorithm.architecture.critic_hidden_size)
    )

    train_agent = TemporalAgent(tr_agent)
    eval_agent = TemporalAgent(ev_agent)
    train_agent.seed(cfg.algorithm.seed)

    old_policy = copy.deepcopy(action_agent)
    old_critic_agent = copy.deepcopy(critic_agent)
    
    kl_agent = None
    if needs_kl:
        kl_agent = TemporalAgent(KLAgent(old_policy, action_agent))

    return action_agent, train_agent, eval_agent, critic_agent, old_policy, old_critic_agent, kl_agent




def compute_advantage_loss(cfg, reward, must_bootstrap, v_value):
    # Compute temporal difference with GAE
    advantage = gae(
        v_value,
        reward,
        must_bootstrap,
        cfg.algorithm.discount_factor,
        cfg.algorithm.gae,
    )
    # Compute critic loss
    td_error = advantage**2
    critic_loss = td_error.mean()
    return critic_loss, advantage




def compute_clip_agent_loss(cfg, advantage, ratio, kl):
    """Computes the PPO CLIP loss
    """
    clip_range = cfg.clip_range
    actor_loss = torch.min(
        advantage * ratio,
        advantage * torch.clamp(ratio, 1 - clip_range, 1 + clip_range)
    ).mean()
    return actor_loss




def setup_optimizer(cfg, action_agent, critic_agent):
    optimizer_args = get_arguments(cfg.optimizer)
    parameters = nn.Sequential(action_agent, critic_agent).parameters()
    optimizer = get_class(cfg.optimizer)(parameters, **optimizer_args)
    return optimizer




def run_ppo(cfg, log_string, variant="clip", compute_actor_loss=compute_clip_agent_loss, needs_kl=False, verbose=False):
    # 1)  Build the  logger
    logger = Logger(cfg, log_string)
    best_reward = -10e9

    # 2) Create the environment agent
    train_env_agent = AutoResetGymAgent(
        get_class(cfg.gym_env),
        get_arguments(cfg.gym_env),
        cfg.algorithm.n_envs,
        cfg.algorithm.seed,
    )
    
    eval_env_agent = NoAutoResetGymAgent(
        get_class(cfg.gym_env),
        get_arguments(cfg.gym_env),
        cfg.algorithm.nb_evals,
        cfg.algorithm.seed,
    )

    (
        policy,
        train_agent,
        eval_agent,
        critic_agent,
        old_policy,
        old_critic_agent,
        kl_agent
    ) = create_ppo_agent(cfg, train_env_agent, eval_env_agent, needs_kl=needs_kl)
    
    
    action_agent = TemporalAgent(policy)
    old_train_agent = TemporalAgent(old_policy)
    train_workspace = Workspace()

    # Configure the optimizer
    optimizer = setup_optimizer(cfg, train_agent, critic_agent)
    nb_steps = 0
    tmp_steps = 0

    # Training loop
    for epoch in tqdm(range(cfg.algorithm.max_epochs)):
        # Execute the agent in the workspace
        
        # Handles continuation
        delta_t = 0
        if epoch > 0:
            train_workspace.zero_grad()
            delta_t = 1
            train_workspace.copy_n_last_steps(delta_t)

        # Run the train/old_train agents
        train_agent(
            train_workspace,
            t=delta_t,
            n_steps=cfg.algorithm.n_steps - delta_t,
            stochastic=True,
            predict_proba=False,
            compute_entropy=False
        )
        old_train_agent(
            train_workspace,
            t=delta_t,
            n_steps=cfg.algorithm.n_steps - delta_t,
            # Just computes the probability
            predict_proba=True,
        )

        # Compute the critic value over the whole workspace
        critic_agent(train_workspace, n_steps=cfg.algorithm.n_steps)

        transition_workspace = train_workspace.get_transitions()
        done, truncated, reward, action, action_logp, v_value = transition_workspace[
            "env/done",
            "env/truncated",
            "env/reward",
            "action",
            "action_logprobs",
            "v_value",
        ]

        nb_steps += action[0].shape[0]

        # Determines whether values of the critic should be propagated
        # True if the episode reached a time limit or if the task was not done
        # See https://colab.research.google.com/drive/1W9Y-3fa6LsPeR6cBC1vgwBjKfgMwZvP5?usp=sharing
        must_bootstrap = torch.logical_or(~done[1], truncated[1])

        with torch.no_grad():
            old_critic_agent(train_workspace, n_steps=cfg.algorithm.n_steps)
        old_action_logp = transition_workspace["logprob_predict"].detach()
        old_v_value = transition_workspace["v_value"]
        if cfg.algorithm.clip_range_vf > 0:
            # Clip the difference between old and new values
            # NOTE: this depends on the reward scaling
            v_value = old_v_value + torch.clamp(
                v_value - old_v_value,
                -cfg.algorithm.clip_range_vf,
                cfg.algorithm.clip_range_vf,
            )
            
        critic_loss, advantage = compute_advantage_loss(
            cfg, reward, must_bootstrap, v_value
        )
        
        # We store the advantage into the transition_workspace
        advantage = advantage.detach().squeeze(0)
        transition_workspace.set("advantage", 0, advantage)
        transition_workspace.set("advantage", 1, torch.zeros_like(advantage))
        transition_workspace.set_full("old_action_logprobs", transition_workspace["logprob_predict"].detach())
        transition_workspace.clear("logprob_predict")
    
        for opt_epoch in range(cfg.algorithm.opt_epochs):
            if cfg.algorithm.minibatch_size > 0:
                sample_workspace = transition_workspace.sample_subworkspace(1, cfg.algorithm.minibatch_size, 2)
            else:
                sample_workspace = transition_workspace
                                 
            if opt_epoch > 0:
                critic_loss = 0. # We don't want to optimize the critic after the first mini-epoch

            action_agent(sample_workspace, t=0, n_steps=1, compute_entropy=True, predict_proba=True)

            advantage, action_logp, old_action_logp, entropy = sample_workspace[
                "advantage",
                "logprob_predict",
                "old_action_logprobs",
                "entropy"
            ]
            advantage = advantage[0]
            act_diff = action_logp[0] - old_action_logp[0]
            ratios = act_diff.exp()

            kl = None
            if kl_agent:
                kl_agent(sample_workspace, t=0, n_steps=1)
                kl = sample_workspace["kl"][0]

            actor_loss = compute_actor_loss(
                cfg.algorithm, advantage, ratios, kl
            )
                            
            # Entropy loss favor exploration
            entropy_loss = torch.mean(entropy[0])

            # Store the losses for tensorboard display
            if opt_epoch == 0:
                # Just for the first epoch
                logger.log_losses(nb_steps, critic_loss, entropy_loss, actor_loss)

            loss = (
                cfg.algorithm.critic_coef * critic_loss
                - cfg.algorithm.actor_coef * actor_loss
                - cfg.algorithm.entropy_coef * entropy_loss
            )
            

            old_policy.copy_parameters(policy)
            old_critic_agent = copy.deepcopy(critic_agent)


            optimizer.zero_grad()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(
                critic_agent.parameters(), cfg.algorithm.max_grad_norm
            )
            torch.nn.utils.clip_grad_norm_(
                train_agent.parameters(), cfg.algorithm.max_grad_norm
            )
            optimizer.step() 

        # Evaluate if enough steps have been performed
        if nb_steps - tmp_steps > cfg.algorithm.eval_interval:
            tmp_steps = nb_steps
            eval_workspace = Workspace()  # Used for evaluation
            eval_agent(
                eval_workspace,
                t=0,
                stop_variable="env/done",
                stochastic=True,
                predict_proba=False,
            )
            rewards = eval_workspace["env/cumulated_reward"][-1]
            mean = rewards.mean()
            logger.add_log("reward_mean", mean, nb_steps)
            logger.add_log("reward_max", rewards.max(), nb_steps)
            logger.add_log("reward_min", rewards.min(), nb_steps)
            logger.add_log("reward_std", rewards.std(), nb_steps)
            if verbose: print(f"nb_steps: {nb_steps}, reward: {mean}")
            if cfg.save_best and mean > best_reward:
                best_reward = mean
                directory = f"./ppo_agent/{cfg.gym_env.env_name}/{variant}/"
                if not os.path.exists(directory):
                    os.makedirs(directory)
                filename = directory + "ppo_" + str(mean.item()) + ".agt"
                policy.save_model(filename)

# SAC

# ROCKETLANDER / PPO

In [30]:
ENV = "RocketLander-v0"

In [31]:
# %reload_ext tensorboard
%tensorboard --logdir ./runs/rocketlander/discrete

UsageError: Line magic function `%tensorboard` not found.


In [32]:
# n_envss = [8, 16]
gae_lambdas = [0.90, 0.94, 0.98]
gammas = [0.95, 0.99, 0.999]
# ent_coefs = [1e-4, 1e-3, 1e-2]
archs = [[64, 64], [128, 128]]
lrs = [1e-4, 1e-3]

for gae_lambda in gae_lambdas:
	for gamma in gammas:
		for arch in archs:
			for lr in lrs:
				log_string = "PPO_gae_lambda_"+str(gae_lambda)+"_gamma_"+str(gamma)+"_arch_"+str(arch)+"_lr_"+str(lr)

				params_ppo={
				"save_best": True,
				"plot_policy": True,

				"logger":{
					"classname": "bbrl.utils.logger.TFLogger",
					"log_dir": f"{os.getcwd()}/runs/{log_string}",
					"cache_size": 10000,
					"every_n_seconds": 10,
					"verbose": False,    
					},

				"algorithm":{
					"seed": 4,
					"n_envs": 8,
					"max_grad_norm": 0.5,
					"nb_evals":10,
					"n_steps": 20,
					"eval_interval": 1000,
					"max_epochs": 3000, # 3000
					"discount_factor": gamma,
					"entropy_coef": 2.55e-5,
					"beta_kl": 1,
					"critic_coef": 0.6,
					"actor_coef": 1.0,
					"gae": gae_lambda,
					"clip_range": 0.2,
					"clip_range_vf": 0,
					"opt_epochs": 1,
					"minibatch_size": 0,
					"architecture":{
					"actor_hidden_size": arch,
					"critic_hidden_size": arch,
					},
				},
				"gym_env":{
					"classname": "__main__.make_gym_env",
					"env_name": ENV,
					},
				"optimizer":{
					"classname": "torch.optim.Adam",
					"lr": lr,
				}
				}

				config_ppo=OmegaConf.create(params_ppo)
				torch.manual_seed(config_ppo.algorithm.seed)

				print("--> Testing HP")
				print("       gae_lambda:", gae_lambda)
				print("       gamma:", gamma)
				print("       arch:", arch)
				print("       lr:", lr)
				start = time.time()
				run_ppo(config_ppo, log_string, verbose=False)
				print("--> Done in:", time.time() - start, "s")

# 36 fits, 15min per fits --> roughly 9 hours run

--> Testing HP
       gae_lambda: 0.9
       gamma: 0.95
       arch: [64, 64]
       lr: 0.0001


  0%|          | 0/3000 [00:00<?, ?it/s]


RuntimeError: The size of tensor a (8) must match the size of tensor b (3) at non-singleton dimension 0