In [2]:
!pip install easypip

from easypip import easyimport
import functools
import time

easyimport("importlib_metadata==4.13.0")
OmegaConf = easyimport("omegaconf").OmegaConf
bbrl_gym = easyimport("bbrl_gym")
bbrl = easyimport("bbrl>=0.1.6")

import os
import copy
import time
from tqdm import tqdm
import numpy as np

from typing import List

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.distributions import Normal
from torch.distributions.normal import Normal
from torch.distributions.independent import Independent

import gym
from bbrl.agents.agent import Agent
from bbrl import get_arguments, get_class, instantiate_class
from bbrl.workspace import Workspace
from bbrl.agents import Agents, RemoteAgent, TemporalAgent
from bbrl.agents.gymb import AutoResetGymAgent, NoAutoResetGymAgent
from bbrl.visu.play import load_agent, play
from bbrl.utils.replay_buffer import ReplayBuffer
from bbrl.utils.functionalb import gae

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'



In [3]:
def build_mlp(sizes, activation, output_activation=nn.Identity()):
    layers = []
    for j in range(len(sizes) - 1):
        act = activation if j < len(sizes) - 2 else output_activation
        layers += [nn.Linear(sizes[j], sizes[j + 1]), act]
    return nn.Sequential(*layers)

def make_gym_env(env_name):
    return gym.make(env_name)

def get_env_agents(cfg):
    train_env_agent = AutoResetGymAgent(
        get_class(cfg.gym_env),
        get_arguments(cfg.gym_env),
        cfg.algorithm.n_envs,
        cfg.algorithm.seed,
    )
    eval_env_agent = NoAutoResetGymAgent(
    get_class(cfg.gym_env),
    get_arguments(cfg.gym_env),
    cfg.algorithm.nb_evals,
    cfg.algorithm.seed,
    )
    return train_env_agent, eval_env_agent

In [4]:
class Logger():
	def __init__(self, cfg, log_string):
		self.logger = instantiate_class(cfg.logger)

	def add_log(self, log_string, loss, epoch):
		self.logger.add_scalar(log_string, loss.item(), epoch)

	def add_eps(self, log_string, eps, epoch):
		self.logger.add_scalar(log_string, eps, epoch)
		
	def log_losses(self, epoch, critic_loss, entropy_loss, actor_loss):
		self.add_log("critic_loss", critic_loss, epoch)
		self.add_log("entropy_loss", entropy_loss, epoch)
		self.add_log("actor_loss", actor_loss, epoch)

In [10]:
class DiscreteQAgent(Agent):
    """BBRL agent (discrete actions) based on a MLP"""
    def __init__(self, state_dim, hidden_layers, action_dim):
        super().__init__()
        self.model = build_mlp(
            [state_dim] + list(hidden_layers) + [action_dim], activation=nn.ReLU()
        )

    def forward(self, t: int, choose_action=True, **kwargs):
        """An Agent can use self.workspace"""

        # Retrieves the observation from the environment at time t
        obs = self.get(("env/env_obs", t))

        # Computes the critic (Q) values for the observation
        q_values = self.model(obs)

        # ... and sets the q-values (one for each possible action)
        self.set(("q_values", t), q_values)

        # Flag to toggle the fact that the action is chosen
        # by this agent; otherwise, we use a specific agent
        # (ex. epsilon-greedy) that implements the current policy,
        # see below (Exploration method)
        if choose_action:
            action = q_values.argmax(1)
            self.set(("action", t), action)


class RandomAgent(Agent):
    def __init__(self, action_dim):
        super().__init__()
        self.action_dim = action_dim

    def forward(self, t: int, choose_action=True, **kwargs):
        """An Agent can use self.workspace"""
        obs = self.get(("env/env_obs", t))
        action = torch.randint(0, self.action_dim, (len(obs), ))
        self.set(("action", t), action)


# class EGreedyActionSelector(Agent):
# 		def __init__(self, epsilon, decay_rate, min_eps, logger):
# 				super().__init__()
# 				self.epsilon = epsilon
# 				self.min_eps = min_eps
# 				self.decay_rate = decay_rate
# 				self.nb_epoch = 0
# 				self.logger = logger

# 		def forward(self, t, **kwargs):
# 				self.epsilon = (1/(1+self.decay_rate*self.nb_epoch)) * self.epsilon
# 				if self.epsilon < self.min_eps:
# 					self.epsilon = self.min_eps
# 				self.logger.add_eps("epsilon", self.epsilon, self.nb_epoch*500)
# 				q_values = self.get(("q_values", t))
# 				nb_actions = q_values.size()[1]
# 				size = q_values.size()[0]
# 				is_random = torch.rand(size).lt(self.epsilon).float()
# 				random_action = torch.randint(low=0, high=nb_actions, size=(size,))
# 				max_action = q_values.max(1)[1]
# 				action = is_random * random_action + (1 - is_random) * max_action
# 				action = action.long()
# 				self.set(("action", t), action)


class EGreedyActionSelector(Agent):
    def __init__(self, epsilon):
        super().__init__()
        self.epsilon = epsilon

    def forward(self, t: int, **kwargs):
        # Retrieves the q values 
        # (matrix nb. of episodes x nb. of actions)
        q_values = self.get(("q_values", t))
        size, nb_actions = q_values.size()

        # Flag 
        is_random = torch.rand(size).lt(self.epsilon).float()
        random_action = torch.randint(low=0, high=nb_actions, size=(size,))
        max_action = q_values.max(1)[1]

        # Choose the action based on the is_random flag
        action = is_random * random_action + (1 - is_random) * max_action

        # Sets the action at time t
        self.set(("action", t), action.long())


# Configure the optimizer over the q agent
def setup_optimizers(cfg, q_agent):
    optimizer_args = get_arguments(cfg.optimizer)
    parameters = q_agent.parameters()
    optimizer = get_class(cfg.optimizer)(parameters, **optimizer_args)
    return optimizer


def create_dqn_agent(cfg, train_env_agent, eval_env_agent):
		obs_size, act_size = train_env_agent.get_obs_and_actions_sizes()

		# Get the two agents (critic and target critic)
		critic = DiscreteQAgent(
				obs_size, cfg.algorithm.architecture.hidden_size, act_size)
		target_critic = copy.deepcopy(critic)

		# Builds the train agent that will produce transitions
		explorer = EGreedyActionSelector(
				cfg.algorithm.epsilon
				)
		tr_agent = Agents(train_env_agent, critic, explorer)
		train_agent = TemporalAgent(tr_agent)

		# Creates two temporal agents just for "replaying" some parts
		# of the transition buffer
		q_agent = TemporalAgent(critic)
		target_q_agent = TemporalAgent(target_critic)

		# Get an agent that is executed on a complete workspace
		ev_agent = Agents(eval_env_agent, critic)
		eval_agent = TemporalAgent(ev_agent)
		train_agent.seed(cfg.algorithm.seed)

		return train_agent, eval_agent, q_agent, target_q_agent
		

def compute_ddqn_loss(cfg, reward, must_bootstrap, q_values, target_q_values, action):
    max_q = target_q_values.max(1)[0].detach()

    target = (
        reward[:-1] + cfg.algorithm.discount_factor * max_q * must_bootstrap.int()
    )

    qvals = q_values[0].gather(1, action[0].unsqueeze(-1)).squeeze(-1)

    # Compute critic loss
    td = target - qvals
    td_error = td**2
    critic_loss = td_error.mean()

    return critic_loss


def run_dqn(cfg, compute_critic_loss, log_string):
    # 1)  Build the  logger
    logger = Logger(cfg, log_string)
    best_reward = -10e9

    # 2) Create the environment agent
    train_env_agent, eval_env_agent = get_env_agents(cfg)

    # 3) Create the DQN-like Agent
    train_agent, eval_agent, q_agent, target_q_agent = create_dqn_agent(
        cfg, train_env_agent, eval_env_agent
    )

    # 5) Configure the workspace to the right dimension
    # Note that no parameter is needed to create the workspace.
    # In the training loop, calling the agent() and critic_agent()
    # will take the workspace as parameter
    train_workspace = Workspace()  # Used for training
    rb = ReplayBuffer(max_size=cfg.algorithm.buffer_size)

    # 6) Configure the optimizer over the a2c agent
    optimizer = setup_optimizers(cfg, q_agent)
    nb_steps = 0
    last_eval_step = 0
    last_critic_update_step = 0
    
    list_mean = []
    list_std = []

    # 7) Training loop
    for epoch in tqdm(range(cfg.algorithm.max_epochs)):
        # Execute the agent in the workspace
        if epoch > 0:
            train_workspace.zero_grad()
            train_workspace.copy_n_last_steps(1)
            train_agent(
                train_workspace, t=1, n_steps=cfg.algorithm.n_steps - 1, stochastic=True
            )
        else:
            train_agent(
                train_workspace, t=0, n_steps=cfg.algorithm.n_steps, stochastic=True
            )

        # Get the transitions
        transition_workspace = train_workspace.get_transitions()

        action = transition_workspace["action"]
        nb_steps += action[0].shape[0]
        
        # Adds the transitions to the workspace
        rb.put(transition_workspace)
        for _ in range(cfg.algorithm.n_updates):
            rb_workspace = rb.get_shuffled(cfg.algorithm.batch_size)

            # The q agent needs to be executed on the rb_workspace workspace (gradients are removed in workspace)
            q_agent(rb_workspace, t=0, n_steps=2, choose_action=False)
            q_values, done, truncated, reward, action = rb_workspace[
                "q_values", "env/done", "env/truncated", "env/reward", "action"
            ]

            with torch.no_grad():
                target_q_agent(rb_workspace, t=0, n_steps=2, stochastic=True)
            target_q_values = rb_workspace["q_values"]

            # Determines whether values of the critic should be propagated
            # True if the episode reached a time limit or if the task was not done
            # See https://colab.research.google.com/drive/1erLbRKvdkdDy0Zn1X_JhC01s1QAt4BBj
            must_bootstrap = torch.logical_or(~done[1], truncated[1])

            if rb.size() > cfg.algorithm.learning_starts:
              # Compute critic loss
                critic_loss = compute_critic_loss(
                  cfg, reward, must_bootstrap, q_values, target_q_values[1], action
              )
              # Store the loss for tensorboard display
                logger.add_log("critic_loss", critic_loss, nb_steps)

                optimizer.zero_grad()
                critic_loss.backward()
                torch.nn.utils.clip_grad_norm_(q_agent.parameters(), cfg.algorithm.max_grad_norm)
                optimizer.step()
                
        if nb_steps - last_critic_update_step > cfg.algorithm.target_critic_update:
            last_critic_update_step = nb_steps
            target_q_agent.agent = copy.deepcopy(q_agent.agent)

        if nb_steps - last_eval_step > cfg.algorithm.eval_interval:
            last_eval_step = nb_steps
            eval_workspace = Workspace()  # Used for evaluation
            eval_agent(
                eval_workspace, t=0, stop_variable="env/done", choose_action=True
            )
            rewards = eval_workspace["env/cumulated_reward"][-1]
            mean = rewards.mean()
            list_mean.append(mean)
            list_std.append(rewards.std())
            
            logger.add_log("reward", mean, nb_steps)
            # print(f"epoch: {epoch}, reward: {mean}, std : {rewards.std()}, BEST VALUE: {best_reward}")
            if cfg.save_best and mean > best_reward:
                best_reward = mean
                directory = "./dqn_critic/"
                if not os.path.exists(directory):
                    os.makedirs(directory)
                filename = directory + "dqn0_" + str(mean.item()) + ".agt"
                eval_agent.save_model(filename)

    return list_mean, list_std


In [11]:
# %reload_ext tensorboard
# %tensorboard --logdir tmp/

In [12]:
gammas = [0.99]
archs = [[128,64,32]]
lrs = [0.0003]
n_envss = [8]
n_updates = [32]
grad_norms = [0.5]
epsilons = [0.02]
minibatchs = [64]

for grad_norm in grad_norms:
	for gamma in gammas:
		for arch in archs:
			for lr in lrs:
				for epsilon in epsilons:
					for n_envs in n_envss:
						for n_update in n_updates:
							for batch in minibatchs:
								log_string = "DDQN_opt_"+str(opt_epochs)+"_batch_"+str(batch)+"_nenvs_"+str(n_envs)+"_gamma_"+str(gamma)+"_arch_"+str(arch)+"_lr_"+str(lr)+"_grad_norm_"+str(grad_norm)+"_eps_"+str(epsilon)
								params={
									"save_best": False,
									"logger":{
										"classname": "bbrl.utils.logger.TFLogger",
										"log_dir": "./tmp/dqn-buffer-" + str(time.time()),
										"cache_size": 10000,
										"every_n_seconds": 10,
										"verbose": False,    
										},

									"algorithm":{
										"seed": 4,
										"n_updates": n_update, 
										"max_grad_norm": grad_norm, 
										"epsilon": epsilon, 
										# "min_eps": 0.1,
										# "epsilon_decay": 5e-6,
										"epsilon_decay": 5e-8,
										"n_envs": n_envs, 
										"n_steps": 32,
										"eval_interval": 2000,
										"learning_starts": 2000,
										"nb_evals": 10,
										"buffer_size": 50000,
										"batch_size": batch, 
										"target_critic_update": 5000,
										"max_epochs": 20000,
										"discount_factor": gamma, 
										"architecture":{"hidden_size": arch},
									},
									"gym_env":{
										"classname": "__main__.make_gym_env",
										"env_name": "LunarLander-v2",
									},
									"optimizer":
									{
										"classname": "torch.optim.Adam",
										"lr": lr, 
									}
								}

								config=OmegaConf.create(params)
								torch.manual_seed(config.algorithm.seed)
								run_dqn(config, compute_ddqn_loss, log_string)

  2%|▏         | 472/20000 [09:45<6:44:00,  1.24s/it] 


KeyboardInterrupt: 