# IMPORTS

In [1]:
from easypip import easyimport
import functools
import time

easyimport("importlib_metadata==4.13.0")
OmegaConf = easyimport("omegaconf").OmegaConf
bbrl_gym = easyimport("bbrl_gym")
bbrl = easyimport("bbrl>=0.1.6")

import os
import copy
import time
import numpy as np
from tqdm import tqdm

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.distributions import Normal
from torch.distributions.normal import Normal
from torch.distributions.independent import Independent

import gym
from bbrl.agents.agent import Agent
from bbrl import get_arguments, get_class, instantiate_class
from bbrl.workspace import Workspace
from bbrl.agents import Agents, RemoteAgent, TemporalAgent
from bbrl.agents.gymb import AutoResetGymAgent, NoAutoResetGymAgent
from bbrl.visu.play import load_agent, play
from bbrl.utils.replay_buffer import ReplayBuffer
from bbrl.utils.functionalb import gae

# !rm -rf gym-rocketlander
# !git clone https://github.com/EmbersArc/gym-rocketlander

# !conda uninstall gym-rocketlander
!pip install -e ./gym-rocketlander

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

Obtaining file:///home/hanzopgp/projects/MasterArtificialIntelligencePTs/S3/RLD/TME_bonus/gym-rocketlander
[31mERROR: file:///home/hanzopgp/projects/MasterArtificialIntelligencePTs/S3/RLD/TME_bonus/gym-rocketlander does not appear to be a Python project: neither 'setup.py' nor 'pyproject.toml' found.[0m[31m
[0m

# ROCKETLANDER ENV

In [2]:
class RocketLanderWrapper(gym.Wrapper):
    """
    Specific wrapper to shape the reward of the rocket lander environment
    """
    def __init__(self, env):
        super(RocketLanderWrapper, self).__init__(env)
        self.env = env
        self.prev_shaping = None
        
    def reset(self):
        self.prev_shaping = None
        return self.env.reset()

    def step(self, action):
        d = 1
        next_state, reward, done, info = self.env.step(action)
        # reward shaping
        """
        shaping = -0.5 * (self.env.distance + self.env.speed + abs(self.env.angle) ** 2)
        shaping += 0.1 * (
            self.env.legs[0].ground_contact + self.env.legs[1].ground_contact
        )
        if self.prev_shaping is not None:
            reward += shaping - self.prev_shaping
        self.prev_shaping = shaping
        """
        # print ("distance", self.env.distance)
        
        # shaping = 0.02
        shaping = 0.008 * (1 - self.env.distance)
        # shaping = 0.1 * (self.env.groundcontact - self.env.speed)
        if (
            self.env.legs[0].ground_contact > 0
            and self.env.legs[1].ground_contact > 0
            and self.env.speed < 0.1
        ):
            d = d * 2
            print("landed !")
            print ("speed", self.env.speed)
            shaping += 6.0 * d / self.env.speed
        else:
          d = 1
        reward += shaping
        return next_state, reward, done, info

    def old_step(self, action):
        next_state, reward, done, info = self.env.step(action)
        # reward shaping
        # shaping = -0.5 * (self.env.distance + self.env.speed + abs(self.env.angle) ** 2)
        # shaping += 0.1 * (self.env.legs[0].ground_contact + self.env.legs[1].ground_contact)
        shaping = 0
        if self.prev_shaping is not None:
            reward += shaping - self.prev_shaping
        self.prev_shaping = shaping

        return next_state, reward, done, info


class FrameSkip(gym.Wrapper):
    """
    Return only every ``skip``-th frame (frameskipping)
    :param env: the environment
    :param skip: number of ``skip``-th frame
    """

    def __init__(self, env: gym.Env, skip: int = 1):
        super().__init__(env)
        self._skip = skip

    def step(self, action: np.ndarray):
        """
        Step the environment with the given action
        Repeat action, sum reward, and max over last observations.
        :param action: the action
        :return: observation, reward, done, information
        """
        total_reward = 0.0
        done = None
        for i in range(self._skip):
            obs, reward, done, info = self.env.step(action)
            total_reward += reward
            if done:
                break

        return obs, total_reward, done, info

    def reset(self):
        return self.env.reset()

In [3]:
env = gym.make("RocketLander-v0")
env = RocketLanderWrapper(env)
env = FrameSkip(env, skip=1)
obs_space = env.observation_space
action_space = env.action_space
print("The observation space: {}".format(obs_space))
print("The action space: {}".format(action_space))

# env = gym.make("LunarLander-v2")
# obs_space = env.observation_space
# action_space = env.action_space
# print("The observation space: {}".format(obs_space))
# print("The action space: {}".format(action_space))

# env = gym.make("LunarLanderContinuous-v2")
# obs_space = env.observation_space
# action_space = env.action_space
# print("The observation space: {}".format(obs_space))
# print("The action space: {}".format(action_space))

Matplotlib backend: module://matplotlib_inline.backend_inline
The observation space: Box([ -1.  -1.  -1.  -1.  -1.  -1.  -1. -inf -inf -inf], [ 1.  1.  1.  1.  1.  1.  1. inf inf inf], (10,), float32)
The action space: Box([-1. -1. -1.], [1. 1. 1.], (3,), float32)


# GENERAL FUNCTIONS

In [4]:
def build_mlp(sizes, activation, output_activation=nn.Identity()):
    layers = []
    for j in range(len(sizes) - 1):
        act = activation if j < len(sizes) - 2 else output_activation
        layers += [nn.Linear(sizes[j], sizes[j + 1]), act]
    return nn.Sequential(*layers)

def build_backbone(sizes, activation):
    layers = []
    for j in range(len(sizes) - 2):
        layers += [nn.Linear(sizes[j], sizes[j + 1]), activation]
    return layers

def make_gym_env(env_name):
	rocket_env = gym.make(env_name)
	rocket_env = RocketLanderWrapper(rocket_env)
	return FrameSkip(rocket_env, skip=1)

def get_env_agents(cfg):
    train_env_agent = AutoResetGymAgent(
        get_class(cfg.gym_env),
        get_arguments(cfg.gym_env),
        cfg.algorithm.n_envs,
        cfg.algorithm.seed,
    )
    eval_env_agent = NoAutoResetGymAgent(
    get_class(cfg.gym_env),
    get_arguments(cfg.gym_env),
    cfg.algorithm.nb_evals,
    cfg.algorithm.seed,
    )
    return train_env_agent, eval_env_agent

# LOGGER

In [5]:
class Logger():
  def __init__(self, cfg, log_string):
    self.logger = instantiate_class(cfg.logger)

  def add_log(self, log_string, loss, epoch):
    self.logger.add_scalar(log_string, loss.item(), epoch)

  def log_losses(self, epoch, critic_loss, entropy_loss, actor_loss):
    self.add_log("critic_loss", critic_loss, epoch)
    self.add_log("entropy_loss", entropy_loss, epoch)
    self.add_log("actor_loss", actor_loss, epoch)

# SAC

In [6]:
from bbrl.utils.distributions import SquashedDiagGaussianDistribution

class SquashedGaussianActor(Agent):
    def __init__(self, state_dim, hidden_layers, action_dim):
        super().__init__()
        backbone_dim = [state_dim] + list(hidden_layers)
        self.layers = build_backbone(backbone_dim, activation=nn.ReLU())
        self.backbone = nn.Sequential(*self.layers)
        self.last_mean_layer = nn.Linear(hidden_layers[-1], action_dim)
        self.last_std_layer = nn.Linear(hidden_layers[-1], action_dim)
        self.action_dist = SquashedDiagGaussianDistribution(action_dim)
        # std must be positive
        self.std_layer = nn.Softplus()

    def dist(self, obs: torch.Tensor):
        """Computes action distributions given observation(s)"""
        backbone_output = self.backbone(obs)
        mean = self.last_mean_layer(backbone_output)
        std_out = self.last_std_layer(backbone_output)
        std = self.std_layer(std_out)
        return self.action_dist.make_distribution(mean, std)


    def forward(self, t, stochastic):
        action_dist = self.dist(self.get(("env/env_obs", t)))
        action = action_dist.sample() if stochastic else action_dist.mode()

        log_prob = action_dist.log_prob(action)
        self.set((f"action", t), action)
        self.set(("action_logprobs", t), log_prob)

    def predict_action(self, obs, stochastic: bool):
        action_dist = self.dist(obs)
        action = action_dist.sample() if stochastic else action_dist.mode()
        return action


class ContinuousQAgent(Agent):
    def __init__(self, state_dim, hidden_layers, action_dim):
        super().__init__()
        self.is_q_function = True
        self.model = build_mlp(
            [state_dim + action_dim] + list(hidden_layers) + [1], activation=nn.ReLU()
        )

    def forward(self, t, detach_actions=False):
        obs = self.get(("env/env_obs", t))
        action = self.get(("action", t))
        if detach_actions:
            action = action.detach()
        osb_act = torch.cat((obs, action), dim=1)
        q_value = self.model(osb_act)
        self.set(("q_value", t), q_value)

    def predict_value(self, obs, action):
        osb_act = torch.cat((obs, action), dim=0)
        q_value = self.model(osb_act)
        return q_value



# Create the SAC Agent
def create_sac_agent(cfg, train_env_agent, eval_env_agent):
    obs_size, act_size = train_env_agent.get_obs_and_actions_sizes()
    assert (
        train_env_agent.is_continuous_action()
    ), "SAC code dedicated to continuous actions"

    # Actor
    actor = SquashedGaussianActor(
        obs_size, cfg.algorithm.architecture.actor_hidden_size, act_size
    )

    # Train/Test agents
    tr_agent = Agents(train_env_agent, actor)
    ev_agent = Agents(eval_env_agent, actor)

    # Builds the critics
    critic_1 = ContinuousQAgent(
        obs_size, cfg.algorithm.architecture.critic_hidden_size, act_size
    )
    target_critic_1 = copy.deepcopy(critic_1)
    critic_2 = ContinuousQAgent(
        obs_size, cfg.algorithm.architecture.critic_hidden_size, act_size
    )
    target_critic_2 = copy.deepcopy(critic_2)

    train_agent = TemporalAgent(tr_agent)
    eval_agent = TemporalAgent(ev_agent)
    train_agent.seed(cfg.algorithm.seed)
    return (
        train_agent,
        eval_agent,
        actor,
        critic_1,
        target_critic_1,
        critic_2,
        target_critic_2,
    )


# Configure the optimizer for the actor and critic
def setup_optimizers(cfg, actor, critic_1, critic_2):
    actor_optimizer_args = get_arguments(cfg.actor_optimizer)
    parameters = actor.parameters()
    actor_optimizer = get_class(cfg.actor_optimizer)(parameters, **actor_optimizer_args)
    critic_optimizer_args = get_arguments(cfg.critic_optimizer)
    parameters = nn.Sequential(critic_1, critic_2).parameters()
    critic_optimizer = get_class(cfg.critic_optimizer)(
        parameters, **critic_optimizer_args
    )
    return actor_optimizer, critic_optimizer


def setup_entropy_optimizers(cfg):
    if cfg.algorithm.target_entropy == "auto":
        entropy_coef_optimizer_args = get_arguments(cfg.entropy_coef_optimizer)
        # Note: we optimize the log of the entropy coefficient which is slightly different from the paper
        # as discussed in https://github.com/rail-berkeley/softlearning/issues/37
        # Comment and code taken from the SB3 version of SAC
        log_entropy_coef = torch.log(
            torch.ones(1) * cfg.algorithm.entropy_coef
        ).requires_grad_(True)
        entropy_coef_optimizer = get_class(cfg.entropy_coef_optimizer)(
            [log_entropy_coef], **entropy_coef_optimizer_args
        )
    else:
        log_entropy_coef = 0
        entropy_coef_optimizer = None
    return entropy_coef_optimizer, log_entropy_coef


def compute_critic_loss(
    cfg, reward, must_bootstrap,
    t_actor, 
    q_agent_1, q_agent_2, 
    target_q_agent_1, target_q_agent_2, 
    rb_workspace,
    ent_coef
):
    """Computes the critic loss for a set of $S$ transition samples

    Args:
        cfg: The experimental configuration
        reward: _description_
        must_bootstrap: Tensor of indicators (2 x S)
        t_actor: The actor agent (as a TemporalAgent)
        q_agent_1: The first critic (as a TemporalAgent)
        q_agent_2: The second critic (as a TemporalAgent)
        target_q_agent_1: The target of the first critic
        target_q_agent_2: The target of the second critic
        rb_workspace: The transition workspace
        ent_coef: The entropy coefficient

    Returns:
        Tuple[torch.Tensor, torch.Tensor]: The two critic losses (scalars)
    """
    # Compute q_values from both critics with the actions present in the buffer:
    # at t, we have Q(s,a) from the (s,a) in the RB
    q_agent_1(rb_workspace, t=0, n_steps=1)
    q_values_rb_1 = rb_workspace["q_value"]
    
    q_agent_2(rb_workspace, t=0, n_steps=1)
    q_values_rb_2 = rb_workspace["q_value"]

    with torch.no_grad():
        # Replay the current actor on the replay buffer to get actions of the
        # current policy
        t_actor(rb_workspace, t=1, n_steps=1, stochastic=True)
        action_logprobs_next = rb_workspace["action_logprobs"]

        # Compute target q_values from both target critics: at t+1, we have
        # Q(s+1,a+1) from the (s+1,a+1) where a+1 has been replaced in the RB

        target_q_agent_1(rb_workspace, t=1, n_steps=1)
        post_q_values_1 = rb_workspace["q_value"]

        target_q_agent_2(rb_workspace, t=1, n_steps=1)
        post_q_values_2 = rb_workspace["q_value"]

    post_q_values = torch.min(post_q_values_1, post_q_values_2).squeeze(-1)

    v_phi = post_q_values[1] - ent_coef * action_logprobs_next
    target = (
        reward[:-1][0] + cfg.algorithm.discount_factor * v_phi * must_bootstrap.int()
    )
    td_1 = target - q_values_rb_1[0].squeeze(-1)
    td_2 = target - q_values_rb_2[0].squeeze(-1)
    td_error_1 = td_1**2
    td_error_2 = td_2**2
    critic_loss_1 = td_error_1.mean()
    critic_loss_2 = td_error_2.mean()

    return critic_loss_1, critic_loss_2


def soft_update_params(net, target_net, tau):
    for param, target_param in zip(net.parameters(), target_net.parameters()):
        target_param.data.copy_(tau * param.data + (1 - tau) * target_param.data)


def compute_actor_loss(ent_coef, t_actor, q_agent_1, q_agent_2, rb_workspace):
    """Actor loss computation
    
    :param ent_coef: The entropy coefficient $\alpha$
    :param t_actor: The actor agent (temporal agent)
    :param q_agent_1: The first critic (temporal agent)
    :param q_agent_2: The second critic (temporal agent)
    :param rb_workspace: The replay buffer (2 time steps, $t$ and $t+1$)
    """
    # Recompute the q_values from the current policy, not from the actions in the buffer

    t_actor(rb_workspace, t=0, n_steps=1, stochastic=True)
    action_logprobs_new = rb_workspace["action_logprobs"]

    q_agent_1(rb_workspace, t=0, n_steps=1)
    q_values_1 = rb_workspace["q_value"]
    q_agent_2(rb_workspace, t=0, n_steps=1)
    q_values_2 = rb_workspace["q_value"]

    current_q_values = torch.min(q_values_1, q_values_2).squeeze(-1)

    actor_loss = ent_coef * action_logprobs_new[0] - current_q_values[0]

    return actor_loss.mean()



def run_sac(cfg, log_string):
    # 1)  Build the  logger
    logger = Logger(cfg, log_string)
    best_reward = -10e9
    ent_coef = cfg.algorithm.entropy_coef

    # 2) Create the environment agent
    train_env_agent = AutoResetGymAgent(
        get_class(cfg.gym_env),
        get_arguments(cfg.gym_env),
        cfg.algorithm.n_envs,
        cfg.algorithm.seed,
    )
    eval_env_agent = NoAutoResetGymAgent(
        get_class(cfg.gym_env),
        get_arguments(cfg.gym_env),
        cfg.algorithm.nb_evals,
        cfg.algorithm.seed,
    )

    # 3) Create the A2C Agent
    (
        train_agent,
        eval_agent,
        actor,
        critic_1,
        target_critic_1,
        critic_2,
        target_critic_2,
    ) = create_sac_agent(cfg, train_env_agent, eval_env_agent)

    t_actor = TemporalAgent(actor)
    q_agent_1 = TemporalAgent(critic_1)
    target_q_agent_1 = TemporalAgent(target_critic_1)
    q_agent_2 = TemporalAgent(critic_2)
    target_q_agent_2 = TemporalAgent(target_critic_2)
    train_workspace = Workspace()

    # Creates a replay buffer
    rb = ReplayBuffer(max_size=cfg.algorithm.buffer_size)

    # Configure the optimizer
    actor_optimizer, critic_optimizer = setup_optimizers(cfg, actor, critic_1, critic_2)
    entropy_coef_optimizer, log_entropy_coef = setup_entropy_optimizers(cfg)
    nb_steps = 0
    tmp_steps = 0

    # Initial value of the entropy coef alpha. If target_entropy is not auto,
    # will remain fixed
    if cfg.algorithm.target_entropy == "auto":
        target_entropy = -np.prod(train_env_agent.action_space.shape).astype(np.float32)
    else:
        target_entropy = cfg.algorithm.target_entropy

    # Training loop
    for epoch in tqdm(range(cfg.algorithm.max_epochs)):
        # Execute the agent in the workspace
        if epoch > 0:
            train_workspace.zero_grad()
            train_workspace.copy_n_last_steps(1)
            train_agent(
                train_workspace,
                t=1,
                n_steps=cfg.algorithm.n_steps - 1,
                stochastic=True,
            )
        else:
            train_agent(
                train_workspace,
                t=0,
                n_steps=cfg.algorithm.n_steps,
                stochastic=True,
            )

        transition_workspace = train_workspace.get_transitions()
        action = transition_workspace["action"]
        nb_steps += action[0].shape[0]
        rb.put(transition_workspace)

        if nb_steps > cfg.algorithm.learning_starts:
            # Get a sample from the workspace
            rb_workspace = rb.get_shuffled(cfg.algorithm.batch_size)

            done, truncated, reward, action_logprobs_rb = rb_workspace[
                "env/done", "env/truncated", "env/reward", "action_logprobs"
            ]

            # Determines whether values of the critic should be propagated
            # True if the episode reached a time limit or if the task was not done
            # See https://colab.research.google.com/drive/1erLbRKvdkdDy0Zn1X_JhC01s1QAt4BBj?usp=sharing
            must_bootstrap = torch.logical_or(~done[1], truncated[1])

            (
                critic_loss_1, critic_loss_2
            ) = compute_critic_loss(
                cfg, 
                reward, 
                must_bootstrap,
                t_actor,
                q_agent_1,
                q_agent_2,
                target_q_agent_1,
                target_q_agent_2,
                rb_workspace,
                ent_coef
            )

            logger.add_log("critic_loss_1", critic_loss_1, nb_steps)
            logger.add_log("critic_loss_2", critic_loss_2, nb_steps)
            critic_loss = critic_loss_1 + critic_loss_2

            actor_loss = compute_actor_loss(
                ent_coef, t_actor, q_agent_1, q_agent_2, rb_workspace
            )
            logger.add_log("actor_loss", actor_loss, nb_steps)

            # Entropy coef update part #####################################################
            if entropy_coef_optimizer is not None:
                # Important: detach the variable from the graph
                # so that we don't change it with other losses
                # see https://github.com/rail-berkeley/softlearning/issues/60
                ent_coef = torch.exp(log_entropy_coef.detach())
                # See Eq. (17) of the SAC and Applications paper
                entropy_coef_loss = -(
                    log_entropy_coef * (action_logprobs_rb + target_entropy)
                ).mean()
                entropy_coef_optimizer.zero_grad()
                # We need to retain the graph because we reuse the
                # action_logprobs are used to compute both the actor loss and
                # the critic loss
                entropy_coef_loss.backward(retain_graph=True)
                entropy_coef_optimizer.step()
                logger.add_log("entropy_coef_loss", entropy_coef_loss, nb_steps)
                logger.add_log("entropy_coef", ent_coef, nb_steps)

            # Actor update part ###############################
            actor_optimizer.zero_grad()
            actor_loss.backward()
            torch.nn.utils.clip_grad_norm_(
                actor.parameters(), cfg.algorithm.max_grad_norm
            )
            actor_optimizer.step()


            # Critic update part ###############################
            critic_optimizer.zero_grad()
            critic_loss.backward()
            torch.nn.utils.clip_grad_norm_(
                critic_1.parameters(), cfg.algorithm.max_grad_norm
            )
            torch.nn.utils.clip_grad_norm_(
                critic_2.parameters(), cfg.algorithm.max_grad_norm
            )
            critic_optimizer.step()
            ####################################################

            # Soft update of target q function
            tau = cfg.algorithm.tau_target
            soft_update_params(critic_1, target_critic_1, tau)
            soft_update_params(critic_2, target_critic_2, tau)
            # soft_update_params(actor, target_actor, tau)

        # Evaluate ###########################################
        if nb_steps - tmp_steps > cfg.algorithm.eval_interval:
            tmp_steps = nb_steps
            eval_workspace = Workspace()  # Used for evaluation
            eval_agent(
                eval_workspace,
                t=0,
                stop_variable="env/done",
                stochastic=False,
            )
            rewards = eval_workspace["env/cumulated_reward"][-1]
            mean = rewards.mean()
            logger.add_log("reward/mean", mean, nb_steps)
            logger.add_log("reward/max", rewards.max(), nb_steps)
            logger.add_log("reward/min", rewards.min(), nb_steps)
            # logger.add_log("reward/med", rewards.median(), nb_steps)

            # print(f"nb_steps: {nb_steps}, reward: {mean}")
            # print("ent_coef", ent_coef)
            if cfg.save_best and mean > best_reward:
                best_reward = mean
                directory = f"./agents/rocketlander/sac/"+log_string
                if not os.path.exists(directory):
                    os.makedirs(directory)
                filename = directory + "sac_" + str(mean.item()) + ".agt"
                actor.save_model(filename)

# ROCKETLANDER / SAC

In [7]:
ENV = "RocketLander-v0"

In [8]:
# %reload_ext tensorboard
%tensorboard --logdir ./runs/rocketlander/

UsageError: Line magic function `%tensorboard` not found.


In [9]:
batches = [256]
grad_norms = [0.5]
gammas = [0.9] 
ent_coefs = [1e-7, 1e-6, 1e-5]
archs_actor = [[128,128]] 
archs_critic = [[256,256], [400,400]]
lrs_actor = [1e-4]
lrs_critic = [1e-4, 5e-5] 
lrs_ent = [1e-4] 

# first run
# 1e-3 critic lr works only if actor is 128,128 atleast
# 1e-4 critic lr seems to work better

# second run
# 0.9 seems better than 0.99 for gamma
# 1e-4 best lr for critic
# 128,128 seems a bit better for actor

# third run
# 400,400 seems to be way better for critic arch
# works well with 5e-5 critic lr, maybe try to lower actor and ent lrs as well ?
# ent coef was the best at 1e-7

# batches = [256]
# grad_norms = [0.5]
# gammas = [0.9] 
# ent_coefs = [1e-7]
# archs_actor = [[128,128], [256,256]] 
# archs_critic = [[400,400]]
# lrs_actor = [1e-4, 5e-5]
# lrs_critic = [5e-5] 
# lrs_ent = [1e-4, 5e-5]

# fourth run

for batch in batches:
	for grad_norm in grad_norms:
		for gamma in gammas:
			for ent_coef in ent_coefs:
				for arch_actor in archs_actor:
					for arch_critic in archs_critic:
						for lr_actor in lrs_actor:
							for lr_critic in lrs_critic:
								for lr_ent in lrs_ent:
									log_string = "SAC_"+str(batch)+"_"+str(grad_norm)+"_"+str(gamma)+"_"+str(ent_coef)+"_"+str(arch_actor)+"_"+str(arch_critic)+"_"+str(lr_actor)+"_"+str(lr_critic)+"_"+str(lr_ent)

									# Avoid doing the same run twice
									for filename in os.listdir("runs/"):
										if log_string == str(filename):
											print("Skipping this one :", log_string)
											continue

									params={
										"save_best": True,
										"logger":{
											"classname": "bbrl.utils.logger.TFLogger",
											"log_dir": "./runs/RocketLander-v0/" + log_string,
											"cache_size": 10000,
											"every_n_seconds": 10,
											"verbose": False,    
											},

										"algorithm":{
											"seed": 1,
											"n_envs": 8,
											"n_steps": 32,
											"buffer_size": 1e6,
											"batch_size": batch, # 256
											"max_grad_norm": grad_norm, # 0.5
											"nb_evals":10,
											"eval_interval": 2000,
											"learning_starts": 2000,
											"max_epochs": 5000,
											"discount_factor": gamma, # 0.98
											"entropy_coef": ent_coef, # 1e-7
											"target_entropy": "auto",
											"tau_target": 0.05,
											"architecture":{
												"actor_hidden_size": arch_actor, # 32,32
												"critic_hidden_size": arch_critic, # 256,256
											},
										},
										"gym_env":{
											"classname": "__main__.make_gym_env",
											"env_name": ENV,
											},
										"actor_optimizer":{
											"classname": "torch.optim.Adam",
											"lr": lr_actor, # 1e-3
											},
										"critic_optimizer":{
											"classname": "torch.optim.Adam",
											"lr": lr_critic, # 1e-3
											},
										"entropy_coef_optimizer":{
											"classname": "torch.optim.Adam",
											"lr": lr_ent, # 1e-3
											}
									}

									config=OmegaConf.create(params)
									torch.manual_seed(config.algorithm.seed)
									run_sac(config, log_string)

100%|██████████| 5000/5000 [37:21<00:00,  2.23it/s]  
100%|██████████| 5000/5000 [40:19<00:00,  2.07it/s]  
100%|██████████| 5000/5000 [37:51<00:00,  2.20it/s]  
100%|██████████| 5000/5000 [46:43<00:00,  1.78it/s]  
100%|██████████| 5000/5000 [41:39<00:00,  2.00it/s]  
100%|██████████| 5000/5000 [47:22<00:00,  1.76it/s]  
100%|██████████| 5000/5000 [50:04<00:00,  1.66it/s]  
100%|██████████| 5000/5000 [49:16<00:00,  1.69it/s]  
100%|██████████| 5000/5000 [50:08<00:00,  1.66it/s]  
100%|██████████| 5000/5000 [54:28<00:00,  1.53it/s]  
100%|██████████| 5000/5000 [50:07<00:00,  1.66it/s]  
100%|██████████| 5000/5000 [46:47<00:00,  1.78it/s]  
