In [None]:
!pip install mujoco

Collecting mujoco
  Downloading mujoco-3.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (5.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.4/5.4 MB[0m [31m33.2 MB/s[0m eta [36m0:00:00[0m
Collecting glfw (from mujoco)
  Downloading glfw-2.6.4-py2.py27.py3.py30.py31.py32.py33.py34.py35.py36.py37.py38-none-manylinux2014_x86_64.whl (211 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.8/211.8 kB[0m [31m14.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: glfw, mujoco
Successfully installed glfw-2.6.4 mujoco-3.1.0


In [None]:
#@title Set up rendering, check installation

from google.colab import files

import distutils.util
import os
import subprocess
if subprocess.run('nvidia-smi').returncode:
  raise RuntimeError(
      'Cannot communicate with GPU. '
      'Make sure you are using a GPU Colab runtime. '
      'Go to the Runtime menu and select Choose runtime type.')

# Add an ICD config so that glvnd can pick up the Nvidia EGL driver.
# This is usually installed as part of an Nvidia driver package, but the Colab
# kernel doesn't install its driver via APT, and as a result the ICD is missing.
# (https://github.com/NVIDIA/libglvnd/blob/master/src/EGL/icd_enumeration.md)
NVIDIA_ICD_CONFIG_PATH = '/usr/share/glvnd/egl_vendor.d/10_nvidia.json'
if not os.path.exists(NVIDIA_ICD_CONFIG_PATH):
  with open(NVIDIA_ICD_CONFIG_PATH, 'w') as f:
    f.write("""{
    "file_format_version" : "1.0.0",
    "ICD" : {
        "library_path" : "libEGL_nvidia.so.0"
    }
}
""")

# Configure MuJoCo to use the EGL rendering backend (requires GPU)
print('Setting environment variable to use GPU rendering:')
%env MUJOCO_GL=egl

try:
  print('Checking that the installation succeeded:')
  import mujoco
  mujoco.MjModel.from_xml_string('<mujoco/>')
except Exception as e:
  raise e from RuntimeError(
      'Something went wrong during installation. Check the shell output above '
      'for more information.\n'
      'If using a hosted Colab runtime, make sure you enable GPU acceleration '
      'by going to the Runtime menu and selecting "Choose runtime type".')

print('Installation successful.')

Setting environment variable to use GPU rendering:
env: MUJOCO_GL=egl
Checking that the installation succeeded:
Installation successful.


In [None]:
!pip install gym[classic_control,mujoco,atari,accept-rom-license]==0.25.2

Collecting pygame==2.1.0 (from gym[accept-rom-license,atari,classic_control,mujoco]==0.25.2)
  Downloading pygame-2.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.3/18.3 MB[0m [31m24.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting ale-py~=0.7.5 (from gym[accept-rom-license,atari,classic_control,mujoco]==0.25.2)
  Downloading ale_py-0.7.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m53.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting autorom[accept-rom-license]~=0.4.2 (from gym[accept-rom-license,atari,classic_control,mujoco]==0.25.2)
  Downloading AutoROM-0.4.2-py3-none-any.whl (16 kB)
Collecting mujoco==2.2.0 (from gym[accept-rom-license,atari,classic_control,mujoco]==0.25.2)
  Downloading mujoco-2.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.6 MB)
[2K     [90m━━

In [None]:
from typing import Sequence, Callable, Tuple, Optional, Union, List, Dict

import numpy as np
import torch
from torch import nn
from torch import distributions
import cv2

import gym
from gym import wrappers
from gym.wrappers.record_episode_statistics import RecordEpisodeStatistics
from gym.wrappers.rescale_action import RescaleAction
from gym.wrappers.clip_action import ClipAction

import time
import argparse
import os

import tqdm

%matplotlib inline
from matplotlib import pyplot as plt
from matplotlib import animation
from IPython.display import display, clear_output, HTML
import numpy as np

In [None]:
# from mujoco_py import GlfwContext
# GlfwContext(offscreen=True)  # Create a window to init GLFW.

  and should_run_async(code)


In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [None]:
def from_numpy(data: Union[np.ndarray, dict], **kwargs):
    if isinstance(data, dict):
        return {k: from_numpy(v) for k, v in data.items()}
    else:
        data = torch.from_numpy(data, **kwargs)
        if data.dtype == torch.float64:
            data = data.float()
        return data.to(device)

def to_numpy(tensor: Union[torch.Tensor, dict]):
    if isinstance(tensor, dict):
        return {k: to_numpy(v) for k, v in tensor.items()}
    else:
        return tensor.to("cpu").detach().numpy()

In [None]:
class SoftActorCritic(nn.Module):
    def __init__(
        self,
        observation_shape: Sequence[int],
        action_dim: int,
        make_actor: Callable[[Tuple[int, ...], int], nn.Module],
        make_actor_optimizer: Callable[[torch.nn.ParameterList], torch.optim.Optimizer],
        make_actor_schedule: Callable[
            [torch.optim.Optimizer], torch.optim.lr_scheduler._LRScheduler
        ],
        make_critic: Callable[[Tuple[int, ...], int], nn.Module],
        make_critic_optimizer: Callable[
            [torch.nn.ParameterList], torch.optim.Optimizer
        ],
        make_critic_schedule: Callable[
            [torch.optim.Optimizer], torch.optim.lr_scheduler._LRScheduler
        ],
        discount: float,
        target_update_period: Optional[int] = None,
        soft_target_update_rate: Optional[float] = None,
        # Actor-critic configuration
        actor_gradient_type: str = "reinforce",  # One of "reinforce" or "reparametrize"
        num_actor_samples: int = 1,
        num_critic_updates: int = 1,
        # Settings for multiple critics
        num_critic_networks: int = 1,
        target_critic_backup_type: str = "mean",  # One of "doubleq", "min", "redq", or "mean"
        # Soft actor-critic
        use_entropy_bonus: bool = False,
        temperature: float = 0.0,
        backup_entropy: bool = True,
    ):
        super().__init__()

        assert target_critic_backup_type in [
            "doubleq",
            "min",
            "mean",
            "redq",
        ], f"{target_critic_backup_type} is not a valid target critic backup type"

        assert actor_gradient_type in [
            "reinforce",
            "reparametrize",
        ], f"{actor_gradient_type} is not a valid type of actor gradient update"

        assert (
            target_update_period is not None or soft_target_update_rate is not None
        ), "Must specify either target_update_period or soft_target_update_rate"

        self.actor = make_actor(observation_shape, action_dim)
        self.actor_optimizer = make_actor_optimizer(self.actor.parameters())
        self.actor_lr_scheduler = make_actor_schedule(self.actor_optimizer)

        self.critics = nn.ModuleList(
            [
                make_critic(observation_shape, action_dim)
                for _ in range(num_critic_networks)
            ]
        )

        self.critic_optimizer = make_critic_optimizer(self.critics.parameters())
        self.critic_lr_scheduler = make_critic_schedule(self.critic_optimizer)
        self.target_critics = nn.ModuleList(
            [
                make_critic(observation_shape, action_dim)
                for _ in range(num_critic_networks)
            ]
        )
        self.update_target_critic()

        self.observation_shape = observation_shape
        self.action_dim = action_dim
        self.discount = discount
        self.target_update_period = target_update_period
        self.target_critic_backup_type = target_critic_backup_type
        self.num_critic_networks = num_critic_networks
        self.use_entropy_bonus = use_entropy_bonus
        self.temperature = temperature
        self.actor_gradient_type = actor_gradient_type
        self.num_actor_samples = num_actor_samples
        self.num_critic_updates = num_critic_updates
        self.soft_target_update_rate = soft_target_update_rate
        self.backup_entropy = backup_entropy

        self.critic_loss = nn.MSELoss()

    def get_action(self, observation: np.ndarray) -> np.ndarray:
        """
        Compute the action for a given observation.
        """
        with torch.no_grad():
            observation = from_numpy(observation)[None]

            action_distribution: torch.distributions.Distribution = self.actor(observation)
            action: torch.Tensor = action_distribution.sample()

            assert action.shape == (1, self.action_dim), action.shape
            return to_numpy(action).squeeze(0)

    def critic(self, obs: torch.Tensor, action: torch.Tensor) -> torch.Tensor:
        """
        Compute the (ensembled) Q-values for the given state-action pair.
        """
        return torch.stack([critic(obs, action) for critic in self.critics], dim=0)

    def target_critic(self, obs: torch.Tensor, action: torch.Tensor) -> torch.Tensor:
        """
        Compute the (ensembled) target Q-values for the given state-action pair.
        """
        return torch.stack(
            [critic(obs, action) for critic in self.target_critics], dim=0
        )

    def update_critic(
        self,
        obs: torch.Tensor,
        action: torch.Tensor,
        reward: torch.Tensor,
        next_obs: torch.Tensor,
        done: torch.Tensor,
    ):
        """
        Update the critic networks by computing target values and minimizing Bellman error.
        """
        (batch_size,) = reward.shape

        # Compute target values
        # Important: we don't need gradients for target values!
        with torch.no_grad():
            # TODO
            # Sample from the actor
            next_action_distribution: torch.distributions.Distribution = self.actor(next_obs)
            next_action = next_action_distribution.sample()

            # Compute the next Q-values for the sampled actions
            next_qs = self.target_critic(next_obs, next_action)

            # Handle Q-values from multiple different target critic networks (if necessary)
            # (For double-Q, clip-Q, etc.)
            next_qs = next_qs.mean(0)[None].expand((self.num_critic_networks, batch_size)).contiguous()

            assert next_qs.shape == (
                self.num_critic_networks,
                batch_size,
            ), next_qs.shape

            if self.use_entropy_bonus and self.backup_entropy:
                # TODO: Add entropy bonus to the target values for SAC
                next_action_entropy = self.entropy(next_action_distribution)
                next_qs += self.temperature * next_action_entropy

            # Compute the target Q-value
            target_values = reward + self.discount * (1 - done.type(next_qs.dtype)) * next_qs

            assert target_values.shape == (
                self.num_critic_networks,
                batch_size
            )

        # TODO: Update the critic
        # Predict Q-values
        q_values = self.critic(obs, action)
        assert q_values.shape == (self.num_critic_networks, batch_size), q_values.shape

        # Compute loss
        loss = self.critic_loss(q_values, target_values)

        self.critic_optimizer.zero_grad()
        loss.backward()
        self.critic_optimizer.step()

        return {
            "critic_loss": loss.item(),
            "q_values": q_values.mean().item(),
            "target_values": target_values.mean().item(),
        }

    def entropy(self, action_distribution: torch.distributions.Distribution):
        """
        Compute the (approximate) entropy of the action distribution for each batch element.
        """

        # TODO: Compute the entropy of the action distribution.
        # Note: Think about whether to use .rsample() or .sample() here...

        samples = action_distribution.rsample()
        return -action_distribution.log_prob(samples)
        # return action_distribution.entropy()

    def actor_loss_reinforce(self, obs: torch.Tensor):
        batch_size = obs.shape[0]

        # TODO: Generate an action distribution
        action_distribution: torch.distributions.Distribution = self.actor(obs)

        with torch.no_grad():
            # TODO: draw num_actor_samples samples from the action distribution for each batch element
            action = action_distribution.sample((self.num_actor_samples,))
            assert action.shape == (
                self.num_actor_samples,
                batch_size,
                self.action_dim,
            ), action.shape

            # TODO: Compute Q-values for the current state-action pair
            # q_values = self.critic(obs.repeat((self.num_actor_samples, 1, 1)), action)
            q_values = self.critic(obs[None].expand((self.num_actor_samples,
                                                      batch_size,
                                                      -1)), action)
            assert q_values.shape == (
                self.num_critic_networks,
                self.num_actor_samples,
                batch_size,
            ), q_values.shape

            # Our best guess of the Q-values is the mean of the ensemble
            q_values = torch.mean(q_values, axis=0)
            advantage = q_values

        # Do REINFORCE: calculate log-probs and use the Q-values
        # TODO
        log_probs = action_distribution.log_prob(action)
        loss = -torch.mean(log_probs * advantage)
        # loss = 0

        return loss, torch.mean(self.entropy(action_distribution))

    def actor_loss_reparametrize(self, obs: torch.Tensor):
        batch_size = obs.shape[0]

        # Sample from the actor
        action_distribution: torch.distributions.Distribution = self.actor(obs)

        # TODO: Sample actions
        # Note: Think about whether to use .rsample() or .sample() here...
        action = action_distribution.rsample()

        # TODO: Compute Q-values for the sampled state-action pair
        q_values = self.critic(obs, action)

        # TODO: Compute the actor loss
        loss = -torch.mean(q_values)

        return loss, torch.mean(self.entropy(action_distribution))

    def update_actor(self, obs: torch.Tensor):
        """
        Update the actor by one gradient step using either REPARAMETRIZE or REINFORCE.
        """

        if self.actor_gradient_type == "reparametrize":
            loss, entropy = self.actor_loss_reparametrize(obs)
        elif self.actor_gradient_type == "reinforce":
            loss, entropy = self.actor_loss_reinforce(obs)

        # Add entropy if necessary
        if self.use_entropy_bonus:
            loss -= self.temperature * entropy

        self.actor_optimizer.zero_grad()
        loss.backward()
        self.actor_optimizer.step()

        return {"actor_loss": loss.item(), "entropy": entropy.item()}

    def update_target_critic(self):
        self.soft_update_target_critic(1.0)

    def soft_update_target_critic(self, tau):
        for target_critic, critic in zip(self.target_critics, self.critics):
            for target_param, param in zip(
                target_critic.parameters(), critic.parameters()
            ):
                target_param.data.copy_(
                    target_param.data * (1.0 - tau) + param.data * tau
                )

    def update(
        self,
        observations: torch.Tensor,
        actions: torch.Tensor,
        rewards: torch.Tensor,
        next_observations: torch.Tensor,
        dones: torch.Tensor,
        step: int,
    ):
        """
        Update the actor and critic networks.
        """

        critic_infos = []
        # TODO: Update the critic for num_critic_updates steps, and add the output stats to critic_infos
        for i in range(self.num_critic_updates):
            critic_infos.append(self.update_critic(observations,
                                                   actions,
                                                   rewards,
                                                   next_observations,
                                                   dones))

        # TODO: Update the actor
        actor_info = self.update_actor(observations)

        # TODO: Perform either hard or soft target updates.
        # Relevant variables:
        #  - step
        #  - self.target_update_period (None when using soft updates)
        #  - self.soft_target_update_rate (None when using hard updates)
        if self.target_update_period is None:
            self.soft_update_target_critic(self.soft_target_update_rate)
        elif step % self.target_update_period == 0:
            self.update_target_critic()

        # Average the critic info over all of the steps
        critic_info = {
            k: np.mean([info[k] for info in critic_infos]) for k in critic_infos[0]
        }

        # Deal with LR scheduling
        self.actor_lr_scheduler.step()
        self.critic_lr_scheduler.step()

        return {
            **actor_info,
            **critic_info,
            "actor_lr": self.actor_lr_scheduler.get_last_lr()[0],
            "critic_lr": self.critic_lr_scheduler.get_last_lr()[0],
        }

In [None]:
class ReplayBuffer:
    def __init__(self, capacity=1000000):
        self.max_size = capacity
        self.size = 0
        self.observations = None
        self.actions = None
        self.rewards = None
        self.next_observations = None
        self.dones = None

    def sample(self, batch_size):
        rand_indices = np.random.randint(0, self.size, size=(batch_size,)) % self.max_size
        return {
            "observations": self.observations[rand_indices],
            "actions": self.actions[rand_indices],
            "rewards": self.rewards[rand_indices],
            "next_observations": self.next_observations[rand_indices],
            "dones": self.dones[rand_indices],
        }

    def __len__(self):
        return self.size

    def insert(
        self,
        /,
        observation: np.ndarray,
        action: np.ndarray,
        reward: np.ndarray,
        next_observation: np.ndarray,
        done: np.ndarray,
    ):
        """
        Insert a single transition into the replay buffer.

        Use like:
            replay_buffer.insert(
                observation=observation,
                action=action,
                reward=reward,
                next_observation=next_observation,
                done=done,
            )
        """
        if isinstance(reward, (float, int)):
            reward = np.array(reward)
        if isinstance(done, bool):
            done = np.array(done)
        if isinstance(action, int):
            action = np.array(action, dtype=np.int64)

        if self.observations is None:
            self.observations = np.empty(
                (self.max_size, *observation.shape), dtype=observation.dtype
            )
            self.actions = np.empty((self.max_size, *action.shape), dtype=action.dtype)
            self.rewards = np.empty((self.max_size, *reward.shape), dtype=reward.dtype)
            self.next_observations = np.empty(
                (self.max_size, *next_observation.shape), dtype=next_observation.dtype
            )
            self.dones = np.empty((self.max_size, *done.shape), dtype=done.dtype)

        assert observation.shape == self.observations.shape[1:]
        assert action.shape == self.actions.shape[1:]
        assert reward.shape == ()
        assert next_observation.shape == self.next_observations.shape[1:]
        assert done.shape == ()

        self.observations[self.size % self.max_size] = observation
        self.actions[self.size % self.max_size] = action
        self.rewards[self.size % self.max_size] = reward
        self.next_observations[self.size % self.max_size] = next_observation
        self.dones[self.size % self.max_size] = done

        self.size += 1

In [None]:
def sample_trajectory(
    env: gym.Env, policy, max_length: int, render: bool = False
) -> Dict[str, np.ndarray]:
    """Sample a rollout in the environment from a policy."""
    ob = env.reset()
    obs, acs, rewards, next_obs, terminals, image_obs = [], [], [], [], [], []
    steps = 0

    while True:
        # render an image
        if render:
            if hasattr(env, "sim"):
                img = env.sim.render(camera_name="track", height=500, width=500)[::-1]
            else:
                img = env.render(mode="rgb_array")

            if isinstance(img, list):
                img = img[0]

            image_obs.append(
                cv2.resize(img, dsize=(250, 250), interpolation=cv2.INTER_CUBIC)
            )

        ac = policy.get_action(ob)

        next_ob, rew, done, info = env.step(ac)

        steps += 1
        rollout_done = done or steps > max_length

        # record result of taking that action
        obs.append(ob)
        acs.append(ac)
        rewards.append(rew)
        next_obs.append(next_ob)
        terminals.append(rollout_done)

        ob = next_ob  # jump to next timestep

        # end the rollout if the rollout ended
        if rollout_done:
            break

    episode_statistics = {"l": steps, "r": np.sum(rewards)}
    if "episode" in info:
        episode_statistics.update(info["episode"])

    env.close()

    return {
        "observation": np.array(obs, dtype=np.float32),
        "image_obs": np.array(image_obs, dtype=np.uint8),
        "reward": np.array(rewards, dtype=np.float32),
        "action": np.array(acs, dtype=np.float32),
        "next_observation": np.array(next_obs, dtype=np.float32),
        "terminal": np.array(terminals, dtype=np.float32),
        "episode_statistics": episode_statistics,
    }

def sample_n_trajectories(
    env: gym.Env, policy, ntraj: int, max_length: int, render: bool = False
):
    """Collect ntraj rollouts."""
    trajs = []
    for _ in range(ntraj):
        # collect rollout
        traj = sample_trajectory(env, policy, max_length, render)
        trajs.append(traj)
    return trajs

def log_paths_as_videos(paths, max_videos_to_save=2):
        # reshape the rollouts
        # videos = [np.transpose(p['image_obs'], [0, 3, 1, 2]) for p in paths]
        videos = [p['image_obs'] for p in paths]

        # max rollout length
        max_videos_to_save = np.min([max_videos_to_save, len(videos)])
        max_length = videos[0].shape[0]
        for i in range(max_videos_to_save):
            if videos[i].shape[0]>max_length:
                max_length = videos[i].shape[0]

        # pad rollouts to all be same length
        for i in range(max_videos_to_save):
            if videos[i].shape[0]<max_length:
                padding = np.tile([videos[i][-1]], (max_length-videos[i].shape[0],1,1,1))
                videos[i] = np.concatenate([videos[i], padding], 0)

        # log videos to tensorboard event file
        videos = np.stack(videos[:max_videos_to_save], 0)

        return videos

def plot_trajectories(videos):
    fig = plt.figure()
    imgs = []

    n_trajs = videos.shape[0]
    for i in range(1, n_trajs + 1):
        fig.add_subplot(1, n_trajs, i)
        imgs.append(plt.imshow(videos[i - 1, 0, ...]))

    plt.close() # this is required to not display the generated image

    def init():
        for j, im in enumerate(imgs):
            im.set_data(videos[j, 0, ...])

        return imgs

    def animate(i):
        for j, im in enumerate(imgs):
            im.set_data(videos[j, i, ...])

        return imgs

    anim = animation.FuncAnimation(fig,
                                   animate,
                                   init_func=init,
                                   frames=videos.shape[1],
                                   interval=25,
                                   repeat=False)
                                #    repeat_delay=1000)

    clear_output(True)
    display(HTML(anim.to_html5_video()))

def run_training_loop(config,
                      seed=42):
    # set random seeds
    np.random.seed(seed)
    torch.manual_seed(seed)

    env = config["make_env"]()
    eval_env = config["make_env"]()
    render_env = config["make_env"](render=True)

    # make the gym environment
    discrete = isinstance(env.action_space, gym.spaces.Discrete)
    assert (
        not discrete
    ), "Our actor-critic implementation only supports continuous action spaces. (This isn't a fundamental limitation, just a current implementation decision.)"


    ob_shape = env.observation_space.shape
    ac_dim = env.action_space.shape[0]

    agent = SoftActorCritic(
        ob_shape,
        ac_dim,
        **config["agent_kwargs"],
    )

    # simulation timestep, will be used for video saving
    if "model" in dir(env):
        fps = 1 / env.model.opt.timestep
    elif "render_fps" in env.env.metadata:
        fps = env.env.metadata["render_fps"]
    else:
        fps = 4

    ep_len = env.spec.max_episode_steps

    observation = env.reset()

    replay_buffer = ReplayBuffer(config["replay_buffer_capacity"])

    stats = {}

    t = tqdm.trange(config["total_steps"], dynamic_ncols=True)

    for step in t:
        if step < config["random_steps"]:
            action = env.action_space.sample()
        else:
            # TODO(student): Select an action
            action = agent.get_action(observation)

        # Step the environment and add the data to the replay buffer
        next_observation, reward, done, info = env.step(action)
        replay_buffer.insert(
            observation=observation,
            action=action,
            reward=reward,
            next_observation=next_observation,
            done=done and not info.get("TimeLimit.truncated", False),
        )

        # Handle episode termination
        if done:
            observation = env.reset()
        else:
            observation = next_observation

        # Main DQN training loop
        if step >= config["learning_starts"]:
            # TODO: Sample config["batch_size"] samples from the replay buffer
            batch = replay_buffer.sample(config["batch_size"])

            # TODO: Train the agent. `batch` is a dictionary of numpy arrays,

            # Convert to PyTorch tensors
            batch = from_numpy(batch)
            update_info = agent.update(batch["observations"],
                                       batch["actions"],
                                       batch["rewards"],
                                       batch["next_observations"],
                                       batch["dones"],
                                       step) #!!!

            # Logging code
            update_info["actor_lr"] = agent.actor_lr_scheduler.get_last_lr()[0]
            update_info["critic_lr"] = agent.critic_lr_scheduler.get_last_lr()[0]

            if step % config["log_interval"] == 0:
                for k, v in update_info.items():
                    stats[k] = v

                t.set_postfix(stats, refresh=True)

        if step % config["eval_interval"] == 0:
            # Evaluate
            trajectories = sample_n_trajectories(
                eval_env,
                agent,
                config["num_eval_trajectories"],
                ep_len,
            )
            returns = [t["episode_statistics"]["r"] for t in trajectories]
            ep_lens = [t["episode_statistics"]["l"] for t in trajectories]

            stats["eval_return"] = np.mean(returns)
            stats["eval_ep_len"] = np.mean(ep_lens)

            if len(returns) > 1:
                stats["eval/return_std"] = np.std(returns)
                stats["eval/return_max"] = np.max(returns)
                stats["eval/return_min"] = np.min(returns)
                stats["eval/ep_len_std"] = np.std(ep_lens)
                stats["eval/ep_len_max"] = np.max(ep_lens)
                stats["eval/ep_len_min"] = np.min(ep_lens)

                t.set_postfix(stats, refresh=True)

    if config["num_render_trajectories"] > 0:
        video_trajectories = sample_n_trajectories(
            render_env,
            agent,
            config["num_render_trajectories"],
            ep_len,
            render=True,
        )

        videos = log_paths_as_videos(video_trajectories,
                                        max_videos_to_save=5)

        plot_trajectories(videos)


In [None]:
_str_to_activation = {
    "relu": nn.ReLU(),
    "tanh": nn.Tanh(),
    "leaky_relu": nn.LeakyReLU(),
    "sigmoid": nn.Sigmoid(),
    "selu": nn.SELU(),
    "softplus": nn.Softplus(),
    "identity": nn.Identity(),
}

def build_mlp(
    input_size: int,
    output_size: int,
    n_layers: int,
    size: int,
    activation = "tanh",
    output_activation = "identity",
):
    """
    Builds a feedforward neural network

    arguments:
        input_placeholder: placeholder variable for the state (batch_size, input_size)
        scope: variable scope of the network

        n_layers: number of hidden layers
        size: dimension of each hidden layer
        activation: activation of each hidden layer

        input_size: size of the input layer
        output_size: size of the output layer
        output_activation: activation of the output layer

    returns:
        output_placeholder: the result of a forward pass through the hidden layers + the output layer
    """
    if isinstance(activation, str):
        activation = _str_to_activation[activation]
    if isinstance(output_activation, str):
        output_activation = _str_to_activation[output_activation]
    layers = []
    in_size = input_size
    for _ in range(n_layers):
        layers.append(nn.Linear(in_size, size))
        layers.append(activation)
        in_size = size
    layers.append(nn.Linear(in_size, output_size))
    layers.append(output_activation)

    mlp = nn.Sequential(*layers)
    mlp.to(device)
    return mlp

class MLPPolicy(nn.Module):
    """
    Base MLP policy, which can take an observation and output a distribution over actions.

    This class implements `forward()` which takes a (batched) observation and returns a distribution over actions.
    """

    def __init__(
        self,
        ac_dim: int,
        ob_dim: int,
        n_layers: int,
        layer_size: int
    ):
        super().__init__()

        self.net = build_mlp(
            input_size=ob_dim,
            output_size=2*ac_dim,
            n_layers=n_layers,
            size=layer_size,
        ).to(device)

    def make_tanh_transformed(
        self, mean: torch.Tensor, std: Union[float, torch.Tensor]
    ) -> distributions.Distribution:
        if isinstance(std, float):
            std = torch.tensor(std, device=mean.device)

        if std.shape == ():
            std = std.expand(mean.shape)

        return distributions.Independent(
            distributions.TransformedDistribution(
                base_distribution=distributions.Normal(mean, std),
                transforms=[distributions.TanhTransform(cache_size=1)],
            ),
            reinterpreted_batch_ndims=1,
        )

    def make_multi_normal(
        self, mean: torch.Tensor, std: Union[float, torch.Tensor]
    ) -> distributions.Distribution:
        if isinstance(std, float):
            std = torch.tensor(std, device=mean.device)

        if std.shape == ():
            std = std.expand(mean.shape)

        return distributions.Independent(distributions.Normal(mean, std), reinterpreted_batch_ndims=1)

    def forward(self, obs: torch.FloatTensor):
        """
        This function defines the forward pass of the network.  You can return anything you want, but you should be
        able to differentiate through it. For example, you can return a torch.FloatTensor. You can also return more
        flexible objects, such as a `torch.distributions.Distribution` object. It's up to you!
        """
        mean, std = torch.chunk(self.net(obs), 2, dim=-1)
        std = torch.nn.functional.softplus(std) + 1e-2

        action_distribution = self.make_tanh_transformed(mean, std)

        return action_distribution


In [None]:
class StateActionCritic(nn.Module):
    def __init__(self, ob_dim, ac_dim, n_layers, size):
        super().__init__()
        self.net = build_mlp(
            input_size=ob_dim + ac_dim,
            output_size=1,
            n_layers=n_layers,
            size=size,
        ).to(device)

    def forward(self, obs, acs):
        return self.net(torch.cat([obs, acs], dim=-1)).squeeze(-1)


def make_critic(observation_shape, action_dim) -> nn.Module:
    return StateActionCritic(
        ob_dim=np.prod(observation_shape),
        ac_dim=action_dim,
        n_layers=3,
        size=128,
    )

def make_actor(observation_shape, action_dim) -> nn.Module:
    assert len(observation_shape) == 1
    return MLPPolicy(
        ac_dim=action_dim,
        ob_dim=np.prod(observation_shape),
        n_layers=3,
        layer_size=128
    )

def make_actor_optimizer(params: torch.nn.ParameterList) -> torch.optim.Optimizer:
    return torch.optim.Adam(params, lr=3e-4)

def make_critic_optimizer(params: torch.nn.ParameterList) -> torch.optim.Optimizer:
    return torch.optim.Adam(params, lr=3e-4)

def make_lr_schedule(
    optimizer: torch.optim.Optimizer,
) -> torch.optim.lr_scheduler._LRScheduler:
    return torch.optim.lr_scheduler.ConstantLR(optimizer, factor=1.0)

def make_env(render: bool = False):
        return RecordEpisodeStatistics(
            ClipAction(
                RescaleAction(
                    gym.make(
                        "HalfCheetah-v4", render_mode="single_rgb_array" if render else None #InvertedPendulum-v4
                    ),
                    -1,
                    1,
                )
            )
        )

agent_kwargs = {
            "make_critic": make_critic,
            "make_critic_optimizer": make_critic_optimizer,
            "make_critic_schedule": make_lr_schedule,
            "make_actor": make_actor,
            "make_actor_optimizer": make_actor_optimizer,
            "make_actor_schedule": make_lr_schedule,
            "num_critic_updates": 1,
            "discount": 0.99,
            "actor_gradient_type": "reinforce",
            "num_actor_samples": 10,
            "num_critic_updates": 1,
            "num_critic_networks": 1,
            "target_critic_backup_type": "mean", #mean
            "use_entropy_bonus": True,
            "backup_entropy": True,
            "temperature": 0.2,
            "target_update_period": None,
            "soft_target_update_rate": 0.005, #0.005
        }

total_steps = 1000000

config = {"total_steps": total_steps,
          "num_render_trajectories": 3,
          "num_eval_trajectories": 10,
          "log_interval": 1000,
          "eval_interval": 5000,
          "learning_starts": 10000,
          "batch_size": 128,
          "replay_buffer_capacity": 1000000,
          "random_steps": 5000,
          "make_env": make_env,
          "agent_kwargs": agent_kwargs}

In [None]:
run_training_loop(config)

  deprecation(
  deprecation(
 71%|███████   | 711071/1000000 [2:38:53<51:56, 92.70it/s, eval_return=980, eval_ep_len=1e+3, eval/return_std=386, eval/return_max=1.26e+3, eval/return_min=-148, eval/ep_len_std=0, eval/ep_len_max=1000, eval/ep_len_min=1000, actor_loss=-38.2, entropy=-.347, critic_loss=6.42, q_values=90.2, target_values=89.9, actor_lr=0.0003, critic_lr=0.0003]