# Homework 4: Soft Actor Critic (10 Pts)

All homeworks are self-contained. They can be completed in their respective notebooks.
To edit and re-run code, you can therefore simply edit and restart the code cells below.
There is a timeout of about ~12 hours with Colab while it is active (and less if you close your browser window).
This file should automatically be synced with your Google Drive. We also save all recordings and logs in it by default so that you won't lose your work in the event of an instance timeout.
 However, you will need to re-mount your Google Drive and re-install packages with every new instance.

In [None]:
# Your work will be stored in a folder called `drl_ws22` by default to prevent Colab
# instance timeouts from deleting your edits.
# We do this by mounting your google drive on the virtual machine created in this colab
# session. For this, you will likely need to sign in to your Google account and copy a
# passcode into a field below

import os
from google.colab import drive

drive.mount('/content/gdrive')

In [None]:
# Create paths in your google drive
DRIVE_PATH = '/content/gdrive/My\ Drive/drl_ws22'
DRIVE_PYTHON_PATH = DRIVE_PATH.replace('\\', '')
if not os.path.exists(DRIVE_PYTHON_PATH):
    ! mkdir $DRIVE_PATH

# the space in `My Drive` causes some issues,
# make a symlink to avoid this
SYM_PATH = '/content/drl_ws22'
if not os.path.exists(SYM_PATH):
    !ln -s $DRIVE_PATH $SYM_PATH
! cd $SYM_PATH

In [None]:
# Install **python** and **system** packages

# install required system dependencies
!apt-get install -y xvfb x11-utils

# install required python dependencies
!pip install matplotlib numpy tqdm torch stable_baselines3 gym==0.21.* pyvirtualdisplay==0.2.* PyOpenGL==3.1.* PyOpenGL-accelerate==3.1.*

We start by importing all the necessary python modules and defining some helper
functions which you do not need to change. Still, make sure you are aware of
what they do.

In [None]:
# Imports and utility
# Progress bar

import os
import gym
import copy
import tqdm
import time
import torch
import numpy as np
import torch.nn as nn
import collections, random
import torch.optim as optim
import torch.nn.functional as F
import matplotlib.pyplot as plt
from typing import Tuple, Dict, List
from torch.distributions import Normal
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.vec_env import VecVideoRecorder

# Set random seeds
np.random.seed(0)

class ProgressBar:
    def __init__(self, num_iterations: int, verbose: bool = True):
        if verbose:  # create a nice little progress bar
            self.scalar_tracker = tqdm.tqdm(total=num_iterations, desc="Scalars", bar_format="{desc}",
                                            position=0, leave=True)
            progress_bar_format = '{desc} {n_fmt:' + str(
                len(str(num_iterations))) + '}/{total_fmt}|{bar}|{elapsed}<{remaining}'
            self.progress_bar = tqdm.tqdm(total=num_iterations, desc='Iteration', bar_format=progress_bar_format,
                                          position=1, leave=True)
        else:
            self.scalar_tracker = None
            self.progress_bar = None

    def __call__(self, _steps: int = 1, **kwargs):
        if self.progress_bar is not None:
            formatted_scalars = {key: "{:.3e}".format(value[-1] if isinstance(value, list) else value)
                                 for key, value in kwargs.items()}
            description = ("Scalars: " + "".join([str(key) + "=" + value + ", "
                                                  for key, value in formatted_scalars.items()]))[:-2]
            self.scalar_tracker.set_description(description)
            self.progress_bar.update(_steps)

# specify the path to save the recordings of this run to.
data_path = '/content/drl_ws22/exercise_4'
data_path = os.path.join(data_path, time.strftime("%d-%m-%Y_%H-%M"))
if not (os.path.exists(data_path)):
    os.makedirs(data_path)

# disable the actual display to prevent errors with colab
from pyvirtualdisplay import Display
_display = Display(visible=False, size=(1400, 900))
_ = _display.start()

# this function will automatically save your figure into your google drive folder (if correctly mounted!)
def save_figure(save_name: str) -> None:
    assert save_name is not None, "Need to provide a filename to save to"
    plt.savefig(os.path.join(data_path, save_name + ".png"))


def evaluate_rollout(evaluation_environment: gym.Env, soft_actor_critic) -> float:
    """
    Performs a full rollout using the mean of the current policy.
    :param evaluation_environment: The environment used for evaluation. In our case, a Pendulum environment
    :param soft_actor_critic: An instance of the SAC class defined above
    :return: The total score for the rollout
    """
    done = False
    score = 0
    state = evaluation_environment.reset()
    while not done:  # alternate between collecting one step of data and updating SAC with one mini-batch
        action, log_probabilities = soft_actor_critic.policy(torch.from_numpy(np.array(state)).float(), deterministic=True)
        scaled_action = evaluation_environment.action_space.high[0] * action.item()
        next_state, reward, done, info = evaluation_environment.step([[scaled_action]])
        # need to wrap action in a list because of the video recording
        state = next_state  # go to the next environment step
        score += reward  # keep track of cumulative reward for recording
    return score


def v_function_visualization(evaluation_environment: gym.Env,
                             soft_actor_critic,
                             current_step: int = 0,
                             resolution: int = 100):
    """
    Visualizes a numerical approximation of the value function by evaluating the Q-Function for a wide range
    :param evaluation_environment:
    :param soft_actor_critic:
    :param current_step:
    :param resolution:
    :return:
    """
    import matplotlib.pyplot as plt
    plt.clf()

    max_speed = 8
    x = np.linspace(-np.pi, np.pi, num=resolution)
    y = np.linspace(-max_speed, max_speed, num=resolution)
    state_evaluation_grid = np.transpose([np.tile(x, len(y)), np.repeat(y, len(x))])
    input_observations = torch.Tensor(np.array([np.cos(state_evaluation_grid[:, 0]),
                                                np.sin(state_evaluation_grid[:, 0]),
                                                state_evaluation_grid[:, 1]])).T

    evaluations = []

    for position, action in enumerate(np.linspace(evaluation_environment.action_space.low,
                                                  evaluation_environment.action_space.high,
                                                  50)):
        action_tensor = torch.Tensor(np.full((len(state_evaluation_grid), 1), fill_value=action))

        q1_val = soft_actor_critic.q_net_1(input_observations, action_tensor)
        q2_val = soft_actor_critic.q_net_2(input_observations, action_tensor)
        q1_q2 = torch.cat([q1_val, q2_val], dim=1)
        reward_evaluation_grid = torch.min(q1_q2, 1, keepdim=True)[0]
        reward_evaluation_grid = reward_evaluation_grid.reshape((resolution, resolution))

        evaluations.append(reward_evaluation_grid.detach().numpy())

    plt.title(f"Numerically integrated V-function at step {current_step}")
    plt.xlabel(r"$\theta$")
    plt.ylabel(r"$\dot{\theta}$")
    heatmap = plt.contourf(x, y, np.array(evaluations).max(axis=0), levels=100,
                           cmap=plt.get_cmap("jet"), zorder=0)
    plt.colorbar(heatmap)
    save_figure(save_name=f"numerical_v_function_{current_step:04d}")


def plot_metrics(metrics: Dict[str, List[float]]):
    """
    Plots various metrics recorded during training
    :param metrics:
    :return:
    """
    if len(metrics) > 0:
        plt.clf()
        plt.figure(figsize=(16, 9))
        for position, (key, value) in enumerate(metrics.items()):
            plt.subplot(len(metrics), 1, position + 1)
            plt.plot(range(len(value)), np.array(value))
            plt.ylabel(key.title())
        plt.xlabel("Recorded Steps")
        plt.tight_layout()
        save_figure(f"training_metrics")
        plt.clf()
        plt.close()


def evaluate(evaluation_environment: gym.Env, soft_actor_critic,
             num_evaluation_rollouts: int = 10):
    """
    Perform num_evaluation_rollouts rollouts on the evaluation environment using the current policy and average over
    the achieved scores. Also plot a visualization of the first of these rollouts and a numerical integration
    of the value function
    :param evaluation_environment: The environment to evaluate. Will perform num_evaluation_rollouts full rollouts on
      this environment
    :param soft_actor_critic: Instance of SAC used to determine the actions
    :param num_evaluation_rollouts: Number of rollouts to evaluate for
    :return:
    """
    scores = []
    for rollout_idx in range(num_evaluation_rollouts):
        rollout_score = evaluate_rollout(evaluation_environment=evaluation_environment,
                                         soft_actor_critic=soft_actor_critic
                                         )
        scores.append(rollout_score)
    mean_score = np.mean(scores)
    return {"score": mean_score}


def visualize_rollout(soft_actor_critic, step: int):
    # evaluation_environment = DummyVecEnv([lambda: gym.make('Pendulum-v1')])
    # keep a second environment for evaluation purposes
    from stable_baselines3.common.env_util import make_vec_env
    evaluation_environment = make_vec_env('Pendulum-v1')
    visualization_environment = VecVideoRecorder(evaluation_environment,
                                                 video_folder=data_path,
                                                 record_video_trigger=lambda x: x == 0,
                                                 video_length=200,  # 200 steps per rollout
                                                 name_prefix=f"Pendulum_{step:05d}")
    evaluate_rollout(evaluation_environment=visualization_environment, soft_actor_critic=soft_actor_critic)


# **Soft Actor Critic**

In this exercise, we will re-implement the Soft Actor Critic (SAC) algorithm. SAC is an off-policy actor-critic method
that is widely used in the community due to its sample efficiency and (relative) stability. It acts under a maximum
entropy principle to ensure sufficient exploration during training, and also employs tricks like the reparameterization
trick, polyak-updates of target Q networks and Twin-Delayed Q-Functions.


## Pendulum
We will showcase the SAC algorithm on the very classic [Pendulum](https://www.gymlibrary.dev/environments/classic_control/pendulum/) control environment.
The goal in this environment is to actuate a pendulum such that it stays upward without too much movement.
It has a one-dimensional action space that represents the torque acting on the pendulum,
and a 2d internal *state* that is the angle $\theta$ and the angular velocity $\dot{\theta}$ of the pendulum at the current time step.
The *external* state or observation is a 3-tuple ($\cos(\theta)$, $\sin(\theta)$, $\dot{\theta}$).


## Replay Buffer
We start by defining our replay buffer which is used to store samples seen during the rollouts that can then
be used for training later on. You do *not* need to implement anything here.

In [None]:
# Functional code
class ReplayBuffer:
    def __init__(self, buffer_limit: int, batch_size: int):
        self.batch_size = batch_size
        self.buffer = collections.deque(maxlen=buffer_limit)  # use a dequeue as a buffer

    def put(self, transition: tuple) -> None:
        """
        Adds a transition to the buffer.
        :param transition: (s, a, r, s', done) pair sampled by having the policy act on the environment
        :return: None
        """
        self.buffer.append(transition)

    def sample(self) -> tuple:
        mini_batch = random.sample(self.buffer, self.batch_size)  # get self.batch_size random samples from the buffer
        states, actions, rewards, next_states, dones = [], [], [], [], []  # initialize list of (s, a, r, s', done) tuples

        for transition in mini_batch:  # parse all transitions into their lists.
            state, action, reward, next_state, done = transition
            states.append(state)
            actions.append([action])
            rewards.append([reward])
            next_states.append(next_state)
            dones.append([float(done)])

        return torch.tensor(states, dtype=torch.float), \
               torch.tensor(actions, dtype=torch.float), \
               torch.tensor(rewards, dtype=torch.float), \
               torch.tensor(next_states, dtype=torch.float), \
               torch.tensor(dones, dtype=torch.float)

    def size(self) -> int:
        return len(self.buffer)

## **TASK 1: Actor/Policy network** (2+2=4 Points)

Next, we will set up the actor/policy.

### Task 1.1: Tanh Squashing (2 Points)
The original SAC implementation squashes its actions into the $[-1, 1]$ range using a *tanh* activation.
To ensure proper probabilities for these actions, it also squashes the log probabilities of each action accordingly.
You can show using the change of variables theorem that

\begin{align}
    \log \pi(\boldsymbol{a}|\boldsymbol{s}) = \log \mu(\boldsymbol{u}|\boldsymbol{s})-\sum_{i=1}^D \log\left(1-\tanh^2(u_i)\right)
\end{align}

for (squashed) actions $\boldsymbol{a}$, states $\boldsymbol{s}$, proposed (unsquashed) actions $\boldsymbol{u}$ and a policy
distribution $\mu$ (which you shouldn't confuse with the mean, which is sometimes also called $\mu$...)
In our case, the dimensionality $D=1$.
You will need to squash the action itself, as well as its log probability.

### Task 1.2: Training from 2 Q Networks (2 Points)
As SAC is using Twin-Delayed Q-Networks to prevent the overestimatio bias in the Q-Values,
the actor/policy needs to choose the minimum of both available Q-Networks for its loss function.
For this, you will need to evaluate the action using both Q-Networks, and then simply choose their minimum.

In [None]:
class PolicyNet(nn.Module):
    def __init__(self, learning_rate: float, entropy_alpha: float):
        super(PolicyNet, self).__init__()  # make sure that the policy network is registered as a pytorch module

        # specify neurons per layer. "fc" is short for "fully_connected layer".
        self.common_mlp = nn.Linear(3, 128)
        self.mean_mlp = nn.Linear(128, 1)
        self.std_mlp = nn.Linear(128, 1)

        self.optimizer = optim.Adam(self.parameters(), lr=learning_rate)  # use adam optimizer

        self.entropy_alpha = entropy_alpha  # weight of the entropy term

    def forward(self, state: torch.Tensor, deterministic: bool = False) -> Tuple[torch.Tensor, torch.Tensor]:
        """
        Choose an action and calcualte its probability for the current state.
        :param state: The state to choose the action for
        :param deterministic: Whether to draw an action from
        :return:
        """
        state = F.relu(self.common_mlp(state))
        mean = self.mean_mlp(state)
        std = F.softplus(self.std_mlp(state))  # we need the standard deviation to be >0
        normal_distribution = Normal(mean, std)

        if deterministic:
            action = mean
        else:
            action = normal_distribution.rsample()
        log_probabilities = normal_distribution.log_prob(action)

        # the original SAC implementation also squishes the action into [-1, 1] using a tanh activation.
        # to keep the probabilities correct, they account for this using the update below.

        ### TODO ###
        ### Your code starts here ###

        ### Your code ends here ###
        return real_action, real_log_probabilities

    def train_step(self, q_net_1, q_net_2, mini_batch: tuple) -> Dict[str, float]:
        states, _, _, _, _ = mini_batch
        actions, log_probabilities = self.forward(states)
        entropy = -self.entropy_alpha * log_probabilities

        # evaluate both q-networks for the current state-action pair
        # and use their minimum (see Twin-Delayed Q functions)

        ### TODO ###
        ### Your code starts here ###

        ### Your code ends here ###

        loss = -(min_q + entropy).mean()  # "-" for gradient ascent
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        return {"entropy": entropy.mean().item(),
                "policy_loss": loss.item()}


Next, we implement our Q-Network. This will be used to evaluate ("criticize") the actions that the actor proposes.
Note that this is only *one* Q-Network, and that the SAC class below will use multiple of those for the Twin-Delayed
Q-Functions.

## Task 2: Polyak Updates ( 2 Points)
For increasing stability in the update of the Q Networks, SAC uses polyak updates of each Q Network.
The update is given as
\begin{align}
    \beta'_i = (1-\tau)\beta'_i+(\tau)\beta_i
\end{align}

for Q-Network parameters $\beta_1$, $\beta_2$ and an update rate $\tau$. Note that the slides use a reverse
order of $(1-\tau)$ and $\tau$, which corresponds to values of $\tau$ close to $1$ rather than close to $0$
as done in the code.

Hint: Use the `parameters().data` attribute of torch.Tensor to access the parameters. You can copy them
using `parameters.data.copy_`.

In [None]:
class QNet(nn.Module):
    def __init__(self, learning_rate: float, q_net_update_rate: float):
        super(QNet, self).__init__()  # make sure that the Q Network is registered as a pytorch module
        # specify network parameters. "fc" is short for "fully_connected layer".
        self.state_layer = nn.Linear(3, 64)
        self.action_layer = nn.Linear(1, 64)
        self.common_layer = nn.Linear(128, 32)
        self.fc_out = nn.Linear(32, 1)

        self.optimizer = optim.Adam(self.parameters(), lr=learning_rate)  # use adam optimizer
        self.q_net_update_rate = q_net_update_rate

    def forward(self, state: torch.Tensor, action: torch.Tensor) -> torch.Tensor:
        h1 = F.relu(self.state_layer(state))
        h2 = F.relu(self.action_layer(action))
        cat = torch.cat([h1, h2], dim=1)
        q_evaluation = F.relu(self.common_layer(cat))
        q_evaluation = self.fc_out(q_evaluation)
        return q_evaluation

    def train_step(self, target_values: torch.Tensor, mini_batch: tuple) -> float:
        """
        Train the network for a single mini-batch update
        :param target_values: The target values to regress to
        :param mini_batch: A tuple (state, action, reward, next_state, done). For this update,
          only the action and state are needed
        :return: The mean loss for this update step
        """
        states, actions, _, _, _ = mini_batch  # get action and state from current mini_batch
        evaluation = self.forward(states, actions)

        # calculate the loss and its gradients; update the network based on them
        loss = F.smooth_l1_loss(evaluation, target_values).mean()
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        return loss.item()

    def polyak_update(self, target_network):
        """
        Soft update the target network with the parameters of this network
        :param target_network:
        :return:
        """
        ### TODO ###
        ### Your code starts here ###

        ### Your code ends here ###

Finally, we combine the Q-Network(s) and the Actor to build our SoftActorCritic class.
This class (roughly) implements the Pseudo-code shown in Slide Set 7, Slide 40.

## Task 3: Q-Network Targets (4 Points)
For SAC, you will need to implement the targets of the Q-Networks. These are calculated using the rule on Slide Set 7, Slide 35, i.e.,
\begin{align}
y_t = r_t +\gamma\left(Q_{\beta'}(\boldsymbol{s}'_t, \boldsymbol{a}') - \alpha \log \pi(\boldsymbol{a}'|\boldsymbol{s}'_t)\right), \qquad \text{where ~} \boldsymbol{a}'\approx \pi(\boldsymbol{a}|\boldsymbol{s}_t')
\end{align}

Hints:
* Remember that $\beta'$ are the parameters of the **target** Q-Network
* In our case, $Q_{\beta'}$ needs to be evaluated as the **minimum** of **both** target Q-Networks
* You can only do a next step in your next step prediction if there is one, i.e., if the environment is **not** done.

In [None]:
class SoftActorCritic:
    def __init__(self, args):
        learning_rate = args["learning_rate"]
        q_net_update_rate = args["q_net_update_rate"]
        batch_size = args["batch_size"]
        self.discount_factor = args["discount_factor"]

        # initialize two Q-networks and their respective target networks
        self.q_net_1 = QNet(learning_rate, q_net_update_rate)
        self.q_net_2 = QNet(learning_rate, q_net_update_rate)

        self.q_net_1_target = copy.deepcopy(self.q_net_1)
        self.q_net_2_target = copy.deepcopy(self.q_net_2)

        # get a replay buffer and a policy network
        self.memory = ReplayBuffer(buffer_limit=args["buffer_limit"], batch_size=batch_size)

        self.policy = PolicyNet(learning_rate=learning_rate,
                                entropy_alpha=args["entropy_alpha"])

    def train_step(self) -> Dict[str, float]:
        mini_batch = self.memory.sample()
        q_targets = self.calculate_q_targets(mini_batch)

        # update both q networks
        q_net_1_loss = self.q_net_1.train_step(q_targets, mini_batch)
        q_net_2_loss = self.q_net_2.train_step(q_targets, mini_batch)

        # update the policy
        policy_metrics = self.policy.train_step(self.q_net_1, self.q_net_2, mini_batch)

        # polyak updates for the target q networks
        self.q_net_1.polyak_update(self.q_net_1_target)
        self.q_net_2.polyak_update(self.q_net_2_target)

        return {"q_net_1_loss": q_net_1_loss,
                "q_net_2_loss": q_net_2_loss,
                **policy_metrics}

    def calculate_q_targets(self, mini_batch: tuple) -> torch.Tensor:
        _, _, rewards, next_states, dones = mini_batch

        with torch.no_grad():
            ### TODO ###
            ### Your code starts here ###

            ### Your code ends here ###
        return target

# Running the algorithm

That's it for SAC. The code below defines arguments/hyperparameters, as well as a general training loop. You do
*not* need to change any code here (unless you want to fiddle with the parameters). If everything is implemented
correctly, you should see training improvements after a couple thousand steps, and have a converged solution after
30000-40000 steps.
The code will save
* a couple of training metrics,
* a contour plot of a numerical integration of the value function,
i.e., the maximum of a number of Q-function evaluations on a grid of the state-space
* a .mp4 video of the pendulum swinging.

For the homework, you only need to send in the last of each, i.e., the one at iteration 50000, but you can also
turn in all of the plots as usual.

In [None]:
class Args:
    def __getitem__(self, key):
        return getattr(self, key)

    def __setitem__(self, key, val):
        setattr(self, key, val)

    num_training_steps = 51000  # @param {type: "integer"}
    learning_rate = 3.0e-4  # @param {type: "number"}
    entropy_alpha = 0.02  # @param {type: "number"}
    discount_factor = 0.98  # @param {type: "number"}
    batch_size = 64  # @param {type: "integer"}
    buffer_limit = 100000  # @param {type: "integer"}
    reward_scale = 0.1  # @param {type: "number"}
    q_net_update_rate = 0.002  # @param {type: "number"}

def main(args: Args):
    environment = gym.make('Pendulum-v1')
    evaluation_environment = DummyVecEnv([lambda: gym.make('Pendulum-v1')])
    # keep a second environment for evaluation purposes.
    # We wrap it in a Dummy Vector Environment for compatibility with
    # the visualization utility

    soft_actor_critic = SoftActorCritic(args=args)

    reward_scale = args["reward_scale"]
    num_training_steps = args["num_training_steps"]

    # logging utility
    logging_frequency = 100  # log progress every 100 steps
    plot_frequency = 5000
    progress_bar = ProgressBar(num_iterations=num_training_steps)

    state = environment.reset()  # restart the environment, i.e., go back to some initial state
    full_metrics = {"score": []}
    train_step_metrics = {}
    for current_step in range(num_training_steps):
        if current_step % logging_frequency == 0:  # log every logging_frequency steps
            for key, value in train_step_metrics.items():
                if key not in full_metrics:
                    full_metrics[key] = []
                full_metrics[key].append(value)

            evaluation_recordings = evaluate(evaluation_environment=evaluation_environment,
                                             soft_actor_critic=soft_actor_critic)

            progress_bar(_steps=logging_frequency,
                         score=evaluation_recordings.get("score"),
                         **train_step_metrics)

            full_metrics["score"].append(evaluation_recordings.get("score"))

        if current_step % plot_frequency == 0:  # plot visualizations
            v_function_visualization(evaluation_environment=evaluation_environment,
                                     soft_actor_critic=soft_actor_critic,
                                     current_step=current_step)
            visualize_rollout(soft_actor_critic=soft_actor_critic,
                              step=current_step)
            plot_metrics(full_metrics)

        # alternate between collecting one step of data and updating SAC with one mini-batch
        action, log_probabilities = soft_actor_critic.policy(torch.from_numpy(np.array(state)).float())
        next_state, reward, done, info = environment.step([environment.action_space.high[0] * action.item()])

        # safe (s, a, r, s', done) tuple in memory buffer
        soft_actor_critic.memory.put((state, action.item(), reward * reward_scale, next_state, done))

        if done:
            state = environment.reset()
        else:
            state = next_state

        # wait until there are enough rollouts in the memory buffer before starting the training
        if soft_actor_critic.memory.size() > 1000:
            train_step_metrics = soft_actor_critic.train_step()
            train_step_metrics["buffer_size"] = soft_actor_critic.memory.size()
    environment.close()


args = Args()
main(args=args)
