In [None]:
import sys, os
if 'google.colab' in sys.modules and not os.path.exists('.setup_complete'):
    !wget -q https://raw.githubusercontent.com/yandexdataschool/Practical_RL/master/setup_colab.sh -O- | bash

    !pip -q install gymnasium

    !touch .setup_complete

# This code creates a virtual display to draw game images on.
# It will have no effect if your machine has a monitor.
if type(os.environ.get("DISPLAY")) is not str or len(os.environ.get("DISPLAY")) == 0:
    !bash ../xvfb start
    os.environ['DISPLAY'] = ':1'

In [None]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import gymnasium as gym
import matplotlib.pyplot as plt

In [None]:
env = gym.make("Pendulum-v1", render_mode="rgb_array")
env.reset()
plt.imshow(env.render())

action_dim = env.action_space.shape[0]
state_dim = env.observation_space.shape[0]

The **Deep Deterministic Policy Gradient (DDPG)** algorithm is a model-free, off-policy actor-critic algorithm designed for environments with continuous action spaces. It integrates the concepts of Q-learning and policy improvement to provide a framework for learning optimal policies.

For a policy $\mu_{\theta}(s)$ parameterized by $\theta$, the gradient of the expected return $J$ with respect to the actor parameters $\theta$ is given by:
\begin{equation}
\nabla_{\theta} J = \mathbb{E}_{s \sim d^\mu} \left[\nabla_{\theta} \mu_{\theta}(s) \nabla_{a} Q^{\mu}(s, a) \big|_{a=\mu_{\theta}(s)}\right]
\end{equation}
where $Q^{\mu}(s, a)$ is the action-value function under policy $\mu$, and $d^\mu$ is the state distribution under policy $\mu$.

**Actor-Critic Architecture:**

DDPG uses an actor-critic architecture, where the actor network approximates the optimal policy deterministically, outputting the best believed action for any given state. The critic evaluates the expected return of the state-action pair, providing a gradient for updating the actor's policy.

In [None]:
class Actor(nn.Module):
    def __init__(self, state_dim, action_dim, max_action):
        super().__init__()
        self.model = #YOUR_CODE
        self.max_action = max_action

    def forward(self, state):
        return #YOUR_CODE


class Critic(nn.Module):
    def __init__(self, state_dim, action_dim):
        super().__init__()
        self.model = #YOUR_CODE

    def forward(self, state, action):
        return #YOUR_CODE

**Experience Replay:**

To break the correlation between consecutive samples and to utilize the learning data more efficiently, DDPG implements experience replay. This technique stores a buffer of previous state-action-reward-next state tuples, sampling from this buffer randomly to train the networks. This process not only stabilizes training but also allows for the reuse of past experiences, improving sample efficiency.

In [None]:
class ReplayBuffer:
    def __init__(self, max_size=int(1e6)):
        self.storage = []
        self.max_size = max_size
        self.ptr = 0

    def __len__(self):
        return len(self.storage)

    def add(self, transition):
        if len(self.storage) == self.max_size:
            #YOUR_CODE
        else:
            self.storage.append(transition)

    def sample(self, batch_size):
        indices = np.random.randint(0, len(self.storage), size=batch_size)
        batch_states, batch_actions, batch_rewards, batch_next_states, batch_dones = (
            [],
            [],
            [],
            [],
            [],
        )
        for i in indices:
            state, action, reward, next_state, done = self.storage[i]
            batch_states.append(np.array(state, copy=False))
            batch_actions.append(np.array(action, copy=False))
            batch_rewards.append(np.array(reward, copy=False))
            batch_next_states.append(np.array(next_state, copy=False))
            batch_dones.append(np.array(done, copy=False))
        return (
            np.array(batch_states),
            np.array(batch_actions),
            np.array(batch_rewards).reshape(-1, 1),
            np.array(batch_next_states),
            np.array(batch_dones).reshape(-1, 1),
        )

**Target Networks:**

DDPG employs target networks for both the actor and the critic to stabilize training. These target networks are copies of the actor and critic networks that are slowly updated towards the learned networks. By using target networks, DDPG mitigates the risk of the moving targets problem, where updates are based on moving estimates, thereby enhancing learning stability.

The target networks are updated using a soft update strategy, which slowly tracks the learned networks. The update rule for the target networks is given by:
\begin{equation}
\theta^- = \tau \theta + (1 - \tau) \theta^-
\end{equation}
where $\theta^-$ are the parameters of the target network, $\theta$ are the parameters of the corresponding learned network, and $\tau \ll 1$ controls the rate of update.

**Exploration:**

For exploration, noise is added to the actor's output:
\begin{equation}
a = \mu_{\theta}(s) + \mathcal{N}
\end{equation}
where $\mathcal{N}$ is a noise process, such as the Gaussian or Ornstein-Uhlenbeck process, designed to provide temporal correlation between successive actions, facilitating efficient exploration in continuous action spaces.

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"

max_action = float(env.action_space.high[0])
actor = Actor(state_dim, action_dim, max_action).to(device)
target_actor = Actor(state_dim, action_dim, max_action).to(device)
target_actor.load_state_dict(actor.state_dict())
actor_optimizer = optim.Adam(actor.parameters(), lr=1e-4, weight_decay=1e-2)

critic = Critic(state_dim, action_dim).to(device)
target_critic = Critic(state_dim, action_dim).to(device)
target_critic.load_state_dict(critic.state_dict())
critic_optimizer = optim.Adam(critic.parameters(), lr=1e-3, weight_decay=1e-2)


def get_action(actor, state, low, high, std_noise=0):
    state = torch.tensor(state.reshape(1, -1), dtype=torch.float32, device=device)
    action = actor(state).cpu().detach().numpy().flatten()
    return #YOUR_CODE

**Critic update:**

The critic is updated by minimizing the MSE between the predicted Q-values and the target Q-values. The target Q-value for a given state-action pair $(s, a)$ is computed as:

$y = r + \gamma Q_{\phi^-}(s', \mu_{\theta^-}(s'))$

where $r$ is the reward received after executing action $a$ in state $s$, $\gamma$ is the discount factor, $s'$ is the next state, $\mu_{\theta^-}$ is the target policy, and $Q_{\phi^-}$ is the target critic network. The loss for the critic is then:

$L(\phi) = (y - Q_\phi(s, a))^2 \to \min_\phi$

**Actor update:**

The actor network is updated by maximising the critic's estimation:

$J(\theta) = Q_\phi(s, \mu_\theta(s)) \to \max_\theta$

This equation uses the chain rule to compute the gradient of the policy's performance with respect to the actor's parameters.

In [None]:
def train_step(batch_size=100, gamma=0.99, tau=0.005):
    # Sample replay buffer
    state, action, reward, next_state, is_done = replay_buffer.sample(batch_size)

    state = torch.tensor(state, dtype=torch.float32, device=device)
    action = torch.tensor(action, dtype=torch.float32, device=device)
    next_state = torch.tensor(next_state, dtype=torch.float32, device=device)
    is_not_done = torch.tensor(1 - is_done, dtype=torch.float32, device=device)
    reward = torch.tensor(reward, dtype=torch.float32, device=device)

    # Compute the target Q value
    with torch.no_grad():
        target_Q = #YOUR_CODE

    # Get current Q estimate
    pred_Q = #YOUR_CODE

    # Compute critic loss
    assert target_Q.shape == pred_Q.shape
    critic_loss = #YOUR_CODE

    # Optimize the critic
    critic_optimizer.zero_grad()
    critic_loss.backward()
    critic_optimizer.step()

    # Compute actor loss
    actor_loss = #YOUR_CODE

    # Optimize the actor
    actor_optimizer.zero_grad()
    actor_loss.backward()
    actor_optimizer.step()

    # Update the frozen target models
    for param, target_param in zip(critic.parameters(), target_critic.parameters()):
        #YOUR_CODE

    for param, target_param in zip(actor.parameters(), target_actor.parameters()):
        #YOUR_CODE

    return actor_loss.item(), critic_loss.item()

In [None]:
def evaluate(env, actor, low, high, n_games=1, t_max=10000):
    rewards = []
    for _ in range(n_games):
        s, _ = env.reset()
        reward = 0
        for _ in range(t_max):
            action = get_action(actor, s, low, high)
            s, r, terminated, truncated, _ = env.step(action)
            reward += r
            if terminated or truncated:
                break

        rewards.append(reward)
    return np.mean(rewards)

In [None]:
from tqdm import trange
from IPython.display import clear_output

plt.rcParams["axes.grid"] = True

In [None]:
actor_losses = []
critic_losses = []
episode_rewards = []

replay_buffer = ReplayBuffer(int(1e6))

total_steps = 100000
batch_size = 100
eval_freq = 500
tau = 0.005
std_noise = 0.1
gamma = 0.99

low, high = env.action_space.low[0], env.action_space.high[0]

max_action = env.action_space.high

state, _ = env.reset()
for step in trange(1, total_steps + 1):

    action = get_action(actor, state, low, high, std_noise=std_noise)
    next_state, reward, terminated, truncated, _ = env.step(action)
    done = terminated or truncated
    replay_buffer.add((state, action, reward, next_state, done))
    state = next_state

    if len(replay_buffer) >= batch_size:
        actor_loss, critic_loss = train_step(batch_size, gamma, tau)
        actor_losses.append(actor_loss)
        critic_losses.append(critic_loss)

    if done:
        state, _ = env.reset()

    if step % eval_freq == 0:
        episode_rewards.append(
            evaluate(gym.make("Pendulum-v1"), actor, low, high, n_games=10)
        )

        # Plotting the results
        clear_output(wait=True)
        plt.figure(figsize=(20, 5))

        plt.subplot(1, 3, 1)
        plt.plot(episode_rewards)
        plt.title("Episode Reward")
        plt.xlabel("Episode")
        plt.ylabel("Reward")

        plt.subplot(1, 3, 2)
        plt.plot(actor_losses)
        plt.title("Actor Loss")
        plt.xlabel("Step")
        plt.ylabel("Loss")

        plt.subplot(1, 3, 3)
        plt.plot(critic_losses)
        plt.title("Critic Loss")
        plt.xlabel("Step")
        plt.ylabel("Loss")

        plt.tight_layout()
        plt.show()

In [None]:
from gymnasium.wrappers import RecordVideo

# let's hope this will work
# don't forget to pray
with gym.make("Pendulum-v1", render_mode="rgb_array") as env, RecordVideo(
    env=env, video_folder="./videos"
) as env_monitor:
    evaluate(env_monitor, actor, low, high, n_games=10)

In [None]:
# Show video. This may not work in some setups. If it doesn't
# work for you, you can download the videos and view them locally.

from pathlib import Path
from IPython.display import HTML
import sys

video_paths = sorted([s for s in Path("videos").iterdir() if s.suffix == ".mp4"])
video_path = video_paths[-1]  # You can also try other indices

if "google.colab" in sys.modules:
    # https://stackoverflow.com/a/57378660/1214547
    with video_path.open("rb") as fp:
        mp4 = fp.read()
    data_url = "data:video/mp4;base64," + b64encode(mp4).decode()
else:
    data_url = str(video_path)

HTML(
    """
<video width="640" height="480" controls>
  <source src="{}" type="video/mp4">
</video>
""".format(
        data_url
    )
)