# TD3 algorithm

In [None]:
!pip install gymnasium
!pip install box2d
!pip install mediapy
!pip install gym==0.23.1



In [None]:
import gym
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import copy
import mediapy as media

# Create Env and visualization

In [None]:
RAM_ENV_NAME = 'BipedalWalker-v3'
env = gym.make(RAM_ENV_NAME)

done = False
state = env.reset()
images = []
total_reward = 0

while not done:
    action = env.action_space.sample()
    state, reward, done, info = env.step(action)
    img = env.render(mode='rgb_array')
    images.append(img)
    total_reward += reward

env.close()

media.show_video(images[:-2], fps=30)
print('total_reward:', total_reward)

  from pkg_resources import resource_stream, resource_exists
Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
  declare_namespace(pkg)
Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
  declare_namespace(pkg)
Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
  declare_namespace(pkg)


0
This browser does not support the video tag.


total_reward: -107.735985


# Create Actor & Critic

In [None]:
class Actor(nn.Module):
    def __init__(self, state_dim, action_dim, hidden_width, max_action):
        super(Actor, self).__init__()
        self.max_action = max_action
        self.l1 = nn.Linear(state_dim, hidden_width)
        self.l2 = nn.Linear(hidden_width, hidden_width)
        self.l3 = nn.Linear(hidden_width, action_dim)

    def forward(self, s):
        s = F.relu(self.l1(s))
        s = F.relu(self.l2(s))
        a = self.max_action * torch.tanh(self.l3(s))  # [-max,max]
        return a


class Critic(nn.Module):  # According to (s,a), directly calculate Q(s,a)
    def __init__(self, state_dim, action_dim, hidden_width):
        super(Critic, self).__init__()
        # Q1
        self.l1 = nn.Linear(state_dim + action_dim, hidden_width)
        self.l2 = nn.Linear(hidden_width, hidden_width)
        self.l3 = nn.Linear(hidden_width, 1)
        # Q2
        self.l4 = nn.Linear(state_dim + action_dim, hidden_width)
        self.l5 = nn.Linear(hidden_width, hidden_width)
        self.l6 = nn.Linear(hidden_width, 1)

    def forward(self, s, a):
        s_a = torch.cat([s, a], 1)
        q1 = F.relu(self.l1(s_a))
        q1 = F.relu(self.l2(q1))
        q1 = self.l3(q1)

        q2 = F.relu(self.l4(s_a))
        q2 = F.relu(self.l5(q2))
        q2 = self.l6(q2)

        return q1, q2

    def Q1(self, s, a):
        s_a = torch.cat([s, a], 1)
        q1 = F.relu(self.l1(s_a))
        q1 = F.relu(self.l2(q1))
        q1 = self.l3(q1)


        return q1

# Create ReplayBuffer

In [None]:
class ReplayBuffer(object):
    def __init__(self, state_dim, action_dim):
        self.max_size = int(1e6)
        self.count = 0
        self.size = 0
        self.s = np.zeros((self.max_size, state_dim))
        self.a = np.zeros((self.max_size, action_dim))
        self.r = np.zeros((self.max_size, 1))
        self.s_ = np.zeros((self.max_size, state_dim))
        self.dw = np.zeros((self.max_size, 1))

    def store(self, s, a, r, s_, dw):
        self.s[self.count] = s
        self.a[self.count] = a
        self.r[self.count] = r
        self.s_[self.count] = s_
        self.dw[self.count] = dw
        self.count = (self.count + 1) % self.max_size  # When the 'count' reaches max_size, it will be reset to 0.
        self.size = min(self.size + 1, self.max_size)  # Record the number of  transitions

    def sample(self, batch_size):
        index = np.random.choice(self.size, size=batch_size)
        batch_s = torch.tensor(self.s[index], dtype=torch.float)
        batch_a = torch.tensor(self.a[index], dtype=torch.float)
        batch_r = torch.tensor(self.r[index], dtype=torch.float)
        batch_s_ = torch.tensor(self.s_[index], dtype=torch.float)
        batch_dw = torch.tensor(self.dw[index], dtype=torch.float)

        return batch_s, batch_a, batch_r, batch_s_, batch_dw

# Create TD3 Agent

In [None]:

class TD3(object):
    def __init__(self, state_dim, action_dim, max_action):
        self.max_action = max_action
        self.hidden_width = 256  # The number of neurons in hidden layers of the neural network
        self.batch_size = 256  # batch size
        self.GAMMA = 0.99  # discount factor
        self.TAU = 0.005  # Softly update the target network
        self.lr = 3e-4  # learning rate
        self.policy_noise = 0.2 * max_action  # The noise for the trick 'target policy smoothing'
        self.noise_clip = 0.5 * max_action  # Clip the noise
        self.policy_freq = 2  # The frequency of policy updates
        self.actor_pointer = 0

        self.actor = Actor(state_dim, action_dim, self.hidden_width, max_action)
        self.actor_target = copy.deepcopy(self.actor)
        self.critic = Critic(state_dim, action_dim, self.hidden_width)
        self.critic_target = copy.deepcopy(self.critic)

        self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=self.lr)
        self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), lr=self.lr)

    def choose_action(self, s):
        s = torch.unsqueeze(torch.tensor(s, dtype=torch.float), 0)
        a = self.actor(s).data.numpy().flatten()
        return a

    def learn(self, relay_buffer):
        self.actor_pointer += 1
        batch_s, batch_a, batch_r, batch_s_, batch_dw = relay_buffer.sample(self.batch_size)  # Sample a batch

        # Compute the target Q
        with torch.no_grad():  # target_Q has no gradient
            # Trick 1:target policy smoothing
            # torch.randn_like can generate random numbers sampled from N(0,1)，which have the same size as 'batch_a'
            noise = (torch.randn_like(batch_a) * self.policy_noise).clamp(-self.noise_clip, self.noise_clip)
            next_action = (self.actor_target(batch_s_) + noise).clamp(-self.max_action, self.max_action)

            # Trick 2:clipped double Q-learning
            target_Q1, target_Q2 = self.critic_target(batch_s_, next_action)
            target_Q = batch_r + self.GAMMA * (1 - batch_dw) * torch.min(target_Q1, target_Q2)

        # Get the current Q
        current_Q1, current_Q2 = self.critic(batch_s, batch_a)
        # Compute the critic loss
        critic_loss = F.mse_loss(current_Q1, target_Q) + F.mse_loss(current_Q2, target_Q)
        # Optimize the critic
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        # Trick 3:delayed policy updates
        if self.actor_pointer % self.policy_freq == 0:
            # Freeze critic networks so you don't waste computational effort
            for params in self.critic.parameters():
                params.requires_grad = False

            # Compute actor loss
            actor_loss = -self.critic.Q1(batch_s, self.actor(batch_s)).mean()  # Only use Q1
            # Optimize the actor
            self.actor_optimizer.zero_grad()
            actor_loss.backward()
            self.actor_optimizer.step()

            # Unfreeze critic networks
            for params in self.critic.parameters():
                params.requires_grad = True

            # Softly update the target networks
            for param, target_param in zip(self.critic.parameters(), self.critic_target.parameters()):
                target_param.data.copy_(self.TAU * param.data + (1 - self.TAU) * target_param.data)

            for param, target_param in zip(self.actor.parameters(), self.actor_target.parameters()):
                target_param.data.copy_(self.TAU * param.data + (1 - self.TAU) * target_param.data)

# Running the training pipeline

In [None]:
def evaluate_policy(env, agent, evaluation_num):
    times = 3  # Perform three evaluations and calculate the average
    evaluate_reward = 0
    for _ in range(times):
        s = env.reset()
        done = False
        episode_reward = 0
        while not done:
            a = agent.choose_action(s)  # We do not add noise when evaluating
            s_, r, done, _ = env.step(a)
            episode_reward += r
            s = s_
        evaluate_reward += episode_reward

    if evaluation_num % 20 == 0:

        done = False
        s = env.reset()
        images = []
        img = env.render(mode='rgb_array')
        images.append(img)

        while not done:
            action = env.action_space.sample()
            state, reward, done, info = env.step(action)
            img = env.render(mode='rgb_array')
            if img.shape[:2] == (400, 600):
              images.append(img)

        media.show_video(images[:-5], fps=30)

    return int(evaluate_reward / times)


def reward_adapter(r, env_index):
    if env_index == 0:  # Pendulum-v1
        r = (r + 8) / 8
    elif env_index == 1:  # BipedalWalker-v3
        if r <= -100:
            r = -1
    return r


if __name__ == '__main__':

    RAM_ENV_NAME = 'BipedalWalker-v3'
    env = gym.make(RAM_ENV_NAME)
    env_evaluate = gym.make(RAM_ENV_NAME)
    number = 1
    # Set random seed
    seed = 0
    env.seed(seed)
    env.action_space.seed(seed)
    env_evaluate.seed(seed)
    env_evaluate.action_space.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)

    state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.shape[0]
    print(env.action_space)
    max_action = float(env.action_space.high[0])
    max_episode_steps = env._max_episode_steps  # Maximum number of steps per episode

    print("state_dim={}".format(state_dim))
    print("action_dim={}".format(action_dim))
    print("max_action={}".format(max_action))
    print("max_episode_steps={}".format(max_episode_steps))

    agent = TD3(state_dim, action_dim, max_action)
    replay_buffer = ReplayBuffer(state_dim, action_dim)

    noise_std = 0.1 * max_action  # the std of Gaussian noise for exploration
    max_train_steps = 3e6  # Maximum number of training steps
    random_steps = 25e3  # Take the random actions in the beginning for the better exploration
    evaluate_freq = 5e3  # Evaluate the policy every 'evaluate_freq' steps
    evaluate_num = 0  # Record the number of evaluations
    evaluate_rewards = []  # Record the rewards during the evaluating
    total_steps = 0  # Record the total steps during the training

    while total_steps < max_train_steps:
        s = env.reset()
        episode_steps = 0
        done = False
        while not done:
            episode_steps += 1
            if total_steps < random_steps:  # Take random actions in the beginning for the better exploration
                a = env.action_space.sample()
            else:
                # Add Gaussian noise to action for exploration
                a = agent.choose_action(s)
                a = (a + np.random.normal(0, noise_std, size=action_dim)).clip(-max_action, max_action)
            s_, r, done, _ = env.step(a)
            r = reward_adapter(r, 1)  # Adjust rewards for better performance
            # When dead or win or reaching the max_episode_steps, done will be Ture, we need to distinguish them;
            # dw means dead or win,there is no next state s';
            # but when reaching the max_episode_steps,there is a next state s' actually.
            if done and episode_steps != max_episode_steps:
                dw = True
            else:
                dw = False
            replay_buffer.store(s, a, r, s_, dw)  # Store the transition
            s = s_

            # Update one step
            if total_steps >= random_steps:
                agent.learn(replay_buffer)

            # Evaluate the policy every 'evaluate_freq' steps
            if (total_steps + 1) % evaluate_freq == 0:
                evaluate_num += 1
                evaluate_reward = evaluate_policy(env_evaluate, agent, evaluate_num)
                evaluate_rewards.append(evaluate_reward)
                print("evaluate_num:{} \t evaluate_reward:{}".format(evaluate_num, evaluate_reward))

            total_steps += 1

  deprecation(


Box(-1.0, 1.0, (4,), float32)
state_dim=24
action_dim=4
max_action=1.0
max_episode_steps=1600
evaluate_num:1 	 evaluate_reward:-4
evaluate_num:2 	 evaluate_reward:-3
evaluate_num:3 	 evaluate_reward:-3
evaluate_num:4 	 evaluate_reward:-3
evaluate_num:5 	 evaluate_reward:-3
evaluate_num:6 	 evaluate_reward:-54
evaluate_num:7 	 evaluate_reward:-77
evaluate_num:8 	 evaluate_reward:-98
evaluate_num:9 	 evaluate_reward:-97
evaluate_num:10 	 evaluate_reward:-96
evaluate_num:11 	 evaluate_reward:-84
evaluate_num:12 	 evaluate_reward:-63
evaluate_num:13 	 evaluate_reward:-81
evaluate_num:14 	 evaluate_reward:-19
evaluate_num:15 	 evaluate_reward:-16
evaluate_num:16 	 evaluate_reward:-73
evaluate_num:17 	 evaluate_reward:-2
evaluate_num:18 	 evaluate_reward:150
evaluate_num:19 	 evaluate_reward:197


0
This browser does not support the video tag.


evaluate_num:20 	 evaluate_reward:178
evaluate_num:21 	 evaluate_reward:56
evaluate_num:22 	 evaluate_reward:199
evaluate_num:23 	 evaluate_reward:309
evaluate_num:24 	 evaluate_reward:97
evaluate_num:25 	 evaluate_reward:308
evaluate_num:26 	 evaluate_reward:149
evaluate_num:27 	 evaluate_reward:308
evaluate_num:28 	 evaluate_reward:218
evaluate_num:29 	 evaluate_reward:314
evaluate_num:30 	 evaluate_reward:240
evaluate_num:31 	 evaluate_reward:313
evaluate_num:32 	 evaluate_reward:311
evaluate_num:33 	 evaluate_reward:267
evaluate_num:34 	 evaluate_reward:254
evaluate_num:35 	 evaluate_reward:265
evaluate_num:36 	 evaluate_reward:237
evaluate_num:37 	 evaluate_reward:314
evaluate_num:38 	 evaluate_reward:277
evaluate_num:39 	 evaluate_reward:315


0
This browser does not support the video tag.


evaluate_num:40 	 evaluate_reward:216
evaluate_num:41 	 evaluate_reward:317
evaluate_num:42 	 evaluate_reward:315
evaluate_num:43 	 evaluate_reward:314
evaluate_num:44 	 evaluate_reward:314
evaluate_num:45 	 evaluate_reward:319
evaluate_num:46 	 evaluate_reward:317
evaluate_num:47 	 evaluate_reward:317
evaluate_num:48 	 evaluate_reward:316
evaluate_num:49 	 evaluate_reward:315
evaluate_num:50 	 evaluate_reward:319
evaluate_num:51 	 evaluate_reward:316
evaluate_num:52 	 evaluate_reward:265
evaluate_num:53 	 evaluate_reward:317
evaluate_num:54 	 evaluate_reward:318
evaluate_num:55 	 evaluate_reward:316
evaluate_num:56 	 evaluate_reward:317
evaluate_num:57 	 evaluate_reward:317
evaluate_num:58 	 evaluate_reward:318
evaluate_num:59 	 evaluate_reward:317


0
This browser does not support the video tag.


evaluate_num:60 	 evaluate_reward:315
evaluate_num:61 	 evaluate_reward:315
evaluate_num:62 	 evaluate_reward:316
evaluate_num:63 	 evaluate_reward:318
evaluate_num:64 	 evaluate_reward:317
evaluate_num:65 	 evaluate_reward:316
evaluate_num:66 	 evaluate_reward:234
evaluate_num:67 	 evaluate_reward:317
evaluate_num:68 	 evaluate_reward:317
