In [1]:
import copy

import gym
import matplotlib.pyplot as plt
import numpy as np
import rl_utils
import torch
import torch.nn.functional as F
from tqdm import tqdm

In [3]:
class PolicyNet(torch.nn.Module):
    def __init__(self, state_dim, hidden_dim, action_dim):
        # initialize the class PolicyNet with the following arguments
        super(PolicyNet, self).__init__()
        # define input layer
        self.fc1 = torch.nn.Linear(state_dim, hidden_dim)
        # define output layer
        self.fc2 = torch.nn.Linear(hidden_dim, action_dim)

    def forward(self, x):
        # define forward pass
        x = F.relu(self.fc1(x))
        return F.softmax(self.fc2(x), dim=1)


# define a class ValueNet
class ValueNet(torch.nn.Module):
    def __init__(self, state_dim, hidden_dim):
        # initialize the class ValueNet with the following arguments
        super(ValueNet, self).__init__()
        # define input layer
        self.fc1 = torch.nn.Linear(state_dim, hidden_dim)
        # define output layer
        self.fc2 = torch.nn.Linear(hidden_dim, 1)

    def forward(self, x):
        # define forward pass
        x = F.relu(self.fc1(x))
        return self.fc2(x)


In [4]:
class TRPO:
    """ TRPO算法 """

    def __init__(self, hidden_dim, state_space, action_space, lmbda,
                 kl_constraint, alpha, critic_lr, gamma, device):
        state_dim = state_space.shape[0]
        action_dim = action_space.n
        # 策略网络参数不需要优化器更新
        self.actor = PolicyNet(state_dim, hidden_dim, action_dim).to(device)
        self.critic = ValueNet(state_dim, hidden_dim).to(device)
        self.critic_optimizer = torch.optim.Adam(self.critic.parameters(),
                                                 lr=critic_lr)
        self.gamma = gamma
        self.lmbda = lmbda  # GAE参数
        self.kl_constraint = kl_constraint  # KL距离最大限制
        self.alpha = alpha  # 线性搜索参数
        self.device = device

    def take_action(self, state):
        # 选择动作
        state = torch.tensor([state], dtype=torch.float).to(self.device)
        probs = self.actor(state)
        action_dist = torch.distributions.Categorical(probs)
        action = action_dist.sample()
        return action.item()

    def hessian_matrix_vector_product(self, states, old_action_dists, vector):
        # 计算黑塞矩阵和一个向量的乘积
        # 计算平均KL距离
        new_action_dists = torch.distributions.Categorical(self.actor(states))
        kl = torch.mean(
            torch.distributions.kl.kl_divergence(old_action_dists,
                                                 new_action_dists))
        # 计算KL距离的梯度
        kl_grad = torch.autograd.grad(kl,
                                      self.actor.parameters(),
                                      create_graph=True)
        # 将梯度向量展平
        kl_grad_vector = torch.cat([grad.view(-1) for grad in kl_grad])
        # KL距离的梯度先和向量进行点积运算
        kl_grad_vector_product = torch.dot(kl_grad_vector, vector)
        # 再计算点积运算的梯度
        grad2 = torch.autograd.grad(kl_grad_vector_product,
                                    self.actor.parameters())
        # 将梯度向量展平
        grad2_vector = torch.cat([grad.view(-1) for grad in grad2])
        return grad2_vector

    def conjugate_gradient(self, grad, states, old_action_dists):  # 共轭梯度法求解方程
        x = torch.zeros_like(grad)  # 初始化搜索方向为0
        r = grad.clone()  # 初始化残差为梯度
        p = grad.clone()  # 初始化搜索方向为梯度
        rdotr = torch.dot(r, r)  # 计算残差的内积
        for i in range(10):  # 共轭梯度主循环
            Hp = self.hessian_matrix_vector_product(states, old_action_dists, p)  # 计算Hessian矩阵与向量的乘积
            alpha = rdotr / torch.dot(p, Hp)  # 计算步长alpha
            x += alpha * p
            r -= alpha * Hp
            new_rdotr = torch.dot(r, r)
            if new_rdotr < 1e-10:  # 如果残差的内积小于1e-10，停止迭代
                break
            beta = new_rdotr / rdotr
            p = r + beta * p
            rdotr = new_rdotr
        return x

    def compute_surrogate_obj(self, states, actions, advantage, old_log_probs,
                              actor):  # 计算策略目标
        log_probs = torch.log(actor(states).gather(1, actions))  # 计算旧策略的log_prob
        ratio = torch.exp(log_probs - old_log_probs)  # 计算比值
        return torch.mean(ratio * advantage)  # 计算策略目标的期望

    def line_search(self, states, actions, advantage, old_log_probs,
                    old_action_dists, max_vec):  # 线性搜索
        old_para = torch.nn.utils.convert_parameters.parameters_to_vector(
            self.actor.parameters())  # 保存旧参数
        old_obj = self.compute_surrogate_obj(states, actions, advantage,
                                             old_log_probs, self.actor)  # 保存旧目标值
        for i in range(15):  # 线性搜索主循环
            coef = self.alpha ** i
            new_para = old_para + coef * max_vec  # 更新参数
            new_actor = copy.deepcopy(self.actor)  # 更新网络
            torch.nn.utils.convert_parameters.vector_to_parameters(
                new_para, new_actor.parameters())
            new_action_dists = torch.distributions.Categorical(
                new_actor(states))  # 更新分布
            kl_div = torch.mean(
                torch.distributions.kl.kl_divergence(old_action_dists,
                                                     new_action_dists))  # 计算KL散度
            new_obj = self.compute_surrogate_obj(states, actions, advantage,
                                                 old_log_probs, new_actor)  # 更新目标值
            if new_obj > old_obj and kl_div < self.kl_constraint:
                return new_para
        return old_para

    def policy_learn(self, states, actions, old_action_dists, old_log_probs,
                     advantage):  # 更新策略函数
        # 计算对数似然比
        surrogate_obj = self.compute_surrogate_obj(states, actions, advantage,
                                                   old_log_probs, self.actor)
        # 计算梯度
        grads = torch.autograd.grad(surrogate_obj, self.actor.parameters())
        # 将梯度转换为向量
        obj_grad = torch.cat([grad.view(-1) for grad in grads]).detach()
        # 用共轭梯度法计算x = H^(-1)g
        descent_direction = self.conjugate_gradient(obj_grad, states,
                                                    old_action_dists)

        Hd = self.hessian_matrix_vector_product(states, old_action_dists,
                                                descent_direction)
        # 计算最大的步长
        max_coef = torch.sqrt(2 * self.kl_constraint /
                              (torch.dot(descent_direction, Hd) + 1e-8))
        # 线性搜索
        new_para = self.line_search(states, actions, advantage, old_log_probs,
                                    old_action_dists,
                                    descent_direction * max_coef)
        # 用线性搜索后的参数更新策略
        torch.nn.utils.convert_parameters.vector_to_parameters(
            new_para, self.actor.parameters())

    def update(self, transition_dict):
        # 从transition_dict中取出数据
        states = torch.tensor(transition_dict['states'],
                              dtype=torch.float).to(self.device)
        actions = torch.tensor(transition_dict['actions']).view(-1, 1).to(
            self.device)
        rewards = torch.tensor(transition_dict['rewards'],
                               dtype=torch.float).view(-1, 1).to(self.device)
        next_states = torch.tensor(transition_dict['next_states'],
                                   dtype=torch.float).to(self.device)
        dones = torch.tensor(transition_dict['dones'],
                             dtype=torch.float).view(-1, 1).to(self.device)
        # 计算TD目标
        td_target = rewards + self.gamma * self.critic(next_states) * (1 -
                                                                       dones)
        # 计算TD误差
        td_delta = td_target - self.critic(states)
        # 计算advantage
        advantage = compute_advantage(self.gamma, self.lmbda,
                                      td_delta.cpu()).to(self.device)
        # 计算旧的log概率
        old_log_probs = torch.log(self.actor(states).gather(1,
                                                            actions)).detach()
        # 计算旧的动作分布
        old_action_dists = torch.distributions.Categorical(
            self.actor(states).detach())
        # 计算价值函数的损失
        critic_loss = torch.mean(
            F.mse_loss(self.critic(states), td_target.detach()))
        # 优化价值函数
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()
        # 优化策略函数
        self.policy_learn(states, actions, old_action_dists, old_log_probs,
                          advantage)

In [5]:
# Import dependencies
import numpy as np
import gym
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import matplotlib.pyplot as plt

import rl_utils

# Hyperparameters
num_episodes = 500
hidden_dim = 128
gamma = 0.98
lmbda = 0.95
critic_lr = 1e-2
kl_constraint = 0.0005
alpha = 0.5
device = torch.device("cuda") if torch.cuda.is_available() else torch.device(
    "cpu")

# Initialize environment and set seeds
env_name = 'CartPole-v0'
env = gym.make(env_name)
env.seed(0)
torch.manual_seed(0)

# Initialize agent
agent = TRPO(hidden_dim, env.observation_space, env.action_space, lmbda,
             kl_constraint, alpha, critic_lr, gamma, device)

# Train agent and plot results
return_list = rl_utils.train_on_policy_agent(env, agent, num_episodes)
episodes_list = list(range(len(return_list)))
plt.plot(episodes_list, return_list)
plt.xlabel('Episodes')
plt.ylabel('Returns')
plt.title('TRPO on {}'.format(env_name))
plt.show()
mv_return = rl_utils.moving_average(return_list, 9)
plt.plot(episodes_list, mv_return)
plt.xlabel('Episodes')
plt.ylabel('Returns')
plt.title('TRPO on {}'.format(env_name))
plt.show()

  f"The environment {id} is out of date. You should consider "


AttributeError: 'CartPoleEnv' object has no attribute 'seed'

In [6]:
class PolicyNetContinuous(torch.nn.Module):
    def __init__(self, state_dim, hidden_dim, action_dim):
        super(PolicyNetContinuous, self).__init__()
        # 网络的第一层全连接层，输入状态的维度，隐藏层的维度
        self.fc1 = torch.nn.Linear(state_dim, hidden_dim)
        # 网络的第二层全连接层，隐藏层的维度，动作的维度
        self.fc_mu = torch.nn.Linear(hidden_dim, action_dim)
        # 网络的第三层全连接层，隐藏层的维度，动作的维度
        self.fc_std = torch.nn.Linear(hidden_dim, action_dim)

    def forward(self, x):
        # 将状态输入第一层全连接层
        x = F.relu(self.fc1(x))
        # 将第一层全连接层的输出输入第二层全连接层，得到动作的均值
        mu = 2.0 * torch.tanh(self.fc_mu(x))
        # 将第一层全连接层的输出输入第三层全连接层，得到动作的标准差
        std = F.softplus(self.fc_std(x))  # softplus函数可以保证标准差为正数，公式为ln(1+e^x)
        # 返回动作的均值和标准差
        return mu, std


class TRPOContinuous:
    """ 处理连续动作的TRPO算法 """

    def __init__(self, hidden_dim, state_space, action_space, lmbda,
                 kl_constraint, alpha, critic_lr, gamma, device):
        state_dim = state_space.shape[0]
        action_dim = action_space.shape[0]
        # initialize the actor network
        self.actor = PolicyNetContinuous(state_dim, hidden_dim,
                                         action_dim).to(device)
        # initialize the critic network
        self.critic = ValueNet(state_dim, hidden_dim).to(device)
        # initialize the critic optimizer
        self.critic_optimizer = torch.optim.Adam(self.critic.parameters(),
                                                 lr=critic_lr)
        # discount rate
        self.gamma = gamma
        # generalized advantage estimation parameter
        self.lmbda = lmbda
        # KL divergence constraint
        self.kl_constraint = kl_constraint
        # entropy regularization coefficient
        self.alpha = alpha
        self.device = device

    def take_action(self, state):
        # Convert the state into a tensor
        state = torch.tensor([state], dtype=torch.float).to(self.device)
        # Get the mean and standard deviation of the action distribution
        mu, std = self.actor(state)
        # Create a normal distribution from the mean and standard deviation
        action_dist = torch.distributions.Normal(mu, std)
        # Sample an action from the distribution
        action = action_dist.sample()
        # Return the sampled action
        return [action.item()]

    # This function is used to compute the approximate Hessian-vector product.
    # The input vector is a flattened vector of the same shape as the
    # parameters.
    def hessian_matrix_vector_product(self,
                                      states,
                                      old_action_dists,
                                      vector,
                                      damping=0.1):
        # Compute the KL divergence between the new action distribution and the old action distribution.
        mu, std = self.actor(states)
        new_action_dists = torch.distributions.Normal(mu, std)
        kl = torch.mean(
            torch.distributions.kl.kl_divergence(old_action_dists,
                                                 new_action_dists))
        # Compute the gradient of the KL divergence with respect to the actor parameters.
        kl_grad = torch.autograd.grad(kl,
                                      self.actor.parameters(),
                                      create_graph=True)
        # Flatten the gradient vector.
        kl_grad_vector = torch.cat([grad.view(-1) for grad in kl_grad])
        # Compute the dot product between the flattened gradient vector and the input vector.
        kl_grad_vector_product = torch.dot(kl_grad_vector, vector)
        # Compute the second order gradient of the dot product with respect to the actor parameters.
        grad2 = torch.autograd.grad(kl_grad_vector_product,
                                    self.actor.parameters())
        # Flatten the second order gradient vector.
        grad2_vector = torch.cat(
            [grad.contiguous().view(-1) for grad in grad2])
        # Return the flattened second order gradient vector.
        return grad2_vector + damping * vector

    def conjugate_gradient(self, grad, states, old_action_dists):
        # initialize x and r
        x = torch.zeros_like(grad)
        r = grad.clone()
        # initialize p
        p = grad.clone()
        # compute r dot r
        rdotr = torch.dot(r, r)
        for i in range(10):
            # compute Hessian matrix vector product
            Hp = self.hessian_matrix_vector_product(states, old_action_dists, p)
            # compute alpha
            alpha = rdotr / torch.dot(p, Hp)
            # update x
            x += alpha * p
            # update r
            r -= alpha * Hp
            # compute new r dot r
            new_rdotr = torch.dot(r, r)
            # check for convergence
            if new_rdotr < 1e-10:
                break
            # compute beta
            beta = new_rdotr / rdotr
            # update p
            p = r + beta * p
            # update r dot r
            rdotr = new_rdotr
        return x

    def compute_surrogate_obj(self, states, actions, advantage, old_log_probs,
                              actor):
        # Compute new log probabilities
        mu, std = actor(states)
        action_dists = torch.distributions.Normal(mu, std)
        log_probs = action_dists.log_prob(actions)

        # Compute the ratio
        ratio = torch.exp(log_probs - old_log_probs)

        # Compute the surrogate objective
        return torch.mean(ratio * advantage)

    def line_search(self, states, actions, advantage, old_log_probs,
                    old_action_dists, max_vec):
        # get current parameters and compute surrogate objective
        old_para = torch.nn.utils.convert_parameters.parameters_to_vector(
            self.actor.parameters())
        old_obj = self.compute_surrogate_obj(states, actions, advantage,
                                             old_log_probs, self.actor)
        for i in range(15):
            # the coefficient is the step size for the line search
            coef = self.alpha ** i
            # compute new parameters
            new_para = old_para + coef * max_vec
            # copy the current actor network
            new_actor = copy.deepcopy(self.actor)
            # load the new parameters to the copied network
            torch.nn.utils.convert_parameters.vector_to_parameters(
                new_para, new_actor.parameters())
            # compute the KL divergence between old and new action distributions
            mu, std = new_actor(states)
            new_action_dists = torch.distributions.Normal(mu, std)
            kl_div = torch.mean(
                torch.distributions.kl.kl_divergence(old_action_dists,
                                                     new_action_dists))
            # compute the new surrogate objective
            new_obj = self.compute_surrogate_obj(states, actions, advantage,
                                                 old_log_probs, new_actor)
            # check if new objective is larger than old objective and if KL divergence is smaller than the constraint
            if new_obj > old_obj and kl_div < self.kl_constraint:
                return new_para
        return old_para

    def policy_learn(self, states, actions, old_action_dists, old_log_probs,
                     advantage):
        # Compute the surrogate objective
        surrogate_obj = self.compute_surrogate_obj(states, actions, advantage,
                                                   old_log_probs, self.actor)
        # Compute the gradient of the surrogate objective
        grads = torch.autograd.grad(surrogate_obj, self.actor.parameters())
        # Concatenate the gradient of the surrogate objective
        obj_grad = torch.cat([grad.view(-1) for grad in grads]).detach()
        # Compute the conjugate gradient descent direction
        descent_direction = self.conjugate_gradient(obj_grad, states,
                                                    old_action_dists)
        # Compute the Hessian matrix vector product
        Hd = self.hessian_matrix_vector_product(states, old_action_dists,
                                                descent_direction)
        # Compute the maximum coefficient
        max_coef = torch.sqrt(2 * self.kl_constraint /
                              (torch.dot(descent_direction, Hd) + 1e-8))
        # Compute the new parameters
        new_para = self.line_search(states, actions, advantage, old_log_probs,
                                    old_action_dists,
                                    descent_direction * max_coef)
        # Update the parameters of the actor
        torch.nn.utils.convert_parameters.vector_to_parameters(
            new_para, self.actor.parameters())

    def update(self, transition_dict):
        # get the data from the buffer
        states = torch.tensor(transition_dict['states'],
                              dtype=torch.float).to(self.device)
        actions = torch.tensor(transition_dict['actions'],
                               dtype=torch.float).view(-1, 1).to(self.device)
        rewards = torch.tensor(transition_dict['rewards'],
                               dtype=torch.float).view(-1, 1).to(self.device)
        next_states = torch.tensor(transition_dict['next_states'],
                                   dtype=torch.float).to(self.device)
        dones = torch.tensor(transition_dict['dones'],
                             dtype=torch.float).view(-1, 1).to(self.device)
        # modify the rewards
        rewards = (rewards + 8.0) / 8.0
        # calculate the td target
        td_target = rewards + self.gamma * self.critic(next_states) * (1 - dones)
        # calculate the td delta
        td_delta = td_target - self.critic(states)
        # calculate the advantage
        advantage = compute_advantage(self.gamma, self.lmbda,
                                      td_delta.cpu()).to(self.device)
        # get the action distribution
        mu, std = self.actor(states)
        # get the old action distribution
        old_action_dists = torch.distributions.Normal(mu.detach(),
                                                      std.detach())
        # get the old log probability
        old_log_probs = old_action_dists.log_prob(actions)
        # calculate the critic loss
        critic_loss = torch.mean(
            F.mse_loss(self.critic(states), td_target.detach()))
        # update the critic
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()
        # update the actor
        self.policy_learn(states, actions, old_action_dists, old_log_probs,
                          advantage)

In [7]:
# define number of episodes
num_episodes = 2000
# define hidden dimension
hidden_dim = 128
# define gamma
gamma = 0.9
# define lambda
lmbda = 0.9
# define critic learning rate
critic_lr = 1e-2
# define KL constraint
kl_constraint = 0.00005
# define alpha
alpha = 0.5
# define device
device = torch.device("cuda") if torch.cuda.is_available() else torch.device(
    "cpu")

# define environment name
env_name = 'Pendulum-v0'
# environment
env = gym.make(env_name)
# seed environment
env.seed(0)
# seed torch
torch.manual_seed(0)
# define agent
agent = TRPOContinuous(hidden_dim, env.observation_space, env.action_space,
                       lmbda, kl_constraint, alpha, critic_lr, gamma, device)
# train agent
return_list = rl_utils.train_on_policy_agent(env, agent, num_episodes)

# define episodes list
episodes_list = list(range(len(return_list)))
# plot episode returns
plt.plot(episodes_list, return_list)
plt.xlabel('Episodes')
plt.ylabel('Returns')
plt.title('TRPO on {}'.format(env_name))
plt.show()

# plot moving average of returns
mv_return = rl_utils.moving_average(return_list, 9)
plt.plot(episodes_list, mv_return)
plt.xlabel('Episodes')
plt.ylabel('Returns')
plt.title('TRPO on {}'.format(env_name))
plt.show()

  f"The environment {id} is out of date. You should consider "


DeprecatedEnv: Environment version v0 for `Pendulum` is deprecated. Please use `Pendulum-v1` instead.