In [1]:
import random
import gym
import numpy as np
from tqdm import tqdm
import torch
from torch import nn
import torch.nn.functional as F
import matplotlib.pyplot as plt
import rl_utils

In [2]:
class PolicyNet(torch.nn.Module):
    def __init__(self, state_dim, hidden_dim, action_dim, action_bound):
        super(PolicyNet, self).__init__()
        self.fc1 = torch.nn.Linear(state_dim, hidden_dim)  # 输入状态维度，输出hidden_dim维度的向量
        self.fc2 = torch.nn.Linear(hidden_dim, action_dim)  # 输入hidden_dim维度的向量，输出动作维度的向量
        self.action_bound = action_bound  # action_bound是环境可以接受的动作最大值

    def forward(self, x):
        x = F.relu(self.fc1(x))  # 输入状态，输出hidden_dim维度的向量
        return torch.tanh(self.fc2(x)) * self.action_bound  # 输入hidden_dim维度的向量，输出动作维度的向量


class QValueNet(torch.nn.Module):
    def __init__(self, state_dim, hidden_dim, action_dim):
        super(QValueNet, self).__init__()
        self.fc1 = torch.nn.Linear(state_dim + action_dim, hidden_dim)  # 输入状态和动作拼接后的维度，输出hidden_dim维度的向量
        self.fc2 = torch.nn.Linear(hidden_dim, hidden_dim)  # 输入hidden_dim维度的向量，输出hidden_dim维度的向量
        self.fc_out = torch.nn.Linear(hidden_dim, 1)  # 输入hidden_dim维度的向量，输出1维度的向量

    def forward(self, x, a):
        cat = torch.cat([x, a], dim=1)  # 拼接状态和动作
        x = F.relu(self.fc1(cat))  # 输入状态和动作拼接后的维度，输出hidden_dim维度的向量
        x = F.relu(self.fc2(x))  # 输入hidden_dim维度的向量，输出hidden_dim维度的向量
        return self.fc_out(x)  # 输入hidden_dim维度的向量，输出1维度的向量

In [3]:
class DDPG:
    ''' DDPG algorithm '''
    def __init__(self, state_dim, hidden_dim, action_dim, action_bound, sigma, actor_lr, critic_lr, tau, gamma, device):
        # Initialize the actor network
        self.actor = PolicyNet(state_dim, hidden_dim, action_dim, action_bound).to(device)
        # Initialize the critic network
        self.critic = QValueNet(state_dim, hidden_dim, action_dim).to(device)
        # Initialize the target actor network and set the parameters to the same as the actor network
        self.target_actor = PolicyNet(state_dim, hidden_dim, action_dim, action_bound).to(device)
        # Initialize the target value network and set the parameters to the same as the value network
        self.target_critic = QValueNet(state_dim, hidden_dim, action_dim).to(device)
        # Initialize the target value network and set the parameters to the same as the value network
        self.target_critic.load_state_dict(self.critic.state_dict())
        # Initialize the target policy network and set the parameters to the same as the policy
        self.target_actor.load_state_dict(self.actor.state_dict())
        # Initialize the actor network optimizer
        self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=actor_lr)
        # Initialize the critic network optimizer
        self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), lr=critic_lr)
        # Discount factor
        self.gamma = gamma
        # Standard deviation of Gaussian noise, mean directly set to 0
        self.sigma = sigma
        # Target network soft update parameter
        self.tau = tau
        # Action dimension
        self.action_dim = action_dim
        # Device
        self.device = device

    def take_action(self, state):
        # Convert state to tensor
        state = torch.tensor([state], dtype=torch.float).to(self.device)
        # Get the action from the state according to the actor network
        action = self.actor(state).item()
        # Add noise to the action to increase exploration
        action = action + self.sigma * np.random.randn(self.action_dim)
        return action

    def soft_update(self, net, target_net):
        for param_target, param in zip(target_net.parameters(), net.parameters()):
            param_target.data.copy_(param_target.data * (1.0 - self.tau) + param.data * self.tau)

    def update(self, transition_dict):
        # 从字典中取出数据
        states = torch.tensor(transition_dict['states'], dtype=torch.float).to(self.device)
        actions = torch.tensor(transition_dict['actions'], dtype=torch.float).view(-1, 1).to(self.device)
        rewards = torch.tensor(transition_dict['rewards'], dtype=torch.float).view(-1, 1).to(self.device)
        next_states = torch.tensor(transition_dict['next_states'], dtype=torch.float).to(self.device)
        dones = torch.tensor(transition_dict['dones'], dtype=torch.float).view(-1, 1).to(self.device)

        # 计算当前状态下，下一状态的价值
        next_q_values = self.target_critic(next_states, self.target_actor(next_states))
        # 计算当前状态下，当前状态的价值
        q_targets = rewards + self.gamma * next_q_values * (1 - dones)
        # 计算当前状态下，当前状态的价值与下一状态的价值之间的差距
        critic_loss = torch.mean(F.mse_loss(self.critic(states, actions), q_targets))
        # 将价值网络的梯度清零
        self.critic_optimizer.zero_grad()
        # 反向传播
        critic_loss.backward()
        # 更新价值网络
        self.critic_optimizer.step()

        # 计算当前状态下，策略网络的价值
        actor_loss = -torch.mean(self.critic(states, self.actor(states)))
        # 将策略网络的梯度清零
        self.actor_optimizer.zero_grad()
        # 反向传播
        actor_loss.backward()
        # 更新策略网络
        self.actor_optimizer.step()

        # 软更新策略网络
        self.soft_update(self.actor, self.target_actor)
        # 软更新价值网络
        self.soft_update(self.critic, self.target_critic)

In [4]:
actor_lr = 3e-4
critic_lr = 3e-3
num_episodes = 200
hidden_dim = 64
gamma = 0.98
tau = 0.005  # 软更新参数
buffer_size = 10000
minimal_size = 1000
batch_size = 64
sigma = 0.01  # 高斯噪声标准差
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

env_name = 'Pendulum-v0'
env = gym.make(env_name)
random.seed(0)
np.random.seed(0)
env.seed(0)
torch.manual_seed(0)
replay_buffer = rl_utils.ReplayBuffer(buffer_size)
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.shape[0]
action_bound = env.action_space.high[0]  # 动作最大值
agent = DDPG(state_dim, hidden_dim, action_dim, action_bound, sigma, actor_lr, critic_lr, tau, gamma, device)

return_list = rl_utils.train_off_policy_agent(env, agent, num_episodes, replay_buffer, minimal_size, batch_size)

  f"The environment {id} is out of date. You should consider "


DeprecatedEnv: Environment version v0 for `Pendulum` is deprecated. Please use `Pendulum-v1` instead.

In [5]:
episodes_list = list(range(len(return_list)))
plt.plot(episodes_list, return_list)
plt.xlabel('Episodes')
plt.ylabel('Returns')
plt.title('DDPG on {}'.format(env_name))
plt.show()

mv_return = rl_utils.moving_average(return_list, 9)
plt.plot(episodes_list, mv_return)
plt.xlabel('Episodes')
plt.ylabel('Returns')
plt.title('DDPG on {}'.format(env_name))
plt.show()

NameError: name 'return_list' is not defined