In [6]:
import random
import gym
import numpy as np
from tqdm import tqdm
import torch
import torch.nn.functional as F
from torch.distributions import Normal
import matplotlib.pyplot as plt
import rl_utils

In [7]:
class PolicyNetContinuous(torch.nn.Module):
    # Initialize the network
    def __init__(self, state_dim, hidden_dim, action_dim, action_bound):
        super(PolicyNetContinuous, self).__init__()
        self.fc1 = torch.nn.Linear(state_dim, hidden_dim)
        self.fc_mu = torch.nn.Linear(hidden_dim, action_dim)
        self.fc_std = torch.nn.Linear(hidden_dim, action_dim)
        self.action_bound = action_bound

    # Forward propagation
    def forward(self, x):
        # The first layer is a fully connected layer
        x = F.relu(self.fc1(x))
        # The output layer is a fully connected layer
        mu = self.fc_mu(x)
        std = F.softplus(self.fc_std(x))
        # Create a normal distribution
        dist = Normal(mu, std)
        # Create a sample from the normal distribution
        normal_sample = dist.rsample()  # rsample() is reparameterization sampling
        # Calculate the log probability of the normal distribution
        log_prob = dist.log_prob(normal_sample)
        # Calculate the action
        action = torch.tanh(normal_sample)
        # Calculate the log probability density of the tanh_normal distribution
        log_prob = log_prob - torch.log(1 - torch.tanh(action).pow(2) + 1e-7)
        # Scale the action
        action = action * self.action_bound
        return action, log_prob


class QValueNetContinuous(torch.nn.Module):
    def __init__(self, state_dim, hidden_dim, action_dim):
        super(QValueNetContinuous, self).__init__()
        # Create the first layer of the network
        self.fc1 = torch.nn.Linear(state_dim + action_dim, hidden_dim)
        # Create the second layer of the network
        self.fc2 = torch.nn.Linear(hidden_dim, hidden_dim)
        # Create the output layer of the network
        self.fc_out = torch.nn.Linear(hidden_dim, 1)

    def forward(self, x, a):
        # Concatenate the state and action along the first dimension (batch dimension)
        cat = torch.cat([x, a], dim=1)
        # Run the input through the first layer, using the relu activation function
        x = F.relu(self.fc1(cat))
        # Run the input through the second layer, using the relu activation function
        x = F.relu(self.fc2(x))
        # Run the input through the output layer, and return the output
        return self.fc_out(x)

In [8]:
class SACContinuous:
    ''' 处理连续动作的SAC算法 '''
    def __init__(self, state_dim, hidden_dim, action_dim, action_bound,
                 actor_lr, critic_lr, alpha_lr, target_entropy, tau, gamma,
                 device):
        self.actor = PolicyNetContinuous(state_dim, hidden_dim, action_dim,
                                         action_bound).to(device)  # 策略网络
        self.critic_1 = QValueNetContinuous(state_dim, hidden_dim,
                                            action_dim).to(device)  # 第一个Q网络
        self.critic_2 = QValueNetContinuous(state_dim, hidden_dim,
                                            action_dim).to(device)  # 第二个Q网络
        self.target_critic_1 = QValueNetContinuous(state_dim,
                                                   hidden_dim, action_dim).to(
                                                       device)  # 第一个目标Q网络
        self.target_critic_2 = QValueNetContinuous(state_dim,
                                                   hidden_dim, action_dim).to(
                                                       device)  # 第二个目标Q网络
        # 令目标Q网络的初始参数和Q网络一样
        self.target_critic_1.load_state_dict(self.critic_1.state_dict())
        self.target_critic_2.load_state_dict(self.critic_2.state_dict())
        self.actor_optimizer = torch.optim.Adam(self.actor.parameters(),
                                                lr=actor_lr)
        self.critic_1_optimizer = torch.optim.Adam(self.critic_1.parameters(),
                                                   lr=critic_lr)
        self.critic_2_optimizer = torch.optim.Adam(self.critic_2.parameters(),
                                                   lr=critic_lr)
        # 使用alpha的log值,可以使训练结果比较稳定
        self.log_alpha = torch.tensor(np.log(0.01), dtype=torch.float)
        self.log_alpha.requires_grad = True  # 可以对alpha求梯度
        self.log_alpha_optimizer = torch.optim.Adam([self.log_alpha],
                                                    lr=alpha_lr)
        self.target_entropy = target_entropy  # 目标熵的大小
        self.gamma = gamma
        self.tau = tau
        self.device = device

    def take_action(self, state):
        # Convert the state to a tensor
        state = torch.tensor([state], dtype=torch.float).to(self.device)
        # Get the action from the actor network
        action = self.actor(state)[0]
        # Convert the action to a Python number and return it
        return [action.item()]

    def calc_target(self, rewards, next_states, dones):  # 计算目标Q值
        next_actions, log_prob = self.actor(next_states)  # 根据状态值计算动作值
        entropy = -log_prob  # 计算熵
        q1_value = self.target_critic_1(next_states, next_actions)  # 计算Q1值
        q2_value = self.target_critic_2(next_states, next_actions)  # 计算Q2值
        next_value = torch.min(q1_value,
                               q2_value) + self.log_alpha.exp() * entropy  # 计算下一步的价值
        td_target = rewards + self.gamma * next_value * (1 - dones)  # 计算目标价值
        return td_target

    def soft_update(self, net, target_net):
        for param_target, param in zip(target_net.parameters(),
                                       net.parameters()):
            # Update the target network parameters by:
            # tau * current_parameters + (1-tau) * target_parameters
            param_target.data.copy_(param_target.data * (1.0 - self.tau) +
                                    param.data * self.tau)

    def update(self, transition_dict):
        # 从经验池中采样出一批经验
        states = torch.tensor(transition_dict['states'],dtype=torch.float).to(self.device)
        actions = torch.tensor(transition_dict['actions'],dtype=torch.float).view(-1, 1).to(self.device)
        rewards = torch.tensor(transition_dict['rewards'],dtype=torch.float).view(-1, 1).to(self.device)
        next_states = torch.tensor(transition_dict['next_states'],dtype=torch.float).to(self.device)
        dones = torch.tensor(transition_dict['dones'],dtype=torch.float).view(-1, 1).to(self.device)
        # 和之前章节一样,对倒立摆环境的奖励进行重塑以便训练
        rewards = (rewards + 8.0) / 8.0
        # 更新两个Q网络
        # 计算目标Q值
        td_target = self.calc_target(rewards, next_states, dones)
        # 计算当前Q值
        critic_1_loss = torch.mean(F.mse_loss(self.critic_1(states, actions), td_target.detach()))
        critic_2_loss = torch.mean(F.mse_loss(self.critic_2(states, actions), td_target.detach()))
        # 更新Q网络
        self.critic_1_optimizer.zero_grad()
        critic_1_loss.backward()
        self.critic_1_optimizer.step()
        self.critic_2_optimizer.zero_grad()
        critic_2_loss.backward()
        self.critic_2_optimizer.step()
        # 更新策略网络
        # 计算策略网络的损失
        new_actions, log_prob = self.actor(states)
        entropy = -log_prob
        q1_value = self.critic_1(states, new_actions)
        q2_value = self.critic_2(states, new_actions)
        actor_loss = torch.mean(-self.log_alpha.exp() * entropy -torch.min(q1_value, q2_value))
        # 更新策略网络
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()
        # 更新alpha值
        # 计算alpha的损失
        alpha_loss = torch.mean((entropy - self.target_entropy).detach() * self.log_alpha.exp())
        # 更新alpha
        self.log_alpha_optimizer.zero_grad()
        alpha_loss.backward()
        self.log_alpha_optimizer.step()
        # 更新Q网络的目标参数
        self.soft_update(self.critic_1, self.target_critic_1)
        self.soft_update(self.critic_2, self.target_critic_2)

In [9]:
env_name = 'Pendulum-v0'  # 环境名称
env = gym.make(env_name)  # 创建环境
state_dim = env.observation_space.shape[0]  # 状态空间维度
action_dim = env.action_space.shape[0]  # 动作空间维度
action_bound = env.action_space.high[0]  # 动作最大值
random.seed(0)
np.random.seed(0)
env.seed(0)
torch.manual_seed(0)

actor_lr = 3e-4
critic_lr = 3e-3
alpha_lr = 3e-4
num_episodes = 100
hidden_dim = 128
gamma = 0.99
tau = 0.005  # 软更新参数
buffer_size = 100000
minimal_size = 1000
batch_size = 64
target_entropy = -env.action_space.shape[0]
device = torch.device("cuda") if torch.cuda.is_available() else torch.device(
    "cpu")

# Initialize replay buffer
replay_buffer = rl_utils.ReplayBuffer(buffer_size)
# Initialize agent
agent = SACContinuous(state_dim, hidden_dim, action_dim, action_bound,
                      actor_lr, critic_lr, alpha_lr, target_entropy, tau,
                      gamma, device)
# Train the agent
return_list = rl_utils.train_off_policy_agent(env, agent, num_episodes,
                                              replay_buffer, minimal_size,
                                              batch_size)

  f"The environment {id} is out of date. You should consider "


DeprecatedEnv: Environment version v0 for `Pendulum` is deprecated. Please use `Pendulum-v1` instead.

In [None]:
episodes_list = list(range(len(return_list)))
plt.plot(episodes_list, return_list)
plt.xlabel('Episodes')
plt.ylabel('Returns')
plt.title('SAC on {}'.format(env_name))
plt.show()

mv_return = rl_utils.moving_average(return_list, 9)
plt.plot(episodes_list, mv_return)
plt.xlabel('Episodes')
plt.ylabel('Returns')
plt.title('SAC on {}'.format(env_name))
plt.show()

NameError: name 'return_list' is not defined

In [None]:
class PolicyNet(torch.nn.Module):
    def __init__(self, state_dim, hidden_dim, action_dim):
        super(PolicyNet, self).__init__()
        # First fully connected layer with ReLU activation
        self.fc1 = torch.nn.Linear(state_dim, hidden_dim)
        # Second fully connected layer with softmax activation
        self.fc2 = torch.nn.Linear(hidden_dim, action_dim)

    def forward(self, x):
        # Forward pass through first fully connected layer
        x = F.relu(self.fc1(x))
        # Forward pass through second fully connected layer
        return F.softmax(self.fc2(x), dim=1)


class QValueNet(torch.nn.Module):
    ''' 只有一层隐藏层的Q网络 '''
    def __init__(self, state_dim, hidden_dim, action_dim):
        super(QValueNet, self).__init__()
        # 输入层到隐藏层的全连接层
        self.fc1 = torch.nn.Linear(state_dim, hidden_dim)
        # 隐藏层到输出层的全连接层
        self.fc2 = torch.nn.Linear(hidden_dim, action_dim)

    def forward(self, x):
        # 用激活函数relu对输入x进行非线性变换，然后再进行线性变换
        x = F.relu(self.fc1(x))
        return self.fc2(x)

NameError: name 'torch' is not defined

In [None]:
class SAC:
    ''' 处理离散动作的SAC算法 '''
    def __init__(self, state_dim, hidden_dim, action_dim, actor_lr, critic_lr,
                 alpha_lr, target_entropy, tau, gamma, device):
        # 策略网络
        self.actor = PolicyNet(state_dim, hidden_dim, action_dim).to(device)
        # 第一个Q网络
        self.critic_1 = QValueNet(state_dim, hidden_dim, action_dim).to(device)
        # 第二个Q网络
        self.critic_2 = QValueNet(state_dim, hidden_dim, action_dim).to(device)
        self.target_critic_1 = QValueNet(state_dim, hidden_dim,
                                         action_dim).to(device)  # 第一个目标Q网络
        self.target_critic_2 = QValueNet(state_dim, hidden_dim,
                                         action_dim).to(device)  # 第二个目标Q网络
        # 令目标Q网络的初始参数和Q网络一样
        self.target_critic_1.load_state_dict(self.critic_1.state_dict())
        self.target_critic_2.load_state_dict(self.critic_2.state_dict())
        self.actor_optimizer = torch.optim.Adam(self.actor.parameters(),
                                                lr=actor_lr)
        self.critic_1_optimizer = torch.optim.Adam(self.critic_1.parameters(),
                                                   lr=critic_lr)
        self.critic_2_optimizer = torch.optim.Adam(self.critic_2.parameters(),
                                                   lr=critic_lr)
        # 使用alpha的log值,可以使训练结果比较稳定
        self.log_alpha = torch.tensor(np.log(0.01), dtype=torch.float)
        self.log_alpha.requires_grad = True  # 可以对alpha求梯度
        self.log_alpha_optimizer = torch.optim.Adam([self.log_alpha],
                                                    lr=alpha_lr)
        self.target_entropy = target_entropy  # 目标熵的大小
        self.gamma = gamma
        self.tau = tau
        self.device = device

    def take_action(self, state):
        # Convert the state to a tensor and send it to the correct device.
        state = torch.tensor([state], dtype=torch.float).to(self.device)
        # Get the action probabilities from the actor network.
        probs = self.actor(state)
        # Create a distribution object from the probabilities.
        action_dist = torch.distributions.Categorical(probs)
        # Sample an action from the distribution.
        action = action_dist.sample()
        # Return the sampled action.
        return action.item()

    # 计算目标Q值,直接用策略网络的输出概率进行期望计算
    def calc_target(self, rewards, next_states, dones):
        # 计算策略网络输出的概率
        next_probs = self.actor(next_states)
        # 计算策略网络输出概率的对数值
        next_log_probs = torch.log(next_probs + 1e-8)
        # 计算熵
        entropy = -torch.sum(next_probs * next_log_probs, dim=1, keepdim=True)
        # 计算两个Q网络的输出值
        q1_value = self.target_critic_1(next_states)
        q2_value = self.target_critic_2(next_states)
        # 计算两个Q网络输出值的最小值
        min_qvalue = torch.sum(next_probs * torch.min(q1_value, q2_value),
                               dim=1,
                               keepdim=True)
        # 计算策略网络输出概率的期望Q值
        next_value = min_qvalue + self.log_alpha.exp() * entropy
        # 计算目标Q值
        td_target = rewards + self.gamma * next_value * (1 - dones)
        return td_target

    def soft_update(self, net, target_net):
        # Copy parameters from net to target_net
        for param_target, param in zip(target_net.parameters(),
                                    net.parameters()):
            # Update the target parameter by linearly interpolating
            # between the current value and the new value
            param_target.data.copy_(param_target.data * (1.0 - self.tau) +
                                    param.data * self.tau)

    def update(self, transition_dict):
        # 从transition_dict中获取状态、动作、奖励、下一状态和游戏是否结束的信息
        states = torch.tensor(transition_dict['states'],dtype=torch.float).to(self.device)
        # 转换为torch的tensor，并指定数据类型为float，然后转移到device上
        actions = torch.tensor(transition_dict['actions']).view(-1, 1).to(self.device)  # 动作不再是float类型
        rewards = torch.tensor(transition_dict['rewards'],dtype=torch.float).view(-1, 1).to(self.device)
        next_states = torch.tensor(transition_dict['next_states'],dtype=torch.float).to(self.device)
        dones = torch.tensor(transition_dict['dones'],dtype=torch.float).view(-1, 1).to(self.device)

        # 更新两个Q网络
        # 计算td_target
        td_target = self.calc_target(rewards, next_states, dones)
        # 获取critic_1的Q值
        critic_1_q_values = self.critic_1(states).gather(1, actions)
        # 计算critic_1的loss
        critic_1_loss = torch.mean(F.mse_loss(critic_1_q_values, td_target.detach()))
        # 获取critic_2的Q值
        critic_2_q_values = self.critic_2(states).gather(1, actions)
        # 计算critic_2的loss
        critic_2_loss = torch.mean(F.mse_loss(critic_2_q_values, td_target.detach()))
        # 梯度清零
        self.critic_1_optimizer.zero_grad()
        # 反向传播
        critic_1_loss.backward()
        # 更新critic_1的参数
        self.critic_1_optimizer.step()
        # 梯度清零
        self.critic_2_optimizer.zero_grad()
        # 反向传播
        critic_2_loss.backward()
        # 更新critic_2的参数
        self.critic_2_optimizer.step()

        # 更新策略网络
        # 1.获取当前状态的概率分布
        probs = self.actor(states)
        # 2.根据概率分布计算log概率
        log_probs = torch.log(probs + 1e-8)
        # 3.直接根据概率计算熵
        entropy = -torch.sum(probs * log_probs, dim=1, keepdim=True)  #
        # 4.获取Q值
        q1_value = self.critic_1(states)
        q2_value = self.critic_2(states)
        # 5.根据概率计算期望Q值
        min_qvalue = torch.sum(probs * torch.min(q1_value, q2_value),dim=1,keepdim=True)  # 直接根据概率计算期望
        # 6.计算actor loss
        actor_loss = torch.mean(-self.log_alpha.exp() * entropy - min_qvalue)
        # 7.优化
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # 更新alpha值
        alpha_loss = torch.mean((entropy - target_entropy).detach() * self.log_alpha.exp())
        self.log_alpha_optimizer.zero_grad()
        alpha_loss.backward()
        self.log_alpha_optimizer.step()

        self.soft_update(self.critic_1, self.target_critic_1)
        self.soft_update(self.critic_2, self.target_critic_2)

In [None]:
actor_lr = 1e-3
critic_lr = 1e-2
alpha_lr = 1e-2
num_episodes = 200
hidden_dim = 128
gamma = 0.98
tau = 0.005  # 软更新参数
buffer_size = 10000
minimal_size = 500
batch_size = 64
target_entropy = -1
device = torch.device("cuda") if torch.cuda.is_available() else torch.device(
    "cpu")

env_name = 'CartPole-v0'
env = gym.make(env_name)
random.seed(0)
np.random.seed(0)
env.seed(0)
torch.manual_seed(0)
replay_buffer = rl_utils.ReplayBuffer(buffer_size)
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.n
agent = SAC(state_dim, hidden_dim, action_dim, actor_lr, critic_lr, alpha_lr,
            target_entropy, tau, gamma, device)

return_list = rl_utils.train_off_policy_agent(env, agent, num_episodes,
                                              replay_buffer, minimal_size,
                                              batch_size)

NameError: name 'torch' is not defined

In [None]:
episodes_list = list(range(len(return_list)))
plt.plot(episodes_list, return_list)
plt.xlabel('Episodes')
plt.ylabel('Returns')
plt.title('SAC on {}'.format(env_name))
plt.show()

mv_return = rl_utils.moving_average(return_list, 9)
plt.plot(episodes_list, mv_return)
plt.xlabel('Episodes')
plt.ylabel('Returns')
plt.title('SAC on {}'.format(env_name))
plt.show()

NameError: name 'return_list' is not defined