In [1]:
import gym
import torch
import torch.nn.functional as F
import torch.nn as nn
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
import random
import rl_utils


class PolicyNet(torch.nn.Module):
    def __init__(self, state_dim, hidden_dim, action_dim):
        super(PolicyNet, self).__init__() # Initialize the parent class
        self.fc1 = torch.nn.Linear(state_dim, hidden_dim) # First layer
        self.fc2 = torch.nn.Linear(hidden_dim, action_dim) # Second layer

    def forward(self, x):
        x = F.relu(self.fc1(x)) # First layer
        return F.softmax(self.fc2(x), dim=1) # Second layer


class ValueNet(torch.nn.Module):
    def __init__(self, state_dim, hidden_dim):
        super(ValueNet, self).__init__() # Initialize the parent class
        self.fc1 = torch.nn.Linear(state_dim, hidden_dim) # First layer
        self.fc2 = torch.nn.Linear(hidden_dim, 1) # Second layer

    def forward(self, x):
        x = F.relu(self.fc1(x)) # First layer
        return self.fc2(x) # Second layer


class PPO:
    ''' PPO算法,采用截断方式 '''
    def __init__(self, state_dim, hidden_dim, action_dim, actor_lr, critic_lr,
                 lmbda, epochs, eps, gamma, device):
        # 创建actor和critic网络
        self.actor = PolicyNet(state_dim, hidden_dim, action_dim).to(device)
        self.critic = ValueNet(state_dim, hidden_dim).to(device)
        # 创建actor和critic的优化器
        self.actor_optimizer = torch.optim.Adam(self.actor.parameters(),
                                                lr=actor_lr)
        self.critic_optimizer = torch.optim.Adam(self.critic.parameters(),
                                                 lr=critic_lr)
        # 定义超参数
        self.gamma = gamma
        self.lmbda = lmbda
        self.epochs = epochs  # 一条序列的数据用来训练轮数
        self.eps = eps  # PPO中截断范围的参数
        self.device = device

    def take_action(self, state):
        # 将state转换为tensor
        state = torch.tensor([state], dtype=torch.float).to(self.device)
        # 获取概率分布
        probs = self.actor(state)
        # 根据概率分布选取动作
        action_dist = torch.distributions.Categorical(probs)
        action = action_dist.sample()
        return action.item()

    def update(self, transition_dict):
        # 从字典中读取转移数据
        states = torch.tensor(transition_dict['states'],
                              dtype=torch.float).to(self.device)
        actions = torch.tensor(transition_dict['actions']).view(-1, 1).to(
            self.device)
        rewards = torch.tensor(transition_dict['rewards'],
                               dtype=torch.float).view(-1, 1).to(self.device)
        next_states = torch.tensor(transition_dict['next_states'],
                                   dtype=torch.float).to(self.device)
        dones = torch.tensor(transition_dict['dones'],
                             dtype=torch.float).view(-1, 1).to(self.device)
        # 计算TD误差
        td_target = rewards + self.gamma * self.critic(next_states) * (1 -
                                                                       dones)
        td_delta = td_target - self.critic(states)
        # 计算优势值
        advantage = rl_utils.compute_advantage(self.gamma, self.lmbda,
                                               td_delta.cpu()).to(self.device)
        # 计算log_probs
        old_log_probs = torch.log(self.actor(states).gather(1,
                                                            actions)).detach()
        # 训练
        for _ in range(self.epochs):
            # 计算log概率
            log_probs = torch.log(self.actor(states).gather(1, actions))
            # 计算比率
            ratio = torch.exp(log_probs - old_log_probs)
            # 计算surr1
            surr1 = ratio * advantage
            # 计算surr2
            surr2 = torch.clamp(ratio, 1 - self.eps,
                                1 + self.eps) * advantage# 截断
            # 计算actor_loss
            actor_loss = torch.mean(-torch.min(surr1, surr2))# PPO损失函数
            # 计算critic_loss
            critic_loss = torch.mean(
                F.mse_loss(self.critic(states), td_target.detach()))
            # 优化actor
            self.actor_optimizer.zero_grad()
            actor_loss.backward()
            self.actor_optimizer.step()
            # 优化critic
            self.critic_optimizer.zero_grad()
            critic_loss.backward()
            self.critic_optimizer.step()

actor_lr = 1e-3  # Learning rate for the actor
critic_lr = 1e-2  # Learning rate for the critic
num_episodes = 250  # Number of episodes to train for
hidden_dim = 128  # Number of neurons in each hidden layer
gamma = 0.98  # Discount factor
lmbda = 0.95  # Lambda for generalized advantage estimation
epochs = 10  # Number of epochs to train for
eps = 0.2  # Epsilon for clipping
device = torch.device("cuda") if torch.cuda.is_available() else torch.device(
    "cpu")  # Device to use

env_name = 'CartPole-v0'  # Environment to use
env = gym.make(env_name)  # Create the environment
env.seed(0)  # Set seed for reproducibility
torch.manual_seed(0)  # Set seed for reproducibility
state_dim = env.observation_space.shape[0]  # Get state dimension
action_dim = env.action_space.n  # Get action dimension
agent = PPO(state_dim, hidden_dim, action_dim, actor_lr, critic_lr, lmbda,
            epochs, eps, gamma, device)  # Create the agent

return_list = rl_utils.train_on_policy_agent(env, agent, num_episodes)

  f"The environment {id} is out of date. You should consider "


AttributeError: 'CartPoleEnv' object has no attribute 'seed'

In [2]:
# generate n_episode of expert data
# each episode will be a list of states and actions
def sample_expert_data(n_episode):
    # initialize empty list of states and actions
    states = []
    actions = []
    # loop through each episode
    for episode in range(n_episode):
        # reset environment for each episode
        state = env.reset()
        # initialize flag for episode completion
        done = False
        # loop through each step
        while not done:
            # take an action from the trained policy
            action = ppo_agent.take_action(state)
            # append state and action to list
            states.append(state)
            actions.append(action)
            # take a step in the environment based on the action
            next_state, reward, done, _ = env.step(action)
            # update state
            state = next_state
    # convert lists to numpy arrays
    return np.array(states), np.array(actions)


env.seed(0)  # 设定环境随机种子
torch.manual_seed(0)  # 设定torch随机种子
random.seed(0)  # 设定random随机种子
n_episode = 1  # 设定采样的episode数
expert_s, expert_a = sample_expert_data(n_episode)  # 采样expert数据

n_samples = 30  # 采样30个数据
random_index = random.sample(range(expert_s.shape[0]), n_samples)  # 从expert_s.shape[0]中随机采样30个数据
expert_s = expert_s[random_index]  # 根据随机采样的数据，获取对应的expert状态
expert_a = expert_a[random_index]  # 根据随机采样的数据，获取对应的expert动作

AttributeError: 'CartPoleEnv' object has no attribute 'seed'

In [3]:
class BehaviorClone:
    def __init__(self, state_dim, hidden_dim, action_dim, lr):
        self.policy = PolicyNet(state_dim, hidden_dim, action_dim).to(device)
        self.optimizer = torch.optim.Adam(self.policy.parameters(), lr=lr)

    def learn(self, states, actions):
        # states: [[s_1], [s_2], ..., [s_n]]
        # actions: [a_1, a_2, ..., a_n]
        # 这里的states是一个二维数组，actions是一个一维数组，都是numpy数组
        states = torch.tensor(states, dtype=torch.float).to(device)  # 将states转换成PyTorch的tensor，并且移动到device上
        # torch.view(-1, 1)相当于numpy的reshape(-1, 1)，将actions转换成二维数组
        actions = torch.tensor(actions).view(-1, 1).to(device)  
        # torch.log(1e-8) = -23.025850929940457，为了防止概率乘积为0，这里取log
        log_probs = torch.log(self.policy(states).gather(1, actions))# gather(1, actions)表示取出每一行对应actions的概率
        # log_probs: [[log(p1)], [log(p2)], ..., [log(pn)]]
        # 这里的p1, p2, ..., pn是pi(a_i|s_i)的概率
        bc_loss = torch.mean(-log_probs)  # 最大似然估计

        self.optimizer.zero_grad()
        bc_loss.backward()
        self.optimizer.step()

    def take_action(self, state):
        state = torch.tensor([state], dtype=torch.float).to(device)
        probs = self.policy(state)
        action_dist = torch.distributions.Categorical(probs)
        action = action_dist.sample()
        return action.item()


def test_agent(agent, env, n_episode):
    # Initialize the list of returns
    return_list = []
    # Perform n_episode episodes
    for episode in range(n_episode):
        # Initialize the episode return
        episode_return = 0
        # Reset the environment
        state = env.reset()
        # Initialize the boolean "done" to False
        done = False
        # Play the episode until the end
        while not done:
            # Get the action from the agent
            action = agent.take_action(state)
            # Play the action in the environment and get the next state, reward and done
            next_state, reward, done, _ = env.step(action)
            # Update the state
            state = next_state
            # Update the episode return
            episode_return += reward
        # Add the episode return to the list of returns
        return_list.append(episode_return)
    # Return the average return
    return np.mean(return_list)


# Set random seeds for reproducibility
env.seed(0)
torch.manual_seed(0)
np.random.seed(0)

# Hyperparameters
lr = 1e-3
bc_agent = BehaviorClone(state_dim, hidden_dim, action_dim, lr)
n_iterations = 1000
batch_size = 64
test_returns = []

# 创建进度条
with tqdm(total=n_iterations, desc="进度条") as pbar:
    # 迭代
    for i in range(n_iterations):
        # 从专家数据中随机抽取 batch_size 个样本
        sample_indices = np.random.randint(low=0,
                                           high=expert_s.shape[0],
                                           size=batch_size)
        # 更新策略
        bc_agent.learn(expert_s[sample_indices], expert_a[sample_indices])
        # 评估当前策略
        current_return = test_agent(bc_agent, env, 5)
        # 记录回报
        test_returns.append(current_return)
        # 每 10 个回合打印一次策略评估的结果
        if (i + 1) % 10 == 0:
            # 计算最近 10 个回报的平均值，作为当前回报
            pbar.set_postfix({'return': '%.3f' % np.mean(test_returns[-10:])})
        pbar.update(1)

AttributeError: 'CartPoleEnv' object has no attribute 'seed'

In [4]:
# Create a list of iterations, each iteration is a number between 0 and the number of test_returns
iteration_list = list(range(len(test_returns)))
# Plot the iterations against the test_returns
plt.plot(iteration_list, test_returns)
# Label the x-axis
plt.xlabel('Iterations')
# Label the y-axis
plt.ylabel('Returns')
# Add a title to the plot
plt.title('BC on {}'.format(env_name))
# Display the plot
plt.show()

NameError: name 'test_returns' is not defined

In [5]:
class Discriminator(nn.Module):
    def __init__(self, state_dim, hidden_dim, action_dim):
        super(Discriminator, self).__init__()
        self.fc1 = torch.nn.Linear(state_dim + action_dim, hidden_dim)
        self.fc2 = torch.nn.Linear(hidden_dim, 1)

    def forward(self, x, a):
        # Concatenate state and action
        cat = torch.cat([x, a], dim=1)
        # Pass through first layer
        x = F.relu(self.fc1(cat))
        # Pass through second layer
        return torch.sigmoid(self.fc2(x))

In [6]:
class GAIL:
    def __init__(self, agent, state_dim, action_dim, hidden_dim, lr_d):
        # Create discriminator network
        self.discriminator = Discriminator(state_dim, hidden_dim,
                                           action_dim).to(device)
        # Create optimizer for discriminator network
        self.discriminator_optimizer = torch.optim.Adam(
            self.discriminator.parameters(), lr=lr_d)
        # Store agent
        self.agent = agent

    def learn(self, expert_s, expert_a, agent_s, agent_a, next_s, dones):
        # Convert expert data to tensors
        expert_states = torch.tensor(expert_s, dtype=torch.float).to(device)
        expert_actions = torch.tensor(expert_a).to(device)
        # Convert agent data to tensors
        agent_states = torch.tensor(agent_s, dtype=torch.float).to(device)
        agent_actions = torch.tensor(agent_a).to(device)
        # Convert actions to one-hot tensors
        expert_actions = F.one_hot(expert_actions, num_classes=2).float()
        agent_actions = F.one_hot(agent_actions, num_classes=2).float()

        # Get discriminator prediction for expert data
        expert_prob = self.discriminator(expert_states, expert_actions)
        # Get discriminator prediction for agent data
        agent_prob = self.discriminator(agent_states, agent_actions)
        # Calculate discriminator loss
        discriminator_loss = nn.BCELoss()(
            agent_prob, torch.ones_like(agent_prob)) + nn.BCELoss()(
                expert_prob, torch.zeros_like(expert_prob))
        # Optimize discriminator
        self.discriminator_optimizer.zero_grad()
        discriminator_loss.backward()
        self.discriminator_optimizer.step()

        # Calculate rewards
        rewards = -torch.log(agent_prob).detach().cpu().numpy()# detach()防止梯度传播，cpu()防止显存溢出
        # Create transition dict
        transition_dict = {
            'states': agent_s,
            'actions': agent_a,
            'rewards': rewards,
            'next_states': next_s,
            'dones': dones
        }
        # Update agent
        self.agent.update(transition_dict)


# Set the seed for reproducibility
env.seed(0)
torch.manual_seed(0)

# Learning rate for the discriminator
lr_d = 1e-3

# Initialize the agent and GAIL
agent = PPO(state_dim, hidden_dim, action_dim, actor_lr, critic_lr, lmbda,
            epochs, eps, gamma, device)
gail = GAIL(agent, state_dim, action_dim, hidden_dim, lr_d)

# Number of episodes to train
n_episode = 500

# List to store the return
return_list = []

with tqdm(total=n_episode, desc="进度条") as pbar:
    for i in range(n_episode):
        # 记录每个episode的reward
        episode_return = 0
        # 重置环境
        state = env.reset()
        # 判断episode是否结束
        done = False
        # 记录每个episode的状态、动作、下一步的状态、done
        state_list = []
        action_list = []
        next_state_list = []
        done_list = []
        while not done:
            # 根据当前状态选择动作
            action = agent.take_action(state)
            # 执行动作，得到下一步的状态、reward、done
            next_state, reward, done, _ = env.step(action)
            # 记录状态、动作、下一步的状态、done
            state_list.append(state)
            action_list.append(action)
            next_state_list.append(next_state)
            done_list.append(done)
            # 更新当前状态
            state = next_state
            # 记录reward
            episode_return += reward
        # 记录每个episode的reward
        return_list.append(episode_return)
        # 根据状态、动作、下一步的状态、done等信息更新GAIL的参数
        gail.learn(expert_s, expert_a, state_list, action_list,
                   next_state_list, done_list)
        # 打印每10个episode的平均reward
        if (i + 1) % 10 == 0:
            pbar.set_postfix({'return': '%.3f' % np.mean(return_list[-10:])})
        pbar.update(1)

AttributeError: 'CartPoleEnv' object has no attribute 'seed'

In [7]:
iteration_list = list(range(len(return_list)))
plt.plot(iteration_list, return_list)
plt.xlabel('Episodes')
plt.ylabel('Returns')
plt.title('GAIL on {}'.format(env_name))
plt.show()

NameError: name 'return_list' is not defined