### 7.3.1 经验回放
在一般的有监督学习中，假设训练数据是独立同分布的，我们每次训练神经网络的时候从训练数据中随机采样一个或若干个数据来进行梯度下降，随着学习的不断进行，每一个训练数据会被使用多次。在原来的 Q-learning 算法中，每一个数据只会用来更新一次值。为了更好地将 Q-learning 和深度神经网络结合，DQN 算法采用了经验回放（experience replay）方法，具体做法为维护一个回放缓冲区，将每次从环境中采样得到的四元组数据（状态、动作、奖励、下一状态）存储到回放缓冲区中，训练 Q 网络的时候再从回放缓冲区中随机采样若干数据来进行训练。这么做可以起到以下两个作用。

（1）使样本满足独立假设。在 MDP 中交互采样得到的数据本身不满足独立假设，因为这一时刻的状态和上一时刻的状态有关。非独立同分布的数据对训练神经网络有很大的影响，会使神经网络拟合到最近训练的数据上。采用经验回放可以打破样本之间的相关性，让其满足独立假设。

（2）提高样本效率。每一个样本可以被使用多次，十分适合深度神经网络的梯度学习。

![image](image/屏幕截图%202025-02-23%20185551.png)

In [7]:
import random
import gymnasium as gym
import numpy as np
import collections
from tqdm import tqdm
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt
import rl_utils

In [8]:
class ReplayBuffer:
    '''
    经验回放池
    1. 保证数据的独立同分布
    2. 提升样本利用效率
    '''
    def __init__(self,capacity):
        self.buffer = collections.deque(maxlen=capacity) #初始化一个双端队列.

    def add(self,state,action,reward,next_state,done):
        #将数据加入buffer
        self.buffer.append((state,action,reward,next_state,done))

    def sample(self,batch_size):
        # obtain sample form buffer, size is batch_size
        transition = random.sample(self.buffer,batch_size)
        #按列解包, state 数据通常是多维数据
        state_tuple, action_tuple, reward_tuple,next_state_tuple, done_tuple = zip(*transition)
        return np.array(state_tuple), action_tuple, reward_tuple, np.array(next_state_tuple), done_tuple

    def size(self):
        return len(self.buffer)

In [9]:
class Qnet(torch.nn.Module):
    ''' 只有一个隐藏层的Q网络 '''
    def __init__(self, state_dim, hidden_dim, action_dim):
        super(Qnet, self).__init__()
        self.fc1 = torch.nn.Linear(state_dim, hidden_dim)
        self.fc2 = torch.nn.Linear(hidden_dim, action_dim)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        return self.fc2(x)

In [21]:
class DQN:
    ''' DQN算法 '''
    def __init__(self, state_dim, hidden_dim, action_dim, learning_rate,
                gamma, epsilon, target_update, device):
        self.action_dim = action_dim
        self.q_net = Qnet(state_dim,hidden_dim,action_dim).to(device)
        '''
        目标网络: 由于TD误差目标持续在被优化, 为了训练的稳定性,需要延迟更新TD误差目标,
        于是维护两个网络, 一个训练网络训练Q function, 另一个是目标网络用于延迟更新
        '''
        self.target_q_net = Qnet(state_dim,hidden_dim,action_dim).to(device)

        #Adam优化器
        self.optimizer = torch.optim.Adam(self.q_net.parameters(), lr=learning_rate)

        self.gamma = gamma
        self.epsilon = epsilon
        self.target_update = target_update
        self.count = 0
        self.device = device

    def take_action(self,state):
        if np.random.rand() < self.epsilon:
            action = np.random.randint(self.action_dim)
        else:
            #这里需要神经网络输出,不再是Q-table输出. state是一个四维向量, action是一个二维向量
            state = torch.tensor(list([state]), dtype=torch.float).to(self.device) #向量转张量
            action_space_tensor = self.q_net(state)
            action_idx_tensor = np.argmax(action_space_tensor) #选出Q值大的动作(左右)
            action = action_idx_tensor.item() #转换为整数
        return action
    
    def update(self,transition_dict): #transition_dict是采样样本的参数
        #[batch_size,states_dim]
        states  = torch.tensor(transition_dict['states'], dtype=float).to(self.device)
        #[batch_size,actions_dim]
        actions = torch.tensor(transition_dict['actions']).view(-1,1).to(self.device)
        #[batch_size,1]
        rewards = torch.tensor(transition_dict['rewards'],dtype=float).view(-1,1).to(self.device)
        #[batch_size,states_dim]
        next_states = torch.tensor(transition_dict['next_states'],dtype=torch.float).to(self.device)
        #[batch_size,1]
        dones = torch.tensor(transition_dict['dones'],dtype=torch.float).view(-1, 1).to(self.device)

        #dim=1 提取网络输出的数据, 索引是actions
        q_values = self.q_net(states).gather(1,actions) #[batch_size,1]
        #下一个状态的最大Q值
        max_next_q_values = self.target_q_net(next_states).max(1)[0].view(-1,1) #[batch_size,1]
        #计算TD误差目标
        q_targets = rewards + self.gamma * max_next_q_values * (1 - dones)

        dqn_loss = torch.mean(F.mse_loss(q_values,q_targets))

        self.optimizer.zero_gard()
        dqn_loss.backward()
        self.optimizer.step()

        if self.count % self.target_update == 0:
            self.target_q_net.load_state_dict(self.q_net.state_dict()) #将q_net的参数 load 到target_q中

        self.count += 1


In [22]:
lr = 2e-3
num_episodes = 500
hidden_dim = 128
gamma = 0.98
epsilon = 0.1
target_update = 10
buffer_size = 10000
minimal_size = 64
batch_size = 64
device = torch.device("cuda")

env_name = 'CartPole-v1' #环境版本更新 TODO
env = gym.make(env_name)  
random.seed(0)
np.random.seed(0)
torch.manual_seed(0)
replay_buffer = ReplayBuffer(buffer_size)
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.n
agent = DQN(state_dim,hidden_dim,action_dim,lr,gamma,epsilon,target_update,device)

return_list = []
for i in range(10):
    with tqdm(total= int(num_episodes / 10), desc='Iteration %d' % i) as pbar:
        for i_episode in range(int(num_episodes / 10)):
            episode_return = 0
            state = env.reset(seed=0) #版本更新 TODO
            done = False
            while not done:
                action = agent.take_action(state)
                next_state, reward, done, _ = env.step(action)
                replay_buffer.add(state,action,reward,next_state,done)
                state = next_state
                episode_return += reward
                # 当buffer数据的量超过一定值后,进行Q网络训练
                if replay_buffer.size() > minimal_size:
                    b_s, b_a, b_r, b_ns, b_d = replay_buffer.sample(batch_size)
                    transition_dict = {
                        'states': b_s,
                        'actions': b_a,
                        'next_states': b_ns,
                        'dones': b_d
                    }
                    agent.update(transition_dict)
                return_list.append(episode_return)
                if(i_episode + 1) % 10 == 0:
                    pbar.set_postfix({
                        'episode': '%d' % (num_episodes / 10 * i + i_episode + 1),
                        'return': '%.3f' % np.mean(return_list[-10:])
                    })
                pbar.update(1)



Iteration 0:   0%|          | 0/50 [00:00<?, ?it/s]


ValueError: expected sequence of length 4 at dim 2 (got 0)