# 09.DQN-n-steps

1）使用随机权重$（w←1.0）$初始化目标网络$Q(s, a, w)$和网络$\hat Q(s, a, w)$，$Q$和$\hat Q$相同，清空回放缓冲区。

2）以概率ε选择一个随机动作a，否则 $a=argmaxQ(s,a,w)$。

3）在模拟器中执行动作a，观察奖励r和下一个状态s'。

4）将转移过程(s, a, r, s')存储在回放缓冲区中 r 用 n 步合计展示。

5）从回放缓冲区中采样一个随机的小批量转移过程。

6）对于回放缓冲区中的每个转移过程，如果片段在此步结束，则计算目标$y=r$，否则计算$y=r+\gamma max \hat Q(s, a, w)$ 。

7）计算损失：$L=(Q(s, a, w)–y)^2$。

8）固定网络$\hat Q(s, a, w)$不变，通过最小化模型参数的损失，使用SGD算法更新$Q(s, a)$。

9）每N步，将权重从目标网络$Q$复制到$\hat Q(s, a, w)$ 。

10）从步骤2开始重复，直到收敛为止。


In [1]:
import collections
import copy
import math
import random
import time
from collections import defaultdict

import gym
import gym.spaces
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from gym.envs.toy_text import frozen_lake
from torch.utils.tensorboard import SummaryWriter

In [2]:
class Net(nn.Module):
    def __init__(self, obs_size, hidden_size, q_table_size):
        super(Net, self).__init__()

        self.net = nn.Sequential(
            # 输入为状态，样本为（1*n）
            nn.Linear(obs_size, hidden_size),
            nn.ReLU(),
            # nn.Linear(hidden_size, hidden_size),
            # nn.ReLU(),
            nn.Linear(hidden_size, q_table_size),
        )

    def forward(self, state):
        return self.net(state)


class DiscreteOneHotWrapper(gym.ObservationWrapper):
    def __init__(self, env):
        super(DiscreteOneHotWrapper, self).__init__(env)
        assert isinstance(env.observation_space, gym.spaces.Discrete)
        shape = (env.observation_space.n,)
        self.observation_space = gym.spaces.Box(0.0, 1.0, shape, dtype=np.float32)

    def observation(self, observation):
        res = np.copy(self.observation_space.low)
        res[observation] = 1.0
        return res


def discount_reward(r_history, gamma):
    n = len(r_history)
    dr = 0
    for i in range(n):
        dr += gamma**i * r_history[i]
    return dr

# ReplayBuffer

In [3]:
class ReplayBuffer:
    def __init__(self, episode_size, replay_time):
        # 存取 queue episode
        self.queue = []
        self.queue_size = episode_size
        self.replay_time = replay_time

    def get_batch_queue(self, env, action_trigger, batch_size, epsilon):
        def insert_sample_to_queue(env):
            state, info = env.reset()
            stop = 0
            episode = []

            while True:
                if np.random.uniform(0, 1, 1) > epsilon:
                    action = env.action_space.sample()
                else:
                    action = action_trigger(state)

                next_state, reward, terminated, truncated, info = env.step(action)
                episode.append([state, action, next_state, reward, terminated])
                state = next_state
                if terminated:
                    state, info = env.reset()
                    self.queue.append(episode)
                    episode = []
                    stop += 1
                    continue
                if stop >= replay_time:
                    self.queue.append(episode)
                    episode = []
                    break

        def init_queue(env):
            while True:
                insert_sample_to_queue(env)
                if len(self.queue) >= self.queue_size:
                    break

        init_queue(env)
        insert_sample_to_queue(env)
        self.queue = self.queue[-self.queue_size :]

        return random.sample(self.queue, batch_size)

# DQN

In [4]:
class DQN:
    def __init__(self, env, obs_size, hidden_size, q_table_size):
        self.env = env
        self.net = Net(obs_size, hidden_size, q_table_size)
        self.tgt_net = Net(obs_size, hidden_size, q_table_size)

    # 更新net参数
    def update_net_parameters(self, update=True):
        self.net.load_state_dict(self.tgt_net.state_dict())

    def get_action_trigger(self, state):
        state = torch.Tensor(state)
        action = int(torch.argmax(self.tgt_net(state).detach()))
        return action

    # 计算y_hat_and_y
    def calculate_y_hat_and_y(self, batch, gamma):
        # n_step
        state_space = []
        action_spcae = []
        y = []

        for episode in batch:
            random_n = int(np.random.uniform(0, len(episode), 1))
            episode = episode[-random_n:]
            state, action, next_state, reward, terminated = episode[-1]
            q_table_net = dqn.net(torch.Tensor(next_state)).detach()
            reward = reward + (1 - terminated) * gamma * float(torch.max(q_table_net))
            episode[-1] = state, action, next_state, reward, terminated
            reward_space = [_[3] for _ in episode]
            r_n_steps = discount_reward(reward_space, gamma)
            y.append(r_n_steps)
            state, action, next_state, reward, terminated = episode[0]
            state_space.append(state)
            action_spcae.append(action)

        y_hat = self.tgt_net(torch.Tensor(np.array(state_space)))
        y_hat = y_hat.gather(1, torch.LongTensor(action_spcae).reshape(-1, 1))
        return y_hat.reshape(-1), torch.tensor(y)

    def predict_reward(self):
        state, info = env.reset()
        step = 0
        reward_space = []

        while True:
            step += 1
            state = torch.Tensor(state)
            action = int(torch.argmax(self.net(state).detach()))
            next_state, reward, terminated, truncated, info = env.step(action)
            reward_space.append(reward)
            state = next_state
            if terminated:
                state, info = env.reset()
                continue
            if step >= 100:
                break
        return float(np.mean(reward_space))

## 训练

In [5]:
hidden_size = 64
queue_size = 500
replay_time = 50

## 初始化环境
env = frozen_lake.FrozenLakeEnv(is_slippery=False)
env.spec = gym.spec("FrozenLake-v1")
env = gym.wrappers.TimeLimit(env, max_episode_steps=100)
env = DiscreteOneHotWrapper(env)

## 初始化buffer
replay_buffer = ReplayBuffer(queue_size, replay_time)

## 初始化dqn
obs_size = env.observation_space.shape[0]
q_table_size = env.action_space.n
dqn = DQN(env, obs_size, hidden_size, q_table_size)

# 定义优化器
opt = optim.Adam(dqn.tgt_net.parameters(), lr=0.01)

# 定义损失函数
loss = nn.MSELoss()

writer = SummaryWriter(log_dir="logs/DQN/n_steps_FrozenLake", comment="test1")

In [6]:
batch_size = 256
epsilon = 0.8
epochs = 200
gamma = 0.9

In [7]:
for epoch in range(epochs):
    batch = replay_buffer.get_batch_queue(
        env, dqn.get_action_trigger, batch_size, epsilon
    )
    y_hat, y = dqn.calculate_y_hat_and_y(batch, gamma)
    l = loss(y_hat, y)

    # 反向传播
    opt.zero_grad()
    l.backward()
    opt.step()

    if epoch % 10 == 0 and epoch != 0:
        dqn.update_net_parameters()

    predict_reward = dqn.predict_reward()
    writer.add_scalars(
        "MSE", {"loss": l.item(), "predict_reward": predict_reward}, epoch
    )

    print(
        "epoch:{},  MSE: {}, epsilon: {}, 100 steps reward: {}".format(
            epoch, l, epsilon, predict_reward
        )
    )

epoch:0,  MSE: 0.01869170367717743, epsilon: 0.8, 100 steps reward: 0.0
epoch:1,  MSE: 0.0029035082552582026, epsilon: 0.8, 100 steps reward: 0.0
epoch:2,  MSE: 0.011312279850244522, epsilon: 0.8, 100 steps reward: 0.0
epoch:3,  MSE: 0.005949886050075293, epsilon: 0.8, 100 steps reward: 0.0
epoch:4,  MSE: 0.005503702908754349, epsilon: 0.8, 100 steps reward: 0.0
epoch:5,  MSE: 0.0059446669183671474, epsilon: 0.8, 100 steps reward: 0.0
epoch:6,  MSE: 0.0030409502796828747, epsilon: 0.8, 100 steps reward: 0.0
epoch:7,  MSE: 0.002284332411363721, epsilon: 0.8, 100 steps reward: 0.0
epoch:8,  MSE: 0.002898065373301506, epsilon: 0.8, 100 steps reward: 0.0
epoch:9,  MSE: 0.004358756355941296, epsilon: 0.8, 100 steps reward: 0.0
epoch:10,  MSE: 0.053469061851501465, epsilon: 0.8, 100 steps reward: 0.0
epoch:11,  MSE: 0.04703954607248306, epsilon: 0.8, 100 steps reward: 0.0
epoch:12,  MSE: 0.05461917817592621, epsilon: 0.8, 100 steps reward: 0.0
epoch:13,  MSE: 0.060953542590141296, epsilon: 0

# 可视化预测

In [8]:
# DQN_Q = dqn.net

# env = frozen_lake.FrozenLakeEnv(is_slippery=False, render_mode="human")
# env.spec = gym.spec("FrozenLake-v1")
# # display_size = 512
# # env.window_size = (display_size, display_size)
# # env.cell_size = (
# #     env.window_size[0] // env.ncol,
# #     env.window_size[1] // env.nrow,
# # )
# env = gym.wrappers.RecordVideo(env, video_folder="video")

# env = DiscreteOneHotWrapper(env)

# state, info = env.reset()
# total_rewards = 0

# while True:
#     action = int(torch.argmax(DQN_Q(torch.Tensor(state))))
#     state, reward, terminated, truncted, info = env.step(action)
#     print(action)
#     if terminated:
#         break
# env.close()