## DDPG 算法理解性代码

In [1]:
import torch
import numpy as np
import torch.nn as nn
import torch.nn.functional as F

## 如何做到连续控制
$$
\mu(s;\theta^{\mu})
$$
通过神经网络将连续的状态张量输入网络计算得到输出然后使用tanh将其映射到[-1,1]中再映射到真实空间完成连续控制输出

In [2]:
class BaselineActor(nn.Module):
    def __init__(self, state_size, action_size, hidden_size=400):
        super().__init__()
        self.fc1 = nn.Linear(state_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        self.out = nn.Linear(hidden_size, action_size)

    def forward(self, state):
        x = F.relu(self.fc1(state))
        x = F.relu(self.fc2(x))
        act = torch.tanh(self.out(x))
        return act

In [41]:
actor=BaselineActor(3,4)
state=torch.rand(1,3)
actor.forward(state)

tensor([[ 0.0835, -0.0681,  0.1415, -0.0943]], grad_fn=<TanhBackward0>)

## 如何评价
$$
Q(s,a;\theta^{q})
$$
输入state和action然后全连接输出一个值近似$Q$值

In [42]:

class BaselineCritic(nn.Module):
    def __init__(self, state_size, action_size, hidden_size=400):
        super().__init__()
        self.fc1 = nn.Linear(state_size + action_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        self.out = nn.Linear(hidden_size, 1)

    def forward(self, state, action):
        x = torch.cat((state, action), dim=1)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        val = self.out(x)
        return val

In [54]:
actor=BaselineActor(3,4)
critic=BaselineCritic(3,4)
state=torch.rand(1,3)
action=actor.forward(state)
value=critic(state,action)
value

tensor([[-0.0444]], grad_fn=<AddmmBackward0>)

# 如何训练
## 训练 critic
$$
TD_{target}=\hat{y}_t=r_t+\gamma*q(s_{t+1},a_{t+1})\\
L=\frac{1}{2}{(\hat{y}-q(s_t,a_t))}
$$
## 训练actor
$$
L=-q(s_t,\mu(s_t))
$$

# 经验回放数组
`ReplayBuffer`
- s_stack,保存当前状态
- action_stack，根据当前状态所做的动作
- reward_stack，做出动作后环境的反馈
- s1_stack，下一个状态
- done_stack，当前状态是否完成任务
在训练的时候会从数组中随机选取batch_size大小的数据进行训练


# 目标网络
> 用目标网络计算 TD 目标，从而缓解bootstrap带来的偏差

使用目标网络来计算TD目标
$$
\theta^{{Q}^{,}}=\tau \theta^{Q}+(1-\tau)\theta^{{Q}^{,}}\\
\theta^{\mu^{,}}=\tau \theta^{\mu}+(1-\tau)\theta^{\mu^{,}}
$$