In [1]:
import torch
import gym
import random
import numpy as np


class SAC:
    class ModelAction(torch.nn.Module):
        def __init__(self):
            super().__init__()
            self.fc_state = torch.nn.Sequential(
                torch.nn.Linear(3, 128),
                torch.nn.ReLU(),
            )
            self.fc_mu = torch.nn.Linear(128, 1)
            self.fc_std = torch.nn.Sequential(
                torch.nn.Linear(128, 1),
                torch.nn.Softplus(),
            )

        def forward(self, state):
            #[b, 3] -> [b, 128]
            state = self.fc_state(state)

            #[b, 128] -> [b, 1]
            mu = self.fc_mu(state)

            #[b, 128] -> [b, 1]
            std = self.fc_std(state)

            #根据mu和std定义b个正态分布
            dist = torch.distributions.Normal(mu, std)

            #采样b个样本
            #这里用的是rsample,表示重采样,其实就是先从一个标准正态分布中采样,然后乘以标准差,加上均值
            sample = dist.rsample()

            #样本压缩到-1,1之间,求动作
            action = torch.tanh(sample)

            #求概率对数
            log_prob = dist.log_prob(sample)

            #这个式子看不懂,但参照上下文理解,这个值应该描述的是动作的熵
            entropy = log_prob - (1 - action.tanh()**2 + 1e-7).log()
            entropy = -entropy

            return action * 2, entropy

    class ModelValue(torch.nn.Module):
        def __init__(self):
            super().__init__()
            self.sequential = torch.nn.Sequential(
                torch.nn.Linear(4, 128),
                torch.nn.ReLU(),
                torch.nn.Linear(128, 128),
                torch.nn.ReLU(),
                torch.nn.Linear(128, 1),
            )

        def forward(self, state, action):
            #[b, 3+1] -> [b, 4]
            state = torch.cat([state, action], dim=1)

            #[b, 4] -> [b, 1]
            return self.sequential(state)

    def __init__(self):
        self.model_action = self.ModelAction()

        self.model_value = self.ModelValue()
        self.model_value_next = self.ModelValue()

        self.model_value_next.load_state_dict(self.model_value.state_dict())

        self.optimizer_action = torch.optim.Adam(
            self.model_action.parameters(), lr=3e-4)
        self.optimizer_value = torch.optim.Adam(self.model_value.parameters(),
                                                lr=3e-3)

        self.mse_loss = torch.nn.MSELoss()

    def get_action(self, state):
        state = torch.FloatTensor(state).reshape(1, 3)
        action, _ = self.model_action(state)
        return action.item()

    def _get_target(self, reward, next_state, over):
        #首先使用model_action计算动作和熵
        #[b, 4] -> [b, 1],[b, 1]
        action, entropy = self.model_action(next_state)

        #评估next_state的价值
        #[b, 4],[b, 1] -> [b, 1]
        target = self.model_value_next(next_state, action)

        #这里的操作是在target上加上了动作的熵
        #[b, 1] - [b, 1] -> [b, 1]
        target += 0.005 * entropy

        #[b, 1]
        target *= 0.99
        target *= (1 - over)
        target += reward

        return target

    def _get_loss_action(self, state):
        #计算action和熵
        #[b, 3] -> [b, 1],[b, 1]
        action, entropy = self.model_action(state)

        #使用value网络评估action的价值
        #[b, 3],[b, 1] -> [b, 1]
        value = self.model_value(state, action)

        #熵,这个值期望的是越大越好,但是这里是计算loss,所以符号取反
        #[1] - [b, 1] -> [b, 1]
        loss_action = -0.005 * entropy

        #减去value,所以value越大越好,这样loss就会越小
        loss_action -= value

        return loss_action.mean()

    def train(self, state, action, reward, next_state, over):
        #对reward偏移,为了便于训练
        reward = (reward + 8) / 8

        #计算target,这个target里已经考虑了动作的熵
        #[b, 1]
        target = self._get_target(reward, next_state, over)
        target = target.detach()

        #计算value
        value = self.model_value(state, action)

        #计算loss,value的目标是要贴近target
        loss_value = self.mse_loss(value, target)

        #更新参数
        self.optimizer_value.zero_grad()
        loss_value.backward()
        self.optimizer_value.step()

        #使用model_value计算model_action的loss
        loss_action = self._get_loss_action(state)
        self.optimizer_action.zero_grad()
        loss_action.backward()
        self.optimizer_action.step()

        #增量更新next模型
        for param, param_next in zip(self.model_value.parameters(),
                                     self.model_value_next.parameters()):
            #以一个小的比例更新
            value = param_next.data * 0.995 + param.data * 0.005
            param_next.data.copy_(value)


sac = SAC()

sac.train(
    torch.randn(64, 3),
    torch.randn(64, 1),
    torch.randn(64, 1),
    torch.randn(64, 3),
    torch.zeros(64, 1).long(),
)

sac.get_action([1, 2, 3])

  "Gym minimally supports python 3.6 as the python foundation not longer supports the version, please update your version to 3.7+"


1.8579914569854736

In [2]:
#创建环境
env = gym.make('Pendulum-v1')

env

  "Initializing wrapper in old step API which returns one bool instead of two. It is recommended to set `new_step_api=True` to use new step API. This will be the default behaviour in future."
  "Initializing environment in old step API which returns one bool instead of two. It is recommended to set `new_step_api=True` to use new step API. This will be the default behaviour in future."


<TimeLimit<OrderEnforcing<StepAPICompatibility<PassiveEnvChecker<PendulumEnv<Pendulum-v1>>>>>>

In [3]:
class Pool:
    def __init__(self, limit):
        #样本池
        self.datas = []
        self.limit = limit

    def add(self, state, action, reward, next_state, over):
        if isinstance(state, np.ndarray) or isinstance(state, torch.Tensor):
            state = state.reshape(3).tolist()

        action = float(action)

        reward = float(reward)

        if isinstance(next_state, np.ndarray) or isinstance(
                next_state, torch.Tensor):
            next_state = next_state.reshape(3).tolist()

        over = bool(over)

        self.datas.append((state, action, reward, next_state, over))
        #数据上限,超出时从最古老的开始删除
        while len(self.datas) > self.limit:
            self.datas.pop(0)

    #获取一批数据样本
    def get_sample(self, size=None):
        if size is None:
            size = len(self)

        size = min(size, len(self))

        #从样本池中采样
        samples = random.sample(self.datas, size)

        #[b, 3]
        state = torch.FloatTensor([i[0] for i in samples])
        #[b, 1]
        action = torch.FloatTensor([i[1] for i in samples]).reshape(-1, 1)
        #[b, 1]
        reward = torch.FloatTensor([i[2] for i in samples]).reshape(-1, 1)
        #[b, 3]
        next_state = torch.FloatTensor([i[3] for i in samples])
        #[b, 1]
        over = torch.LongTensor([i[4] for i in samples]).reshape(-1, 1)

        return state, action, reward, next_state, over

    def __len__(self):
        return len(self.datas)


env_pool = Pool(10000)


#先给env_pool初始化一局游戏的数据
def _():
    #初始化游戏
    state = env.reset()

    #玩到游戏结束为止
    over = False
    while not over:
        #根据当前状态得到一个动作
        action = sac.get_action(state)

        #执行动作,得到反馈
        next_state, reward, over, _ = env.step([action])

        #记录数据样本
        env_pool.add(state, action, reward, next_state, over)

        #更新游戏状态,开始下一个动作
        state = next_state


_()

len(env_pool), env_pool.datas[0], env_pool.get_sample(2)

(200,
 ([-0.7426962852478027, 0.6696284413337708, -0.9164497256278992],
  -1.4821420907974243,
  -5.884091277596468,
  [-0.7210111021995544, 0.6929234862327576, -0.636549711227417],
  False),
 (tensor([[-0.3513,  0.9362,  7.9284],
          [-0.4564, -0.8898,  7.8133]]),
  tensor([[ 1.4688],
          [-1.9468]]),
  tensor([[-10.0123],
          [-10.2895]]),
  tensor([[-0.6882,  0.7255,  8.0000],
          [-0.1309, -0.9914,  6.8539]]),
  tensor([[0],
          [0]])))

In [4]:
#定义主模型
class Model(torch.nn.Module):
    #swish激活函数
    class Swish(torch.nn.Module):
        def __init__(self):
            super().__init__()

        def forward(self, x):
            return x * torch.sigmoid(x)

    #定义一个工具层
    class FCLayer(torch.nn.Module):
        def __init__(self, in_size, out_size):
            super().__init__()
            self.in_size = in_size

            #初始化参数
            std = in_size**0.5
            std *= 2
            std = 1 / std

            weight = torch.empty(5, in_size, out_size)
            torch.nn.init.normal_(weight, mean=0.0, std=std)

            #[5, in, out]
            self.weight = torch.nn.Parameter(weight)

            #[5, 1, out]
            self.bias = torch.nn.Parameter(torch.zeros(5, 1, out_size))

        def forward(self, x):
            #x -> [5, b, in]

            #[5, b, in] * [5, in, out] -> [5, b, out]
            x = torch.bmm(x, self.weight)

            #[5, b, out] + [5, 1, out] -> [5, b, out]
            x = x + self.bias

            return x

    def __init__(self):
        super().__init__()

        self.sequential = torch.nn.Sequential(
            self.FCLayer(4, 200),
            self.Swish(),
            self.FCLayer(200, 200),
            self.Swish(),
            self.FCLayer(200, 200),
            self.Swish(),
            self.FCLayer(200, 200),
            self.Swish(),
            self.FCLayer(200, 8),
            torch.nn.Identity(),
        )

        self.softplus = torch.nn.Softplus()
        self.optimizer = torch.optim.Adam(self.parameters(), lr=1e-3)

    def forward(self, x):
        #x -> [5, b, 4]

        #[5, b, 4] -> [5, b, 8]
        x = self.sequential(x)

        #[5, b, 8] -> [5, b, 4]
        mean = x[..., :4]

        #[5, b, 8] -> [5, b, 4]
        logvar = x[..., 4:]

        #[1, 1, 4] - [5, b, 4] -> [5, b, 4]
        logvar = 0.5 - logvar

        #[1, 1, 4] - [5, b, 4] -> [5, b, 4]
        logvar = 0.5 - self.softplus(logvar)

        #[5, b, 4] - [1, 1, 4] -> [5, b, 4]
        logvar = logvar + 10

        #[5, b, 4] + [1, 1, 4] -> [5, b, 4]
        logvar = self.softplus(logvar) - 10

        #[5, b, 4],[5, b, 4]
        return mean, logvar

    def train(self):
        state, action, reward, next_state, _ = env_pool.get_sample()

        #input -> [b, 4]
        #label -> [b, 4]
        input = torch.cat([state, action], dim=1)
        label = torch.cat([reward, next_state - state], dim=1)

        #反复训练N次
        for _ in range(len(input) // 64 * 20):
            #从全量数据中抽样64个,反复抽5遍,形成5份数据
            #[5, 64]
            select = [torch.randperm(len(input))[:64] for _ in range(5)]
            select = torch.stack(select)
            #[5, b, 4],[5, b, 4]
            input_select = input[select]
            label_select = label[select]
            del select

            #模型计算
            #[5, b, 4] -> [5, b, 4],[5, b, 4]
            mean, logvar = model(input_select)

            #计算loss
            #[b, 4] - [b, 4] * [b, 4] -> [b, 4]
            mse_loss = (mean - label_select)**2 * (-logvar).exp()

            #[b, 4] -> [b] -> scala
            mse_loss = mse_loss.mean(dim=1).mean()

            #[b, 4] -> [b] -> scala
            var_loss = logvar.mean(dim=1).mean()

            loss = mse_loss + var_loss

            self.optimizer.zero_grad()
            loss.backward()
            self.optimizer.step()


model = Model()

#model.train()

a, b = model(torch.randn(5, 64, 4))
a.shape, b.shape

(torch.Size([5, 64, 4]), torch.Size([5, 64, 4]))

In [5]:
class MBPO():
    def _fake_step(self, state, action):
        state = torch.FloatTensor(state).reshape(-1, 3)
        action = torch.FloatTensor([action]).reshape(-1, 1)
        #state -> [b, 3]
        #action -> [b, 1]

        #[b, 4]
        input = torch.cat([state, action], dim=1)

        #重复5遍
        #[b, 4] -> [1, b, 4] -> [5, b, 4]
        input = input.unsqueeze(dim=0).repeat([5, 1, 1])

        #模型计算
        #[5, b, 4] -> [5, b, 4],[5, b, 4]
        with torch.no_grad():
            mean, var = model(input)
        var = var.exp()
        del input

        #means的后3列加上环境数据
        mean[:, :, 1:] += state

        #重采样
        #[5, b ,4]
        sample = torch.distributions.Normal(0, 1).sample(mean.shape)
        sample = mean + sample * var**0.5

        #0-4的值域采样b个元素
        #[4, 4, 2, 4, 3, 4, 1, 3, 3, 0, 2,...
        select = [random.choice(range(5)) for _ in range(mean.shape[1])]

        #重采样结果,的结果,第0个维度,0-4随机选择,第二个维度,0-b顺序选择
        #[5, b ,4] -> [b, 4]
        sample = sample[select, range(mean.shape[1])]

        #切分一下,就成了rewards, next_state
        reward, next_state = sample[:, :1], sample[:, 1:]

        return reward, next_state

    def rollout(self):
        states, _, _, _, _ = env_pool.get_sample(1000)
        for state in states:
            action = sac.get_action(state)
            reward, next_state = self._fake_step(state, action)

            model_pool.add(state, action, reward, next_state, False)
            state = next_state


model_pool = Pool(1000)

mbpo = MBPO()
a, b = mbpo._fake_step([1, 2, 3], 1)
print(a.shape, b.shape)

#mbpo.rollout()

torch.Size([1, 1]) torch.Size([1, 3])


In [6]:
for i in range(20):
    reward_sum = 0
    state = env.reset()
    over = False

    step = 0
    while not over:
        #每隔50个step,训练一次模型
        if step % 50 == 0:
            model.train()
            mbpo.rollout()

        #使用sac获取一个动作
        action = sac.get_action(state)

        #执行动作
        next_state, reward, over, _ = env.step([action])

        #累和reward
        reward_sum += reward

        #添加数据到池子里
        env_pool.add(state, action, reward, next_state, over)

        #更新状态,进入下一个循环
        state = next_state

        #更新模型
        for _ in range(10):
            sample = []
            sample_env = env_pool.get_sample(32)
            sample_model = model_pool.get_sample(32)

            for (i1, i2) in zip(sample_env, sample_model):
                i3 = torch.cat([i1, i2], dim=0)
                sample.append(i3)

            sac.train(*sample)

        step += 1
    print(i, len(env_pool), len(model_pool), reward_sum)

0 400 1000 -1663.363921885937
1 600 1000 -1391.066172121398
2 800 1000 -1246.3640911783252
3 1000 1000 -872.2960833431164
4 1200 1000 -259.00131777906756
5 1400 1000 -135.7633717557256
6 1600 1000 -1.0835938078447904
7 1800 1000 -315.033962548136
8 2000 1000 -116.18777735203085
9 2200 1000 -3.2155846586321877
10 2400 1000 -6.778235912098387
11 2600 1000 -126.66412390397129
12 2800 1000 -259.3294395158037
13 3000 1000 -132.52840118784707
14 3200 1000 -250.0659581298676
15 3400 1000 -135.46098770378336
16 3600 1000 -128.36676704318737
17 3800 1000 -237.8167829122683
18 4000 1000 -184.5584103370651
19 4200 1000 -269.6876077884121
