# 交叉熵方法的实践

1）使用当前的模型和环境产生N次片段。

2）计算每个片段的总奖励，并确定奖励边界。通常使用总奖励的百分位来确定，例如50或70。


3）将奖励在边界之下的片段丢掉。


4）用观察值作为输入、智能体产生的动作作为目标输出，训练剩余的“精英”片段。


5）从第1步开始重复，直到得到满意的结果意的结果

## 记录条

In [1]:
from collections import namedtuple

Episode = namedtuple("Episode", field_names=["reward", "steps"])
EpisodeStep = namedtuple("EpisodeStep", field_names=["observation", "action"])

## 实验过程

In [2]:
import time

import gym
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.tensorboard import SummaryWriter

In [3]:
class Net(nn.Module):
    def __init__(self, obs_size, hidden_size, n_actions):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(obs_size, hidden_size),
            nn.ReLU(),
            # nn.Linear(hidden_size, hidden_size),
            # nn.ReLU(),
            # nn.Linear(hidden_size, hidden_size),
            # nn.ReLU(),
            nn.Linear(hidden_size, n_actions),
        )

    def forward(self, X):
        return self.net(X)

In [4]:
# 批处理
def iterate_batches(env, net, batch_size):
    batch = []
    episode_reward = 0.0
    episode_steps = []
    # 该状态的状态值
    obs, info = env.reset()
    sm = nn.Softmax(dim=1)
    # while True:
    while True:
        obs_v = torch.FloatTensor(obs.reshape(1, -1))
        act_probs_y = sm(net(obs_v))
        act_probs = act_probs_y.detach().numpy()[0]
        action = np.random.choice([0, 1], p=act_probs)
        next_obs, reward, terminated, truncated, info = env.step(action)
        episode_reward += reward

        ## 更新 step
        step = EpisodeStep(observation=obs, action=action)
        episode_steps.append(step)

        if terminated:
            e = Episode(reward=episode_reward, steps=episode_steps)
            batch.append(e)
            episode_reward = 0.0
            episode_steps = []
            next_obs, info = env.reset()
            if len(batch) == batch_size:
                yield batch
                batch = []
                env

        obs = next_obs

In [5]:
# 筛选批
def filter_batch(batch, percentile):
    rewards = list(map(lambda s: s.reward, batch))
    reward_bound = np.percentile(rewards, percentile)
    reward_mean = float(np.mean(rewards))

    train_obs = []
    train_act = []
    for reward, steps in batch:
        if reward < reward_bound:
            continue
        train_obs.extend(map(lambda step: step.observation, steps))
        train_act.extend(map(lambda step: step.action, steps))
    train_obs_v = torch.FloatTensor(np.array(train_obs))
    train_act_v = torch.LongTensor(np.array(train_act))
    return train_obs_v, train_act_v, reward_bound, reward_mean

## 训练

In [6]:
# 初始化
HIDDEN_SIZE = 256
BATCH_SIZE = 64
PERCENTILE = 80
env = gym.make("CartPole-v1", render_mode="rgb_array")
# env = gym.wrappers.RecordVideo(env, video_folder="video", name_prefix="mario", video_length=200)
obs_size = env.observation_space.shape[0]
n_actions = env.action_space.n

net = Net(obs_size, HIDDEN_SIZE, n_actions)
objective = nn.CrossEntropyLoss()
optimizer = optim.Adam(params=net.parameters(), lr=0.01)
writer = SummaryWriter(log_dir="logs/plot_4", comment="test1")

In [7]:
# idx = 0
# for i in range(len(batch_data)):
#     for j in range(len(batch_data[i])):

#         print(idx,(len(batch_data[i][j].steps)))
#         idx += 1

In [8]:
batch_data = []
for iter_no, batch in enumerate(iterate_batches(env, net, BATCH_SIZE)):
    batch_data.append(batch)
    obs_v, acts_v, reward_b, reward_m = filter_batch(batch, PERCENTILE)
    optimizer.zero_grad()
    action_scores_v = net(obs_v)
    loss_v = objective(action_scores_v, acts_v)
    loss_v.backward()
    optimizer.step()
    print(
        "%d: loss=%.3f, reward_mean=%.1f, rw_bound=%.1f"
        % (iter_no, loss_v.item(), reward_m, reward_b)
    )
    writer.add_scalars(
        "graph",
        {"loss": loss_v.item(), "reward_bound": reward_b, "reward_mean": reward_m},
        iter_no
    )
    # writer.add_scalar("reward_bound", reward_b, iter_no)
    # writer.add_scalar("reward_mean", reward_m, iter_no)
    if reward_m > 199:
        print("Solved!")
        break
writer.close()

0: loss=0.687, reward_mean=21.2, rw_bound=30.8
1: loss=0.586, reward_mean=14.6, rw_bound=18.0
2: loss=0.664, reward_mean=18.5, rw_bound=24.4
3: loss=0.662, reward_mean=28.7, rw_bound=39.0
4: loss=0.659, reward_mean=29.2, rw_bound=40.4
5: loss=0.627, reward_mean=31.9, rw_bound=41.8
6: loss=0.624, reward_mean=29.4, rw_bound=38.8
7: loss=0.614, reward_mean=43.6, rw_bound=57.4
8: loss=0.591, reward_mean=60.0, rw_bound=83.4
9: loss=0.599, reward_mean=64.8, rw_bound=88.4
10: loss=0.592, reward_mean=73.9, rw_bound=101.2
11: loss=0.582, reward_mean=77.1, rw_bound=110.6
12: loss=0.579, reward_mean=63.5, rw_bound=78.8
13: loss=0.575, reward_mean=71.9, rw_bound=103.6
14: loss=0.567, reward_mean=75.5, rw_bound=104.2
15: loss=0.556, reward_mean=87.3, rw_bound=124.6
16: loss=0.544, reward_mean=95.3, rw_bound=144.8
17: loss=0.525, reward_mean=95.3, rw_bound=128.8
18: loss=0.512, reward_mean=94.0, rw_bound=129.0
19: loss=0.520, reward_mean=103.5, rw_bound=141.8
20: loss=0.501, reward_mean=106.5, rw_bo