# 吸收状态

In [1]:
from muzero.config import PLANE_NUM, MuZeroConfig
from muzero.self_play import SelfPlay
from muzero.buffer_utils import make_target
import xqcpp
import numpy as np


## 辅助函数

In [2]:
def view_game_history(gh, title):
    # 显示游戏历史信息【政策基于子节点访问次数】
    print(f"{title} {len(gh.root_values)=}")
    for i in range(len(gh.root_values)):
        pi = {}
        cv = gh.child_visits[i]
        for a, prob in enumerate(cv):
            if prob > 0.0:
                pi[xqcpp.m2a(a)] = round(prob, 4)
        print(
            "根值 {:.2f} 政策 {} 合计 {} 下一步移动 {} reward = {:.2f}".format(
                gh.root_values[i],
                pi,
                round(sum(pi.values()), 4),
                xqcpp.m2a(gh.action_history[i + 1]),
                gh.reward_history[i + 1],
            )
        )
        print("=" * 30)


In [3]:
# 固定种子后可复现
config = MuZeroConfig()
config.batch_size = 128
config.training_steps = 200
init_fen = "3k5/2P1P4/9/9/9/9/9/9/4p1p2/5K3 r - 100 0 190"
config.num_simulations = 60


## 演示棋局

In [4]:
import gymnasium as gym
import gymxq


In [5]:
env = gym.make(
    "xqv1",
    init_fen=config.init_fen,
    render_mode="ansi",
)


In [6]:
obs, info = env.reset()
print(env.render())



9 [30m＋[0m[30m＋[0m[30m＋[0m[34m将[0m[30m＋[0m[30m＋[0m[30m＋[0m[30m＋[0m[30m＋[0m
8 [30m＋[0m[30m＋[0m[31m兵[0m[30m＋[0m[31m兵[0m[30m＋[0m[30m＋[0m[30m＋[0m[30m＋[0m
7 [30m＋[0m[30m＋[0m[30m＋[0m[30m＋[0m[30m＋[0m[30m＋[0m[30m＋[0m[30m＋[0m[30m＋[0m
6 [30m＋[0m[30m＋[0m[30m＋[0m[30m＋[0m[30m＋[0m[30m＋[0m[30m＋[0m[30m＋[0m[30m＋[0m
5 [30m＋[0m[30m＋[0m[30m＋[0m[30m＋[0m[30m＋[0m[30m＋[0m[30m＋[0m[30m＋[0m[30m＋[0m
4 [30m＋[0m[30m＋[0m[30m＋[0m[30m＋[0m[30m＋[0m[30m＋[0m[30m＋[0m[30m＋[0m[30m＋[0m
3 [30m＋[0m[30m＋[0m[30m＋[0m[30m＋[0m[30m＋[0m[30m＋[0m[30m＋[0m[30m＋[0m[30m＋[0m
2 [30m＋[0m[30m＋[0m[30m＋[0m[30m＋[0m[30m＋[0m[30m＋[0m[30m＋[0m[30m＋[0m[30m＋[0m
1 [30m＋[0m[30m＋[0m[30m＋[0m[30m＋[0m[34m卒[0m[30m＋[0m[34m卒[0m[30m＋[0m[30m＋[0m
0 [30m＋[0m[30m＋[0m[30m＋[0m[30m＋[0m[30m＋[0m[31m帅[0m[30m＋[0m[30m＋[0m[30m＋[0m
  ０１２３４５６７８
轮到红方走子



## 修正模拟生成数据

In [7]:
from muzero.buffer_utils import Buffer


In [8]:
buffer = Buffer(config)


In [37]:
# 现模型
ghs = []
for i in range(3):
    player = SelfPlay(config, i,init_fen)
    gh = player.rollout(1)
    ghs.append(gh)
    view_game_history(gh, f"rollout_{i}")


调试MCST目录/home/ldf/muzero_experiment/runs_000/train_mcts_000

rollout_0 len(gh.root_values)=1
根值 -0.03 政策 {'2818': 0.15, '2829': 0.1833, '2838': 0.1833, '4838': 0.1833, '4849': 0.15, '4858': 0.15} 合计 0.9999 下一步移动 2838 reward = 1.00
调试MCST目录/home/ldf/muzero_experiment/runs_000/train_mcts_001

rollout_1 len(gh.root_values)=7
根值 -0.01 政策 {'2818': 0.1667, '2829': 0.1667, '2838': 0.1667, '4838': 0.1667, '4849': 0.1667, '4858': 0.1667} 合计 1.0002 下一步移动 2818 reward = 0.00
根值 -0.01 政策 {'4131': 0.1333, '4140': 0.2, '4151': 0.1667, '6151': 0.2, '6160': 0.1333, '6171': 0.1667} 合计 1.0 下一步移动 4140 reward = 0.00
根值 0.04 政策 {'5040': 1.0} 合计 1.0 下一步移动 5040 reward = 0.00
根值 0.02 政策 {'6151': 0.2667, '6160': 0.4667, '6171': 0.2667} 合计 1.0001 下一步移动 6160 reward = 0.00
根值 -0.00 政策 {'1808': 0.1333, '1819': 0.1333, '1828': 0.2, '4041': 0.1333, '4838': 0.1333, '4849': 0.1333, '4858': 0.1333} 合计 0.9998 下一步移动 1828 reward = 0.00
根值 0.00 政策 {'6050': 0.5, '6070': 0.5} 合计 1.0 下一步移动 6050 reward = 0.00
根值 0.04 政策 {'4041'

In [10]:
# 第一例
case1 = ghs[0]
case1.root_values = [0.999]


In [11]:
moves_policy = {
    "2818": 0.005,
    "2829": 0.490,
    "2838": 0.490,
    "4838": 0.005,
    "4849": 0.005,
    "4858": 0.005,
}


In [12]:
actions_policy = [0] * 2086
for k, v in moves_policy.items():
    actions_policy[xqcpp.m2a(k)] = v
case1.child_visits[0] = actions_policy


In [13]:
# 显示修改后的政策
view_game_history(case1, "case1")


case1 len(gh.root_values)=1
根值 1.00 政策 {'2818': 0.005, '2829': 0.49, '2838': 0.49, '4838': 0.005, '4849': 0.005, '4858': 0.005} 合计 1.0 下一步移动 2838 reward = 1.00


In [14]:
case1.action_history


[2086, 643]

In [15]:
buffer.save_game(case1)


In [16]:
case2 = case1
case2.action_history[1] = xqcpp.m2a("2838")
# 显示修改后的政策
view_game_history(case2, "case2")


case2 len(gh.root_values)=1
根值 1.00 政策 {'2818': 0.005, '2829': 0.49, '2838': 0.49, '4838': 0.005, '4849': 0.005, '4858': 0.005} 合计 1.0 下一步移动 2838 reward = 1.00


In [17]:
buffer.save_game(case2)


In [18]:
buffer.save_game(ghs[1])
buffer.save_game(ghs[2])


In [19]:
index_batch, (
    observation_batch,
    action_batch,
    value_batch,
    reward_batch,
    policy_batch,
    weight_batch,
    gradient_scale_batch,
) = buffer.get_batch()


In [20]:
batch = (
    observation_batch,
    action_batch,
    value_batch,
    reward_batch,
    policy_batch,
    weight_batch,
    gradient_scale_batch,
)


In [21]:
# 解码查看目标移动
from muzero.feature_utils import decode_action


In [22]:
# 注意 自然序号 0 -> 0001 只有空白才编码为 0000
for a in action_batch[0]:
    print(decode_action(a, True))


0000
2838
0000
0000
0000
0000


## 训练

In [23]:
import torch


In [24]:
np.random.seed(config.seed)
torch.manual_seed(config.seed)


<torch._C.Generator at 0x7f60b8fbca90>

In [25]:
from muzero.models import MuZeroNetwork


In [26]:
model = MuZeroNetwork(config)
model.to("cuda")
optimizer = torch.optim.SGD(
    model.parameters(),
    lr=config.lr_init,
    momentum=config.momentum,
    weight_decay=config.weight_decay,
)


In [27]:
from muzero.trainer_utils import update_weights, update_lr


方案比较

批量设定64，训练200次
吸收状态其值及即时奖励全部设置为0，反向传播时由于访问次数增加，导致数值变小。

1. `action`以`NUM_ACTIONS`填充,政策为空 值估计 1.07 损失 value:0.00 reward:0.00 policy:0.81
2. 维持终止状态`action`及上一`action`设为1，其余为0的政策 值估计 0.97 value:0.00 reward:0.00 policy:0.81

训练时间：
batch_size = 128 使用 GPU 4.4 G -> 256 ~9 G
+ cpu 21m4s
+ GPU  1m7s

In [28]:
for training_step in range(config.training_steps):
    (
        total_loss,
        value_loss,
        reward_loss,
        policy_loss,
    ) = update_weights(batch, model, optimizer, config, False)
    update_lr(training_step, optimizer, config)
    if training_step % 20 == 0:
        print(
            "{:>7d}nth loss [total:{:.2f} value:{:.2f} reward:{:.2f} policy:{:.2f}] lr:{:.5f}".format(
                training_step,
                total_loss,
                value_loss,
                reward_loss,
                policy_loss,
                optimizer.param_groups[0]["lr"],
            )
        )


      0nth loss [total:13.57 value:1.91 reward:0.61 policy:12.48] lr:0.00200
     20nth loss [total:4.44 value:0.37 reward:0.13 policy:4.22] lr:0.00200
     40nth loss [total:2.73 value:0.14 reward:0.05 policy:2.64] lr:0.00199
     60nth loss [total:2.16 value:0.06 reward:0.03 policy:2.12] lr:0.00199
     80nth loss [total:2.02 value:0.04 reward:0.02 policy:1.98] lr:0.00198
    100nth loss [total:1.98 value:0.03 reward:0.02 policy:1.94] lr:0.00198
    120nth loss [total:1.96 value:0.03 reward:0.02 policy:1.93] lr:0.00197
    140nth loss [total:1.95 value:0.03 reward:0.02 policy:1.92] lr:0.00197
    160nth loss [total:1.95 value:0.03 reward:0.02 policy:1.92] lr:0.00197
    180nth loss [total:1.94 value:0.03 reward:0.02 policy:1.91] lr:0.00196


In [33]:
# 存储模型
torch.save(model.state_dict(), "model_weights.pth")


验证

In [34]:
from muzero.feature_utils import obs2feature
from muzero.mcts import MCTS, render_root


In [35]:
# 设置调试 mcts 搜索树
config.debug_mcts = True


In [36]:
observation = obs2feature(obs, info, flatten=False)
to_play = info["to_play"]
reset = False
with torch.no_grad():
    legal_actions = info["legal_actions"]
    root, mcts_info = MCTS(config).run(
        model,
        observation,
        legal_actions,
        to_play,
        False,
    )
    render_root(root, "test", "svg", "mcts_tree")


## 预测、调整

In [39]:
player = SelfPlay(config, 1, init_fen)
player.model.set_weights(model.get_weights())


调试MCST目录/home/ldf/muzero_experiment/runs_000/train_mcts_000



In [None]:
def new_fen(env, moves):
    pass


In [None]:
def predicate(env, moves, model):
    gh = SelfPlay(config, 1)


In [38]:
ghs = []
for i in range(3):
    player = SelfPlay(config, i)
    player.model.set_weights(model.get_weights())
    gh = player.rollout(1)
    ghs.append(gh)
    view_game_history(gh, f"rollout_{i}")


调试MCST目录/home/ldf/muzero_experiment/runs_000/train_mcts_000

rollout_0 len(gh.root_values)=1
根值 1.10 政策 {'2829': 0.4833, '2838': 0.4333, '4858': 0.0833} 合计 0.9999 下一步移动 2829 reward = 1.00
调试MCST目录/home/ldf/muzero_experiment/runs_000/train_mcts_001

rollout_1 len(gh.root_values)=1
根值 1.10 政策 {'2818': 0.0833, '2829': 0.4833, '2838': 0.4333} 合计 0.9999 下一步移动 2829 reward = 1.00
调试MCST目录/home/ldf/muzero_experiment/runs_000/train_mcts_002

rollout_2 len(gh.root_values)=1
根值 1.10 政策 {'2829': 0.5167, '2838': 0.4833} 合计 1.0 下一步移动 2829 reward = 1.00
