In [1]:
import sys

# make mygo available
sys.path.insert(0, "../src")

In [2]:
import torch
import numpy as np
import torch.nn.functional as F
import mygo
from mygo.model import SmallModel, ZeroModel
from mygo.encoder import OnePlaneEncoder, ZeroEncoder
from mygo.agent import MLBot, ZeroAgent
from mygo.game import Game, Player
from mygo.dataset import ExperienceBuffer, ZeroExpDataset
from torch.utils.data import Dataset, DataLoader
import copy as cp
from scipy.stats import binomtest
import time
from mygo.tool import ModelTrainer
import matplotlib.pyplot as plt

plt.style.use('dark_background')
torch.set_float32_matmul_precision("high")

In [3]:
board_size=5
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
encoder = ZeroEncoder(board_size=board_size)
model = ZeroModel(encoder.plane_count, board_size=board_size)
model_opt = torch.compile(model.to(device), mode="reduce-overhead")
print(model_opt.info)
agent = ZeroAgent(model_opt, encoder, rounds=1600)

Device: cuda:0
Parameters: 22,491,448
Structure:
ZeroModel(
  (conv_block): ConvBlock(
    (conv): Conv2d(17, 256, kernel_size=(3, 3), stride=(1, 1), padding=same)
    (bn): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (relu): ReLU()
  )
  (res_blocks): Sequential(
    (0): ResBlock(
      (conv_block): ConvBlock(
        (conv): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=same)
        (bn): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu): ReLU()
      )
      (conv_layer): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=same)
      (bn): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU()
    )
    (1): ResBlock(
      (conv_block): ConvBlock(
        (conv): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=same)
        (bn): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
   

In [4]:
def self_play(agent, n_iters=10, log_interval=1):
    
    black_agent = cp.copy(agent)
    white_agent = cp.copy(agent)
    black_agent.exp_buffer = ExperienceBuffer()
    white_agent.exp_buffer = ExperienceBuffer()
    
    agents = {
        Player.black: black_agent,
        Player.white: white_agent,
    }
    
    for i in range(n_iters):
        ti = time.perf_counter()
        
        for a in agents.values():
            a.exp_buffer.begin()
            
        game = Game.new(board_size=agent.encoder.size, komi=0.5)
        n_moves = 0
        while not game.is_over:
            state = encoder.encode(game)
            move = agents[game.next_player].select_move(game)
            game.apply_move(move)
    
            if (n_moves := n_moves + 1) > 2 * game.board_size**2:
                print(f"terminate game because it exceed max moves: {n_moves}")
                break
    
        winner = game.winner
        agents[winner].exp_buffer.complete(1)
        agents[-winner].exp_buffer.complete(-1)

        dt = ModelTrainer.pretty_time(time.perf_counter() - ti)
        if i % log_interval == 0:
            print(f"[{i/n_iters:5.1%}] selfplay iter {i:,d}, time {dt}")
    
    buffer = black_agent.exp_buffer + white_agent.exp_buffer
    
    return buffer

In [5]:
buffer = self_play(agent, 10)

  return func(*args, **kwargs)


terminate game because it exceed max moves: 51
[ 0.0%] selfplay iter 0, time 7.513m
terminate game because it exceed max moves: 51
[10.0%] selfplay iter 1, time 8.317m
terminate game because it exceed max moves: 51
[20.0%] selfplay iter 2, time 8.063m
terminate game because it exceed max moves: 51
[30.0%] selfplay iter 3, time 7.469m
terminate game because it exceed max moves: 51
[40.0%] selfplay iter 4, time 7.445m
terminate game because it exceed max moves: 51
[50.0%] selfplay iter 5, time 7.359m
terminate game because it exceed max moves: 51
[60.0%] selfplay iter 6, time 7.110m
terminate game because it exceed max moves: 51
[70.0%] selfplay iter 7, time 7.967m
terminate game because it exceed max moves: 51
[80.0%] selfplay iter 8, time 9.394m
terminate game because it exceed max moves: 51
[90.0%] selfplay iter 9, time 7.946m


In [6]:
def rl_train(agent):
    assert agent.root is None
    assert agent.exp_buffer is None
    agent = cp.copy(agent)
    new_agent = cp.deepcopy(agent)
    
    learning_rate = 1e-3
    optimizer = torch.optim.SGD(new_agent.model.parameters(), lr=1e-2)
    
    batch_size = 16
    dataset = ZeroExpDataset(
        buffer,
        dtype=np.float32,
        symmetries=True,
        transform=new_agent.model.transform,
        target_transform=new_agent.model.transform
    )
    data_loader = DataLoader(dataset, batch_size=batch_size)
    # loss_fn = F.cross_entropy
    log_interval = 1
    
    new_player = Player.black
    n_wins = 0
    n_draws = 0
    
    eval_iters = 100
    eval_interval = 1
    
    for i, (x, (p, v)) in enumerate(data_loader):
    # for ix in range(0, len(dataset), batch_size):
        ti = time.perf_counter()
    
        # i = ix / batch_size
        # x, (p, v) = dataset[ix:ix+batch_size]
        
        # x = np.array(buffer.states[ix:ix+batch_size])
        # x = torch.from_numpy(x).to(new_agent.model.device)
        # p = np.array(buffer.actions[ix:ix+batch_size])
        # p = torch.from_numpy(p).to(new_agent.model.device)
        # v = torch.tensor(buffer.rewards[ix:ix+batch_size], device=new_agent.model.device, dtype=torch.float32).unsqueeze(1)
        
        new_agent.model.train()
    
        optimizer.zero_grad()
        pred_p, pred_v = new_agent.model(x)
        loss = F.cross_entropy(pred_p, p) + F.mse_loss(pred_v, v)
    
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
    
        if i % log_interval == 0:
            dt = ModelTrainer.pretty_time(time.perf_counter() - ti)
            
            print(f"train iter {i}, loss {loss.item():.3f}, time {dt}")
    
    
    for i in range(eval_iters):
        ti = time.perf_counter()
        
        game = Game.new(board_size, komi=0.5)
        agents = {
            new_player: new_agent,
            -new_player: agent,
        }
    
        while not game.is_over:
            player = game.next_player
            move = agents[player].select_move(game)
            game.apply_move(move)
    
        if game.winner == new_player:
            n_wins += 1
    
        if i % eval_interval == 0:
            dt = ModelTrainer.pretty_time(time.perf_counter() - ti)
            print(f"[{i/eval_iters:5.1%}] eval iter {i}, time {dt}")
    
        new_player = -new_player
    
    print(f"win: {n_wins}/{eval_iters}")
    
    if n_wins > 0.55 * eval_iters:
        return new_agent
    
    return None

In [8]:
agent = rl_train(agent) or agent

train iter 0, loss 4.298, time 1.043s
train iter 1, loss 3.564, time 5.883ms
train iter 2, loss 3.242, time 3.877ms
train iter 3, loss 3.220, time 3.458ms
train iter 4, loss 3.020, time 3.828ms
train iter 5, loss 3.059, time 3.344ms
train iter 6, loss 3.067, time 3.243ms
train iter 7, loss 2.830, time 3.031ms
train iter 8, loss 2.997, time 3.177ms
train iter 9, loss 2.864, time 3.111ms
train iter 10, loss 2.833, time 3.069ms
train iter 11, loss 3.001, time 3.079ms
train iter 12, loss 2.695, time 2.995ms
train iter 13, loss 2.895, time 2.900ms
train iter 14, loss 2.844, time 3.028ms
train iter 15, loss 2.670, time 2.711ms
train iter 16, loss 5.672, time 2.710ms
train iter 17, loss 6.190, time 2.580ms
train iter 18, loss 4.900, time 3.166ms
train iter 19, loss 3.786, time 4.378ms
train iter 20, loss 3.044, time 4.138ms
train iter 21, loss 3.157, time 3.517ms
train iter 22, loss 2.902, time 3.850ms
train iter 23, loss 2.792, time 3.860ms
train iter 24, loss 2.916, time 3.385ms
train iter 