## AlphaZero Implementation
One step demonstration

note: still in progress

#### Import common library

In [1]:
import time
import shlex
import hashlib
from collections import defaultdict
import numpy as np
import dill

#### Import MCTS library

In [2]:
from MCTS.utils import *
from MCTS.Agent import *
from MCTS.MCTS import *

# note: The MCTS library was designed for 'vanilla' MCTS and not specially 
# for Zero's implemetation. We shall modify the library as we go by.

#### Import game rules

In [3]:
import rules.Othello as Othello
# shorthands
OthelloGame   = Othello.OthelloGame   
OthelloHelper = Othello.OthelloHelper

#### Define hyperparameters

In [4]:
# MCTS search related
c = 1.2
allowed_time = 10

# Game specific
state_memory_n = 1
board_size = [8, 8]

#### Import and implement data structure for the game

In [5]:
class OthelloDataNode(ZeroDataNode):
    def __init__(self, name, Game=OthelloGame, player=1):
        super().__init__(Game=Game, name=name, player=player)
    # end def
# end class

#### Define neural network

In [32]:
import torch
class Net(torch.nn.Module):
    def __init__(self):
        super().__init__()
        # define functionals
        self.fc1     = torch.nn.Linear(64, 200)
        self.relu1   = torch.nn.ReLU()
        self.norm1   = torch.nn.BatchNorm1d(200)
        self.fc2     = torch.nn.Linear(200, 62)
        self.norm2   = torch.nn.BatchNorm1d(62)
        self.softmax = torch.nn.Softmax(dim=1)
    # end def
    
    def forward(self, x):
        out = self.fc1(x)
        out = self.norm1(out)
        out = self.relu1(out)
        out = self.fc2(out)
        out = self.norm2(out)
        out = self.softmax(out)
        return out
    # end def
# end class

#### Import a pre-trained model (on human expert dataset)

In [86]:
model_file = 'expert_prediction/data/models/oth_exp_pred-iter03500.dill'
with open(model_file, 'rb') as fin:
    net = dill.load(fin)
# end with

In [87]:
net = net.cpu()

In [88]:
net.eval()

Net(
  (fc1): Linear(in_features=64, out_features=200, bias=True)
  (relu1): ReLU()
  (norm1): BatchNorm1d(200, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (fc2): Linear(in_features=200, out_features=62, bias=True)
  (norm2): BatchNorm1d(62, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (softmax): Softmax()
)

In [89]:
label2move = {0: [0, 0], 1: [0, 1], 2: [0, 2], 3: [0, 3], 4: [0, 4], 5: [0, 5], 6: [0, 6], 
              7: [0, 7], 8: [1, 0], 9: [1, 1], 10: [1, 2], 11: [1, 3], 12: [1, 4], 13: [1, 5], 
              14: [1, 6], 15: [1, 7], 16: [2, 0], 17: [2, 1], 18: [2, 2], 19: [2, 3], 20: [2, 4], 
              21: [2, 5], 22: [2, 6], 23: [2, 7], 24: [3, 0], 25: [3, 1], 26: [3, 2], 27: [3, 5], 
              28: [3, 6], 29: [3, 7], 30: [4, 0], 31: [4, 1], 32: [4, 2], 33: [4, 5], 34: [4, 6], 
              35: [4, 7], 36: [5, 0], 37: [5, 1], 38: [5, 2], 39: [5, 3], 40: [5, 4], 41: [5, 5], 
              42: [5, 6], 43: [5, 7], 44: [6, 0], 45: [6, 1], 46: [6, 2], 47: [6, 3], 48: [6, 4], 
              49: [6, 5], 50: [6, 6], 51: [6, 7], 52: [7, 0], 53: [7, 1], 54: [7, 2], 55: [7, 3], 
              56: [7, 4], 57: [7, 5], 58: [7, 6], 59: [7, 7], 60: 'PASS'}
move2label = {tuple(q): p for p, q in label2move.items()}

In [117]:
_input = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 1, 0, 0, 0, 0, 0, 0, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
_input = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, -1, 0, 0, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
_input = [-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1, -1, 1, -1, -1, -1, -1, 1, -1, 1, -1, -1, -1, -1, -1, 1, 1, -1, 1, -1, -1, -1, -1, 1, -1, 1, 1, 1, -1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, -1, 1, 1, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1, 0]
_input = torch.from_numpy(-np.array([_input,])).float()

In [118]:
outputs = net(_input)

In [119]:
outputs[0][-1] * 2 - 1

tensor(0.7052, grad_fn=<SubBackward>)

In [116]:
move_probs, pred_vals = torch.split(outputs, (61, 1), 1)

In [22]:
round(float(pred_vals.detach().numpy()[0][0]), 2)

0.0

In [16]:
z = move_probs.detach().numpy()[0]

In [17]:
z = {tuple(label2move[i]): z[i] for i in range(len(z))}

In [None]:
z['PASS'] = z.pop(tuple('PASS'))

In [None]:
z

#### wrap the neuralnet in a predictor

In [None]:
def nnet_pred(state):
    P, v = net.forward(state)
    P = P.reshape(board_size)
    return P, v
# end def

#### Define exit conditions

In [None]:
def exit_cond(time0, time_thr):
    # time based
    if time.time() - time0 > time_thr:
        return True
    # end if
    
    # winning prob based
    # TODO
# end def

#### Main logic

In [None]:
def start_mcts(node, allowed_time):
    assert node.parent is None
    winner_value_dict = {1: 1, -1: 0, 0: 0.5}

    while True:
        # check exit condition
        if exit_cond(time0, allowed_time):
            break
        # end if

        # dynamically expand the tree - search for first un-expanded node
        while True:
            if node.end_game() is True:
                break
            # end if
            if node.expanded is True:
                # choose node
                # 1. compute Q value for every node
                N = sum(node.N.values())
                actions = list(node.Q.keys())
                U = {key: None for key in actions}
                V = []
                for a in actions:
                    U[a] = c*np.sqrt(N) / (1+node.N[a]) * node.P[a]
                    V.append(node.Q[a] + U[a])
                # end for

                # 2. choose action that maximizes V
                idx = np.argmax(V)
                action = actions[idx]
                node = node.child_dict[action]
            else:
                break
            # end if
        # end while

        # if end_game, simply evaluate
        _winner = node.get_winner()
        if _winner is not None:
            v = winner_value_dict[_winner]
            node.backprob(v)
            continue
        # end if
        # if not, expand and evaluate

        # - list all possible moves
        child_nodes = node.grow_branches() 
        # append legal child nodes
        node.append_children(child_nodes)

        # - neuralnet evaluation here
        P, v = nnet_pred(node.state)

        # back-propagation
        node.assign_probs(P)
        node.backprob(v)

        # go back to root
        node = node.root
    # end while
    
    return node
# end def

#### Initialize the board

In [None]:
new_board = OthelloHelper.new_board()

In [None]:
init_state = [stateType(data=new_board, player=1) for _ in range(state_memory_n)]

In [None]:
# show the board (optional)
# OthelloHelper.print_board(init_state[0])

#### Initialize a node

In [None]:
node = self.Node("root")
node.state = init_state

In [None]:
# sample the node
sampled_node = start_mcts(node, allowed_time=allowed_time)

#### Choose a node using the tree search result

In [None]:
chosen_node = choose_node(sampled_node.children)

#### Print the selected action

In [None]:
print(chosen_node.action)