In [2]:
import numpy as np
from copy import deepcopy
import matplotlib.pyplot as plt
from matplotlib import colors
import math
from math import sqrt
from math import log

# Move class for Breakthrough

In [3]:
class Move(object):
    def __init__(self, color, x1, y1, x2, y2):
        self.color = color
        self.x1 = x1
        self.y1 = y1
        self.x2 = x2
        self.y2 = y2
        
    def valid (self, board):
        if self.x2 >= Dx or self.y2 >= Dy or self.x2 < 0 or self.y2 < 0:
            return False
        if self.color == White:
            if self.x2 != self.x1 + 1:
                return False
            if board.board [self.x2] [self.y2] == Black:
                if self.y2 == self.y1 + 1 or self.y2 == self.y1 - 1:
                    return True
                return False
            elif board.board [self.x2] [self.y2] == Empty:
                if self.y2 == self.y1 + 1 or self.y2 == self.y1 - 1 or self.y2 == self.y1:
                    return True
                return False
        elif self.color == Black:
            if self.x2 != self.x1 - 1:
                return False
            if board.board [self.x2] [self.y2] == White:
                if self.y2 == self.y1 + 1 or self.y2 == self.y1 - 1:
                    return True
                return False
            elif board.board [self.x2] [self.y2] == Empty:
                if self.y2 == self.y1 + 1 or self.y2 == self.y1 - 1 or self.y2 == self.y1:
                    return True
                return False
        return False
    
    def code (self, board):
        direction = 0
        if self.y2 > self.y1:
            if board.board [self.x2] [self.y2] == Empty:
                direction = 1
            else: 
                direction = 2
        if self.y2 < self.y1:
            if board.board [self.x2] [self.y2] == Empty:
                direction = 3
            else:
                direction = 4
        if self.color == White:
            return 5 * (Dy * self.x1 + self.y1) + direction
        else:
            return 5 * Dx * Dy + 5 * (Dy * self.x1 + self.y1) + direction


        

# Board class to play Breakthrough 5x5

In [4]:
import random

Dx = 5
Dy = 5
Empty = 0
White = 1
Black = 2

class Board(object):
    def __init__(self):
        self.h = 0
        self.turn = White
        self.board = np.zeros ((Dx, Dy))
        for i in range (0, 2):
            for j in range (0, Dy):
                self.board [i] [j] = White
        for i in range (Dx - 2, Dx):
            for j in range (0, Dy):
                self.board [i] [j] = Black
    
    def legalMoves(self):
        moves = []
        for i in range (0, Dx):
            for j in range (0, Dy):
                if self.board [i] [j] == self.turn:
                    for k in [-1, 0, 1]:
                        for l in [-1, 0, 1]:
                            m = Move (self.turn, i, j, i + k, j + l)
                            if m.valid (self):
                                moves.append (m)
        return moves
    
    def score (self):
        for i in range (0, Dy):
            if (self.board [Dx - 1] [i] == White):
                return 1.0
            elif (self.board [0] [i] == Black):
                return 0.0
        l = self.legalMoves ()
        if len (l) == 0:
            if self.turn == Black:
                return 1.0
            else:
                return 0.0
        return 0.5

    def terminal (self):
        if self.score () == 0.5:
            return False
        return True
    
    def play (self, move):
        self.board [move.x1] [move.y1] = Empty
        self.board [move.x2] [move.y2] = move.color
        if (self.turn == White):
            self.turn = Black
        else:
            self.turn = White

    def playout (self):
        while (True):
            moves = self.legalMoves ()
            if self.terminal ():
                return self.score ()
            n = random.randint (0, len (moves) - 1)
            self.play (moves [n])

    def print(self):
        print("   1 2 3 4 5")
        for i in range(Dy):
            print("{} |".format(i + 1), end="")
            for j in range(Dx):
                if self.board [i] [j] == Black:
                    print("\u265F", end="")
                elif self.board [i] [j] == White:
                    print("\u2659", end="")
                else:
                    print(" ", end="")
                if j < Dx:
                    print("|", end="")

            if i < Dy:
                print()        


# Flat Monte Carlo

In [5]:
import copy
# n: the number of simulations to run
def flat (board, n):
    moves = board.legalMoves ()
    bestScore = 0
    bestMove = 0
    for m in range (len(moves)):
        sum = 0
        for i in range (n // len (moves)):
            b = copy.deepcopy (board)
            b.play (moves [m])
            r = b.playout ()
            if board.turn == Black:
                r = 1 - r
            sum = sum + r
        if sum > bestScore:
            bestScore = sum
            bestMove = m
    return moves [bestMove], bestScore

In [8]:
board = Board()

bestMove, bestScore = flat(board, 1000)
print(bestScore)
board.play(bestMove)

bestMove, bestScore = flat(board, 1000)
print(bestScore)

49.0
55.0


# UCB

In [11]:
def UCB (board, n):
    """Algorithme UCB permettant d'estimer le meilleur mouvement possbile à partir d'un état du jeu

    Args:
        board (Board): Jeu dans lequel on souhaite extraire le meilleur mouvement
        n (int): Nombre d'estimation que l'on souhaite réaliser pour estimer le meilleur mouvement

    Returns:
        Move: Meilleur mouvement sélectionné
    """    
    moves = board.legalMoves ()
    sumScores = [0.0 for x in range (len (moves))]
    nbVisits = [0 for x in range (len(moves))]
    for i in range (n):
        bestScore = 0
        bestMove = 0
        for m in range (len(moves)):
            score = 1000000
            if nbVisits [m] > 0:
                 score = sumScores [m] / nbVisits [m] + 0.4 * math.sqrt (math.log (i) / nbVisits [m])
            if score > bestScore:
                bestScore = score
                bestMove = m
        b = copy.deepcopy (board)
        b.play (moves [bestMove])
        r = b.playout ()
        sumScores [bestMove] += r
        nbVisits [bestMove] += 1
    bestScore = 0
    bestMove = 0
    for m in range (len(moves)):
        score = nbVisits [m]
        if score > bestScore:
            bestScore = score
            bestMove = m
    return moves [bestMove], bestScore

In [12]:
board = Board()

bestMove, bestScore = UCB(board, 1000)
print(bestScore)

253


# Board class with hashcode

In [15]:
import random

Dx = 5
Dy = 5
Empty = 0
White = 1
Black = 2

hashTable = []
for k in range (3):
    l = []
    for i in range (Dx):
        l1 = []
        for j in range (Dy):
            l1.append (random.randint (0, 2 ** 64))
        l.append (l1)
    hashTable.append (l)
hashTurn = random.randint (0, 2 ** 64)

class Board(object):
    def __init__(self):
        self.h = 0
        self.turn = White
        self.board = np.zeros ((Dx, Dy))
        for i in range (0, 2):
            for j in range (0, Dy):
                self.board [i] [j] = White
        for i in range (Dx - 2, Dx):
            for j in range (0, Dy):
                self.board [i] [j] = Black
    
    def legalMoves(self):
        moves = []
        for i in range (0, Dx):
            for j in range (0, Dy):
                if self.board [i] [j] == self.turn:
                    for k in [-1, 0, 1]:
                        for l in [-1, 0, 1]:
                            m = Move (self.turn, i, j, i + k, j + l)
                            if m.valid (self):
                                moves.append (m)
        return moves
    
    def score(self):
        for i in range (0, Dy):
            if (self.board [Dx - 1] [i] == White):
                return 1.0
            elif (self.board [0] [i] == Black):
                return 0.0
        l = self.legalMoves ()
        if len (l) == 0:
            if self.turn == Black:
                return 1.0
            else:
                return 0.0
        return 0.5

    def terminal(self):
        if self.score () == 0.5:
            return False
        return True
    
    def playout(self):
        while (True):
            moves = self.legalMoves ()
            if self.terminal ():
                return self.score ()
            n = random.randint (0, len (moves) - 1)
            self.play (moves [n])
            
    def play(self, move):
        col = int (self.board [move.x2] [move.y2])
        if col != Empty:
            self.h = self.h ^ hashTable [col] [move.x2] [move.y2]
        self.h = self.h ^ hashTable [move.color] [move.x2] [move.y2]
        self.h = self.h ^ hashTable [move.color] [move.x1] [move.y1]
        self.h = self.h ^ hashTurn
        self.board [move.x2] [move.y2] = move.color
        self.board [move.x1] [move.y1] = Empty
        if (move.color == White):
            self.turn = Black
        else:
            self.turn = White
            
    def print(self):
        print("   1 2 3 4 5")
        for i in range(Dy):
            print("{} |".format(i + 1), end="")
            for j in range(Dx):
                if self.board [i] [j] == Black:
                    print("\u265F", end="")
                elif self.board [i] [j] == White:
                    print("\u2659", end="")
                else:
                    print(" ", end="")
                if j < Dx:
                    print("|", end="")

            if i < Dy:
                print()


# Transposition Table

In [16]:
MaxLegalMoves = 6 * Dx
Table = {}

def add (board):
    nplayouts = [0.0 for x in range (MaxLegalMoves)]
    nwins = [0.0 for x in range (MaxLegalMoves)]
    Table [board.h] = [0, nplayouts, nwins]

def look (board):
    return Table.get (board.h, None)


# UCT

In [17]:
def UCT(board):
    if board.terminal():
        return board.score()

    t = look(board)

    if t != None:
        bestValue = 0
        bestMove = 0
        moves = board.legalMoves()
        for i in range(0, len(moves)):
            val = 100000.0
            n = t[0]
            ni = t[1][i] # number of playouts after move i
            wi = t[2][i] # number of wins after move i

            if ni > 0:
                Q = wi/ni
                if board.turn == Black:
                    Q  = 1-Q
                val = Q + 0.4 * math.sqrt(log(n)/ni)
            if val > bestValue:
                bestValue = val
                best = i
        board.play(moves[best])
        res = UCT(board)
        t[0] += 1
        t[1][best] += 1
        t[2][best] += res
        return res
    else:
        add(board)
        return board.playout()


# Tournament between Flat, UCB and UCT

In [35]:
board = Board()
print(UCT(board))
board.print()

1.0
   1 2 3 4 5
1 |♙| |♙|♙|♙|
2 |♙|♟| |♟| |
3 | |♟| | | |
4 | | |♟|♟|♟|
5 |♟|♙| |♟| |


AMAF

In [43]:
def playoutAMAF (board, played): 
    while (True):
        moves = []
        moves = board.legalMoves ()
        if len(moves) == 0 or board.terminal ():
            return board.score ()
        n = random.randint (0, len(moves) - 1)
        played.append (moves[n].code(board))
        board.play (moves[n])

In [44]:
MaxCodeLegalMoves = 2 * Dx * Dy * 5

In [45]:
def addAMAF (board):  
    nplayouts = [0.0 for x in range (MaxLegalMoves)]
    nwins = [0.0 for x in range (MaxLegalMoves)]
    nplayoutsAMAF = [0.0 for x in range (MaxCodeLegalMoves)]
    nwinsAMAF = [0.0 for x in range (MaxCodeLegalMoves)]
    Table [board.h] = [0, nplayouts, nwins, nplayoutsAMAF, nwinsAMAF]

In [46]:
def updateAMAF(t, played, res):
    for i in range(len(played)):
        if played[:i].count(played[i]) == 0:
            t[3][played[i]] += 1
            t[4][played[i]] += res

RAVE

In [47]:
def RAVE(board, played):
    if board.terminal ():
        return board.score ()
    t = look (board)
    if t != None:
        bestValue = 0
        best = 0
        moves = board.legalMoves()
        bestcode = moves[0].code(board)
        for i in range (0, len (moves)):
            val = 1000000.0
            code = moves[i].code(board)
            if t[3][code] > 0:
                beta  = t[3][code] / (t[1][i] + t[3][code] + 1e-5 * t[3][code] * t[1][i])
                Q = 1
                if t[1][i] > 0:
                    Q = t[2][i] / t[1][i]
                    if board.turn == Black:
                        Q = 1 - Q
                AMAF = t[4][code] / t[3][code]
                if board.turn == Black:
                    AMAF = 1 - AMAF
                val = beta * Q + (1 - beta) * AMAF
            if val > bestValue:
                bestValue = val
                best = i
                bestcode = code
        board.play (moves [best])
        played.append (bestcode)
        res = RAVE(board, played)
        t[0] += 1
        t[1][best] += 1
        t[2][best] += res
        updateAMAF (t, played, res)
        return res
    else:
        addAMAF (board)
        return playoutAMAF (board, played)
                


In [42]:

class Sequential_Halving():
    def __init__(self):
            self.TMC = TranspoMonteCarlo()
            #self.Dx=5
            #self.Dy=5
            self.Side=8
            self.MaxCodeLegalMoves = 2 * self.Side * self.Side * 8
            self.MaxLegalMoves=32
            self.UCT = UCT()

    def BestMove (self, state, budget):
        """Algorithme de Sequantial Halving minimisant le regret

        Args:
            state (Board): Etat du jeu actuel
            budget (int): Budget à allouer à chaque noeud

        Returns:
            Move: Mouvement sélectionné, minimisant le regret
        """    
        
        #Table = {}
        self.TMC = TranspoMonteCarlo()
        self.TMC.add (state)
        moves = state.legalMoves ()
        total = len (moves)
        nplayouts = [0.0 for x in range (self.MaxCodeLegalMoves)]
        nwins = [0.0 for x in range (self.MaxCodeLegalMoves)]
        while (len (moves) > 1):
            for m in moves:
                rang = budget // (len (moves) * np.log2 (total))
                rang=rang.astype(int)
                for i in range (rang):
                    s = copy.deepcopy (state)
                    s.play (m)
                    res = self.UCT.Update_Tree (s)
                    nplayouts [m.code ()] += 1
                    if state.turn == White:
                        nwins [m.code ()] += res
                    else:
                        nwins [m.code ()] += 1.0 - res
            moves = self.bestHalf (state, moves, nwins, nplayouts)
        return moves [0]


    def bestHalf (self, state, moves, nwins, nplayouts):
        """Choisi le meilleur mouvement pour un état du jeu

        Args:
            state (Board): Etat du jeu
            moves (list<Move>): Liste des mouvements
            nwins (list<int>): liste du nombre de victoire pour chaque noeud/état
            nplayouts (list<int>): liste du nombre de parties jouées pour chaque noeud/état

        Returns:
            list<Move>: Liste des meilleurs mouvements
        """    
        half = []
        notused = [True for x in range (self.MaxCodeLegalMoves)]
        rang=np.ceil(len (moves) / 2)
        rang=rang.astype(int)
        for i in range (rang):
            best = -1.0
            bestMove = moves [0]
            for m in moves:
                code = m.code ()
                if notused [code]:
                    mu = nwins [code] / nplayouts [code]
                    if mu > best:
                        best = mu
                        bestMove = m
            notused[bestMove.code ()] = False
            half.append (bestMove)
        return half

In [None]:
def bestHalf (state, moves, nwins, nplayouts):      
    half = []
    notused = list(np.full(MaxCodeLegalMoves, True))
    rang=np.ceil(len (moves) / 2)
    rang=rang.astype(int)
    for i in range (rang):
        best = -1.0
        bestMove = moves [0]
        for m in moves:
            code = m.code ()
            if notused [code]:
                mu = nwins [code] / nplayouts [code]
                if mu > best:
                    best = mu
                    bestMove = m
        notused[bestMove.code ()] = False
        half.append (bestMove)
    return half

In [None]:
def SHUSS(state, budget):
    """Algorithme de Sequantial Halving minimisant le regret

    Args:
        state (Board): Etat du jeu actuel
        budget (int): Budget à allouer à chaque noeud

    Returns:
        Move: Mouvement sélectionné, minimisant le regret
    """    
    global Table
    Table = {}
    addAMAF(state)
    root = look(state)
    moves = state.legalMoves ()
    total = len (moves)
    nplayouts = [0.0 for x in range (MaxCodeLegalMoves)]
    nwins = [0.0 for x in range (MaxCodeLegalMoves)]
    while (len (moves) > 1):
        for m in moves:
            rang = budget // (len (moves) * np.log2 (total))
            rang=rang.astype(int)
            for i in range (rang):
                s = copy.deepcopy (state)
                s.play (m)
                code = m.code(state)
                played = [code]
                res = GRAVE(s.played, root)
                updateAMAF(root, played, res)
                nplayouts [code] += 1
                if state.turn == White:
                    nwins [code] += res
                else:
                    nwins [code] += 1.0 - res
        moves = bestHalfSHUSS(root, state, moves, nwins, nplayouts)
    return moves [0]

In [None]:
def bestHalfSHUSS(t, state, moves, nwins, nplayouts):
    """Choisi le meilleur mouvement pour un état du jeu

    Args:
        state (Board): Etat du jeu
        moves (list<Move>): Liste des mouvements
        nwins (list<int>): liste du nombre de victoire pour chaque noeud/état
        nplayouts (list<int>): liste du nombre de parties jouées pour chaque noeud/état

    Returns:
        list<Move>: Liste des meilleurs mouvements
    """    
    half = []
    notused = list(np.full(MaxCodeLegalMoves, True))
    c = 128
    rang=np.ceil(len (moves) / 2)
    rang=rang.astype(int)
    for i in range (rang):
        best = -1.0
        bestMove = moves [0]
        for m in moves:
            code = m.code(state)
            if notused [code]:
                AMAF = t[4][code] / t[3][code]
                if state.turn == Black:
                    AMAF = 1 - AMAF
                mu = nwins [code] / nplayouts[code] + c * AMAF / nplayouts [code]
                if mu > best:
                    best = mu
                    bestMove = m
        notused[bestMove.code (state)] = False
        half.append (bestMove)
    return half

Nested Monte Carlo Search

In [1]:
import random
import copy
 
def legalMoves (state):
    return [0, 1]
 
def play (state, move):
    state.append (move)
    return state
 
def terminal (state):
    return len (state) >= 60
 
def score (state):
    s = 0
    for i in state:
        if i == 0:
            s = s + 1
    return s
    
def playout (state):
    while not terminal (state):
        moves = legalMoves (state)
        move = moves [int(random.random () * len (moves))]
        state = play (state, move)                                                                          
    return state

def nested (state, n):
    if (n == 0):
        s1 = copy.deepcopy (state)
        return playout (s1)
    bestSequence = []
    while not terminal (state):
        moves = legalMoves (state)
        for m in moves:
            s1 = copy.deepcopy (state)
            s1 = play (s1, m)
            s1 = nested (s1, n - 1)
            if score (s1) > score (bestSequence):
                bestSequence = s1
        state = play (state, bestSequence [len (state)])
    return state


print(nested([],10))

KeyboardInterrupt: 

Misere Breakthrough

In [None]:
class Board_Misere(Board):

    def misereScore (self):
        """Permet d'attribuer un score à la partie
M
        Returns:
            [float]: renvoie 0.5 si rien ne se passe, 1 (resp 0) si les blancs (resp noirs)
            atteignent la ligne ou que les noirs (resp blancs) n'ont plus de coups possibles
        """        
        s = self.score()
        if s==1:
            return 1
        elif s==0:
            return -1
        return s

    def discountedPlayout (self, t):
        """Permet de jouer aléatoirement un partie sans imposer une politique

        Returns:
            [float]: Score de la partie aléatoire jouée
        """        
        while (True):
            moves = self.legalMoves ()
            if self.terminal ():
                return self.misereScore ()/(t+1)
            n = random.randint (0, len (moves) - 1)
            self.play (moves [n])
            t+=1


    def nestedDiscountedPlayout (self, t):
        while (True):
            if self.terminal ():
                return self.misereScore () / (t + 1)
            moves = self.legalMoves ()
            bestMove = moves [0]
            best = -2
            for i in range (len (moves)):
                b = copy.deepcopy (self)
                b.play (moves [i])
                s = b.discountedPlayout(t+1)
                if self.turn == Black:
                    s = -s
                if s > best:
                    best = s
                    bestMove = moves [i]
            self.play (bestMove)
            t = t + 1

    # def code (self, move):
    #     direction = 1
    #     if move.y2 > move.y1:
    #         direction = 0
    #     if move.y2 < move.y1:
    #         direction = 2
    #     capture = 0
    #     if self.board [move.x2] [move.y2] != Empty:
    #         capture = 1
    #     if move.color == White:
    #         return 6 * (Dy * move.x1 + move.y1) + 2 * direction + capture
    #     else:
    #         return 6 * Dx * Dy + 6 * (Dy * move.x1 + move.y1) + 2 * direction + capture
    
    def UCTNested(board, t):
        if board.terminal():
            return board.misereScore()
        t = look(board)
        if t != None:
            bestValue = -1000000.0
            best = 0
            moves = board.legalMoves()
            for i in range(0, len(moves)):
                val = 100000.0
                if t[1][i] > 0:
                    Q = t[2][i] / t[1][i]
                    if board.turn == Black:
                        Q = - Q
                    val = Q + 0.4 * math.sqrt(log(t[0])/t[1][i])
                if val > bestValue:
                    bestValue = val
                    best = i

leftmove NRPA

In [None]:
from turtle import st

from sympy import sequence


def randomMove(state, policy):
    moves = legalMoves(state)
    z = 0.0
    for m in moves:
        if policy.get(code(state.m)) == None:
            policy[code(state.m)] = 0.0
        z = z + math.exp(policy[code(state.m)])
    stop = random.random() * z
    sum = 0.0
    for m in moves:
        sum = sum + math.exp(policy[code(state.m)])
        if sum > stop:
            return m

def playout(state, policy):
    while not terminal(state):
        move = randomMove(state, policy)
        state = play(state, move)
    return score(state), sequence(state)

In [None]:
def adapt(policy, sequence, alpha=1.0):
    s = []
    polp = copy.deepcopy(policy)
    for best in sequence:
        moves = legalMoves(best)
        z = 0.0
        for m in moves:
            if policy.get(code(s, m)) == None:
                policy[code(s, m)] = 0.0
            z = z + math.exp(policy[code(m)])
        for m in moves:
            if polp.get(code(s, m)) == None:
                polp[code(s, m)] = 0.0
            polp[code(s, m)] -= alpha * math.exp(policy[code(m)]) / z
        polp[code(s, best)] += alpha
        play(s, best)
    return polp

In [None]:
def NRPA(level, policy):
    if level == 0:
        return playout([], policy)
    best = -np.inf
    seq = []
    for i in range(10):
        pol = copy.deepcopy(policy)
        score, sequence = NRPA(level - 1, pol)
        if score > best:
            best = score
            seq = sequence
        policy = adapt(policy, sequence)
    return best, seq

In [None]:
def score(state):
    return sum(state)

def play(state, move):
    state.append(move)

def legalMoves(state):
    return [0, 1]

def terminal(state):
    return len(state) >= 60

def sequence(state):
    return state

def code(state, m):
    return 2*len(state) + m

sc, s = NRPA(1, {})
print(sc, s)
sc, s = NRPA(2, {})
print(sc, s)

In [None]:
def randomMove (state, policy):

    moves = legalMoves (state)

    z = 0.0

    for m in moves:

        if policy.get (code(state,m)) == None:

            policy [code(state,m)] = 0.0

        z = z + math.exp (policy [code(state,m)])

    stop = random.random () * z

    sum = 0.0

    for m in moves:

        sum = sum + math.exp (policy [code(state,m)])

        if (sum >= stop):

            return m

def playout (state, policy):

    while not terminal (state):

        move = randomMove (state, policy)

        play (state, move)

    return score (state),sequence(state)


def adapt (policy, sequence, alpha = 1.0):

    s = []

    polp = copy.deepcopy (policy)

    for best in sequence:

        moves = legalMoves (s)

        z = 0.0

        for m in moves:

            if policy.get (code(s,m)) == None:

                policy [code(s,m)] = 0.0

            z = z + math.exp (policy [code(s,m)])

        for m in moves:

            if polp.get (code(s,m)) == None:

                polp [code(s,m)] = 0.0

            polp [code(s,m)] -= alpha * math.exp (policy [code(s,m)]) / z

        polp [code(s,best)] += alpha

        play (s, best)

    return polp

def NRPA (level, policy):

    if level == 0:

        return playout ([], policy)

    best = -np.inf

    seq = []

    for i in range (100):

        pol = copy.deepcopy (policy)

        sc, s = NRPA (level - 1, pol)

        if sc > best:

            best = sc

            seq = s

        policy = adapt (policy, seq)

    return best, seq

In [None]:
import random
import math
import numpy as np
N = 3
MaxNumber = 10000
class WS (object):
    def __init__ (self):
        self.partitions = [[] for i in range (N)]
        self.possible = np.full((N,MaxNumber),True)
        self.next = 1
        self.sequence = []
 
    def legalMoves (self):
        l = []
        for i in range (N):
            if self.possible [i] [self.next]:
                l.append (i)
        return l
 
    def code (self, p):
        return N * self.next + p
    
    def terminal (self):
        l = self.legalMoves ()
        if l == []:
            return True
        return False
 
    def score (self):
        return self.next - 1
 
    def play (self, p):
        for i in range (len (self.partitions [p])):
            self.possible [p] [self.next + self.partitions [p] [i]] = False
        self.partitions [p].append (self.next)
        self.next = self.next + 1
        self.sequence.append (p)

In [None]:
class Policy (object):
    def __init__ (self):
        self.dict = {}
 
    def get (self, code):
        w = 0
        if code in self.dict:
            w = self.dict [code]
        return w
 
    def put (self, code, w):
        self.dict [code] = w

In [None]:
def playout (state, policy):
    while not state.terminal ():
        l = state.legalMoves ()
        z = 0
        for i in range (len (l)):
            z = z + math.exp (policy.get (state.code (l [i])))
        stop = random.random () * z
        move = 0
        z = 0
        while True:
            z = z + math.exp (policy.get (state.code (l [move])))
            if z >= stop:
                break
            move = move + 1
        state.play (l [move])

In [None]:
def adapt (sequence, policy):
    polp = copy.deepcopy (policy)
    s = WS ()
    while not s.terminal ():
        l = s.legalMoves ()
        z = 0
        for i in range (len (l)):
            z = z + math.exp (policy.get (s.code (l [i])))
        move = sequence [len (s.sequence)]
        polp.put (s.code (move), polp.get(s.code (move)) + 1)
        for i in range (len (l)):
            proba = math.exp (policy.get (s.code (l [i]))) / z
            polp.put (s.code (l [i]), polp.get(s.code (l [i])) - proba)
        s.play (move)
    return polp

In [None]:
def NRPA (level, policy):
    state = WS ()
    if level == 0:
        playout (state, policy)
        return state
    pol = copy.deepcopy (policy)
    for i in range (100):
        ws = NRPA (level - 1, pol)
        if ws.score () >= state.score ():
            state = ws
        pol = adapt (state.sequence, pol)
    return state
 
ws = NRPA (2, Policy ())
print (ws.partitions)
[[1, 2, 4, 8, 11, 16, 22], [3, 5, 6, 7, 19, 21, 23], [9, 10, 12, 13, 14, 15, 17, 18, 20]]

In [None]:
def playout (state, policy):
    while not state.terminal ():
        l = state.legalMoves ()
        z = 0
        for i in range (len (l)):
            z = z + math.exp (policy.get (state.code (l [i])) + state.beta (l [i]))
        stop = random.random () * z
        move = 0
        z = 0
        while True:
            z = z + math.exp (policy.get (state.code (l [move])) + state.beta (l [move]))
            if z >= stop:
                break
            move = move + 1
        state.play (l [move])

In [None]:

def adapt (sequence, policy):

    polp = copy.deepcopy (policy)

    s = WS ()

    while not s.terminal ():

        l = s.legalMoves ()

        z = 0

        for i in range (len (l)):

            z = z + math.exp (policy.get (s.code (l [i])) + s.beta (l [i]))

        move = sequence [len (s.sequence)]

        polp.put (s.code (move), polp.get(s.code (move)) + 1)

        for i in range (len (l)):

            proba = math.exp (policy.get (s.code (l [i])) + s.beta (l [i])) / z

            polp.put (s.code (l [i]), polp.get(s.code (l [i])) - proba)

        s.play (move)

    return polp

    def beta (self, p):

        last = len (self.sequence)

        if last == 0:

            return 0

        if p == self.sequence [last – 1]:

            return 10

        return 0


In [None]:
# PPAF
def code (self, move):
        direction = 1
        if move.y2 > move.y1:
            direction = 0
        if move.y2 < move.y1:
            direction = 2
        capture = 0
        if self.board [move.x2] [move.y2] != Empty:
            capture = 1
        if move.color == White:
            return 6 * (Dy * move.x1 + move.y1) + 2 * direction + capture
        else:
            return 6 * Dx * Dy + 6 * (Dy * move.x1 + move.y1) + 2 * direction + capture

def playout (state, policy):
    while not state.terminal ():
        l = state.legalMoves ()
        z = 0
        for i in range (len (l)):
            z = z + math.exp (policy.get (state.code (l [i])))
        stop = random.random () * z
        move = 0
        z = 0
        while True:
            z = z + math.exp (policy.get (state.code (l [move])))
            if z >= stop:
                break
            move = move + 1
        state.play (l [move])
    return state.score ()

def adapt (s, winner, state, policy):
    polp = copy.deepcopy (policy)
    alpha = 0.32
    while not s.terminal ():
        l = s.legalMoves ()
        move = state.rollout [len (s.rollout)]
        if s.turn == winner:
            z = 0
            for i in range (len (l)):
                z = z + math.exp (policy.get (s.code (l [i])))
            polp.put (s.code (move), polp.get(s.code (move)) + alpha)
            for i in range (len (l)):
                proba = math.exp (policy.get (s.code (l [i]))) / z
                polp.put (s.code (l [i]), polp.get(s.code (l [i])) - alpha * proba)
        s.play (move)
    return polp

In [None]:
def PPAF (board, policy):
    if board.terminal ():
        return board.score ()
    t = look (board)
    if t != None:
        bestValue = -1000000.0
        best = 0
        moves = board.legalMoves()
        for i in range (0, len (moves)):
            val = 1000000.0
            if t [1] [i] > 0:
                Q = t [2] [i] / t [1] [i]
                if board.turn == Black:
                    Q = 1 - Q
                val = Q + 0.4 * sqrt (log (t [0]) / t [1] [i])
            if val > bestValue:
                bestValue = val
                best = i
        board.play (moves [best])
        res = PPAF (board, policy)
        t [0] += 1
        t [1] [best] += 1
        t [2] [best] += res
        return res
    else:
        add (board)
        return playout (board, policy)

In [None]:
def BestMovePPAF (board, n):
    global Table
    Table = {}
    policy = Policy ()
    for i in range (n):
        b1 = copy.deepcopy (board)
        res = PPAF (b1, policy)
        b2 = copy.deepcopy (board)
        if res == 1:
            policy = adapt (b2, White, b1, policy)
        else:
            policy = adapt (b2, Black, b1, policy)
    t = look (board)
    moves = board.legalMoves ()
    best = moves [0]
    bestValue = t [1] [0]
    for i in range (1, len(moves)):
        if (t [1] [i] > bestValue):
            bestValue = t [1] [i]
            best = moves [i]
    return best

In [None]:
# Modify GRAVE to incorporate a policy and a bias.
def GRAVEPolicyBias (board, played, tref, root, policy):
    if (board.terminal ()):
        return board.score ()
    t = look (board)
    if t != None:
        tr = tref
        if t [0] > 50:
            tr = t
        bestValue = -1000000.0
        best = 0
        moves = board.legalMoves ()
        bestcode = board.code (moves [0])
        for i in range (0, len (moves)):
            val = 1000000.0
            code = board.code (moves [i])
            if tr [3] [code] > 0:
                beta = tr [3] [code] / (t [1] [i] + tr [3] [code] + 1e-5 * t [1] [i] * tr [3] [code])
                Q = 1
                if t [1] [i] > 0:
                    Q = t [2] [i] / t [1] [i]
                    if board.turn == Black:
                        Q = 1 - Q
                AMAF = tr [4] [code] / tr [3] [code]
                if board.turn == Black:
                    AMAF = 1 - AMAF
                val = (1.0 - beta) * Q + beta * AMAF
            if val > bestValue:
                bestValue = val
                best = i
                bestcode = code
        board.play (moves [best])
        played.append (bestcode)
        res = GRAVEPolicyBias (board, played, tr, root, policy)
        t [0] += 1
        t [1] [best] += 1
        t [2] [best] += res
        updateAMAF (t, played, res)
        return res
    else:
        addAMAF (board)
        return playoutBias (board, played, root, policy)
    
def playoutBias (state, played, root, policy):
        while not state.terminal ():
            l = state.legalMoves ()
            z = 0
            for i in range (len (l)):
                code = board.code (l [i])
                AMAF = 1
                if root [3] [code] > 0:
                    AMAF = root [4] [code] / root [3] [code]
                    if board.turn == Black:
                        AMAF = 1 – AMAF
                if AMAF > 0:
                    z = z + math.exp (policy.get (state.code (l [i])) + math.log (AMAF))
            stop = random.random () * z
            move = 0
            z = 0
            while True:
                code = board.code (l [move])
                AMAF = 1
                if root [3] [code] > 0:
                    AMAF = root [4] [code] / root [3] [code]
                    if board.turn == Black:
                        AMAF = 1 - AMAF
                if AMAF > 0:
                    z = z + math.exp (policy.get (state.code (l [move])) + math.log(AMAF))
                if z >= stop or move == len (l) - 1:
                    break
                move = move + 1
            played.append (state.code(l [move]))
            state.play (l [move])
        return state.score ()