In [9]:
import random
import math
import numpy as np
N = 3

MaxNumber = 10000
class WS (object):
    def __init__ (self):
        self.partitions = [[] for i in range (N)]
        self.possible = np.full((N,MaxNumber),True)
        self.next = 1
        self.sequence = []
 
    def legalMoves (self):
        l = []
        for i in range (N):
            if self.possible [i] [self.next]:
                l.append (i)
        return l
 
    def code (self, p):
        return N * self.next + p
    
    def terminal (self):
        l = self.legalMoves ()
        if l == []:
            return True
        return False
 
    def score (self):
        return self.next - 1
 
    def play (self, p):
        for i in range (len (self.partitions [p])):
            self.possible [p] [self.next + self.partitions [p] [i]] = False
        self.partitions [p].append (self.next)
        self.next = self.next + 1
        self.sequence.append (p)

    def beta (self, p):
        last = len (self.sequence)
        if last == 0:
            return 0
        if p == self.sequence[last-1]:
            return 10
        return 0

In [2]:
class Policy (object):
    def __init__ (self):
        self.dict = {}
 
    def get (self, code):
        w = 0
        if code in self.dict:
            w = self.dict [code]
        return w
 
    def put (self, code, w):
        self.dict [code] = w 

In [3]:
def playout (state, policy):
    while not state.terminal ():
        l = state.legalMoves ()
        z = 0
        for i in range (len (l)):
            z = z + math.exp (policy.get (state.code (l [i])))
        stop = random.random () * z
        move = 0
        z = 0
        while True:
            z = z + math.exp (policy.get (state.code (l [move])))
            if z >= stop:
                break
            move = move + 1
        state.play (l [move])

In [5]:
import copy

def adapt (sequence, policy):
    polp = copy.deepcopy (policy)
    s = WS ()
    while not s.terminal ():
        l = s.legalMoves ()
        z = 0
        for i in range (len (l)):
            z = z + math.exp (policy.get (s.code (l [i])))
        move = sequence [len (s.sequence)]
        polp.put (s.code (move), polp.get(s.code (move)) + 1)
        for i in range (len (l)):
            proba = math.exp (policy.get (s.code (l [i]))) / z
            polp.put (s.code (l [i]), polp.get(s.code (l [i])) - proba)
        s.play (move)
    return polp

In [6]:
def NRPA (level, policy):
    state = WS ()
    if level == 0:
        playout (state, policy)
        return state
    pol = copy.deepcopy (policy)
    for i in range (100):
        ws = NRPA (level - 1, pol)
        if ws.score () >= state.score ():
            state = ws
        pol = adapt (state.sequence, pol)
    return state
 
ws = NRPA (2, Policy ())
print (ws.partitions)
[[1, 2, 4, 8, 11, 16, 22], [3, 5, 6, 7, 19, 21, 23], [9, 10, 12, 13, 14, 15, 17, 18, 20]]

[[3, 5, 6, 7, 19, 21, 23], [1, 2, 4, 8, 11, 16, 22], [9, 10, 12, 13, 14, 15, 17, 18, 20]]


[[1, 2, 4, 8, 11, 16, 22],
 [3, 5, 6, 7, 19, 21, 23],
 [9, 10, 12, 13, 14, 15, 17, 18, 20]]

In [7]:
# Bias weights learning

def playout(state, policy, w1):
    while not state.terminal():
        l = state.legalMoves()
        z = 0
        for i in range(len(l)):
            z = z + math.exp(policy.get(state.code(l[i])) + w1 * state.beta(l[i]))
        stop = random.random() * z
        move = 0
        z = 0
        while True:
            z = z + math.exp(policy.get(state.code(l[move])) + w1 * state.beta(l[move]))
            if z >= stop:
                break
            move = move + 1
        state.play(l[move])

def adapt (sequence, policy, w1):
    polp = copy.deepcopy (policy)
    w = w1
    s = WS ()
    while not s.terminal ():
        l = s.legalMoves ()
        z = 0
        b = 0
        for i in range (len (l)):
            z = z + math.exp (policy.get (s.code (l [i])) + w1 * s.beta (l [i]))
            b = b + s.beta (l [i]) * math.exp (policy.get (s.code (l [i])) + w1 * s.beta (l [i]))
        move = sequence [len (s.sequence)]
        w = w + s.beta (move) – b / z
        polp.put (s.code (move), polp.get(s.code (move)) + 1)
        for i in range (len (l)):
            proba = math.exp (policy.get (s.code (l [i])) + w1 * s.beta (l [i])) / z
            polp.put (s.code (l [i]), polp.get(s.code (l [i])) - proba)
        s.play (move)
    return (polp,w)

def BLGNRPA (level, policy, w):
    state = WS ()
    if level == 0:
        playout (state, policy, w)
        return state
    pol = copy.deepcopy (policy)
    w1 = w
    for i in range (100):
        ws = BLGNRPA (level - 1, pol, w1)
        score = ws.score ()
        if score >= state.score ():
            state = ws
        (pol, w1) = adapt (state.sequence, pol, w1)
    return state