In [None]:
import numpy as np

In [None]:
class Board:
    BLANK = 0
    BLACK = 1
    WHITE = 2
    
    def __init__(self, size):
        self.size = size
        self.board = []
        self.mask = [True, True, True, True, False, True, True, True, True]
        self.reset()
    
    def reset(self):
        center = np.array([[Board.BLACK, Board.WHITE],[Board.WHITE, Board.BLACK]])
        pad = (self.size - 2) // 2
        self.board = np.pad(center, [(pad, pad), (pad, pad)], 'constant')
    
    def count(self, color):
        return np.count_nonzero(self.board == color)

    def get_state(self):
        return ''.join(map(str, self.board.flatten()))
    
    def get_vector(self, y, x, direction):
        if direction == 0:  #左上
            return [(i, j) for j, i in zip(reversed(range(x)), reversed(range(y)))]
        elif direction == 1: #上
            return [(i, x) for i in reversed(range(self.size)) if i < y]
        elif direction == 2: #右上
            return [(i, j) for j, i in zip(np.arange(x + 1, x + y + 1), reversed(range(y))) if i < self.size and j < self.size]
        elif direction == 3: #左
            return [(y, i) for i in reversed(range(self.size)) if i < x]
        elif direction == 4: #右
            return [(y, i) for i in range(self.size) if i > x]
        elif direction == 5: #左下
            return [(i, j) for j, i in zip(reversed(range(x)), range(y + 1, self.size))]
        elif direction == 6: #下
            return [(i, x) for i in range(self.size) if i > y]
        elif direction == 7: #右下
            return [(i, j) for j, i in zip(range(x, self.size), range(y, self.size)) if j != x and i != y]
        return []
    
    def check_vector(self, color, vector):
        grids = []
        flg = False
        for i, grid in enumerate(vector):
            if self.board[grid] == Board.BLANK:
                return []
            elif self.board[grid] == color:
                flg = True
                break
            grids.append(grid)
        if flg == False: 
            return []
        return grids
    
    def check_arround(self, pboard, y, x):
        return pboard[y: y + 3, x: x + 3][np.array(self.mask).reshape(3, -1)]
    
    def availables(self, color):
        blank = np.where(self.board == Board.BLANK)
        pboard = np.pad(self.board, [(1, 1), (1, 1)], 'constant')
        #accept = []
        accept = {}
        for y, x in zip(blank[0], blank[1]):
            for i, v in enumerate(self.check_arround(pboard, y, x)):
                if v not in [Board.BLANK, color]:
                    grids = self.check_vector(color, self.get_vector(y, x, i))
                    if len(grids) > 0:
                        #grids.insert(0, (y, x))
                        #accept.append(grids)
                        if (y, x) in accept.keys():
                            accept[(y, x)].extend(grids)
                        else:
                            accept[(y, x)] = grids
        return accept
    
    def flip(self, color, grids):
        for grid in grids:
            self.board[grid] = color
        return self.count(color)

In [None]:
class Quantity:
    def __init__(self, size, alpha=0.1, gamma=0.9, init=1.0):
        self.size, self.alpha, self.gamma, self.init = size, alpha, gamma, init
        self.Q = {}
        self.last_q = 0
    
    def reset(self):
        self.last_q = 0
    
    def init_row(self):
        #return np.random.rand(self.size * self.size)
        center = np.zeros((2, 2))
        pad = (self.size - 2) // 2
        q = np.pad(center, [(pad, pad), (pad, pad)], 'constant', constant_values=self.init)
        return q.flatten()
    
    def to_idx(self, pos):
        y, x = pos[0], pos[1]
        return y * self.size + x
    
    def to_pos(self, idx):
        return divmod(idx, self.size)
    
    def latest_q(self, state, action):
        if state not in self.Q:
            self.Q[state] = self.init_row()
        return self.Q[state][self.to_idx(action)]
    
    def get_q(self, state, action, reward=0):
        q = self.latest_q(state, action)
        return self.alpha * (reward + self.gamma * q - self.last_q)
        #return self.alpha * self.gamma * (reward - q)
    
    def update_q(self, state, action, q):
        self.last_q = q
        self.Q[state][self.to_idx(action)] += q

In [None]:
class Player():
    def __init__(self, color, epsilon=0.5):
        self.color, self.epsilon = color, epsilon
        self.count = 2
    
    def put(self, board, quantity, train=False):
        grids = board.availables(self.color)
        if len(grids) == 0: return 0
        keys = []
        stones = []
        q = []
        state = board.get_state()
        for k, v in grids.items():
            keys.append(k)
            q.append(quantity.get_q(state, k, len(v)))
            #q.append(quantity.get_q(state, k))
            v.insert(0, k)
            stones.append(v)
        idx = None
        if np.random.rand() < self.epsilon:
            p = q - np.max(q)
            p = np.exp(p)
            p /= np.sum(p)
            idx = np.random.choice(len(q), p=p)
        else:
            idx = np.argmax(q)
        self.count += board.flip(self.color, stones[idx])
        if train:
            quantity.update_q(state, keys[idx], q[idx])
        return self.count

In [None]:
def learning(board, quantity, player1, player2, stories=2000):
    for story in range(stories):
        board.reset()
        quantity.reset()
        while True:
            p1_ret = player1.put(board, quantity, train=True)
            p2_ret = player2.put(board, quantity, train=True)
            if p1_ret == 0 and p2_ret == 0:
                break

In [None]:
# 学習
b = Board(6)
q = Quantity(6)
p1 = Player(Board.BLACK)
p2 = Player(Board.WHITE)
learning(b, q, p1, p2)

In [None]:
# 個別対戦用にリセット
b.reset()
q.reset()
print(b.board)