In [1]:
from helpers import *
from simulator import *
from agents import *

In [2]:
x = ConnectN(7, 4)
x.print_grid()

[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]]


In [3]:
def train_Q_learner(num_trials = 1000, k=2, n=4, grid_size = 7):
    """
    Trains the Q Learner against Minimax Depth k
    
    Inputs:
    Number of games to play
    On grid_size x
    N tokens to connect
    
    Outputs:
    Q Learner value table after training
    """
    depth = k
    N = n
    grid_size = grid_size
    x = ConnectN(grid_size, N)
    
    p1 = Q_Learner(x, None, None, player=1)
    p2 = Random_Learner(x)
    play_game_no_output(x, p1, p2)
    
    for game in xrange(1, num_trials):
                
        x = ConnectN(grid_size, N)
        p1 = Q_Learner(x, p1.value_table, None, player=1)
        p2 = Random_Learner(x)
        play_game_no_output(x, p1, p2)

        if game == num_trials - 1:
            return p1.value_table


In [5]:
x = ConnectN(7,4)
x.move(0,1)
x.move(1,-1)
x.move(1,1)
x.move(2,-1)
x.move(2,-1)
ql = Q_Learner(x, None, True, player=1)
ql.calc_next_move(None, x)

2

In [6]:
ql.value_table

{'0000000000000000000000000000000000001-100001-1-10000': array([  0.,   0.,  15.,   0.,   0.,   0.,   0.])}

In [7]:
train_Q_learner(num_trials=2)

{'00000000000000-100000010000001000000-100000011-100-10': array([ 0.,  0.,  0.,  0.,  0.,  0.,  0.]),
 '00000000000000000000000000000000000-100000010010-10': array([ 0.,  0.,  0.,  0.,  0.,  0.,  0.]),
 '00000000000000000000000000000000000-100000011-10000': array([ 0.,  0.,  0.,  0.,  0.,  0.,  0.]),
 '0000000000000000000000000000000000000000000000000': array([ 0.,  0.,  0.,  0.,  0.,  0.,  0.]),
 '00000000000000000000000000000000000000000000010-10': array([ 0.,  0.,  0.,  0.,  0.,  0.,  0.]),
 '00000000000000000000000000000000000000000001-10000': array([ 0.,  0.,  0.,  0.,  0.,  0.,  0.]),
 '00000000000000000000000000000000000000000010010-10': array([ 0.,  0.,  0.,  0.,  0.,  0.,  0.]),
 '00000000000000000000000000000000000000000011-10000': array([ 0.,  0.,  0.,  0.,  0.,  0.,  0.]),
 '00000000000000000000000000001000000-10000-1010010-10': array([ 0.,  0.,  0.,  0.,  0.,  0.,  0.]),
 '00000000000000000000000000001000000-100000010010-10': array([ 0.,  0.,  0.,  0.,  0.,  0.,  0.]),
 '0

In [35]:
class Q_Learner(TD_Learner):
    """
    Implementation of Q Learning, inheriting from TD Learner base class.
    """

    def __init__(self, task, value_table, known_states, epsilon=.1, discount_factor=.9, learning_rate=.5, player=1, trace_size=.1):
        TD_Learner.__init__(self, task, value_table, epsilon, discount_factor, learning_rate, player, trace_size)
        self.known_states = known_states

    def calc_next_move(self, reward, next_board_state):
        if reward is None:
            # Approximation of known states. Since too many states, instead, given a board position,
            # explore possible moves and give 15 reward to creating streaks of length 3 or 4 and
            # 20 reward for preventing an opponent win.
            if (self.known_states):
                for col in self.task.next_possible_moves():
                    row = np.sum([abs(x) for x in next_board_state.grid[col]])
                    if next_board_state.streakVertical(next_board_state.grid, col, row - 2, self.player) >= 2:
                        self.value_table[grid_to_key(next_board_state.grid)][col] = 15
                    temp_board = deepcopy(next_board_state.grid)
                    temp_board[col][row] = self.player
                    for i in range(0, 4):
                        if next_board_state.streakHorizontal(temp_board, col - i, row, self.player) >= 3:
                            self.value_table[grid_to_key(next_board_state.grid)][col] = 15
                        if next_board_state.streakDiagonalUp(temp_board, col - i, row - i, self.player) >= 3:
                            self.value_table[grid_to_key(next_board_state.grid)][col] = 15
                        if next_board_state.streakDiagonalDown(temp_board, col - i, row + i, self.player) >= 3:
                            self.value_table[grid_to_key(next_board_state.grid)][col] = 15

                    if next_board_state.streakVertical(next_board_state.grid, col, row - 3, -self.player) == 3:
                        self.value_table[grid_to_key(next_board_state.grid)][col] = 20
                    temp_board = deepcopy(next_board_state.grid)
                    temp_board[col][row] = -1*self.player
                    for i in range(0, 4):
                        if next_board_state.streakHorizontal(temp_board, col - i, row, -1*self.player) == 4:
                            self.value_table[grid_to_key(next_board_state.grid)][col] = 20
                        if next_board_state.streakDiagonalUp(temp_board, col - i, row - i, -1*self.player) == 4:
                            self.value_table[grid_to_key(next_board_state.grid)][col] = 20
                        if next_board_state.streakDiagonalDown(temp_board, col - i, row + i, -1*self.player) == 4:
                            self.value_table[grid_to_key(next_board_state.grid)][col] = 20

            next_action = self.softmax(next_board_state)

            self.last_board_state = next_board_state.grid

            self.last_action = next_action
            return self.last_action

        if reward == 50:
            delta = delta = reward - self.value_table[grid_to_key(self.last_board_state)][self.last_action]
            self.value_table[grid_to_key(self.last_board_state)][self.last_action] += self.learning_rate * delta

            return self.last_action

        if reward == -50:
            delta = delta = reward - self.value_table[grid_to_key(self.last_board_state)][self.last_action]
            self.value_table[grid_to_key(self.last_board_state)][self.last_action] += self.learning_rate * delta

            return self.last_action

        """
        VDBE-Softmax policy. If draw < epsilon, perform Softmax. Else do best action.
        """
        draw = np.random.uniform(0,1,1)

        if draw < self.epsilon:
            next_action = self.softmax(next_board_state)
        else:
            next_action = np.argmax(self.value_table[grid_to_key(next_board_state.grid)])

        # Update value function.

        print(next_board_state.grid)
        delta = reward + self.discount_factor * np.amax(self.value_table[grid_to_key(next_board_state.grid)]) - self.value_table[grid_to_key(self.last_board_state)][self.last_action]
        self.value_table[grid_to_key(self.last_board_state)][self.last_action] += self.learning_rate * delta

        # Update eligibility traces (Watson's Q(lambda))
        self.e[grid_to_key(self.last_board_state)][self.last_action] += 1

        # Eligibility traces
        # Note that here we do not implement classic eligibility traces, which iterate over all state, action pairs
        # Instead we consider all next possible board states and update those (for easier computation)
        next_possible_moves = next_board_state.next_possible_moves()
        next_possible_boards = []

        for i in next_possible_moves:
            temp_board = deepcopy(next_board_state)
            temp_board.move(next_action, self.player)
            next_possible_boards.append(temp_board)

        for board in next_possible_boards:
            valid_actions = board.next_possible_moves()
            for action in valid_actions:
                self.value_table[grid_to_key(board.grid)][action] += self.learning_rate * delta \
                                                                    * self.e[grid_to_key(board.grid)][action]
                if self.last_action == action:
                    self.e[grid_to_key(board.grid)][action] = self.discount_factor * self.trace_size \
                                                                    * self.e[grid_to_key(board.grid)][action]
                else:
                    self.e[grid_to_key(board.grid)][action] = 0

        self.last_board_state = next_board_state.grid
        self.last_action = next_action

        if next_board_state.simulate_move(self.last_action, self.player) == 1:
            self.last_action = self.softmax(next_board_state)

        return self.last_action

In [36]:
x = ConnectN(7, 4)
p1 = Q_Learner(x, None, None, player=1)
p2 = Random_Learner(x)
y = play_game_no_output(x, p1, p2)

[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [-1.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 1.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]]
[[ 1.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [-1. -1.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 1.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]]
[[ 1.  1.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [-1. -1.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 1.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [-1.  0.  0.  0.  0.  0.  0.]]
[[ 1.  1.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [-1. -1. -1.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 1.  1.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [-1.  0.  0.  0.  0.  0.  0.]]
[[ 1.  1.  1. -1.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [-1. -1. -1.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 1

In [37]:
y

2

In [38]:
x = ConnectN(7, 4)
p1 = Random_Learner(x)
p2 = Q_Learner(x, None, None, player=-1)
y = play_game_no_output(x, p1, p2)

[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 1.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]]


ValueError: Input must >= 2-d.

In [33]:
y

1