# Create tick-tack-toe agent by "MonteCalroES"
## Reference

Reinforcement Learning: An Introduction (Richard S. Sutton and Andrew G. Barto)
  - chapter 5.3
  - https://webdocs.cs.ualberta.ca/~sutton/book/ebook/node53.html

## algorithm
<img src="https://webdocs.cs.ualberta.ca/~sutton/book/ebook/pseudotmp4.png" />
https://webdocs.cs.ualberta.ca/~sutton/book/ebook/pseudotmp4.png

## Prepare functions to play tick-tack-toe

In [362]:
def visualize_board(state):
    board = [-1 for i in range(9)]
    for i in range(2):
        for j in range(9):
            if state[i] >> j &1 == 1:
                board[j] = i
    icon_map = {-1: "-", 0:"o", 1:"x"}
    board = map(lambda i: icon_map[i], board)
    for i in range(3):
        print "%s %s %s" % tuple(board[i*3:(i+1)*3])

# return empty cell position on the board
def possible_actions(state):
    board = state[0] | state[1]
    def add(ary, pos):
        if (board >> pos)&1 == 0:
            ary.append(1<<pos)
        return ary
    return reduce(add, range(9), [])

def is_terminated(player_board):
    bin2i = lambda b: int(b, 2)
    line_horizon = any([player_board & mask == mask for mask in map(bin2i, ['000000111', '000111000', '111000000'])])
    line_vertical = any([player_board & mask == mask for mask in map(bin2i, ['001001001', '010010010', '100100100'])])
    line_diagonal = any([player_board & mask == mask for mask in map(bin2i, ['100010001', '001010100'])])
    return line_horizon | line_vertical | line_diagonal

def is_draw(state):
    return len(possible_actions(state))==0

def calc_reward(state):
    first_player_board, second_player_board = state
    if (is_terminated(first_player_board)):
        return 1
    elif(is_terminated(second_player_board)):
        return -1
    else:
        return 0

def apply_action(state, action):
    first_player_board ,second_player_board = state
    count_move = lambda : bin(first_player_board | second_player_board).count("1")
    if count_move()%2==0:
        first_player_board |= action
    else:
        second_player_board |= action
    return first_player_board, second_player_board

def play_a_game():
    state = (0,0)
    visualize_board(state)
    while not any(map(is_terminated, state) + [is_draw(state)]):
        act = int(raw_input("action => %s" % possible_actions(state)))
        state = apply_action(state, act)
        visualize_board(state)

In [365]:
play_a_game()

- - -
- - -
- - -
action => [1, 2, 4, 8, 16, 32, 64, 128, 256]1
o - -
- - -
- - -
action => [2, 4, 8, 16, 32, 64, 128, 256]8
o - -
x - -
- - -
action => [2, 4, 16, 32, 64, 128, 256]32
o - -
x - o
- - -
action => [2, 4, 16, 64, 128, 256]16
o - -
x x o
- - -
action => [2, 4, 64, 128, 256]64
o - -
x x o
o - -
action => [2, 4, 128, 256]256
o - -
x x o
o - x
action => [2, 4, 128]128
o - -
x x o
o o x
action => [2, 4]4
o - x
x x o
o o x
action => [2]2
o o x
x x o
o o x


## Prepare functions for GPI 

In [418]:
# Play 1 game and return Q-value (state, reward pair) array watched in the game
def gen_episode(Q, debug=False):
    state = (0, 0)
    episode = []
    while not any(map(is_terminated, state) + [is_draw(state)]):
        action = policy(Q, state, eps=0.3, debug=debug)
        episode.append((state, action))
        _, state = transition(state, action)
        if debug: visualize_board(state)
    return episode, calc_reward(state)

import math
# Epsiron-greedy
def policy(Q, current_state, eps=0.1, debug=False):
    do_random = lambda : random.random() < eps
    transition_curry = lambda action: transition(current_state, action)
    Q_value_curry = lambda action: Q[current_state[0]][current_state[1]][int(math.log(action,2))]
    if do_random():
        if debug: print "do random"
        return random.choice(possible_actions(current_state))
    else:
        if debug: print "do greedy"
        Q_value_for_actions = map(Q_value_curry, possible_actions(current_state))
        best_act_idx = Q_value_for_actions.index(max(Q_value_for_actions))
        return possible_actions(current_state)[best_act_idx]

# return  next state and reward after passed action is applied
def transition(state, action):
    next_state =  apply_action(state, action)
    reward = calc_reward(next_state)
    return reward, next_state

In [329]:
# Initialization
Q_table = [[[0 for a in range(9)] for j in range(2**9)] for i in range(2**9)]
Returns = [[[0 for a in range(9)] for j in range(2**9)] for i in range(2**9)]
gen_episode(Q_table, debug=True)

do greedy
o - -
- - -
- - -
do random
o - -
- - -
x - -
do greedy
o o -
- - -
x - -
do random
o o -
- - -
x x -
do greedy
o o o
- - -
x x -


([((0, 0), 1), ((1, 0), 64), ((1, 64), 2), ((3, 64), 128), ((3, 192), 4)], 1)

In [415]:
# policy evaluation => policy improvement process
def GPIProcess(Q):
    episode, reward = gen_episode(Q)
    for state, action in episode:
        s1, s2, a = state[0], state[1], int(math.log(action,2))
        Returns[s1][s2][a].append(reward)
        Q[s1][s2][a]= 1.0*sum(Returns[s1][s2][a]) / len(Returns[s1][s2][a])

def display_Q_value(Q, state):
    Q_value_curry = lambda action: Q[state[0]][state[1]][int(math.log(action,2))]
    Q_values =map(Q_value_curry, [1<<n for n in range(9)])
    max_len = max([len(str(val)) for val in Q_values])
    for i in range(3):
        line = []
        for j in range(3):
            pos = 3*i+j
            board = state[0] | state[1]
            line.append(str(Q_values[pos]) if (board>>pos)&1 == 0 else "x")
        print "%s %s %s" % tuple([elem.rjust(max_len, '0') for elem in line])

## Start learning !! Iterate GPI process for 50000 times

In [419]:
Q_table = [[[0 for a in range(9)] for j in range(2**9)] for i in range(2**9)]
Returns = [[[[] for a in range(9)] for j in range(2**9)] for i in range(2**9)]

# Improve agent by iterating GPI process
for i in range(50000):
    GPIProcess(Q_table)
display_Q_value(Q_table, (0,0))

0.898170731707 0.869512928443 0.889096573209
0.887445887446 0.936357908003 0.813437312537
0.961614734726 0.869718309859 0.874560375147


## Prepare functions to play with agent through console

In [402]:
def is_next_player_agent(state):
    first_player_board ,second_player_board = state
    count_move = lambda : bin(first_player_board | second_player_board).count("1")
    return count_move() % 2 == 0

def play_with_agent():
    state = (0,0)
    while not any(map(is_terminated, state) + [is_draw(state)]):
        action, player = None, None
        if is_next_player_agent(state):
            display_Q_value(Q_table, state)
            action = policy(Q_table, state)
            player = "agent"
        else:
            action = int(raw_input("action => %s" % possible_actions(state)))
            player = "you"
        state = apply_action(state, action)
        visualize_board(state)
        print "player : %s, action : %d" % (player, int(math.log(action, 2)))

## Let's play tick-tack-toe with our agent

In [420]:
play_with_agent()

0.898170731707 0.869512928443 0.889096573209
0.887445887446 0.936357908003 0.813437312537
0.961614734726 0.869718309859 0.874560375147
- - -
- - -
o - -
player : agent, action : 6
action => [1, 2, 4, 8, 16, 32, 128, 256]16
- - -
- x -
o - -
player : you, action : 4
0.911076443058 0.817518248175 0.413793103448
0.861538461538 0000000000000x 0.661538461538
0000000000000x 000000000000.6 00000000000.64
o - -
- x -
o - -
player : agent, action : 0
action => [2, 4, 8, 32, 128, 256]4
o - x
- x -
o - -
player : you, action : 2
0000000000000x 0.780487804878 0000000000000x
000000000001.0 0000000000000x 0.926829268293
0000000000000x 0.918918918919 0.965517241379
o - x
o x -
o - -
player : agent, action : 3


`>>> Good!! Agent seems understanding how to win!!`

In [421]:
play_with_agent()

0.898170731707 0.869512928443 0.889096573209
0.887445887446 0.936357908003 0.813437312537
0.961614734726 0.869718309859 0.874560375147
- - -
- - -
o - -
player : agent, action : 6
action => [1, 2, 4, 8, 16, 32, 128, 256]1
x - -
- - -
o - -
player : you, action : 0
0000000000000x 0.872093023256 0.945365853659
00000000000.75 0.849056603774 0.789473684211
0000000000000x 0.866666666667 0.904761904762
x - o
- - -
o - -
player : agent, action : 2
action => [2, 8, 16, 32, 128, 256]16
x - o
- x -
o - -
player : you, action : 4
0000000000000x 000000000000.5 0000000000000x
000000000000.0 0000000000000x 0.622641509434
0000000000000x 000000000000.0 000000000000.5
x - o
- x o
o - -
player : agent, action : 5
action => [2, 8, 128, 256]256
x - o
- x o
o - x
player : you, action : 8


`>>> Woops... Agent made a silly mistake :(`

In [458]:
import numpy as np
visited = reduce(lambda acc, e: sum(e) + acc, np.ndarray.flatten(np.array(Returns)), 0)
all_state = reduce(lambda acc, e: acc*e, np.array(Returns).shape, 1)
print "visited = %d" % visited
print "All state = %d" % all_state
print "visit rate = %f" % (1.0 * visited / all_state)

visited = 321255
All state = 2359296
visit rate = 0.136166


`>>> It seems that 50000 GPI iteration was too small (Most of states are never experienced)`