In [8]:
from __future__ import print_function

import pandas as pd
import numpy as np

from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets

In [374]:
def is_winning_board(board, winner = ['X', 'O']):
    
    ways_to_win = np.array([[0, 1, 2], [3, 4, 5], [6, 7, 8], [0, 3, 6], [1, 4, 7], [2, 5, 8], [0, 4, 8], [2, 4, 6]])

    i = 0
    check_win = False

    while(check_win is False and i < len(ways_to_win)):
        
        win_vec = ways_to_win[i]
        
        vals_x = [board[w]== "X" for w in win_vec]
        vals_o = [board[w]== "O" for w in win_vec]
        
        if winner == 'X':
            check_win = (sum(vals_x) == 3)
        elif winner == 'O':
            check_win = (sum(vals_o) == 3)
        else:
             check_win = (sum(vals_x) == 3) | (sum(vals_o) == 3)
            
        i = i + 1    

    return check_win

def unique_boards(all_boards):
    return [np.array(x) for x in set(tuple(x) for x in all_boards)]

def is_full_board(board):
    return sum([board[i] in ["X", "O"] for i in range(9)]) == 9

def get_baseline_value(board):
    
    if is_winning_board(board, winner = 'X'):
        return 1    
    elif is_winning_board(board, winner = 'X') or is_full_board(board):
        return 0
    else:
        return .5
    
def get_value_board(board, value_df):
    board_df = pd.DataFrame([board])
    return board_df.merge(value_df, how = "left")['value'].loc[0]


In [375]:
round = 0
player_turns = ['X', 'O', 'X', 'O', 'X', 'O', 'X', 'O', 'X']

prev_round_boards = [np.array(["", "", "", "", "", "", "", "", ""])]

all_boards = prev_round_boards

while(round < 9):
    
    current_player = player_turns[round]
    round_boards = []

    for prev_board in prev_round_boards:
        
        if not is_winning_board(prev_board):
            still_open = np.where(prev_board == '')[0]

            for move in still_open:
                new_board = prev_board.copy()
                new_board[move] = current_player
                round_boards = round_boards + [new_board]
                            
    round_boards_unique = unique_boards(round_boards)
    
    prev_round_boards =  round_boards_unique
    
    all_boards = all_boards + round_boards_unique
        
    round = round + 1
    

In [376]:
init_values = [get_baseline_value(board) for board in all_boards]

value_df = pd.DataFrame(all_boards)

all_boards_str = [','.join(board) for board in all_boards]

value_df['board_str'] = all_boards_str

value_df['value'] = init_values

value_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,board_str,value
0,,,,,,,,,,",,,,,,,,",0.5
1,X,,,,,,,,,"X,,,,,,,,",0.5
2,,,X,,,,,,,",,X,,,,,,",0.5
3,,,,,,,X,,,",,,,,,X,,",0.5
4,,,,,,X,,,,",,,,,X,,,",0.5


In [377]:
value_df.value.value_counts()

0.5    4836
1.0     626
0.0      16
Name: value, dtype: int64

Let the learner play 300 rounds of games against a random-selection oponent with decaying learning rate and epsilon in epsilon-greedy strategy.

$V(S_t) \leftarrow V(S_t) + \alpha[ V(S_{t+1}) - V(S_t)]$ 

For example, if $\alpha  = .1$ and $V(S_t) = .4$ and $V(S_{t+1})$ = .5, then $V(S_t)$ becomes 0.41.    

Epsilon = % of time the learner takes a random step
(otherwise it takes the move with the highest 

In [378]:
alpha_list = np.arange(.3, 0, -.001)
epsilon_list = np.arange(.3, 0, -.001)

In [380]:
epoch = 0
curr_board = np.array(['X', '', '', '', 'X', '', 'O', '', 'O'])

def learner_move(curr_board, epoch):

    curr_board_str = ','.join(curr_board)

    curr_board_value = get_value_board(curr_board, value_df)

    still_open = np.where(curr_board == '')[0]

    next_boards = []

    for move in still_open:
        next_board = curr_board.copy()
        next_board[move] = 'X'
        next_boards = next_boards + [next_board]

    epsilon = epsilon_list[iter]
    alpha = alpha_list[iter]

    greedy = np.random.random_sample() > epsilon

    values = [get_value_board(board, value_df) for board in next_boards]

    if greedy:
        choices = still_open[np.where(values == max(values))]
        if len(choices) > 1:
            next_move = np.random.choice(choices)
        else:
            next_move = choices[0]
    else:
        next_move = np.random.choice(still_open)

    new_board = curr_board.copy()
    new_board[next_move] = 'X'

    new_board_value = get_value_board(new_board, value_df)

    curr_board_value  = curr_board_value + alpha*(new_board_value - curr_board_value)

    value_df.loc[value_df['board_str'] == curr_board_str, 'value'] = curr_board_value
    
    return new_board

def random_move(curr_board):
    still_open = np.where(curr_board == '')[0]

    next_move = np.random.choice(still_open)
    
    new_board = curr_board.copy()
    new_board[next_move] = 'O'
    
    return new_board
    
def make_move(curr_board, turn, epoch):
    
    if turn in [0,2,4,6,8]:
        new_board = learner_move(curr_board, epoch)
        
    else:
        new_board = random_move(curr_board)
        
    return new_board

def play_round(epoch):
    curr_board = np.array(['', '', '', '', '', '', '', '', ''])

    turn = 0

    check_win = False

    while(turn < 9 and check_win is False):

        new_board = make_move(curr_board, turn, epoch)

        x_wins = is_winning_board(new_board, 'X')
        o_wins = is_winning_board(new_board, 'O')

        check_win = x_wins or o_wins

        curr_board = new_board
        turn = turn + 1

    if x_wins:
        ending = "X wins"
    elif o_wins:
        ending = "O wins"
    else:
        ending = "cats"
        
    return curr_board, ending


In [382]:
for epoch in range(300):
    play_round(epoch)

In [383]:
value_df.value.value_counts()

0.5000    4595
1.0000     626
0.6500     195
0.7550      20
0.3500      17
0.0000      16
0.2450       4
0.6050       4
0.1715       1
Name: value, dtype: int64