In [1]:
import numpy as np
import itertools
import pickle
import os.path

In [2]:
# Function which checks whether the input state is a winning state.
# playerIdx = 1 means the player is 'X', playerIdx = 2 means the player is 'O'
def isWinningState(inState, playerIdx=1):
    
    # Check for horizontal win states
    idx1 = 0
    idx2 = 1
    idx3 = 2
    for i in xrange(3):
        if (inState[idx1] == inState[idx2] == inState[idx3]) and inState[idx1] == playerIdx:
            return True
        
        idx1 = idx1 + 3
        idx2 = idx2 + 3
        idx3 = idx3 + 3
    
    # Check for vertical win states
    idx1 = 0
    idx2 = 3
    idx3 = 6
    
    for i in xrange(3):
        if (inState[idx1] == inState[idx2] == inState[idx3]) and inState[idx1] == playerIdx:
            return True
        
        idx1 = idx1 + 1
        idx2 = idx2 + 1
        idx3 = idx3 + 1
    
    # Check for diagonal win state 1
    if (inState[0] == inState[4] == inState[8]) and inState[0] == playerIdx:
        return True
    
    # Check for diagonal win state 2
    if (inState[2] == inState[4] == inState[6]) and inState[2] == playerIdx:
        return True
    
    # If we got to here, no win states were found
    return False
            

In [3]:
# Checks if this is a drawn state.
# Note that this should be called AFTER checking if 
# this state is a winning state because this does no
# explicit win check
def isDrawn(state):
    for i in xrange(9):
        if state[i] == 0:
            return False
        
    # If we get to here then it is a drawn game
    return True

In [4]:
# Determines possible next states for the computer
def nextStates(state, playerIdx=1):
    nextStates = []
    stateList = list(state)
    for i in xrange(9):
        if stateList[i] == 0:
            newState = list(stateList)
            newState[i] = playerIdx
            nextStates.append(tuple(newState))
            
    return nextStates

In [5]:
# Takes a state as a tuple, e.g. (1, 0, 0, 1, 1, 0, 0, 0, 0)
# and prints out the appearance of the board, e.g:
#
#           X  |      |    
#           X  |   X  |    
#              |      |
def printBoardState(state):
    idx = 0
    lookup = {0: ' ', 1: 'X', 2: 'O'}
    for i in xrange(3):
        c0 = lookup[state[idx]]
        c1 = lookup[state[idx+1]]
        c2 = lookup[state[idx+2]]
        if c0 == ' ':
            c0 = str(idx)
        if c1 == ' ':
            c1 = str(idx + 1)
        if c2 == ' ':
            c2 = str(idx + 2)
        print c0 + '\t|\t' + c1 + '\t|\t' + c2
        idx = idx + 3

In [6]:
# Our computer will play player 1, i.e. it will play as 'X'
# Create lookup table for all possible board states and set initial values to zero
V = {}
for combination in itertools.product(xrange(3), repeat=9):
    if isWinningState(combination, 1):
        V[combination] = 1.0
    elif isDrawn(combination) or isWinningState(combination, 2):
        V[combination] = 0.0
    else:
        V[combination] = 0.5

In [7]:
nTrainingEpisodes = 1000000
stepSize = 0.01
exploreProbability = 0.4
trainFromScratch = True
trainAtAll = True

# Load in a previously trained value table
if not trainFromScratch and os.path.exists('valuetable.txt'):
    print 'Loading pretrained file...'
    with open('valuetable.txt', 'rb') as handle:
        V = pickle.loads(handle.read())
        
tally1 = 0
tally2 = 0

if trainAtAll:
    for i in xrange(nTrainingEpisodes):
        # Initialise the game
        curState = (0,0,0,0,0,0,0,0,0)
        moveNum = 0
        while True:
            futureStates = nextStates(curState, 1)
            nFutureStates = len(futureStates)
            # Iterate through future states and find best one
            # in terms of value function, V
            maxValue = V[futureStates[0]]
            maxIdx = 0
            for j in xrange(nFutureStates):
                if V[futureStates[j]] > maxValue:
                    maxValue = V[futureStates[j]]
                    maxIdx = j
                    
            # Explore suboptimal states with some probability
            if np.random.uniform() < exploreProbability:
                #print 'Explore step'
                # Don't randomly explore if we are on the verge of a win
                if abs(maxValue - 1.0) < 1e-12:
                    newState = futureStates[maxIdx]
                    V[curState] = V[curState] + stepSize * (V[newState] - V[curState]) 
                    #print V[curState]
                    curState = newState
                else:
                    # Select a random state to explore
                    randomStateIdx = int(nFutureStates * np.random.uniform())
                    curState = futureStates[randomStateIdx]
                    tally1 = tally1 + 1
            else:
                #print 'TD step'
                # Perform Temporal Difference update: V(s) = V(s) + stepSize*[V(s') - V(s)]
                newState = futureStates[maxIdx]
                V[curState] = V[curState] + stepSize * (V[newState] - V[curState]) 
                #print V[curState]
                curState = newState
                tally2 = tally2 + 1
                
            #print curState
            #printBoardState(curState)
            #print ''

            # Check if the game is over
            if isWinningState(curState, 1):
                #print 'X won'
                break

            # Check if game is drawn
            if isDrawn(curState):
                #print 'X drawn'
                break
            

            # NOW PLAY AS THE OTHER PLAYER (i.e. player 2)
            
            futureStates = nextStates(curState, 2)
            nFutureStates = len(futureStates)
            # Iterate through future states and find best one
            # in terms of value function, V
            minValue = V[futureStates[0]]
            minIdx = 0
            for j in xrange(nFutureStates):
                if V[futureStates[j]] < min:
                    minValue = V[futureStates[j]]
                    minIdx = j
                    
            # Explore suboptimal states with some probability
            if np.random.uniform() < exploreProbability:
                if abs(minValue - 0.0) < 1e-12:
                    newState = futureStates[minIdx]
                    V[curState] = V[curState] + stepSize * (V[newState] - V[curState]) 
                    curState = newState
                # Select a random state to explore
                randomStateIdx = int(nFutureStates * np.random.uniform())
                curState = futureStates[randomStateIdx]
            else:
                        
                newState = futureStates[minIdx]
                V[curState] = V[curState] + stepSize * (V[newState] - V[curState]) 
                curState = newState
            
            #print curState
            #printBoardState(curState)
            #print ''

            # Check if the game is over
            if isWinningState(curState, 2):
                #print 'O won'
                break

            # Check if game is drawn
            if isDrawn(curState):
                #print 'O drawn'
                break
                
            moveNum = moveNum + 1
            
            
    
    print 'Expored ' + str(tally1) + ' times'
    print 'Learned ' + str(tally2) + ' times'

    # Save the value table to file
    with open('valuetable.txt', 'wb') as handle:
        pickle.dump(V, handle)

Expored 52923 times
Learned 1845056 times


In [8]:
print V[(1, 1, 0, 0, 2, 0, 0, 0, 2)]

1.0


In [10]:
# ********* Play against the computer **********
# Initialise the game
curState = (0,0,0,0,0,0,0,0,0)
while True:
    # Iterate through future states and find best one
    # in terms of value function, V
    futureStates = nextStates(curState)
    nFutureStates = len(futureStates)
    maxValue = V[futureStates[0]]
    maxIdx = 0
    print futureStates
    for j in xrange(1, nFutureStates):
        print V[futureStates[j]]
        if V[futureStates[j]] > maxValue:
            maxValue = V[futureStates[j]]
            maxIdx = j

    curState = futureStates[maxIdx]
    
    printBoardState(curState)

    # Check if the game is over
    if isWinningState(curState, 1):
        print 'Game Over! Bot Wins!'
        break

    # Check if game is drawn
    if isDrawn(curState):
        print 'Game Over! Drawn.'
        break
    
    # Ask the user what their move is
    userMoveIdx = raw_input('Please enter your move: ')
    curStateList = list(curState)
    curStateList[int(userMoveIdx)] = 2
    curState = tuple(curStateList)
    
        # Check if the game is over
    if isWinningState(curState, 2):
        print 'Game Over! You Win!'
        break

    # Check if game is drawn
    if isDrawn(curState):
        print 'Game Over! Drawn.'
        break
        
    

[(1, 0, 0, 0, 0, 0, 0, 0, 0), (0, 1, 0, 0, 0, 0, 0, 0, 0), (0, 0, 1, 0, 0, 0, 0, 0, 0), (0, 0, 0, 1, 0, 0, 0, 0, 0), (0, 0, 0, 0, 1, 0, 0, 0, 0), (0, 0, 0, 0, 0, 1, 0, 0, 0), (0, 0, 0, 0, 0, 0, 1, 0, 0), (0, 0, 0, 0, 0, 0, 0, 1, 0), (0, 0, 0, 0, 0, 0, 0, 0, 1)]
0.909007709231
0.779437519279
0.793665092587
0.793786485248
0.619857596472
0.801001705539
0.787122520652
0.659774579783
X	|	1	|	2
3	|	4	|	5
6	|	7	|	8
Please enter your move: 4
[(1, 1, 0, 0, 2, 0, 0, 0, 0), (1, 0, 1, 0, 2, 0, 0, 0, 0), (1, 0, 0, 1, 2, 0, 0, 0, 0), (1, 0, 0, 0, 2, 1, 0, 0, 0), (1, 0, 0, 0, 2, 0, 1, 0, 0), (1, 0, 0, 0, 2, 0, 0, 1, 0), (1, 0, 0, 0, 2, 0, 0, 0, 1)]
0.911165897392
0.885951506137
0.630566028236
0.889579653384
0.547001320071
0.559601151042
X	|	X	|	2
3	|	O	|	5
6	|	7	|	8
Please enter your move: 2
[(1, 1, 2, 1, 2, 0, 0, 0, 0), (1, 1, 2, 0, 2, 1, 0, 0, 0), (1, 1, 2, 0, 2, 0, 1, 0, 0), (1, 1, 2, 0, 2, 0, 0, 1, 0), (1, 1, 2, 0, 2, 0, 0, 0, 1)]
0.581595851636
0.999993352706
0.141127971051
0.538278913446
X	|	X	

#### V[(1, 1, 0, 0, 0, 0, 0, 0, 2)]

In [140]:
print nextStates((1, 1, 0, 0, 0, 0, 1, 2, 2))

[(1, 1, 1, 0, 0, 0, 1, 2, 2), (1, 1, 0, 1, 0, 0, 1, 2, 2), (1, 1, 0, 0, 1, 0, 1, 2, 2), (1, 1, 0, 0, 0, 1, 1, 2, 2)]


In [141]:
print V[(1, 1, 1, 0, 0, 0, 1, 2, 2)]
print V[]

1.0


In [22]:
printBoardState((1, 2, 1, 1, 1, 0, 2, 0, 2))

X	|	O	|	X
X	|	X	|	5
O	|	7	|	O
