In [1]:
import numpy as np
import random
from tqdm import tqdm # loops show smart progress meter
from enum import Enum

In [2]:
#class Player(Enum):
#    NOUGHTS = -1
#    CROSSES = 1

class Board:
    """ holds a 3x3 board.
    """
    @staticmethod
    def coordFromLinearCoord(pos):
        
        x = pos % 3
        y = int(pos/3)
        
        return x, y
    
    @staticmethod
    def coordToLinearCoord(x, y):
        return x + 3 * y
    
    def __init__(self, bInit):
        """ bInit is a numpy array.
        """
        
        assert(bInit.shape == ((3,3)))
        
        self.b = bInit
    
    def hasEmptyCell(self):
        return np.any(self.b == 0) == False
    
    def isEmptyCell(self, x, y):
        return self.b[x, y] == 0
    
    def check(self, player):
        """
        This function returns player if this player won, 2 if draw, and 0 otherwise.
        """
        
        assert(player == 1 or player == -1)
        
        # check horizontal and vertical lines for win.
        
        for i in range(3):
            if (self.b[i,0] == player and self.b[i,0] == self.b[i,1] == self.b[i,2]) or \
               (self.b[0,i] == player and self.b[0,i] == self.b[1,i] == self.b[2,i]):
                return player

        # check diagonals for win.
        
        if (self.b[0,0] == player and self.b[0,0] == self.b[1,1] == self.b[2,2]) or \
           (self.b[0,2] == player and self.b[0,2] == self.b[1,1] == self.b[2,0]):
            return player
        
        
        if self.hasEmptyCell():   # draw.
            return 2
        
        return 0    # game not over.

    def toKey(self):
        
        s = 0
        i = 0
        
        for x in range(3):
            for y in range(3):
                s += (self.b[x,y] + 2) * 10**i
                i += 1

        return int(s)

    def fromKey(self, i):
        
        self.b = np.zeros((3, 3))
        
        for x in range(3):
            for y in range(3):
                self.b[x,y] = i%10 - 2
                i = int(i/10)
                
    def move(self, x, y, player):
        self.b[x,y] = player
        
        return self.check(player)
    
    def possibleActions(self):
        
        """ returns list of empty cells using linear coords.
        """
        
        actions = []
        
        for x in range(3):
            for y in range(3):
                if self.isEmptyCell(x, y):
                    actions.append(Board.coordToLinearCoord(x,y))
        
        return actions

def getAllStartStates():
    def f(keys, b, i):
        # i is initial cell in linear coords.
        
        assert(i >= 0 and i < 9)
        
        x, y = Board.coordFromLinearCoord(i)
        
        for p in [-1, 0, 1]:
            b.b[x,y] = p
            
            if np.sum(b.b == 1) - np.sum(b.b == -1) in [0, 1] and b.check(1) == 0 and b.check(-1) == 0:
                keys.append(b.toKey())
            
            if i < 8:
                f(keys, b, i+1)
    
    board = Board(np.zeros((3,3)))
    boardKeys = [board.toKey()]
    
    f(boardKeys, board, 0)
    
    return boardKeys

In [3]:
def policyRandomMove(board, pure = False):
    
    p = np.where(board.b == 0, 1, 0) # p is 3x3 grid which is 1 where we have empty cells
    
    if not pure:
        if p[1,1] == 1:  # (1, 1) is empty
            p[1,1] = 3
        for x, y in [[0,0],[0,2],[2,0],[2,2]]:  # the four corners
            if p[x,y] == 1:
                p[x,y] = 2
    
    policy = p / np.sum(p)
    
    return chooseMove(policy)

# does this function always return something?

def chooseMove(policy):
    
    u = np.random.uniform()
    c = 0
    
    for x in [0, 1, 2]:
        for y in [0, 1, 2]:
            c += policy[x, y]
            if u < c:
                return x, y

def policyMCMove(board, q, player):
    """
    We choose the action that has the highest current q-value.
    """
    key = board.toKey()
    actions = board.possibleActions()  # actually just empty cells.

    qValues = [q[key, action] for action in actions]

    bestAction = actions[qValues.index(max(qValues))]
    
    return Board.coordFromLinearCoord(bestAction)

def policyMCMoveEpsGreedy(board, q, player, epsilon):
    """
    With probabiliy epsilon we choose an action randomly.
    """
    if random.random() < epsilon:
        a = random.choice(board.possibleActions())
        return Board.coordFromLinearCoord(a)
    
    return policyMCMove(board, q, player)

def expectedTargetUnderEpsGreedy(board, q, epsilon):

    key = board.toKey()
    
    qValues = [q[(key, a)] for a in board.possibleActions()]
    bestq = max(qValues)
    
    return bestq * (1 - epsilon) + epsilon * sum(qValues) / len(actions)
    
def playGame(nGames, q, epsilon = -1):
    
    win1 = 0
    winM1 = 0
    draw = 0
    
    for _ in tqdm(range(nGames)):
        
        b = Board(np.zeros((3,3)))
        
        player = random.choice([-1,1])
        
        gameOver = 0
        
        while gameOver == 0:
            if player == 1:
                x, y = policyRandomMove(b)
            else:
                x, y = policyMCMoveEpsGreedy(b, q, player, epsilon)
               
            gameOver = b.move(x, y, player)
            
            player *= -1

        if gameOver == 2:
            draw += 1
        elif gameOver == 1:
            win1 += 1
        else:
            winM1 += 1


    print(win1/nGames, winM1/nGames, draw/nGames)

In [4]:
def initialise(keys):

    q = {}
    n = {}

    for key in keys:
        board = Board(np.zeros((3,3)))
        board.fromKey(key)
              
        for action in board.possibleActions():
            q[(key, action)] = 0.5
            n[(key, action)] = 0
    
    return q, n

q, n = initialise(getAllStartStates())

Here we look at the $\epsilon$-greedy method.  This means that we choose the current best method most of the time but sometimes, namely with probability $\epsilon$, we explore.  In our case, one player plays randomly and the other follows this strategy.  The action-value function $q(s, a)$ is a function of state $s$ and action $a$. 

In [10]:
def playSingleGame(player):
    
    player = random.choice([-1,1])
    board = Board(np.zeros((3,3)))
    
    listStateAction = []
    
    gameState = 0
    
    while gameOver == 0:
        if player == 1:
            x, y = policyRandomMove(board)
        else:
            x, y = policyMCMoveEpsGreedy(board, q, player, epsilon)
            listStateAction.append((board.toKey(), (x, y)))
        
        gameState = board.move(x, y, player)
        player *= -1
        
    return gameState, listStateAction

#epsilon greedy

epsilon = 0.05
nGames = 100000

for _ in tqdm(range(nGames)):

    finalGameState, listStateAction = playSingleGame(player)

    # r is the reward.
    
    if finalGameState == 2: # draw
        r = 0.5
    elif finalGameState == 1: # loss
        r = 0
    else: # win
        r = 1

    for key, (x, y) in listStateAction:
        action = Board.coordToLinearCoord(x, y)
        
        u = (key, action)
        
        q[u] = q[u] * n[u] + r
        n[u] += 1
        q[u] /= n[u]

playGame(nGames, q, epsilon)

100%|██████████| 100000/100000 [01:43<00:00, 961.73it/s]
100%|██████████| 100000/100000 [01:01<00:00, 1620.44it/s]

0.04176 0.89466 0.06358





In [218]:
q, n = initialise(getAllStartStates())

In [None]:
#exploring starts
nGames = 100000
for _ in tqdm(range(nGames)):
    b = Board(np.zeros((3,3)))
    
    player = -1
    
    randomAction = True
    
    gameOver = 0
    
    listStateAction=[]
    
    while gameOver == 0:
        if player == 1:
            x, y = policyRandomMove(b)
        else:
            key = b.toKey()
            if randomAction:
                p = np.where(b.b==0, 1, 0)
                policy = p / np.sum(p)
                x,y = chooseMove(policy)
                randomAction = False
            else:
                x,y = policyMCMove(b, q, player)
            
            listStateAction.append((key, (x, y)))
        
        gameOver = b.move(x, y, player)
        
        player *= -1
        
    if gameOver == 2:
        r = 0.5
    elif gameOver == 1:
        r = 0
    else:
        r = 1
    
    for key, (x,y) in listStateAction:
        a = Board.coordToLinearCoord(x, y)
        
        u = (key, a)
        
        q[u] = q[u] * n[u] + r
        n[u] += 1
        q[u] /= n[u]

playGame(nGames, q)

In [None]:
q, c = initialise(getAllStartStates())

In [None]:
#off policy
nGames = 1000000
for _ in tqdm(range(nGames)):
    b = Board(np.zeros((3,3)))
    player = random.choice([-1,1])
    gameOver=0
    listStateAction=[]
    while gameOver==0:
        if player==1:
            x,y=policyRandomMove(b)
        else:
            x,y=policyRandomMove(b, True)
            listStateAction.append((b.toKey(), (x, y)))
        gameOver = b.move(x,y,player)
        player=-player
        
    if gameOver==2:
        r=0.5
    elif gameOver==1:
        r=0
    else:
        r=1

    W=1
    for k, (x,y) in listStateAction[::]:
        a = Board.coordToLinearCoord(x, y)
        c[(k,a)]=c[(k,a)]+W
        q[(k,a)]=q[(k,a)]+W*(r-q[(k,a)])/c[(k,a)]
        b = Board(np.zeros((3,3)))
        b.fromKey(k)
        pA=b.possibleActions()
        bestA=pA[0]
        for aa in pA[1:]:
            if q[(k,aa)]>q[(k,bestA)]:
                bestA=aa
        if bestA!=a:
            break
        W=W*len(pA)

playGame(nGames, q)

In [None]:
q, _ = initialise(getAllStartStates())

In [None]:
#sarsa
nGames = 100000
alpha=0.1
epsilon = 0.1
for _ in tqdm(range(nGames)):
    b = Board(np.zeros((3,3)))
    player = random.choice([-1,1])
    gameOver=0
    action=None
    newAction=None
    while gameOver==0:
        if player==1:
            x,y=policyRandomMove(b)
        else:
            x, y = policyMCMoveEpsGreedy(b,q,player,0.05)
            if action is None:
                action = x+3*y
                key = b.toKey()
            else:
                newAction = x+3*y
                newKey = b.toKey()
        gameOver = b.move(x,y,player)
        if gameOver==-1:
            q[(key,action)]=q[(key,action)]+alpha*(1-q[(key,action)])
        elif gameOver==1:
            q[(key,action)]=q[(key,action)]+alpha*(0-q[(key,action)])
        elif newAction is not None:
            q[(key,action)]=q[(key,action)]+alpha*(q[(newKey,newAction)]-q[(key,action)])
        if player==-1 and newAction is not None:
            action=newAction
            key=newKey
        player=-player

playGame(nGames, q)

In [None]:
q, _ = initialise(getAllStartStates())

In [None]:
#q learning
nGames = 100000
alpha=0.1
for _ in tqdm(range(nGames)):
    b = Board(np.zeros((3,3)))
    player = random.choice([-1,1])
    gameOver=0
    action=None
    newAction=None
    while gameOver==0:
        if player==1:
            x,y=policyRandomMove(b)
        else:
            x, y = policyMCMoveEpsGreedy(b,q,player,0.1)
            if action is None:
                action = x+3*y
                key = b.toKey()
            else:
                newAction = x+3*y
                newKey = b.toKey()
                pA=b.possibleActions()
                maxq=q[(newKey,pA[0])]
                for aa in pA[1:]:
                    if q[(newKey,aa)]>maxq:
                        maxq=q[(newKey,aa)]
        gameOver = b.move(x,y,player)
        if gameOver==-1:
            q[(key,action)]=q[(key,action)]+alpha*(1-q[(key,action)])
        elif gameOver==1:
            q[(key,action)]=q[(key,action)]+alpha*(0-q[(key,action)])
        elif newAction is not None:
            q[(key,action)]=q[(key,action)]+alpha*(maxq-q[(key,action)])
        if player==-1 and newAction is not None:
            action=newAction
            key=newKey
        player=-player

playGame(nGames, q)

In [None]:
q, _ = initialise(getAllStartStates())

In [None]:
# expected sarsa where behaviour policy is eps greedy derived from q
# (expected sarsa where behaviour policy is greedy derived from q = q learning)
nGames = 100000
alpha = 0.1
epsilon = 0.1

for _ in tqdm(range(nGames)):
    b = Board(np.zeros((3,3)))
    
    player = random.choice([-1,1])
    
    gameOver = 0
    
    action = None
    newAction = None
    
    while gameOver == 0:
        if player == 1:
            x, y = policyRandomMove(b)
        else:
            x, y = policyMCMoveEpsGreedy(b, q, player, epsilon)
            if action is None:
                action = x+3*y
                key = b.toKey()
            else:
                newAction = Board.coordToLinearCoord(x, y)
                newKey = b.toKey()
                expectedTarget = expectedTargetUnderEpsGreedy(b, q, epsilon)
        gameOver = b.move(x,y,player)
        if gameOver==-1:
            q[(key,action)]=q[(key,action)]+alpha*(1-q[(key,action)])
        elif gameOver==1:
            q[(key,action)]=q[(key,action)]+alpha*(0-q[(key,action)])
        elif newAction is not None:
            q[(key,action)]=q[(key,action)]+alpha*(expectedTarget-q[(key,action)])
        if player==-1 and newAction is not None:
            action=newAction
            key=newKey
        player=-player

playGame(nGames, q)