In [46]:
import numpy as np
import random
from tqdm import tqdm # loops show smart progress meter
from enum import Enum

In [47]:
#class Player(Enum):
#    NOUGHTS = -1
#    CROSSES = 1

class Board:
    """ holds a 3x3 board.
    """
    
    def __init__(self, bInit):
        """ bInit is a numpy array.
        """
        
        assert(bInit.shape == ((3,3)))
        
        self.b = bInit
    
    def isGameOver(self):
        return np.any(self.b == 0) == False
    
    def isEmptyCell(self, x, y):
        return self.b[x, y] == 0
    
    def check(self, player):
        """
        This function returns player if this player won, 2 if draw, and 0 otherwise.
        """
        
        assert(player == 1 or player == -1)
        
        # check horizontal and vertical lines for win.
        
        for i in range(3):
            if (self.b[i,0] == player and self.b[i,0] == self.b[i,1] == self.b[i,2]) or \
               (self.b[0,i] == player and self.b[0,i] == self.b[1,i] == self.b[2,i]):
                return player

        # check diagonals for win.
        
        if (self.b[0,0] == player and self.b[0,0] == self.b[1,1] == self.b[2,2]) or \
           (self.b[0,2] == player and self.b[0,2] == self.b[1,1] == self.b[2,0]):
            return player
        
        
        if self.isGameOver():   # draw.
            return 2
        
        return 0    # game not over.

    def toKey(self):
        
        s = 0
        i = 0
        
        for x in range(3):
            for y in range(3):
                s += (self.b[x,y] + 2) * 10**i
                i += 1

        return int(s)

    def fromKey(self, i):
        
        self.b = np.zeros((3, 3))
        
        for x in range(3):
            for y in range(3):
                self.b[x,y] = i%10 - 2
                i = int(i/10)
                
    def move(self, x, y, player):
        self.b[x,y] = player
        
        return self.check(player)
    
    def possibleActions(self):
        
        """ returns list of empty cells using linear coords.
        """
        
        actions = []
        
        for x in range(3):
            for y in range(3):
                if self.isEmptyCell(x, y):
                    actions.append(x + 3 * y)
        
        return actions
    
def getAllStartStates():
    def f(keys, b, i):
        # i is initial cell in linear coords.
        
        assert(i >= 0 and i < 9)
        
        x = i % 3
        y = int(i / 3)
        
        for p in [-1, 0, 1]:
            b.b[x,y] = p
            
            if np.sum(b.b == 1) - np.sum(b.b == -1) in [0, 1] and b.check(1) == 0 and b.check(-1) == 0:
                keys.append(b.toKey())
            
            if i < 8:
                f(keys, b, i+1)
    
    board = Board(np.zeros((3,3)))
    boardKeys = [board.toKey()]
    
    f(boardKeys, board, 0)
    
    return boardKeys

In [48]:
def policyRandomMove(board, pure = False):
    
    p = np.where(board.b == 0, 1, 0) # p is 3x3 grid which is 1 where we have empty cells
    
    if not pure:
        if p[1,1] == 1:  # (1, 1) is empty
            p[1,1] = 3
        for x, y in [[0,0],[0,2],[2,0],[2,2]]:  # the four corners
            if p[x,y] == 1:
                p[x,y] = 2
    
    policy = p / np.sum(p)
    
    return chooseMove(policy)

def chooseMove(policy):
    
    u = np.random.uniform()
    c = 0
    
    for x in [0, 1, 2]:
        for y in [0, 1, 2]:
            c += policy[x,y]
            if u < c:
                return x, y


def policyMCMove(board, q, player):
    key=board.toKey()
    actions=board.possibleActions()
    best=q[key,actions[0]]
    bestAction=actions[0]
    for a in actions[1:]:
        if q[key,a]>best:
            bestAction=a
    x=bestAction % 3
    y=int(bestAction/3)
    return x,y

def policyMCMoveEpsGreedy(board, q, player, epsilon):
    if random.random()<epsilon:
        a = random.choice(board.possibleActions())
        x=a % 3
        y=int(a/3)
        return x,y
    else:
        return policyMCMove(board, q, player)

def expectedTargetUnderEpsGreedy(board, q, epsilon):
    e=0
    actions = board.possibleActions()
    key=board.toKey()
    bestq=None
    for a in actions:
        e+=epsilon * q[(key,a)] / len(actions)
        if bestq is None or q[(key,a)]>bestq:
            bestq=q[(key,a)]
    e+=bestq*(1-epsilon)
    return e
    
def playGame(q,epsilon = -1):
    nGames = 100000
    win1 = 0
    winM1 = 0
    draw = 0
    
    for _ in tqdm(range(nGames)):
        b = Board(np.zeros((3,3)))
        player = random.choice([-1,1])
        gameOver = 0
        
        while gameOver==0:
            if player==1:
                x,y=policyRandomMove(b)
            else:
                if epsilon > 0:
                    x,y=policyMCMoveEpsGreedy(b,q,player,epsilon)
                else:
                    x,y=policyMCMove(b,q,player)                    
            
            gameOver = b.move(x,y,player)
            
            player = -player

        if gameOver==2:
            draw += 1
        elif gameOver==1:
            win1 += 1
        else:
            winM1 += 1


    print(win1/nGames, winM1/nGames, draw/nGames)

In [49]:
startStates = getAllStartStates()

q = {}
n = {}

for key in startStates:
    board = Board(np.zeros((3,3)))
    board.fromKey(key)
              
    for action in board.possibleActions():
        q[(key, action)] = 0.5
        n[(key, action)] = 0


In [None]:
#epsilon greedy
epsilon = 0.05
nGames = 100000

for _ in tqdm(range(nGames)):
    b = Board(np.zeros((3,3)))
    
    player = random.choice([-1,1])
    
    gameOver = 0
    
    listStateAction = []
    
    while gameOver == 0:
        if player == 1:
            x,y = policyRandomMove(b)
        else:
            x,y = policyMCMoveEpsGreedy(b, q, player, epsilon)
            listStateAction.append((b.toKey(), (x, y)))
        
        gameOver = b.move(x,y,player)
        
        player = -player
        
    r = 1
    
    if gameOver == 2:
        r = 0.5
    elif gameOver == 1:
        r = 0

    for k, (x,y) in listStateAction:
        a = x + 3*y
        
        u = (k, a)
        
        q[u] = q[u] * n[u] + r
        n[u] += 1
        q[u] /= n[u]


playGame(q, epsilon)

 56%|█████▌    | 55872/100000 [00:37<00:29, 1493.60it/s]

In [52]:
startStates=getAllStartStates()

q={}
n={}

for k in startStates:
    b = Board(np.zeros((3,3)))
    b.fromKey(k)
    
    for a in b.possibleActions():
        q[(k,a)] = 0.5
        n[(k,a)] = 0


In [53]:
#exploring starts
nGames = 100000
for _ in tqdm(range(nGames)):
    b = Board(np.zeros((3,3)))
    player = -1
    randomAction = True
    gameOver = 0
    listStateAction=[]
    while gameOver==0:
        if player==1:
            x,y=policyRandomMove(b)
        else:
            key=b.toKey()
            if randomAction:
                p = np.where(b.b==0, 1, 0)
                policy = p / np.sum(p)
                x,y = chooseMove(policy)
                randomAction=False
            else:
                x,y=policyMCMove(b,q,player)
            listStateAction.append((key, (x, y)))
        gameOver = b.move(x,y,player)
        player=-player
        
    if gameOver==2:
        r=0.5
    elif gameOver==1:
        r=0
    else:
        r=1
    
    for k, (x,y) in listStateAction[::]:
        a=x+3*y
        q[(k,a)]=q[(k,a)]*n[(k,a)]+r
        n[(k,a)]+=1
        q[(k,a)]/=n[(k,a)]

playGame(q)

 13%|█▎        | 13186/100000 [00:11<01:13, 1178.79it/s]


KeyboardInterrupt: 

In [None]:
startStates=getAllStartStates()
q={}
c={}
for k in startStates:
    b = Board(np.zeros((3,3)))
    b.fromKey(k)
    for a in b.possibleActions():
        q[(k,a)]=0.5
        c[(k,a)]=0

In [None]:
#off policy
nGames = 1000000
for _ in tqdm(range(nGames)):
    b = Board(np.zeros((3,3)))
    player = random.choice([-1,1])
    gameOver=0
    listStateAction=[]
    while gameOver==0:
        if player==1:
            x,y=policyRandomMove(b)
        else:
            x,y=policyRandomMove(b, True)
            listStateAction.append((b.toKey(), (x, y)))
        gameOver = b.move(x,y,player)
        player=-player
        
    if gameOver==2:
        r=0.5
    elif gameOver==1:
        r=0
    else:
        r=1

    W=1
    for k, (x,y) in listStateAction[::]:
        a=x+3*y
        c[(k,a)]=c[(k,a)]+W
        q[(k,a)]=q[(k,a)]+W*(r-q[(k,a)])/c[(k,a)]
        b = Board(np.zeros((3,3)))
        b.fromKey(k)
        pA=b.possibleActions()
        bestA=pA[0]
        for aa in pA[1:]:
            if q[(k,aa)]>q[(k,bestA)]:
                bestA=aa
        if bestA!=a:
            break
        W=W*len(pA)

playGame(q)

In [None]:
startStates=getAllStartStates()

q={}

for k in startStates:
    b = Board(np.zeros((3,3)))
    b.fromKey(k)
    
    for a in b.possibleActions():
        q[(k,a)]=0.5

In [None]:
#sarsa
nGames = 100000
alpha=0.1
epsilon = 0.1
for _ in tqdm(range(nGames)):
    b = Board(np.zeros((3,3)))
    player = random.choice([-1,1])
    gameOver=0
    action=None
    newAction=None
    while gameOver==0:
        if player==1:
            x,y=policyRandomMove(b)
        else:
            x, y = policyMCMoveEpsGreedy(b,q,player,0.05)
            if action is None:
                action = x+3*y
                key = b.toKey()
            else:
                newAction = x+3*y
                newKey = b.toKey()
        gameOver = b.move(x,y,player)
        if gameOver==-1:
            q[(key,action)]=q[(key,action)]+alpha*(1-q[(key,action)])
        elif gameOver==1:
            q[(key,action)]=q[(key,action)]+alpha*(0-q[(key,action)])
        elif newAction is not None:
            q[(key,action)]=q[(key,action)]+alpha*(q[(newKey,newAction)]-q[(key,action)])
        if player==-1 and newAction is not None:
            action=newAction
            key=newKey
        player=-player

playGame(q)

In [None]:
startStates=getAllStartStates()
q={}
for k in startStates:
    b = Board(np.zeros((3,3)))
    b.fromKey(k)
    for a in b.possibleActions():
        q[(k,a)]=0.5

In [None]:
#q learning
nGames = 100000
alpha=0.1
for _ in tqdm(range(nGames)):
    b = Board(np.zeros((3,3)))
    player = random.choice([-1,1])
    gameOver=0
    action=None
    newAction=None
    while gameOver==0:
        if player==1:
            x,y=policyRandomMove(b)
        else:
            x, y = policyMCMoveEpsGreedy(b,q,player,0.1)
            if action is None:
                action = x+3*y
                key = b.toKey()
            else:
                newAction = x+3*y
                newKey = b.toKey()
                pA=b.possibleActions()
                maxq=q[(newKey,pA[0])]
                for aa in pA[1:]:
                    if q[(newKey,aa)]>maxq:
                        maxq=q[(newKey,aa)]
        gameOver = b.move(x,y,player)
        if gameOver==-1:
            q[(key,action)]=q[(key,action)]+alpha*(1-q[(key,action)])
        elif gameOver==1:
            q[(key,action)]=q[(key,action)]+alpha*(0-q[(key,action)])
        elif newAction is not None:
            q[(key,action)]=q[(key,action)]+alpha*(maxq-q[(key,action)])
        if player==-1 and newAction is not None:
            action=newAction
            key=newKey
        player=-player

playGame(q)

In [None]:
startStates=getAllStartStates()
q={}
for k in startStates:
    b = Board(np.zeros((3,3)))
    b.fromKey(k)
    for a in b.possibleActions():
        q[(k,a)]=0.5

In [None]:
# expected sarsa where behaviour policy is eps greedy derived from q
# (expected sarsa where behaviour policy is greedy derived from q = q learning)
nGames = 100000
alpha=0.1
epsilon = 0.1
for _ in tqdm(range(nGames)):
    b = Board(np.zeros((3,3)))
    
    player = random.choice([-1,1])
    
    gameOver=0
    
    action = None
    newAction = None
    
    while gameOver == 0:
        if player == 1:
            x,y=policyRandomMove(b)
        else:
            x, y = policyMCMoveEpsGreedy(b,q,player,epsilon)
            if action is None:
                action = x+3*y
                key = b.toKey()
            else:
                newAction = x+3*y
                newKey = b.toKey()
                expectedTarget = expectedTargetUnderEpsGreedy(b, q, epsilon)
        gameOver = b.move(x,y,player)
        if gameOver==-1:
            q[(key,action)]=q[(key,action)]+alpha*(1-q[(key,action)])
        elif gameOver==1:
            q[(key,action)]=q[(key,action)]+alpha*(0-q[(key,action)])
        elif newAction is not None:
            q[(key,action)]=q[(key,action)]+alpha*(expectedTarget-q[(key,action)])
        if player==-1 and newAction is not None:
            action=newAction
            key=newKey
        player=-player

playGame(q)