In [1]:
from enum import Enum
import numpy as np
import random

In [2]:
# constants
REWARD_FOR_WINNING = 10
REWARD_FOR_LOSING = -10
REWARD_FOR_KEEPING_THE_GAME_GOING = -1
REWARD_FOR_DRAWING = 0

N_EPISODES = 10000
EPSILON = 0.15

In [3]:
class Player(Enum):
    X = 0
    O = 1

        
class Board:
    
    def __init__(self=None):
        self._board = np.array([[0]*3]*3)
        
    def _position_from_action(self, action: int):
        row = action // 3
        col = action % 3
        return row, col
        
    # action is an integer between 1 to 9 where 1 is the top left cell and 9 the bottom right, going row-wise
    def make_play(self, action, player):
        row, col = self._position_from_action(action)
        if player == Player.X:  # represented as 1
            self._board[row][col] = 1
        else:
            assert player == Player.O  # represented as 2
            self._board[row][col] = 2
            
        
    def free_cells(self):
        possible = []
        for action in range(9):
            row, col = self._position_from_action(action)
            if self._board[row][col] == 0:
                possible.append(action)
        return possible
    
    
    def get_state(self):
        encoding = 0
        for i in range(3):
            for j in range(3):
                cell_number = 3 * i + j
                encoding += self._board[i][j] * (10 ** cell_number)
        return encoding
    
    
    def print(self):
        for i in range(3):
            for j in range(3):
                if self._board[i][j] == 0:
                    print('-', end=' ')
                elif self._board[i][j] == 1:
                    print('X', end=' ')
                else:
                    assert self._board[i][j] == 2
                    print('O', end=' ')
            print()
        print()

    # assumes the other software parts work correctly (only one can have three in a row. game ends.)
    def won_by(self, player):
        target = np.array([1,1,1]) if player == Player.X else np.array([2,2,2])
        for m in [self._board, np.transpose(self._board)]:
            for row in m:
                if np.all(row == target):
                    return True
            if np.all(np.diag(m) == target):
                return True
            diag2 = np.array([m[0][2], m[1][1], m[2][0]])
            if np.all(diag2 == target):
                return True
        return False
    
    def drawn(self):
        for i in range(3):
            for j in range(3):
                if self._board[i][j] == 0:
                    return False
        # if get here that means the board is completely filled
        if not self.won_by(Player.X) and not self.won_by(Player.O):
            return True
        else:
            return False
    
    
    def terminal(self):
        if self.drawn() or self.won_by(Player.X) or self.won_by(Player.O):
            return True
        else:
            return False
    

            
        

# can pass an action to and get state and reward back. state is just an identifier
class Gym:
    
    
    def __init__(self):
        self.board = Board()
    
    
    # returns possible actions
    def get_possible_actions(self):
        return self.board.free_cells()
    
    
    def get_state(self):
        return self.board.get_state()
    
    
    def new_session(self):
        self.board = Board()
        
    def finished(self):
        return self.board.terminal()
        
        
    # function agent uses to pass an action to the environment. returns reward and state landed in
    def action(self, action):
        assert action in self.get_possible_actions()
        self.board.make_play(action, Player.X)
        # trainer's turn
        if not self.board.terminal():
            free_cells = self.board.free_cells()
            trainer_action = random.choice(free_cells)
            self.board.make_play(trainer_action, Player.O)
        # what reward to give
        if self.board.won_by(Player.X):
            reward = REWARD_FOR_WINNING
        elif self.board.won_by(Player.O):
            reward = REWARD_FOR_LOSING
        elif self.board.drawn():
            reward = REWARD_FOR_DRAWING
        else:
            assert not self.board.terminal()
            reward = REWARD_FOR_KEEPING_THE_GAME_GOING
        return self.board.get_state(), reward
    
    
    
    
class Agent:
    
    
    def __init__(self=None):
        self.policy = []
        self.gym = Gym()
        
    
    def _pick_action(self, current_state, possible_actions):
        if random.random() < EPSILON:
            return random.choice(possible_actions)
        else:
            best_Q = -float('Inf')
            best_action = random.choice(possible_actions)
            for state_action_pair in self.policy:
                state = state_action_pair['s']
                action = state_action_pair['a']
                Q = state_action_pair['Q']
                if state != current_state or action not in possible_actions:
                    continue
                else:
                    if Q > best_Q:
                        best_Q = Q
                        best_action = action
            return best_action


    def _update_policy(self, state, action, Return):
        for i in range(len(self.policy)):
            s, a, Q, N = self.policy[i]['s'], \
                                  self.policy[i]['a'], \
                                  self.policy[i]['Q'], \
                                  self.policy[i]['N']
            if s == state and a == action:
                new_Q = (Q * N + Return) / (N + 1)
                new_N = N+1
                self.policy[i]['Q'] = new_Q
                self.policy[i]['N'] = new_N
                return  # so don't append twice

        self.policy.append({
            's': state,
            'a': action,
            'Q': Return,
            'N': 1
        })
    
    def train_for_one_episode(self):
        state_action_reward = []
        self.gym.new_session()
        while not self.gym.finished():
            state = self.gym.get_state()
            possible_actions = self.gym.get_possible_actions()
            action = self._pick_action(state, possible_actions)  # according to self.policy
            _, reward = self.gym.action(action)
            state_action_reward.append([state, action, reward])
        Return = 0
        for state,action,reward in list(reversed(state_action_reward)):
            Return += reward
            self._update_policy(state, action, Return)
            
        
    def interactive_game(self):
        board = Board()
        while not board.terminal():
            # agent's turn
            board.print()
            state = board.get_state()
            possible_actions = board.free_cells()
            action = self._pick_action(state, possible_actions)  # according to self.policy
            board.make_play(action, Player.X)
            # user's turn
            if board.terminal():
                break
            board.print()
            possible_actions = board.free_cells()  # since not terminal
            action = int(input('Where do you want to play? (0-8)'))
            while action not in possible_actions:
                action = int(input('Choose empty field'))
            board.make_play(action, Player.O)
        board.print()
        print('The game concluded')






In [4]:
agent = Agent()

In [10]:
for i in range(10000):
    agent.train_for_one_episode()

In [15]:
agent.interactive_game()

- - - 
- - - 
- - - 

- - - 
X - - 
- - - 



Where do you want to play? (0-8) 4


- - - 
X O - 
- - - 

X - - 
X O - 
- - - 



Where do you want to play? (0-8) 6


X - - 
X O - 
O - - 

X - - 
X O - 
O X - 



Where do you want to play? (0-8) 2


X - O 
X O - 
O X - 

The game concluded


In [11]:
agent.policy

[{'s': 220201011, 'a': 4, 'Q': -10, 'N': 1},
 {'s': 220000011, 'a': 3, 'Q': -11.0, 'N': 2},
 {'s': 20000001, 'a': 1, 'Q': -12.0, 'N': 2},
 {'s': 0, 'a': 0, 'Q': 1.0060606060606063, 'N': 165},
 {'s': 21120201, 'a': 8, 'Q': -4.6, 'N': 5},
 {'s': 20100201, 'a': 6, 'Q': -5.0, 'N': 3},
 {'s': 201, 'a': 5, 'Q': -8.0, 'N': 12},
 {'s': 121200021, 'a': 4, 'Q': 10.0, 'N': 5},
 {'s': 21200001, 'a': 8, 'Q': 4.384615384615385, 'N': 13},
 {'s': 200001, 'a': 6, 'Q': 5.928571428571429, 'N': 28},
 {'s': 102020001, 'a': 3, 'Q': -10.666666666666666, 'N': 3},
 {'s': 20001, 'a': 8, 'Q': -11.5, 'N': 2},
 {'s': 212112021, 'a': 2, 'Q': 0.0, 'N': 6},
 {'s': 210012021, 'a': 5, 'Q': -1.0, 'N': 2},
 {'s': 12021, 'a': 7, 'Q': 4.833333333333333, 'N': 12},
 {'s': 21, 'a': 4, 'Q': 2.2, 'N': 10},
 {'s': 201012210, 'a': 7, 'Q': 10.0, 'N': 7},
 {'s': 201000210, 'a': 4, 'Q': 0.8, 'N': 5},
 {'s': 201000000, 'a': 1, 'Q': -3.2857142857142856, 'N': 7},
 {'s': 0, 'a': 6, 'Q': 1.3023255813953485, 'N': 172},
 {'s': 11022012, 'a

In [None]:
agent.interactive_game()

In [None]:
a = np.array([[1,2,3],[4,5,6],[7,8,9]])
a

In [None]:
np.diag(a, )

In [None]:
np.transpose(a)



def terminal(state):
    arr = row_col_diag_sums(state)
    if 3 in arr or 30 in arr: # one wins or
        return True
    for x in arr: # contains empty cell
        if x >= 100:
            return False
    return True # no empty cell


def rand_state():
    return np.copy(init_board)


def possible_next_states(state):
    possible = []
    for i in range(3):
        for j in range(3):
            if state[i][j] == 100:  # empty
                c = np.copy(state)
                c[i][j] = 1
                possible.append(c)
    return possible


def make_action(state, policy):
    possible = possible_next_states(state)
    if random.random() < epsilon:
        return random.choice(possible)
    else:
        max_return = -10000
        next_state = random.choice(possible)  # default if none yet in policy
        for s in possible:
            average_return, _ = policy.get(s)
            if average_return is not None and average_return > max_return:
                max_return = average_return
                next_state = s
        return next_state





def play_first_free_cell(state):
    c = np.copy(state)
    for i in range(len(state)):
        for j in range(len(state[0])):
            if state[i][j] == 100:
                c[i][j] = 10
                return c
    return c


epi = 0
def master_move(intermediate_state):
    global epi
    epi +=1
    if epi < n_episodes-1:
        next_state = play_first_free_cell(intermediate_state)
        return next_state
    else:
        print(intermediate_state)
        inp = input('where do you wanna put your O?')
        if len(inp) != 2:
            return intermediate_state
        else:
            x, y = int(inp[0]), int(inp[1])
            next_state = np.copy(intermediate_state)
            next_state[x][y] = 10
            return next_state




def get_reward(state):
    if not terminal(state):
        return -1
    else:
        if X_won(state):
            return 20
        elif O_won(state):
            return -10
        else:
            return 0

In [None]:
np.all(np.array([True, True, True]))