<a href="https://colab.research.google.com/github/hinsley/colabs/blob/master/TDt3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# TDt3

Temporal difference learning model for tic-tac-toe

Inspired by Sutton & Barto

Plays as X

Board position encodings:

$\begin{bmatrix}0 & 1 & 2 & 3 & 4 & 5 & 6 & 7 & 8\end{bmatrix} \to \begin{pmatrix} 0 & 1 & 2 \\ 3 & 4 & 5 \\ 6 & 7 & 8 \end{pmatrix}$

In [0]:
import numpy as np
from typing import Dict, List, Optional, Tuple

In [0]:
ActionValue = Optional[float] # [0.0, 1.0]. Higher values are better. We
                              # consider draws to be losses.

PositionState = int # Enumeration. See below.
VALUE_BLANK = 0
VALUE_X = 1
VALUE_O = 2

BoardState = List[PositionState] # (3 x 3)

EncodedBoardState = int # Trinary encoding.

In [0]:
#@title f_eq(a: float, b: float, epsilon: float=1e-4) -> bool
def f_eq(a: float, b: float, epsilon: float=1e-4) -> bool:
  """ Test for float equivalency. Parameterizes rounding error tolerance. """

  return abs(a - b) < epsilon

In [0]:
#@title eval_state(state: BoardState) -> ActionValue
def eval_state(state: BoardState) -> ActionValue:
  """
  Used to generate a fresh action value. Only evaluates win/lose/draw
  conditions, outputting a 0.5 action value if the state does not signify the
  end of a game.

  Because it is not possible in Tic-Tac-Toe to achieve a board state such that
  both players have three moves in a row [sic], we can stop evaluating once the
  first row [sic] is detected.
  """
  
  for group in [
    [0,1,2], # Rows
    [3,4,5],
    [6,7,8],
    [0,3,6], # Columns
    [1,4,7],
    [2,5,8],
    [0,4,8], # Diagonals
    [2,4,6],
  ]:
    if state[group[0]] == state[group[1]] == state[group[2]]:
      try:
        return {
          VALUE_X: 1.0,
          VALUE_O: 0.0,
        }[state[group[0]]]
      except KeyError: # Row of blanks
        continue
  
  if any([position == VALUE_BLANK for position in state]):
    return 0.5 # Board contains an empty space and no win condition achieved
  else:
    return 0.75 # Draw

In [0]:
#@title encode_board(state: BoardState) -> EncodedBoardState
def encode_board(state: BoardState) -> EncodedBoardState:
  """
  Encode a trinary board state into a single hashable value. Smallest position
  is top left.
  """

  return sum([value * 3 ** i for i, value in enumerate(state)])

In [0]:
#@title decode_board(encoded_state: EncodedBoardState) -> BoardState
def decode_board(encoded_state: EncodedBoardState) -> BoardState:
  """
  Decode a single hashable representation of board state into an indexable
  serialization of positions.
  """

  return [encoded_state // (3 ** i) % 3 for i in range(9)]

In [0]:
#@title pprint(state: Optional[BoardState])
def pprint(state: BoardState):
  """ Pretty prints a board state. """
  
  if state is None:
    return

  def graphical_position_state(position_state: PositionState) -> str:
    return {
        VALUE_BLANK: "-",
        VALUE_X: "X",
        VALUE_O: "O",
    }[position_state]
  
  print()
  print(*[graphical_position_state(pstate) for pstate in state[:3]])
  print(*[graphical_position_state(pstate) for pstate in state[3:6]])
  print(*[graphical_position_state(pstate) for pstate in state[6:]])

In [0]:
#@title class ActionEvaluator()

from random import randint

class ActionEvaluator():

  _action_values: Dict[EncodedBoardState, ActionValue]
  _prev_state: EncodedBoardState


  def __init__(self):

    self._action_values = {0: 0.5}
    self.reset_board_state()
  

  def reset_board_state(self):

    self._prev_state = encode_board([VALUE_BLANK] * 9)


  def evaluate(self, state: BoardState) -> ActionValue:

    encoded_state = encode_board(state)

    if not encoded_state in self._action_values: # Unexplored action.
      self._action_values[encoded_state] = eval_state(state)
    
    return self._action_values[encoded_state]


  def back_up_value(self,
                    new_state: BoardState,
                    learning_rate: float=0.9):
    
    self._action_values[self._prev_state] += learning_rate * (self.evaluate(new_state) - self.evaluate(decode_board(self._prev_state)))
    self._prev_state = encode_board(new_state)


  def best_move(self, state: BoardState) -> BoardState:

    def possible_move_generator():

      for i, position_value in enumerate(state):
        if position_value == VALUE_BLANK:
          state_after_move = state.copy()
          state_after_move[i] = VALUE_X
          yield state_after_move
    possible_states = possible_move_generator()

    max_action_value, argmax = 0.0, []
    for new_state in possible_states:
      action_value = self.evaluate(new_state)
      if action_value >= max_action_value: # Found a better/equal move!
        if action_value != max_action_value:
          argmax = []
        max_action_value = action_value
        argmax.append(new_state)
      if f_eq(max_action_value, 1.0): # No point in looking for a better move.
        break

    # We never have to worry about accessing an empty list here, as that means
    # we've already reached a draw.

    return argmax[randint(0, len(argmax)-1)]

In [0]:
#@title move_o(action_evaluator: ActionEvaluator, state: BoardState, row: int, col: int) -> Optional[BoardState]
def move_o(action_evaluator: ActionEvaluator, state: BoardState, row: int, col: int) -> Optional[BoardState]:
  """ Makes a move for O in place. Row and col are zero-indexed. """
  
  if state[row * 3 + col] != VALUE_BLANK:
    print("That's not valid -- someone has already moved there!")
    return

  prev_state = state.copy()
  state[row * 3 + col] = VALUE_O

  if eval_state(state) == 0.0:
    print("O wins!")
    action_evaluator.back_up_value(state)

  return state

In [0]:
#@title move_x(action_evaluator: ActionEvaluator, state: BoardState) -> BoardState
def move_x(action_evaluator: ActionEvaluator, state: BoardState) -> BoardState:
  """ Makes the best known move for X in place. """

  prev_state = state.copy()
  state[:] = action_evaluator.best_move(state)
  action_evaluator.back_up_value(state)

  state_value = eval_state(state)
  if f_eq(1.0, state_value):
    print("X wins!")
  elif f_eq(0.75, state_value):
    print("Draw!")

  return state

In [0]:
#@title Delete knowledge of the game
try:
  del initialized
  del action_evaluator
  print("Knowledge deleted successfully.")
except:
  print("Cannot delete knowledge -- it doesn't exist!")

In [0]:
#@title Load knowledge of the game

knowledge = "" #@param {type: "string"}

try:
  initialized = initialized
except:
  action_evaluator = ActionEvaluator()

initialized = 0

try:
  action_evaluator._action_values = eval(knowledge)
  print("Knowledge loaded successfully.")
except:
  print("Cannot load knowledge -- invalid.")

In [807]:
#@title Turn 1: New game

try:
  initialized += 1
  action_evaluator.reset_board_state()
except:
  print("Initializing ActionEvaluator")
  initialized = 1
  action_evaluator = ActionEvaluator()

board_state = [VALUE_BLANK] * 9

print(f"Game {initialized} - Turn 1")
pprint(move_x(action_evaluator, board_state))

Game 3 - Turn 1

- - -
- X -
- - -


In [808]:
#@title Turns 2-3

row =  "Bottom"#@param ["Top", "Middle", "Bottom"]
column =  "Middle"#@param ["Left", "Middle", "Right"]

row = {
    "Top": 0,
    "Middle": 1,
    "Bottom": 2,
}[row]

column = {
    "Left": 0,
    "Middle": 1,
    "Right": 2,
}[column]

if not move_o(action_evaluator, board_state, row, column) is None:
  if f_eq(0.0, eval_state(board_state)):
    pprint(board_state)
  else:
    pprint(move_x(action_evaluator, board_state))


- - -
- X -
X O -


In [809]:
#@title Turns 4-5

row =  "Top"#@param ["Top", "Middle", "Bottom"]
column =  "Right"#@param ["Left", "Middle", "Right"]

row = {
    "Top": 0,
    "Middle": 1,
    "Bottom": 2,
}[row]

column = {
    "Left": 0,
    "Middle": 1,
    "Right": 2,
}[column]

if not move_o(action_evaluator, board_state, row, column) is None:
  if f_eq(0.0, eval_state(board_state)):
    pprint(board_state)
  else:
    pprint(move_x(action_evaluator, board_state))


- - O
X X -
X O -


In [810]:
#@title Turns 6-7

row =  "Top"#@param ["Top", "Middle", "Bottom"]
column =  "Left"#@param ["Left", "Middle", "Right"]

row = {
    "Top": 0,
    "Middle": 1,
    "Bottom": 2,
}[row]

column = {
    "Left": 0,
    "Middle": 1,
    "Right": 2,
}[column]

if not move_o(action_evaluator, board_state, row, column) is None:
  if f_eq(0.0, eval_state(board_state)):
    pprint(board_state)
  else:
    pprint(move_x(action_evaluator, board_state))

X wins!

O - O
X X X
X O -


In [0]:
#@title Turns 8-9: Showdown!

row =  "Top"#@param ["Top", "Middle", "Bottom"]
column =  "Middle"#@param ["Left", "Middle", "Right"]

row = {
    "Top": 0,
    "Middle": 1,
    "Bottom": 2,
}[row]

column = {
    "Left": 0,
    "Middle": 1,
    "Right": 2,
}[column]

if not move_o(action_evaluator, board_state, row, column) is None:
  if f_eq(0.0, eval_state(board_state)):
    pprint(board_state)
  else:
    pprint(move_x(action_evaluator, board_state))

In [0]:
#@title Run and copy the result of this cell to save/share your model's learned knowledge.
print(action_evaluator._action_values)