In [1]:
!pip install tf-agents

Collecting tf-agents
[?25l  Downloading https://files.pythonhosted.org/packages/96/f5/4b5ddf7138d2fdaad2f7d44437372525859183cdac4ffad3fd86a94f8f52/tf_agents-0.8.0-py3-none-any.whl (1.2MB)
[K     |████████████████████████████████| 1.2MB 6.9MB/s 
Collecting tensorflow-probability==0.12.2
[?25l  Downloading https://files.pythonhosted.org/packages/9c/c0/d6a9212d3e74748474b59e077e85ca577308c808eee93f9d2e11c3f1cc16/tensorflow_probability-0.12.2-py2.py3-none-any.whl (4.8MB)
[K     |████████████████████████████████| 4.8MB 19.0MB/s 
Installing collected packages: tensorflow-probability, tf-agents
  Found existing installation: tensorflow-probability 0.12.1
    Uninstalling tensorflow-probability-0.12.1:
      Successfully uninstalled tensorflow-probability-0.12.1
Successfully installed tensorflow-probability-0.12.2 tf-agents-0.8.0


In [None]:
# https://towardsdatascience.com/creating-a-custom-environment-for-tensorflow-agent-tic-tac-toe-example-b66902f73059

In [4]:
import tensorflow as tf
import numpy as np
from tf_agents.environments import py_environment
from tf_agents.environments import tf_environment
from tf_agents.environments import tf_py_environment
from tf_agents.environments import utils
from tf_agents.specs import array_spec
from tf_agents.environments import wrappers
from tf_agents.environments import suite_gym
from tf_agents.trajectories import time_step as ts

In [16]:
class SimplifiedTicTacToe(py_environment.PyEnvironment):  

    def __init__(self):
      self._action_spec = array_spec.BoundedArraySpec(
          shape=(), dtype=np.int32, minimum=0, maximum=8, name='play')
      self._observation_spec = array_spec.BoundedArraySpec(
          shape=(1,9), dtype=np.int32, minimum=0, maximum=1, name='board')
      self._state = [0, 0, 0, 0, 0, 0, 0, 0, 0]
      self._episode_ended = False  
    
    def action_spec(self):
      return self._action_spec  
    def observation_spec(self):
      return self._observation_spec
    def _reset(self):
      # state at the start of the game
      self._state = [0, 0, 0, 0, 0, 0, 0, 0, 0]
      self._episode_ended = False
      return ts.restart(np.array([self._state], dtype=np.int32))
    def __is_spot_empty(self, ind):
      return self._state[ind] == 0

    def __all_spots_occupied(self):
      return all(i == 1 for i in self._state)

    def _step(self, action):    
      if self._episode_ended:
          return self.reset()    
      if self.__is_spot_empty(action):        
          self._state[action] = 1
          
          if self.__all_spots_occupied():
              self._episode_ended = True
              return ts.termination(np.array([self._state], dtype=np.int32), 1)
          else:
              return ts.transition(np.array([self._state], dtype=np.int32), reward=0.05, discount=1.0)
      else:
          self._episode_ended = True
          return ts.termination(np.array([self._state], dtype=np.int32), -1)

In [17]:
python_environment = SimplifiedTicTacToe()
tf_env = tf_py_environment.TFPyEnvironment(python_environment)

In [25]:
time_step = tf_env.reset()
rewards = []
steps = []
number_of_episodes = 10000

for _ in range(number_of_episodes):
  episode_reward = 0
  episode_steps = 0 
  reward_t = 0
  steps_t = 0
  tf_env.reset()
  while True:
    action = tf.random.uniform([1], 0, 9, dtype=tf.int32)
    next_time_step = tf_env.step(action)
    if tf_env.current_time_step().is_last():
      break

    episode_steps += 1
    episode_reward += next_time_step.reward.numpy()
  rewards.append(episode_reward)
  steps.append(episode_steps)

In [26]:
mean_no_of_steps = np.mean(steps)

In [27]:
mean_no_of_steps

3.4586