In [1]:
from tictactoe import TicTacToeEnv
import random
import gym

from keras.models import Sequential
from keras import layers
from keras import losses
from keras import optimizers
import numpy as np

In [2]:
#from keras.model import Sequential

env = gym.make('tictactoe-v0')

def create_agent_model():
    model = Sequential()

    # each position can be empty, 0 or 1 and there are 9 positions
    # we encode each position with two 0/1 values.  empty = 0/0, player1 = 1/0, player2 = 0/1
    model.add(layers.Dense(18, activation="relu"))
    model.add(layers.Dense(100, activation="relu"))
    model.add(layers.Dense(100, activation="relu"))
    model.add(layers.Dense(9))
    
    optimizer =  optimizers.Adam(learning_rate=0.01)
    loss = losses.MeanSquaredError()
    model.compile(optimizer, loss)
    
    return model


class Agent:
    def __init__(self, model):
        self.model = model
        self.env = env
        self.use_randomness = False
        self.random_rate = 0.2
        
    def set_random_rate(self, rate):
        self.random_rate = rate
        
    def get_action(self, env):
        if self.use_randomness:
            return random.randint(0,8)
        else:
            best_action = self._get_best_action()
            return best_action
    
    def _get_best_action(self):
        best_action = np.argmax(self.model.predict(np.array([env._one_hot_board()])))
        return best_action
        
    def _get_random_action(self):
        # truly random, so could yield invalid moves
        return random.randint(0, 8)
    
    def set_use_randomness(self, value):
        self.use_randomness = value
    
    #TODO: get random *valid* action and get best *valid* action (to avoid invalid moves when not training)

In [3]:
def record_experience(env, agent):
    # plays a game until it's done, recording all steps into tuples (state, action, reward, next_state, done)
    env.reset()
    done = False
    experience = []
    while not done:
        action = agent.get_action(env)
        state = env._one_hot_board()
        step_result = env.step(action)
        (next_state, reward, done, info) = step_result
        
        experience.append((state, action, reward, next_state, done))
    return experience

In [4]:
model = create_agent_model()
agent = Agent(model)

In [5]:

#  work in progress

def train_model(experiences, model, verbose=False):
    start_states = np.array([e[0] for e in experiences])
    actions = np.array([e[1] for e in experiences])
    rewards = np.array([e[2] for e in experiences])
    next_states = np.array([e[3] for e in experiences])
    predictions  = model.predict(start_states)
    next_predictions = model.predict(next_states)
    
    gamma = 0.95
    for prediction,action,reward,next_prediction in zip(predictions, actions, rewards, next_predictions):
        prediction[action] = reward + gamma * np.max(next_prediction)
    
    model.fit(x=start_states, y=predictions, epochs=1, verbose=verbose)
    
def train_game_per_game(num_games, env, agent):
    for i in range(num_games):
        if i%100 == 0:
            experience = record_experience(env, agent)
            train_model(experience, model, verbose=True)
            print("round %d, game duration %d"%(i, len(experience)))
        else:
            experience = record_experience(env, agent)
            train_model(experience, model)
            
def train_batch_experiences(num_actions, env, agent, verbose=False):
    env.reset()
    done = False
    experiences = []
    for i in range(num_actions):
        action = agent.get_action(env)
        state = env._one_hot_board()
        
        step_result = env.step(action)
        (next_state, reward, done, info) = step_result
        
        experiences.append((state, action, reward, next_state, done))
        
        if done:
            env.reset()
      
    print("training with %d experiences"%(len(experiences)))
    train_model(experiences, model, verbose)

agent.set_use_randomness(True)
train_game_per_game(10000, env, agent)
#for epoch in range(5000):
#    print("epoch", epoch)
#    train_batch_experiences(100, env, agent, True)

round 0, game duration 5
round 100, game duration 5
round 200, game duration 3
round 300, game duration 3
round 400, game duration 5
round 500, game duration 4
round 600, game duration 5
round 700, game duration 2
round 800, game duration 4
round 900, game duration 3
round 1000, game duration 4
round 1100, game duration 4
round 1200, game duration 5
round 1300, game duration 4
round 1400, game duration 7
round 1500, game duration 5
round 1600, game duration 3
round 1700, game duration 3
round 1800, game duration 4
round 1900, game duration 7
round 2000, game duration 6
round 2100, game duration 3
round 2200, game duration 2
round 2300, game duration 6
round 2400, game duration 6
round 2500, game duration 6
round 2600, game duration 3
round 2700, game duration 6
round 2800, game duration 5
round 2900, game duration 3
round 3000, game duration 6
round 3100, game duration 4
round 3200, game duration 3
round 3300, game duration 5
round 3400, game duration 5
round 3500, game duration 7
roun

round 8500, game duration 5
round 8600, game duration 4
round 8700, game duration 4
round 8800, game duration 3
round 8900, game duration 5
round 9000, game duration 4
round 9100, game duration 4
round 9200, game duration 2
round 9300, game duration 6
round 9400, game duration 7
round 9500, game duration 5
round 9600, game duration 3
round 9700, game duration 2
round 9800, game duration 5
round 9900, game duration 2


In [6]:
def play_game(env, agent):
    done = False
    env.reset()
    env.render()
    while not done:
        action = agent.get_action(env)
        print("prediction", model.predict(np.array([env._one_hot_board()])))
        print("action:", action)
        obs,reward,done,info = env.step(action)
        env.render()

In [7]:
agent.set_use_randomness(False)
play_game(env, agent)

0 0 0
0 0 0
0 0 0
prediction [[104.399086 106.468605 107.04008  100.85167  117.147606 105.31688
   98.74773  113.27484   99.71636 ]]
action: 4
0 0 0
0 1 0
0 0 0
prediction [[157.175   160.42621 156.68333 158.6314  169.54083 166.26317 158.28264
  164.12338 156.46956]]
action: 4
0 0 0
0 1 0
0 0 0


In [8]:
play_game(env, agent)

0 0 0
0 0 0
0 0 0
prediction [[104.399086 106.468605 107.04008  100.85167  117.147606 105.31688
   98.74773  113.27484   99.71636 ]]
action: 4
0 0 0
0 1 0
0 0 0
prediction [[157.175   160.42621 156.68333 158.6314  169.54083 166.26317 158.28264
  164.12338 156.46956]]
action: 4
0 0 0
0 1 0
0 0 0
