In [1]:
from tictactoe import TicTacToeEnv
import random
import gym

from keras.models import Sequential
from keras import layers
from keras import losses
from keras import optimizers
import numpy as np

%matplotlib notebook
import matplotlib.pyplot as plt

In [2]:
#from keras.model import Sequential

env = gym.make('tictactoe-v0')

def create_agent_model(learning_rate):
    model = Sequential()

    # input = one hot encoding of the board state (3*9 = 27 inputs)
    # TODO: Use 2-d inputs so the encoding & relation between cells does not have to be learned by the model
    model.add(layers.Dense(27, activation="relu"))
    model.add(layers.Dense(100, activation="relu"))
    model.add(layers.Dense(100, activation="relu"))
    model.add(layers.Dense(100, activation="relu"))
    model.add(layers.Dense(9))
    
    optimizer =  optimizers.Adam(learning_rate=learning_rate)
    loss = losses.MeanSquaredError()
    model.compile(optimizer, loss)
    
    return model

class Agent:
    def __init__(self, model):
        self.model = model
        self.env = env
        self.random_rate = 0.01
        
    def set_random_rate(self, rate):
        self.random_rate = rate
        
    def get_action(self, env):
        if self.random_rate > 0 and random.random() < self.random_rate:
            return random.randint(0,8)
        else:
            best_action = self._get_best_action()
            return best_action
    
    def _get_best_action(self):
        best_action = np.argmax(self.model.predict(np.array([env._one_hot_board()])), axis=1)
        return best_action[0]
        
    def _get_random_action(self):
        # truly random, so could yield invalid moves
        return random.randint(0, 8)
    
    #TODO: get random *valid* action and get best *valid* action (to avoid invalid moves when not training)

In [3]:
def record_experience(env, agent1, agent2):
    # plays a game until it's done, recording all steps into tuples (state, action, reward, next_state, done)
    env.reset()
    done = False
    experience = []
    while not done:
        agent = agent1 if env.current_player == 0 else agent2
        
        action = agent.get_action(env)
        state = env._one_hot_board()
        
        step_result = env.step(action)
        (next_state, reward, done, info) = step_result
        
        #print("TTTT", action)
        experience.append((state, action, reward, next_state, done))
    return experience

In [4]:
model1 = create_agent_model(0.01)
model2 = create_agent_model(0.01)
agent1 = Agent(model1)
agent2 = Agent(model2)

In [5]:
#  work in progress

def split_experiences(experiences):
    # split in even and odd (agent1 and agent2) experiences
    return experiences[::2], experiences[1::2]
    
def train_model(experiences, model, verbose=False):
    start_states = np.array([e[0] for e in experiences])
    actions = np.array([e[1] for e in experiences])
    rewards = np.array([e[2] for e in experiences])
    next_states = np.array([e[3] for e in experiences])
    dones = [e[4] for e in experiences]
    Q = model.predict(start_states)
    nextQ = model.predict(next_states)
    
    gamma = 0.95
#     print("current Q", Q)
#     print("current best action", np.argmax(Q, axis=1))
#     print("experiences", experiences)
#     print("next Q", nextQ)
#     print("best next Q", np.max(nextQ, axis = 1))

    # Update Q values for observed rewards of actions
    # If the game is done we don't need to add the next Q value
    
    # TODO: this could also be done array manipulations; for loop bad. numpy good.
    for (s0, a, r, s1, Qs0, Qs1, done) in zip(start_states, actions, rewards, next_states, Q, nextQ, dones):
#         print("old Q line", Qs0)
        Qs0[a] = r + (0 if done else gamma * np.max(Qs1))
#         print("new Q line", Qs0)
        # fit one by one. This  is a lot slower than fitting in batch
        #model.fit(x=np.array([s0]), y=np.array([Qs0]), epochs=1, verbose=verbose)
        
#     print("updated Q", Q),
#     print("updated Q", np.argmax(Q, axis=1))
    model.fit(x=start_states, y=Q, epochs=1, verbose=verbose)
        
#     newQ = model.predict(start_states)
#     print("Q after fit", newQ)
#     print("best action after fit", np.argmax(newQ, axis = 1))
    
def train_game_per_game(env, agent1, agent2, num_batches = 100, batch_size = 100):
    moves = 0
    gamelengths = []
    
    for batch_num in range(num_batches):
        agent1_wins = 0
        agent2_wins = 0
        agent1_fail = 0
        agent2_fail = 0
        draws = 0
        
        for game_num in range(batch_size):
            experience = record_experience(env, agent1, agent2)
            winner = env.get_winner()
            if winner == 0:
                agent1_wins += 1
            elif winner == 1:
                agent2_wins += 1
            else:
                if experience[-1][2] == -3:
                    if env.current_player == 0:
                        agent1_fail += 1
                    else:
                        agent2_fail += 1
                else:
                    draws+=1
                    
            moves += len(experience)

            e1, e2 = split_experiences(experience)

    #       print("game actions", [e[1] for e in experience])

            # train model after every game
            train_model(e1, agent1.model)
            train_model(e2, agent2.model)

        
        print("batch %3d / %4d moves / %4d games / %4d a1 wins / %4d a1 fails / %4d a2 wins / %4d a2 fails / %4d draws "%( 
              batch_num, moves, batch_size, agent1_wins, agent1_fail, agent2_wins, agent2_fail, draws))
        games = 0
        moves = 0
        agent1_wins = 0
        agent2_wins = 0
        draws = 0
                    
agent1.set_random_rate(0.02)
agent2.set_random_rate(0.02)
train_game_per_game(env, agent1, agent2, 100, 100)


KeyboardInterrupt: 

In [None]:
def play_game(env, agent1, agent2):
    done = False
    env.reset()
    env.render()
    while not done:
        agent = agent1 if env.current_player == 0 else agent2
        action = agent.get_action(env)
        print("prediction", agent.model.predict(np.array([env._one_hot_board()])))
        print("action:", action)
        obs,reward,done,info = env.step(action)
        env.render()

In [None]:
agent1.set_random_rate(0)
agent2.set_random_rate(0)
play_game(env, agent1, agent2)