In [1]:
from tictactoe import TicTacToeEnv
from train import update_Q
import random
import gym

from keras.models import Sequential
from keras import layers
from keras import losses
from keras import optimizers
from keras import initializers
import numpy as np

%matplotlib notebook
import matplotlib.pyplot as plt

In [2]:
#from keras.model import Sequential

env = gym.make('tictactoe-v0')

def create_agent_model(learning_rate):
    model = Sequential()

    # input = one hot encoding of the board state (3*9 = 27 inputs)
    # TODO: Use 2-d inputs so the encoding & relation between cells does not have to be learned by the model
    model.add(layers.Dense(27, activation='relu', kernel_initializer=initializers.RandomNormal(stddev=0.01)))
    model.add(layers.Dense(27, activation='relu', kernel_initializer=initializers.RandomNormal(stddev=0.01)))
    model.add(layers.Dense(27, activation='relu', kernel_initializer=initializers.RandomNormal(stddev=0.01)))
    model.add(layers.Dense(27, activation='relu', kernel_initializer=initializers.RandomNormal(stddev=0.01)))
    model.add(layers.Dense(9, activation='linear', kernel_initializer=initializers.RandomNormal(stddev=0.01)))
    
    #sgd = optimizers.SGD(learning_rate=learning_rate, momentum=0.0, nesterov=False, name="SGD")
    adam =  optimizers.Adam(learning_rate=learning_rate)
    loss = losses.MeanSquaredError()
    model.compile(adam, loss)
    
    return model

class Agent:
    def __init__(self, model):
        self.model = model
        self.env = env
        self.random_rate = 0
        
    def set_random_rate(self, rate):
        self.random_rate = rate
        
    def get_action(self, env):
        #print("Getting action for")
        #env.render()
        if (self.random_rate > 0) and (random.random() < self.random_rate):
#             print("random move")
            return random.randint(0,8)
        else:
            best_action = self._get_best_action()
            return best_action
    
    def _get_best_action(self):
        #print("predicted Q values", self.model.predict(np.array([env._one_hot_board()])))
        best_action = np.argmax(self.model.predict(np.array([env._one_hot_board()])), axis=1)
        #print("best action", best_action[0])
        return best_action[0]
        
    def _get_random_action(self):
        # truly random, so could yield invalid moves
        return random.randint(0, 8)
    
    #TODO: get random *valid* action and get best *valid* action (to avoid invalid moves when not training)

In [3]:
def record_experience(env, agent1, agent2):
    # plays a game until it's done, recording all steps into tuples (state, action, reward, next_state, done)
    env.reset()
    done = False
    experience = []
    while not done:
        agent = agent1 if env.current_player == 0 else agent2
        
        action = agent.get_action(env)
        state = env._one_hot_board()
        
        step_result = env.step(action)
        (next_state, reward, done, info) = step_result
        
        experience.append((state, action, reward, next_state, done))
    return experience

In [4]:
model1 = create_agent_model(0.001)
model2 = create_agent_model(0.001)
agent1 = Agent(model1)
agent2 = Agent(model2)

In [6]:
#  work in progress

def split_experiences(experiences):
    # split in even and odd (agent1 and agent2) experiences
    return experiences[::2], experiences[1::2]
    
def train_model(experiences, model, verbose=False):
    start_states = np.array([e[0] for e in experiences])
    actions = np.array([e[1] for e in experiences])
    rewards = np.array([e[2] for e in experiences])
    next_states = np.array([e[3] for e in experiences])
    dones = [e[4] for e in experiences]
    Q = model.predict(start_states)
    nextQ = model.predict(next_states)
    
    gamma = 0.95
    
    if verbose:
        print("Q", Q)
        print("nextQ", nextQ)
        print("actions", actions)
        print("rewards", rewards)
        print("dones", dones)
    update_Q(Q, nextQ, actions, rewards, dones, gamma)
    if verbose:
        print("updated Q", Q)
    
    model.fit(x=start_states, y=Q, epochs=10, verbose=verbose)

def train_game_per_game(env, agent1, agent2, num_batches = 100, batch_size = 100, verbose=False):
    moves = 0
    gamelengths = []
    
    for batch_num in range(num_batches):
        agent1_wins = 0
        agent2_wins = 0
        agent1_fail = 0
        agent2_fail = 0
        draws = 0
        
        agent1_experience_batch  = []
        agent2_experience_batch  = []
        for game_num in range(batch_size):
            experience = record_experience(env, agent1, agent2)
            #print(len(experience))

            #print actions when it was a draw (0 score)
            #print("actions:", [e[1] for e in experience])

            if (experience[-1][2] == 0):
                print("draw! actions:", [e[1] for e in experience])
            winner = env.get_winner()
            if winner == 0:
                print("agent 1 win! actions:", [e[1] for e in experience])
                agent1_wins += 1
            elif winner == 1:
                print("agent 2  win! actions:", [e[1] for e in experience])
                agent2_wins += 1
            else:
                if experience[-1][2] == -3:
                    if env.current_player == 0:
                        agent1_fail += 1
                    else:
                        agent2_fail += 1
                else:
                    draws+=1
                    
            moves += len(experience)

            e1, e2 = split_experiences(experience)
            agent1_experience_batch += e1
            agent2_experience_batch += e2

            # train model after every game
            train_model(e1, agent1.model, verbose=verbose)
            train_model(e2, agent2.model, verbose=verbose)

    #       print("game actions", [e[1] for e in experience])

        
        print("batch %3d / %4d moves / %4d games / %4d a1 wins / %4d a1 fails / %4d a2 wins / %4d a2 fails / %4d draws "%( 
              batch_num, moves, batch_size, agent1_wins, agent1_fail, agent2_wins, agent2_fail, draws))
        games = 0
        moves = 0
        agent1_wins = 0
        agent2_wins = 0
        draws = 0
    
agent1.set_random_rate(0.05)
agent2.set_random_rate(0.05)
train_game_per_game(env, agent1, agent2, 1000, 10, verbose=False)


actions: [2, 6, 2]
actions: [2, 1, 2]
actions: [3, 1, 2, 1]
actions: [2, 1, 2]
actions: [2, 1, 2]
actions: [2, 1, 2]
actions: [2, 1, 2]
actions: [2, 1, 2]
actions: [2, 6, 2]
actions: [2, 1, 2]
batch   0 /   31 moves /   10 games /    0 a1 wins /    9 a1 fails /    0 a2 wins /    1 a2 fails /    0 draws 
actions: [2, 1, 2]
actions: [2, 1, 2]
actions: [2, 0, 2]
actions: [3, 0, 3]
actions: [7, 0, 7]
actions: [2, 1, 2]
actions: [2, 0, 2]
actions: [3, 1, 3]
actions: [3, 1, 3]
actions: [3, 1, 3]
batch   1 /   30 moves /   10 games /    0 a1 wins /   10 a1 fails /    0 a2 wins /    0 a2 fails /    0 draws 
actions: [3, 1, 3]
actions: [3, 1, 3]
actions: [8, 1, 3, 1]
actions: [7, 0, 3, 0]
actions: [3, 0, 3]
actions: [7, 0, 7]
actions: [3, 0, 3]
actions: [7, 0, 7]
actions: [7, 0, 3, 0]
actions: [2, 0, 3, 7, 3]
batch   2 /   35 moves /   10 games /    0 a1 wins /    7 a1 fails /    0 a2 wins /    3 a2 fails /    0 draws 
actions: [3, 0, 3]
actions: [3, 0, 5, 0]
actions: [3, 0, 3]
actions: [3, 0, 

actions: [6, 7, 6]
actions: [7, 7]
actions: [7, 7]
actions: [7, 7]
batch  27 /   27 moves /   10 games /    0 a1 wins /    7 a1 fails /    0 a2 wins /    3 a2 fails /    0 draws 
actions: [7, 7]
actions: [7, 7]
actions: [7, 7]
actions: [7, 7]
actions: [7, 7]
actions: [7, 7]
actions: [7, 7]
actions: [7, 7]
actions: [7, 7]
actions: [7, 7]
batch  28 /   20 moves /   10 games /    0 a1 wins /    0 a1 fails /    0 a2 wins /   10 a2 fails /    0 draws 
actions: [7, 7]
actions: [7, 7]
actions: [7, 7]
actions: [7, 7]
actions: [7, 7]
actions: [7, 7]
actions: [7, 7]
actions: [7, 7]
actions: [7, 7]
actions: [7, 7]
batch  29 /   20 moves /   10 games /    0 a1 wins /    0 a1 fails /    0 a2 wins /   10 a2 fails /    0 draws 
actions: [7, 7]
actions: [7, 7]
actions: [7, 7]
actions: [7, 7]
actions: [7, 7]
actions: [7, 7]
actions: [7, 2, 8, 7]
actions: [7, 7]
actions: [7, 7]
actions: [7, 7]
batch  30 /   22 moves /   10 games /    0 a1 wins /    0 a1 fails /    0 a2 wins /   10 a2 fails /    0 draws 

KeyboardInterrupt: 

In [None]:
def play_game(env, agent1, agent2, correct_invalid_actions = False):
    done = False
    env.reset()
    env.render()
    while not done:
        agent = agent1 if env.current_player == 0 else agent2
        action = agent.get_action(env)
        print("prediction", agent.model.predict(np.array([env._one_hot_board()])))
        print("action:", action)
        if env.board[action] is not None:
            action = valid_actions[random.randint(0, len(valid_actions)-1)]
            print("Agent picked invalid action. Adjusting with random value")
        obs,reward,done,info = env.step(action)
        env.render()

In [None]:
agent1.set_random_rate(0)
agent2.set_random_rate(0)
play_game(env, agent1, agent2)